6 hari lalu · e8cef4a441
--- a/news_mcp/jobs/poller.py
+++ b/news_mcp/jobs/poller.py
@@ -134,23 +134,32 @@ class ClusterPoller:
 
				             return self.stats
			
 
				 
			
 
				         # 3b. Seen-articles filter: drop articles whose URL was already
			
 
				-        # processed in any previous cycle.  This is the strongest dedup —
			
 
				-        # an article key (derived from URL) in seen_articles means it has
			
 
				-        # already been clustered and enriched, so re-processing is pure waste.
			
 
				-        new_articles, already_seen = self.store.filter_already_seen(articles)
			
 
				-        if already_seen:
			
 
				-            # Attribute seen articles to their feeds for stats
			
 
				-            for a in already_seen:
			
 
				+        # processed with the same content.  Three outcomes:
			
 
				+        #   new → never seen, full clustering + enrichment
			
 
				+        #   seen_unchanged → same URL, same content hash → skip entirely
			
 
				+        #   seen_changed → same URL, different content → re-cluster to update
			
 
				+        #                  the existing cluster (triggers re-enrichment)
			
 
				+        new_articles, seen_unchanged, seen_changed = self.store.filter_already_seen(articles)
			
 
				+        changed_cluster_ids: set[str] = set()
			
 
				+        if seen_unchanged or seen_changed:
			
 
				+            for a in seen_unchanged:
			
 
				                 fu = a.get("feed_url", "")
			
 
				                 for fs in feed_stats:
			
 
				                     if fs.feed_url == fu:
			
 
				                         fs.seen += 1
			
 
				                         break
			
 
				+            for a in seen_changed:
			
 
				+                fu = a.get("feed_url", "")
			
 
				+                for fs in feed_stats:
			
 
				+                    if fs.feed_url == fu:
			
 
				+                        fs.seen += 1  # still "seen" (by URL), but content changed
			
 
				+                        break
			
 
				             self.logger.info(
			
 
				-                "seen_articles: total=%d new=%d already_seen=%d",
			
 
				-                len(articles), len(new_articles), len(already_seen),
			
 
				+                "seen_articles: total=%d new=%d unchanged=%d changed=%d",
			
 
				+                len(articles), len(new_articles), len(seen_unchanged), len(seen_changed),
			
 
				             )
			
 
				-        articles = new_articles
			
 
				+        # Merge changed articles with new ones for clustering
			
 
				+        articles = new_articles + seen_changed
			
 
				 
			
 
				         if not articles:
			
 
				             self.logger.info("poll: all articles already seen (nothing new to cluster)")
			
@@ -159,6 +168,26 @@ class ClusterPoller:
 
				             self._prune_and_finalize(enabled_urls, feed_map)
			
 
				             return self.stats
			
 
				 
			
 
				+        # 3c. For changed-content articles, clear enriched_at on their existing
			
 
				+        # clusters so the next enrichment cycle re-processes them with the
			
 
				+        # updated article data.
			
 
				+        if seen_changed:
			
 
				+            from news_mcp.article_identity import article_key as _ak
			
 
				+            changed_keys = {_ak(a) for a in seen_changed}
			
 
				+            with self.store._conn() as conn:
			
 
				+                for ak in changed_keys:
			
 
				+                    row = conn.execute(
			
 
				+                        "SELECT cluster_id FROM seen_articles WHERE article_key=?", (ak,)
			
 
				+                    ).fetchone()
			
 
				+                    if row:
			
 
				+                        changed_cluster_ids.add(row[0])
			
 
				+                        conn.execute(
			
 
				+                            "UPDATE clusters SET enriched_at=NULL WHERE cluster_id=?",
			
 
				+                            (row[0],),
			
 
				+                        )
			
 
				+            if changed_cluster_ids:
			
 
				+                self.logger.info("content_changed: clusters=%d will re-enrich", len(changed_cluster_ids))
			
 
				+
			
 
				         # 4. Pre-seed existing clusters for cross-cycle merging
			
 
				         existing_clusters = self._preseed_clusters()
			
 
				 
			
--- a/news_mcp/storage/sqlite_store.py
+++ b/news_mcp/storage/sqlite_store.py
@@ -261,13 +261,19 @@ class SQLiteClusterStore:
 
				             conn.execute(
			
 
				                 """
			
 
				                 CREATE TABLE IF NOT EXISTS seen_articles (
			
 
				-                    article_key TEXT PRIMARY KEY,
			
 
				-                    cluster_id  TEXT NOT NULL,
			
 
				-                    first_seen  TEXT NOT NULL,
			
 
				-                    url         TEXT NOT NULL DEFAULT ''
			
 
				+                    article_key  TEXT PRIMARY KEY,
			
 
				+                    cluster_id   TEXT NOT NULL,
			
 
				+                    first_seen   TEXT NOT NULL,
			
 
				+                    url          TEXT NOT NULL DEFAULT '',
			
 
				+                    content_hash TEXT NOT NULL DEFAULT ''
			
 
				                 )
			
 
				                 """
			
 
				             )
			
 
				+            # Migration: add content_hash column if missing (existing DBs)
			
 
				+            try:
			
 
				+                conn.execute("ALTER TABLE seen_articles ADD COLUMN content_hash TEXT NOT NULL DEFAULT ''")
			
 
				+            except sqlite3.OperationalError:
			
 
				+                pass  # column already exists
			
 
				 
			
 
				             try:
			
 
				                 cur = conn.execute("PRAGMA table_info(entity_metadata)")
			
@@ -361,9 +367,12 @@ class SQLiteClusterStore:
 
				                     akey = _article_key(art)
			
 
				                     if akey:
			
 
				                         art_url = str(art.get("url") or "").strip()
			
 
				+                        from news_mcp.article_identity import article_content_hash as _chash
			
 
				+                        ahash = _chash(art)
			
 
				                         conn.execute(
			
 
				-                            "INSERT OR IGNORE INTO seen_articles(article_key, cluster_id, first_seen, url) VALUES(?,?,?,?)",
			
 
				-                            (akey, cluster_id, now.isoformat(), art_url),
			
 
				+                            "INSERT INTO seen_articles(article_key, cluster_id, first_seen, url, content_hash) VALUES(?,?,?,?,?) "
			
 
				+                            "ON CONFLICT(article_key) DO UPDATE SET cluster_id=excluded.cluster_id, url=excluded.url, content_hash=excluded.content_hash",
			
 
				+                            (akey, cluster_id, now.isoformat(), art_url, ahash),
			
 
				                         )
			
 
				 
			
 
				     def upsert_cluster_summary(
			
@@ -519,31 +528,40 @@ class SQLiteClusterStore:
 
				     #  Seen-articles: skip already-processed articles at ingestion
			
 
				     # ------------------------------------------------------------------ #
			
 
				 
			
 
				-    def filter_already_seen(self, articles: list[dict]) -> tuple[list[dict], list[dict]]:
			
 
				-        """Split articles into (new, already_seen) based on seen_articles table.
			
 
				+    def filter_already_seen(self, articles: list[dict]) -> tuple[list[dict], list[dict], list[dict]]:
			
 
				+        """Split articles into (new, seen_unchanged, seen_changed) based on seen_articles.
			
 
				 
			
 
				-        Uses _article_key (derived from URL) as the identity check.
			
 
				-        Returns two lists: articles never seen before, and articles already
			
 
				-        processed in a previous cycle.
			
 
				+        Uses _article_key (URL) as identity and article_content_hash to detect
			
 
				+        in-place content updates (e.g. a stub that gets fleshed out).
			
 
				+
			
 
				+        Returns:
			
 
				+            new_articles: never seen before → full clustering + enrichment
			
 
				+            seen_unchanged: same key, same content hash → skip entirely
			
 
				+            seen_changed: same key, different content hash → re-cluster to update
			
 
				+                       the existing cluster payload (will trigger re-enrichment)
			
 
				         """
			
 
				+        from news_mcp.article_identity import article_content_hash as _content_hash
			
 
				         keys = [_article_key(a) for a in articles]
			
 
				         if not keys:
			
 
				-            return [], []
			
 
				+            return [], [], []
			
 
				         with self._conn() as conn:
			
 
				             placeholders = ",".join("?" for _ in keys)
			
 
				             cur = conn.execute(
			
 
				-                f"SELECT article_key FROM seen_articles WHERE article_key IN ({placeholders})",
			
 
				+                f"SELECT article_key, content_hash FROM seen_articles WHERE article_key IN ({placeholders})",
			
 
				                 keys,
			
 
				             )
			
 
				-            seen_set = {row[0] for row in cur.fetchall()}
			
 
				+            seen_map = {row[0]: row[1] for row in cur.fetchall()}
			
 
				         new_articles = []
			
 
				-        seen_articles = []
			
 
				+        seen_unchanged = []
			
 
				+        seen_changed = []
			
 
				         for art, key in zip(articles, keys):
			
 
				-            if key in seen_set:
			
 
				-                seen_articles.append(art)
			
 
				-            else:
			
 
				+            if key not in seen_map:
			
 
				                 new_articles.append(art)
			
 
				-        return new_articles, seen_articles
			
 
				+            elif seen_map[key] == _content_hash(art):
			
 
				+                seen_unchanged.append(art)
			
 
				+            else:
			
 
				+                seen_changed.append(art)
			
 
				+        return new_articles, seen_unchanged, seen_changed
			
 
				 
			
 
				     def get_seen_article_count(self) -> int:
			
 
				         """Total rows in seen_articles (for diagnostics)."""
			
--- a/test_news_mcp.py
+++ b/test_news_mcp.py
@@ -377,7 +377,7 @@ def test_refresh_skips_reprocessing_when_feed_hash_is_unchanged(monkeypatch):
 
				             return []
			
 
				 
			
 
				         def filter_already_seen(self, articles):
			
 
				-            return articles, []
			
 
				+            return articles, [], []
			
 
				 
			
 
				         def set_meta(self, key, value):
			
 
				             self.meta[key] = value
			
@@ -638,7 +638,7 @@ def test_poller_persists_clusters_under_post_enrichment_topic(monkeypatch):
 
				             return []
			
 
				 
			
 
				         def filter_already_seen(self, articles):
			
 
				-            return articles, []
			
 
				+            return articles, [], []
			
 
				 
			
 
				         def set_meta(self, key, value):
			
 
				             pass