Jelajahi Sumber

feat: detect in-place article content updates via content hash

Problem: FT (and others) update article content in-place at the same URL.
The seen_articles table keyed on URL was skipping them entirely, so stubs
like 'More to come...' were never refreshed.

Solution:
- content_hash column in seen_articles (SHA-1 of title+summary)
- filter_already_seen() now returns 3 lists: new, seen_unchanged, seen_changed
- changed-content articles are re-clustered into their existing cluster
- enriched_at is cleared on changed clusters so they get re-enriched
- content_hash is updated in seen_articles on every upsert
- migration: ALTER TABLE adds the column on existing DBs
Lukas Goldschmidt 6 hari lalu
induk
melakukan
e8cef4a441
3 mengubah file dengan 78 tambahan dan 31 penghapusan
  1. 39 10
      news_mcp/jobs/poller.py
  2. 37 19
      news_mcp/storage/sqlite_store.py
  3. 2 2
      test_news_mcp.py

+ 39 - 10
news_mcp/jobs/poller.py

@@ -134,23 +134,32 @@ class ClusterPoller:
             return self.stats
 
         # 3b. Seen-articles filter: drop articles whose URL was already
-        # processed in any previous cycle.  This is the strongest dedup —
-        # an article key (derived from URL) in seen_articles means it has
-        # already been clustered and enriched, so re-processing is pure waste.
-        new_articles, already_seen = self.store.filter_already_seen(articles)
-        if already_seen:
-            # Attribute seen articles to their feeds for stats
-            for a in already_seen:
+        # processed with the same content.  Three outcomes:
+        #   new → never seen, full clustering + enrichment
+        #   seen_unchanged → same URL, same content hash → skip entirely
+        #   seen_changed → same URL, different content → re-cluster to update
+        #                  the existing cluster (triggers re-enrichment)
+        new_articles, seen_unchanged, seen_changed = self.store.filter_already_seen(articles)
+        changed_cluster_ids: set[str] = set()
+        if seen_unchanged or seen_changed:
+            for a in seen_unchanged:
                 fu = a.get("feed_url", "")
                 for fs in feed_stats:
                     if fs.feed_url == fu:
                         fs.seen += 1
                         break
+            for a in seen_changed:
+                fu = a.get("feed_url", "")
+                for fs in feed_stats:
+                    if fs.feed_url == fu:
+                        fs.seen += 1  # still "seen" (by URL), but content changed
+                        break
             self.logger.info(
-                "seen_articles: total=%d new=%d already_seen=%d",
-                len(articles), len(new_articles), len(already_seen),
+                "seen_articles: total=%d new=%d unchanged=%d changed=%d",
+                len(articles), len(new_articles), len(seen_unchanged), len(seen_changed),
             )
-        articles = new_articles
+        # Merge changed articles with new ones for clustering
+        articles = new_articles + seen_changed
 
         if not articles:
             self.logger.info("poll: all articles already seen (nothing new to cluster)")
@@ -159,6 +168,26 @@ class ClusterPoller:
             self._prune_and_finalize(enabled_urls, feed_map)
             return self.stats
 
+        # 3c. For changed-content articles, clear enriched_at on their existing
+        # clusters so the next enrichment cycle re-processes them with the
+        # updated article data.
+        if seen_changed:
+            from news_mcp.article_identity import article_key as _ak
+            changed_keys = {_ak(a) for a in seen_changed}
+            with self.store._conn() as conn:
+                for ak in changed_keys:
+                    row = conn.execute(
+                        "SELECT cluster_id FROM seen_articles WHERE article_key=?", (ak,)
+                    ).fetchone()
+                    if row:
+                        changed_cluster_ids.add(row[0])
+                        conn.execute(
+                            "UPDATE clusters SET enriched_at=NULL WHERE cluster_id=?",
+                            (row[0],),
+                        )
+            if changed_cluster_ids:
+                self.logger.info("content_changed: clusters=%d will re-enrich", len(changed_cluster_ids))
+
         # 4. Pre-seed existing clusters for cross-cycle merging
         existing_clusters = self._preseed_clusters()
 

+ 37 - 19
news_mcp/storage/sqlite_store.py

@@ -261,13 +261,19 @@ class SQLiteClusterStore:
             conn.execute(
                 """
                 CREATE TABLE IF NOT EXISTS seen_articles (
-                    article_key TEXT PRIMARY KEY,
-                    cluster_id  TEXT NOT NULL,
-                    first_seen  TEXT NOT NULL,
-                    url         TEXT NOT NULL DEFAULT ''
+                    article_key  TEXT PRIMARY KEY,
+                    cluster_id   TEXT NOT NULL,
+                    first_seen   TEXT NOT NULL,
+                    url          TEXT NOT NULL DEFAULT '',
+                    content_hash TEXT NOT NULL DEFAULT ''
                 )
                 """
             )
+            # Migration: add content_hash column if missing (existing DBs)
+            try:
+                conn.execute("ALTER TABLE seen_articles ADD COLUMN content_hash TEXT NOT NULL DEFAULT ''")
+            except sqlite3.OperationalError:
+                pass  # column already exists
 
             try:
                 cur = conn.execute("PRAGMA table_info(entity_metadata)")
@@ -361,9 +367,12 @@ class SQLiteClusterStore:
                     akey = _article_key(art)
                     if akey:
                         art_url = str(art.get("url") or "").strip()
+                        from news_mcp.article_identity import article_content_hash as _chash
+                        ahash = _chash(art)
                         conn.execute(
-                            "INSERT OR IGNORE INTO seen_articles(article_key, cluster_id, first_seen, url) VALUES(?,?,?,?)",
-                            (akey, cluster_id, now.isoformat(), art_url),
+                            "INSERT INTO seen_articles(article_key, cluster_id, first_seen, url, content_hash) VALUES(?,?,?,?,?) "
+                            "ON CONFLICT(article_key) DO UPDATE SET cluster_id=excluded.cluster_id, url=excluded.url, content_hash=excluded.content_hash",
+                            (akey, cluster_id, now.isoformat(), art_url, ahash),
                         )
 
     def upsert_cluster_summary(
@@ -519,31 +528,40 @@ class SQLiteClusterStore:
     #  Seen-articles: skip already-processed articles at ingestion
     # ------------------------------------------------------------------ #
 
-    def filter_already_seen(self, articles: list[dict]) -> tuple[list[dict], list[dict]]:
-        """Split articles into (new, already_seen) based on seen_articles table.
+    def filter_already_seen(self, articles: list[dict]) -> tuple[list[dict], list[dict], list[dict]]:
+        """Split articles into (new, seen_unchanged, seen_changed) based on seen_articles.
 
-        Uses _article_key (derived from URL) as the identity check.
-        Returns two lists: articles never seen before, and articles already
-        processed in a previous cycle.
+        Uses _article_key (URL) as identity and article_content_hash to detect
+        in-place content updates (e.g. a stub that gets fleshed out).
+
+        Returns:
+            new_articles: never seen before → full clustering + enrichment
+            seen_unchanged: same key, same content hash → skip entirely
+            seen_changed: same key, different content hash → re-cluster to update
+                       the existing cluster payload (will trigger re-enrichment)
         """
+        from news_mcp.article_identity import article_content_hash as _content_hash
         keys = [_article_key(a) for a in articles]
         if not keys:
-            return [], []
+            return [], [], []
         with self._conn() as conn:
             placeholders = ",".join("?" for _ in keys)
             cur = conn.execute(
-                f"SELECT article_key FROM seen_articles WHERE article_key IN ({placeholders})",
+                f"SELECT article_key, content_hash FROM seen_articles WHERE article_key IN ({placeholders})",
                 keys,
             )
-            seen_set = {row[0] for row in cur.fetchall()}
+            seen_map = {row[0]: row[1] for row in cur.fetchall()}
         new_articles = []
-        seen_articles = []
+        seen_unchanged = []
+        seen_changed = []
         for art, key in zip(articles, keys):
-            if key in seen_set:
-                seen_articles.append(art)
-            else:
+            if key not in seen_map:
                 new_articles.append(art)
-        return new_articles, seen_articles
+            elif seen_map[key] == _content_hash(art):
+                seen_unchanged.append(art)
+            else:
+                seen_changed.append(art)
+        return new_articles, seen_unchanged, seen_changed
 
     def get_seen_article_count(self) -> int:
         """Total rows in seen_articles (for diagnostics)."""

+ 2 - 2
test_news_mcp.py

@@ -377,7 +377,7 @@ def test_refresh_skips_reprocessing_when_feed_hash_is_unchanged(monkeypatch):
             return []
 
         def filter_already_seen(self, articles):
-            return articles, []
+            return articles, [], []
 
         def set_meta(self, key, value):
             self.meta[key] = value
@@ -638,7 +638,7 @@ def test_poller_persists_clusters_under_post_enrichment_topic(monkeypatch):
             return []
 
         def filter_already_seen(self, articles):
-            return articles, []
+            return articles, [], []
 
         def set_meta(self, key, value):
             pass