Bladeren bron

fix: handle empty content_hash in seen_articles (pre-migration rows)

Existing seen_articles rows have content_hash='' from the migration.
Comparing '' against a real hash made every article appear 'changed',
triggering 19/20 articles to re-cluster on first poll after deploy.

Fix: treat empty stored hash as 'unchanged' — the next upsert will
populate the real content_hash for future comparisons.

Also fix: enriched_at is stored in the JSON payload, not a SQL column.
The previous code tried UPDATE clusters SET enriched_at=NULL which
failed with 'no such column'. Now correctly updates the JSON payload.
Lukas Goldschmidt 6 dagen geleden
bovenliggende
commit
46e9b02ecc
2 gewijzigde bestanden met toevoegingen van 17 en 6 verwijderingen
  1. 13 6
      news_mcp/jobs/poller.py
  2. 4 0
      news_mcp/storage/sqlite_store.py

+ 13 - 6
news_mcp/jobs/poller.py

@@ -168,22 +168,29 @@ class ClusterPoller:
             self._prune_and_finalize(enabled_urls, feed_map)
             return self.stats
 
-        # 3c. For changed-content articles, clear enriched_at on their existing
-        # clusters so the next enrichment cycle re-processes them with the
-        # updated article data.
+        # 3c. For changed-content articles, clear enriched_at in the cluster
+        # payload JSON so the next enrichment cycle re-processes them with
+        # the updated article data.  (enriched_at is stored inside the JSON
+        # payload, not as a separate SQL column.)
         if seen_changed:
             from news_mcp.article_identity import article_key as _ak
+            import json as _json
             changed_keys = {_ak(a) for a in seen_changed}
             with self.store._conn() as conn:
                 for ak in changed_keys:
                     row = conn.execute(
-                        "SELECT cluster_id FROM seen_articles WHERE article_key=?", (ak,)
+                        "SELECT cluster_id, payload FROM seen_articles sa "
+                        "JOIN clusters c ON c.cluster_id = sa.cluster_id "
+                        "WHERE sa.article_key=?",
+                        (ak,),
                     ).fetchone()
                     if row:
                         changed_cluster_ids.add(row[0])
+                        payload = _json.loads(row[1])
+                        payload.pop("enriched_at", None)
                         conn.execute(
-                            "UPDATE clusters SET enriched_at=NULL WHERE cluster_id=?",
-                            (row[0],),
+                            "UPDATE clusters SET payload=? WHERE cluster_id=?",
+                            (_json.dumps(payload, ensure_ascii=False), row[0]),
                         )
             if changed_cluster_ids:
                 self.logger.info("content_changed: clusters=%d will re-enrich", len(changed_cluster_ids))

+ 4 - 0
news_mcp/storage/sqlite_store.py

@@ -559,6 +559,10 @@ class SQLiteClusterStore:
                 new_articles.append(art)
             elif seen_map[key] == _content_hash(art):
                 seen_unchanged.append(art)
+            elif not seen_map[key]:
+                # Stored hash is empty (pre-migration row).  Treat as unchanged
+                # and let the next upsert populate the real hash.
+                seen_unchanged.append(art)
             else:
                 seen_changed.append(art)
         return new_articles, seen_unchanged, seen_changed