Sfoglia il codice sorgente

feat: per-feed re-enrichment toggle for changed articles

Add re_enrich column to feed_state table (default 0 = disabled).
When a seen article's content hash changes, the poller now checks
the feed's re_enrich flag before stripping enriched_at. Feeds with
re-enrich disabled skip changed articles silently (no re-enrichment).

Changes:
- sqlite_store.py: re_enrich column migration, set_feed_re_enrich(),
  is_re_enrich_enabled(), updated SELECTs
- poller.py: per-feed re_enrich check in seen_changed block
- mcp_server_fastmcp.py: POST /api/v1/feeds/set-re-enrich endpoint,
  updated startup seed
- dashboard.js: re-enrich checkbox per feed row, toggleReEnrich()
Lukas Goldschmidt 5 giorni fa
parent
commit
a7db1d1d20

+ 28 - 0
dashboard/dashboard.js

@@ -154,11 +154,16 @@ function renderFeedsList() {
     var lastItems = f.last_item_count != null ? f.last_item_count + ' items' : '—';
     var lastSeen = f.updated_at ? ' · ' + new Date(f.updated_at).toLocaleString() : '';
     var isEnabled = f.enabled !== false;
+    var isReEnrich = f.re_enrich === true;
     html += '<div class="feed-toggle-row">' +
       '<input type="checkbox" id="feed-' + esc(String(i)) + '"' + (isEnabled ? ' checked' : '') +
       ' onchange="toggleFeed(\'' + esc(f.feed_key) + '\', this.checked)" />' +
       '<div class="feed-url">' + esc(domain) + '</div>' +
       '<span class="feed-toggle-hint">' + lastItems + lastSeen + '</span>' +
+      '<label class="feed-re-enrich-label" style="font-size:.7rem;color:var(--text-dim);display:flex;align-items:center;gap:.2rem;margin-left:auto">' +
+      '<input type="checkbox" id="re-enrich-' + esc(String(i)) + '"' + (isReEnrich ? ' checked' : '') +
+      ' onchange="toggleReEnrich(\'' + esc(f.feed_key) + '\', this.checked)" />' +
+      're-enrich</label>' +
       '</div>';
   }
   html += '</div>';
@@ -192,6 +197,29 @@ async function toggleFeed(feedUrl, enabled) {
   }
 }
 
+async function toggleReEnrich(feedUrl, reEnrich) {
+  try {
+    var res = await fetch(API + '/feeds/set-re-enrich', {
+      method: 'POST',
+      headers: { 'Content-Type': 'application/x-www-form-urlencoded' },
+      body: 'feed_url=' + encodeURIComponent(feedUrl) + '&re_enrich=' + (reEnrich ? 'true' : 'false')
+    });
+    var d = await res.json();
+    if (!d.ok) throw new Error(d.error || 'Toggle failed');
+    // Update local state
+    for (var i = 0; i < _feedsData.length; i++) {
+      if (_feedsData[i].feed_key === feedUrl) { _feedsData[i].re_enrich = reEnrich; break; }
+    }
+    renderFeedsList();
+    showToast('Re-enrich ' + (reEnrich ? 'enabled' : 'disabled') + ': ' + feedUrl.replace(/^https?:\/\//, ''));
+  } catch(e) {
+    console.error('Re-enrich toggle error:', e);
+    showToast('Error toggling re-enrich: ' + e.message, true);
+    var cb = document.getElementById('re-enrich-' + _feedsData.findIndex(function(f){return f.feed_key === feedUrl}));
+    if (cb) cb.checked = !reEnrich;
+  }
+}
+
 // ── Clusters (ALWAYS date descending) ──────────────────────
 async function reloadClusters() {
   var topic = $('cluster-topic').value;

+ 60 - 23
news_mcp/jobs/poller.py

@@ -168,32 +168,69 @@ class ClusterPoller:
             self._prune_and_finalize(enabled_urls, feed_map)
             return self.stats
 
-        # 3c. For changed-content articles, clear enriched_at in the cluster
-        # payload JSON so the next enrichment cycle re-processes them with
-        # the updated article data.  (enriched_at is stored inside the JSON
-        # payload, not as a separate SQL column.)
+        # 3c. For changed-content articles whose feed has re-enrichment enabled,
+        # clear enriched_at in the cluster payload JSON so the next enrichment
+        # cycle re-processes them with the updated article data.
+        # Feeds with re-enrich disabled are left alone — their clusters keep
+        # existing enrichment and the changed articles are silently skipped.
         if seen_changed:
             from news_mcp.article_identity import article_key as _ak
             import json as _json
-            changed_keys = {_ak(a) for a in seen_changed}
-            with self.store._conn() as conn:
-                for ak in changed_keys:
-                    row = conn.execute(
-                        "SELECT sa.cluster_id, c.payload FROM seen_articles sa "
-                        "JOIN clusters c ON c.cluster_id = sa.cluster_id "
-                        "WHERE sa.article_key=?",
-                        (ak,),
-                    ).fetchone()
-                    if row:
-                        changed_cluster_ids.add(row[0])
-                        payload = _json.loads(row[1])
-                        payload.pop("enriched_at", None)
-                        conn.execute(
-                            "UPDATE clusters SET payload=? WHERE cluster_id=?",
-                            (_json.dumps(payload, ensure_ascii=False), row[0]),
-                        )
-            if changed_cluster_ids:
-                self.logger.info("content_changed: clusters=%d will re-enrich", len(changed_cluster_ids))
+
+            # Group changed articles by feed_url
+            from collections import defaultdict as _dd
+            changed_by_feed: dict[str, list] = _dd(list)
+            for a in seen_changed:
+                fu = a.get("feed_url", "")
+                changed_by_feed[fu].append(a)
+
+            # Only process feeds that have re-enrichment enabled
+            re_enrich_feeds = set()
+            for fu in changed_by_feed:
+                if self.store.is_re_enrich_enabled(fu):
+                    re_enrich_feeds.add(fu)
+
+            # Move disabled-feed articles from seen_changed → seen_unchanged
+            truly_changed = []
+            for a in seen_changed:
+                fu = a.get("feed_url", "")
+                if fu in re_enrich_feeds:
+                    truly_changed.append(a)
+                else:
+                    # Re-enrich disabled → treat as unchanged (skip silently)
+                    seen_unchanged.append(a)
+                    fu_stats = [fs for fs in feed_stats if fs.feed_url == fu]
+                    if fu_stats:
+                        fu_stats[0].seen += 1
+
+            seen_changed = truly_changed
+
+            if seen_changed:
+                changed_keys = {_ak(a) for a in seen_changed}
+                with self.store._conn() as conn:
+                    for ak in changed_keys:
+                        row = conn.execute(
+                            "SELECT sa.cluster_id, c.payload FROM seen_articles sa "
+                            "JOIN clusters c ON c.cluster_id = sa.cluster_id "
+                            "WHERE sa.article_key=?",
+                            (ak,),
+                        ).fetchone()
+                        if row:
+                            changed_cluster_ids.add(row[0])
+                            payload = _json.loads(row[1])
+                            payload.pop("enriched_at", None)
+                            conn.execute(
+                                "UPDATE clusters SET payload=? WHERE cluster_id=?",
+                                (_json.dumps(payload, ensure_ascii=False), row[0]),
+                            )
+                if changed_cluster_ids:
+                    self.logger.info(
+                        "content_changed: clusters=%d will re-enrich (feeds=%s)",
+                        len(changed_cluster_ids),
+                        ",".join(sorted(re_enrich_feeds)),
+                    )
+            else:
+                self.logger.info("content_changed: all %d changed articles on re-enrich disabled feeds → skipped", len(changed_by_feed) and sum(len(v) for v in changed_by_feed.values()))
 
         # 4. Pre-seed existing clusters for cross-cycle merging
         existing_clusters = self._preseed_clusters()

+ 17 - 1
news_mcp/mcp_server_fastmcp.py

@@ -1248,7 +1248,7 @@ async def _background_refresh_loop():
     with _shared_store._conn() as conn:
         for url in feed_urls:
             conn.execute(
-                "INSERT OR IGNORE INTO feed_state(feed_key, last_hash, last_item_count, enabled, updated_at) VALUES(?, '', 0, 1, '')",
+                "INSERT OR IGNORE INTO feed_state(feed_key, last_hash, last_item_count, enabled, re_enrich, updated_at) VALUES(?, '', 0, 1, 0, '')",
                 (url,),
             )
     logger.info("startup seeded %d feeds into feed_state", len(feed_urls))
@@ -1473,6 +1473,22 @@ async def api_feed_toggle(feed_url: str = Form(), enabled: bool = Form()):
         return _api_err(e, f"toggle({feed_url})")
 
 
+@app.post("/api/v1/feeds/set-re-enrich")
+async def api_feed_set_re_enrich(feed_url: str = Form(), re_enrich: bool = Form()):
+    """Toggle per-feed re-enrichment on content change."""
+    try:
+        store = SQLiteClusterStore(DB_PATH)
+        ok = store.set_feed_re_enrich(feed_url.strip(), re_enrich)
+        if not ok:
+            return JSONResponse(
+                status_code=404,
+                content={"error": f"Feed not found: {feed_url}"},
+            )
+        return {"ok": True, "feed_url": feed_url.strip(), "re_enrich": re_enrich}
+    except Exception as e:
+        return _api_err(e, f"set-re-enrich({feed_url})")
+
+
 # ------------------------------------------------------------------ #
 #  Site config (dashboard-tuneable parameters)
 # ------------------------------------------------------------------ #

+ 30 - 5
news_mcp/storage/sqlite_store.py

@@ -314,6 +314,10 @@ class SQLiteClusterStore:
                 conn.execute("ALTER TABLE feed_state ADD COLUMN enabled INTEGER DEFAULT 1")
             except sqlite3.OperationalError:
                 pass
+            try:
+                conn.execute("ALTER TABLE feed_state ADD COLUMN re_enrich INTEGER DEFAULT 0")
+            except sqlite3.OperationalError:
+                pass
 
             conn.execute(
                 """
@@ -470,7 +474,7 @@ class SQLiteClusterStore:
         """All feed_state rows."""
         with self._conn() as conn:
             cur = conn.execute(
-                "SELECT feed_key, last_hash, last_item_count, enabled, updated_at FROM feed_state ORDER BY updated_at DESC"
+                "SELECT feed_key, last_hash, last_item_count, enabled, re_enrich, updated_at FROM feed_state ORDER BY updated_at DESC"
             )
             return [
                 {
@@ -478,7 +482,8 @@ class SQLiteClusterStore:
                     "last_hash": row[1],
                     "last_item_count": row[2],
                     "enabled": bool(row[3]),
-                    "updated_at": row[4],
+                    "re_enrich": bool(row[4]),
+                    "updated_at": row[5],
                 }
                 for row in cur.fetchall()
             ]
@@ -490,7 +495,7 @@ class SQLiteClusterStore:
         with self._conn() as conn:
             for url in feed_urls:
                 conn.execute(
-                    "INSERT OR IGNORE INTO feed_state(feed_key, last_hash, last_item_count, enabled, updated_at) VALUES(?, '', 0, 1, '')",
+                    "INSERT OR IGNORE INTO feed_state(feed_key, last_hash, last_item_count, enabled, re_enrich, updated_at) VALUES(?, '', 0, 1, 0, '')",
                     (url,),
                 )
 
@@ -507,6 +512,26 @@ class SQLiteClusterStore:
             )
             return cur.rowcount > 0
 
+    def set_feed_re_enrich(self, feed_url: str, re_enrich: bool) -> bool:
+        """Toggle per-feed re-enrichment. Returns True if the feed existed and was updated."""
+        with self._conn() as conn:
+            cur = conn.execute(
+                "UPDATE feed_state SET re_enrich = ? WHERE feed_key = ?",
+                (1 if re_enrich else 0, feed_url),
+            )
+            return cur.rowcount > 0
+
+    def is_re_enrich_enabled(self, feed_url: str) -> bool:
+        """Return True if re-enrichment is enabled for the given feed_url."""
+        with self._conn() as conn:
+            cur = conn.execute(
+                "SELECT re_enrich FROM feed_state WHERE feed_key = ?", (feed_url,)
+            )
+            row = cur.fetchone()
+            if row is None:
+                return False  # unknown feed → disabled
+            return bool(row[0])
+
     def get_enabled_feed_urls(self, feed_urls: list[str]) -> list[str]:
         """From a list of configured feed URLs, return only those that are enabled in feed_state.
 
@@ -788,11 +813,11 @@ class SQLiteClusterStore:
             last_refresh = self.get_meta("last_refresh_at")
             feeds = {}
             for row in conn.execute(
-                "SELECT feed_key, last_hash, last_item_count, enabled, updated_at FROM feed_state ORDER BY updated_at DESC"
+                "SELECT feed_key, last_hash, last_item_count, enabled, re_enrich, updated_at FROM feed_state ORDER BY updated_at DESC"
             ):
                 feeds[row[0]] = {
                     "last_hash": row[1], "last_item_count": row[2],
-                    "enabled": bool(row[3]), "updated_at": row[4],
+                    "enabled": bool(row[3]), "re_enrich": bool(row[4]), "updated_at": row[5],
                 }
             # Freshness: did a refresh happen recently? (within 2x the configured interval)
             fresh = False