|
|
@@ -266,6 +266,20 @@ class SQLiteClusterStore:
|
|
|
"CREATE INDEX IF NOT EXISTS idx_cluster_keywords_keyword ON cluster_keywords(keyword)"
|
|
|
)
|
|
|
|
|
|
+ # Seen-articles table: tracks every article_key that has been
|
|
|
+ # clustered, so the poller can skip already-processed articles
|
|
|
+ # entirely (no re-clustering, no re-enrichment).
|
|
|
+ conn.execute(
|
|
|
+ """
|
|
|
+ CREATE TABLE IF NOT EXISTS seen_articles (
|
|
|
+ article_key TEXT PRIMARY KEY,
|
|
|
+ cluster_id TEXT NOT NULL,
|
|
|
+ first_seen TEXT NOT NULL,
|
|
|
+ url TEXT NOT NULL DEFAULT ''
|
|
|
+ )
|
|
|
+ """
|
|
|
+ )
|
|
|
+
|
|
|
try:
|
|
|
cur = conn.execute("PRAGMA table_info(entity_metadata)")
|
|
|
cols = [row[1] for row in cur.fetchall()]
|
|
|
@@ -345,6 +359,16 @@ class SQLiteClusterStore:
|
|
|
"INSERT OR IGNORE INTO cluster_keywords(cluster_id, keyword) VALUES(?, ?)",
|
|
|
(cluster_id, kw_norm),
|
|
|
)
|
|
|
+ # Record every article in seen_articles so the poller can
|
|
|
+ # skip already-processed articles on future cycles.
|
|
|
+ for art in c.get("articles", []):
|
|
|
+ akey = _article_key(art)
|
|
|
+ if akey:
|
|
|
+ art_url = str(art.get("url") or "").strip()
|
|
|
+ conn.execute(
|
|
|
+ "INSERT OR IGNORE INTO seen_articles(article_key, cluster_id, first_seen, url) VALUES(?,?,?,?)",
|
|
|
+ (akey, cluster_id, now.isoformat(), art_url),
|
|
|
+ )
|
|
|
|
|
|
def upsert_cluster_summary(
|
|
|
self,
|
|
|
@@ -495,6 +519,41 @@ class SQLiteClusterStore:
|
|
|
cur = conn.execute("SELECT feed_key FROM feed_state WHERE enabled = 1")
|
|
|
return [row[0] for row in cur.fetchall()]
|
|
|
|
|
|
+ # ------------------------------------------------------------------ #
|
|
|
+ # Seen-articles: skip already-processed articles at ingestion
|
|
|
+ # ------------------------------------------------------------------ #
|
|
|
+
|
|
|
+ def filter_already_seen(self, articles: list[dict]) -> tuple[list[dict], list[dict]]:
|
|
|
+ """Split articles into (new, already_seen) based on seen_articles table.
|
|
|
+
|
|
|
+ Uses _article_key (derived from URL) as the identity check.
|
|
|
+ Returns two lists: articles never seen before, and articles already
|
|
|
+ processed in a previous cycle.
|
|
|
+ """
|
|
|
+ keys = [_article_key(a) for a in articles]
|
|
|
+ if not keys:
|
|
|
+ return [], []
|
|
|
+ with self._conn() as conn:
|
|
|
+ placeholders = ",".join("?" for _ in keys)
|
|
|
+ cur = conn.execute(
|
|
|
+ f"SELECT article_key FROM seen_articles WHERE article_key IN ({placeholders})",
|
|
|
+ keys,
|
|
|
+ )
|
|
|
+ seen_set = {row[0] for row in cur.fetchall()}
|
|
|
+ new_articles = []
|
|
|
+ seen_articles = []
|
|
|
+ for art, key in zip(articles, keys):
|
|
|
+ if key in seen_set:
|
|
|
+ seen_articles.append(art)
|
|
|
+ else:
|
|
|
+ new_articles.append(art)
|
|
|
+ return new_articles, seen_articles
|
|
|
+
|
|
|
+ def get_seen_article_count(self) -> int:
|
|
|
+ """Total rows in seen_articles (for diagnostics)."""
|
|
|
+ with self._conn() as conn:
|
|
|
+ return conn.execute("SELECT count(*) FROM seen_articles").fetchone()[0]
|
|
|
+
|
|
|
def get_meta(self, key: str) -> str | None:
|
|
|
with self._conn() as conn:
|
|
|
cur = conn.execute("SELECT value FROM meta WHERE key=?", (key,))
|
|
|
@@ -618,8 +677,21 @@ class SQLiteClusterStore:
|
|
|
# Use payload_ts (event time from payload.timestamp) not updated_at
|
|
|
# (row write time). updated_at is refreshed on every upsert, which
|
|
|
# would keep re-ingested old articles alive forever.
|
|
|
+ # Collect cluster_ids being pruned so we can clean seen_articles.
|
|
|
+ pruned_ids = [
|
|
|
+ row[0] for row in conn.execute(
|
|
|
+ "SELECT cluster_id FROM clusters WHERE payload_ts < ?", (cutoff_iso,)
|
|
|
+ ).fetchall()
|
|
|
+ ]
|
|
|
cur = conn.execute("DELETE FROM clusters WHERE payload_ts < ?", (cutoff_iso,))
|
|
|
deleted = int(cur.rowcount or 0)
|
|
|
+ # Clean up seen_articles rows pointing to pruned clusters
|
|
|
+ if pruned_ids:
|
|
|
+ placeholders = ",".join("?" for _ in pruned_ids)
|
|
|
+ conn.execute(
|
|
|
+ f"DELETE FROM seen_articles WHERE cluster_id IN ({placeholders})",
|
|
|
+ pruned_ids,
|
|
|
+ )
|
|
|
conn.execute(
|
|
|
"INSERT INTO meta(key, value) VALUES(?, ?) "
|
|
|
"ON CONFLICT(key) DO UPDATE SET value=excluded.value",
|
|
|
@@ -732,6 +804,7 @@ class SQLiteClusterStore:
|
|
|
"data_fresh": fresh,
|
|
|
"feeds": feeds,
|
|
|
"feed_count": len(feeds),
|
|
|
+ "seen_article_count": self.get_seen_article_count(),
|
|
|
"prune_state": prune_state,
|
|
|
}
|
|
|
|