|
|
@@ -261,13 +261,19 @@ class SQLiteClusterStore:
|
|
|
conn.execute(
|
|
|
"""
|
|
|
CREATE TABLE IF NOT EXISTS seen_articles (
|
|
|
- article_key TEXT PRIMARY KEY,
|
|
|
- cluster_id TEXT NOT NULL,
|
|
|
- first_seen TEXT NOT NULL,
|
|
|
- url TEXT NOT NULL DEFAULT ''
|
|
|
+ article_key TEXT PRIMARY KEY,
|
|
|
+ cluster_id TEXT NOT NULL,
|
|
|
+ first_seen TEXT NOT NULL,
|
|
|
+ url TEXT NOT NULL DEFAULT '',
|
|
|
+ content_hash TEXT NOT NULL DEFAULT ''
|
|
|
)
|
|
|
"""
|
|
|
)
|
|
|
+ # Migration: add content_hash column if missing (existing DBs)
|
|
|
+ try:
|
|
|
+ conn.execute("ALTER TABLE seen_articles ADD COLUMN content_hash TEXT NOT NULL DEFAULT ''")
|
|
|
+ except sqlite3.OperationalError:
|
|
|
+ pass # column already exists
|
|
|
|
|
|
try:
|
|
|
cur = conn.execute("PRAGMA table_info(entity_metadata)")
|
|
|
@@ -361,9 +367,12 @@ class SQLiteClusterStore:
|
|
|
akey = _article_key(art)
|
|
|
if akey:
|
|
|
art_url = str(art.get("url") or "").strip()
|
|
|
+ from news_mcp.article_identity import article_content_hash as _chash
|
|
|
+ ahash = _chash(art)
|
|
|
conn.execute(
|
|
|
- "INSERT OR IGNORE INTO seen_articles(article_key, cluster_id, first_seen, url) VALUES(?,?,?,?)",
|
|
|
- (akey, cluster_id, now.isoformat(), art_url),
|
|
|
+ "INSERT INTO seen_articles(article_key, cluster_id, first_seen, url, content_hash) VALUES(?,?,?,?,?) "
|
|
|
+ "ON CONFLICT(article_key) DO UPDATE SET cluster_id=excluded.cluster_id, url=excluded.url, content_hash=excluded.content_hash",
|
|
|
+ (akey, cluster_id, now.isoformat(), art_url, ahash),
|
|
|
)
|
|
|
|
|
|
def upsert_cluster_summary(
|
|
|
@@ -519,31 +528,40 @@ class SQLiteClusterStore:
|
|
|
# Seen-articles: skip already-processed articles at ingestion
|
|
|
# ------------------------------------------------------------------ #
|
|
|
|
|
|
- def filter_already_seen(self, articles: list[dict]) -> tuple[list[dict], list[dict]]:
|
|
|
- """Split articles into (new, already_seen) based on seen_articles table.
|
|
|
+ def filter_already_seen(self, articles: list[dict]) -> tuple[list[dict], list[dict], list[dict]]:
|
|
|
+ """Split articles into (new, seen_unchanged, seen_changed) based on seen_articles.
|
|
|
|
|
|
- Uses _article_key (derived from URL) as the identity check.
|
|
|
- Returns two lists: articles never seen before, and articles already
|
|
|
- processed in a previous cycle.
|
|
|
+ Uses _article_key (URL) as identity and article_content_hash to detect
|
|
|
+ in-place content updates (e.g. a stub that gets fleshed out).
|
|
|
+
|
|
|
+ Returns:
|
|
|
+ new_articles: never seen before → full clustering + enrichment
|
|
|
+ seen_unchanged: same key, same content hash → skip entirely
|
|
|
+ seen_changed: same key, different content hash → re-cluster to update
|
|
|
+ the existing cluster payload (will trigger re-enrichment)
|
|
|
"""
|
|
|
+ from news_mcp.article_identity import article_content_hash as _content_hash
|
|
|
keys = [_article_key(a) for a in articles]
|
|
|
if not keys:
|
|
|
- return [], []
|
|
|
+ return [], [], []
|
|
|
with self._conn() as conn:
|
|
|
placeholders = ",".join("?" for _ in keys)
|
|
|
cur = conn.execute(
|
|
|
- f"SELECT article_key FROM seen_articles WHERE article_key IN ({placeholders})",
|
|
|
+ f"SELECT article_key, content_hash FROM seen_articles WHERE article_key IN ({placeholders})",
|
|
|
keys,
|
|
|
)
|
|
|
- seen_set = {row[0] for row in cur.fetchall()}
|
|
|
+ seen_map = {row[0]: row[1] for row in cur.fetchall()}
|
|
|
new_articles = []
|
|
|
- seen_articles = []
|
|
|
+ seen_unchanged = []
|
|
|
+ seen_changed = []
|
|
|
for art, key in zip(articles, keys):
|
|
|
- if key in seen_set:
|
|
|
- seen_articles.append(art)
|
|
|
- else:
|
|
|
+ if key not in seen_map:
|
|
|
new_articles.append(art)
|
|
|
- return new_articles, seen_articles
|
|
|
+ elif seen_map[key] == _content_hash(art):
|
|
|
+ seen_unchanged.append(art)
|
|
|
+ else:
|
|
|
+ seen_changed.append(art)
|
|
|
+ return new_articles, seen_unchanged, seen_changed
|
|
|
|
|
|
def get_seen_article_count(self) -> int:
|
|
|
"""Total rows in seen_articles (for diagnostics)."""
|