1 개월 전 · 89e141466f
--- a/README.md
+++ b/README.md
@@ -218,4 +218,6 @@ underlying article id / URL path. To clean existing rows:
 
				 ```
			
 
				 
			
 
				 The live clustering path also deduplicates article entries when new data comes in.
			
 
				+
			
 
				+As of the latest hardening, the server/storage write path also self-heals `payload.articles` by deduplicating before persisting (so historical rows can be fixed via the cleanup script, and future writes won’t reintroduce duplicates). 
			
 
				 ```
			
--- a/news_mcp/jobs/poller.py
+++ b/news_mcp/jobs/poller.py
@@ -7,6 +7,7 @@ from news_mcp.config import CLUSTERS_TTL_HOURS, DB_PATH, NEWS_FEED_URL, NEWS_FEE
 
				 from news_mcp.dedup.cluster import dedup_and_cluster_articles
			
 
				 from news_mcp.enrichment.enrich import enrich_cluster
			
 
				 from news_mcp.enrichment.llm_enrich import classify_cluster_groq
			
 
				+from news_mcp.trends_resolution import resolve_entity_via_trends
			
 
				 from news_mcp.sources.news_feeds import fetch_news_articles
			
 
				 from news_mcp.storage.sqlite_store import SQLiteClusterStore
			
 
				 
			
@@ -57,6 +58,16 @@ async def refresh_clusters(topic: str | None = None, limit: int = 80) -> None:
 
				                     c2 = dict(c2)
			
 
				                     # Keep existing enriched fields.
			
 
				                     c2["entities"] = existing.get("entities", [])
			
 
				+
			
 
				+                    # IMPORTANT: entityResolutions must stay consistent with entities.
			
 
				+                    # Older rows may have entities but missing/malformed resolutions.
			
 
				+                    existing_resolutions = existing.get("entityResolutions", None)
			
 
				+                    if isinstance(existing_resolutions, list) and existing_resolutions:
			
 
				+                        c2["entityResolutions"] = existing_resolutions
			
 
				+                    else:
			
 
				+                        # Recompute resolutions deterministically from the stored entities.
			
 
				+                        c2["entityResolutions"] = [resolve_entity_via_trends(e) for e in c2["entities"]]
			
 
				+
			
 
				                     if existing.get("sentiment"):
			
 
				                         c2["sentiment"] = existing.get("sentiment")
			
 
				                     if existing.get("sentimentScore") is not None:
			
--- a/news_mcp/storage/sqlite_store.py
+++ b/news_mcp/storage/sqlite_store.py
@@ -6,6 +6,10 @@ from dataclasses import dataclass
 
				 from datetime import datetime, timezone, timedelta
			
 
				 from pathlib import Path
			
 
				 from typing import Any
			
 
				+from urllib.parse import urlparse
			
 
				+
			
 
				+from news_mcp.entity_normalize import normalize_entities
			
 
				+from news_mcp.trends_resolution import resolve_entity_via_trends
			
 
				 
			
 
				 
			
 
				 @dataclass
			
@@ -16,6 +20,71 @@ class ClusterRow:
 
				     updated_at: datetime
			
 
				 
			
 
				 
			
 
				+def _article_key(article: dict[str, Any]) -> str:
			
 
				+    url = str(article.get("url") or "").strip()
			
 
				+    if not url:
			
 
				+        return str(article.get("title") or "")
			
 
				+    try:
			
 
				+        parsed = urlparse(url)
			
 
				+        parts = [p for p in parsed.path.split("/") if p]
			
 
				+        if parts:
			
 
				+            return parts[-1]
			
 
				+    except Exception:
			
 
				+        pass
			
 
				+    return url
			
 
				+
			
 
				+
			
 
				+def _dedup_articles(articles: list[dict[str, Any]]) -> list[dict[str, Any]]:
			
 
				+    seen: set[str] = set()
			
 
				+    out: list[dict[str, Any]] = []
			
 
				+    for article in articles:
			
 
				+        key = _article_key(article)
			
 
				+        if key in seen:
			
 
				+            continue
			
 
				+        seen.add(key)
			
 
				+        out.append(article)
			
 
				+    return out
			
 
				+
			
 
				+
			
 
				+def _has_valid_entity_resolutions(resolutions: Any, entities: list[str]) -> bool:
			
 
				+    if not isinstance(resolutions, list):
			
 
				+        return False
			
 
				+    if len(resolutions) != len(entities):
			
 
				+        return False
			
 
				+    for res in resolutions:
			
 
				+        if not isinstance(res, dict):
			
 
				+            return False
			
 
				+        if not res.get("normalized") or not res.get("canonical_label"):
			
 
				+            return False
			
 
				+    return True
			
 
				+
			
 
				+
			
 
				+def sanitize_cluster_payload(cluster: dict[str, Any], *, include_resolutions: bool = True) -> dict[str, Any]:
			
 
				+    """Normalize cluster payload so every stored payload is internally consistent."""
			
 
				+    out = dict(cluster)
			
 
				+
			
 
				+    raw_articles = out.get("articles", []) or []
			
 
				+    articles = [a for a in raw_articles if isinstance(a, dict)]
			
 
				+    out["articles"] = _dedup_articles(articles)
			
 
				+
			
 
				+    raw_entities = out.get("entities", []) or []
			
 
				+    entities = normalize_entities(raw_entities)
			
 
				+    out["entities"] = entities
			
 
				+
			
 
				+    if not include_resolutions:
			
 
				+        return out
			
 
				+
			
 
				+    resolutions = out.get("entityResolutions", None)
			
 
				+    if entities:
			
 
				+        if not _has_valid_entity_resolutions(resolutions, entities):
			
 
				+            out["entityResolutions"] = [resolve_entity_via_trends(e) for e in entities]
			
 
				+    else:
			
 
				+        # Keep the empty case explicit and stable.
			
 
				+        out["entityResolutions"] = []
			
 
				+
			
 
				+    return out
			
 
				+
			
 
				+
			
 
				 class SQLiteClusterStore:
			
 
				     def __init__(self, db_path: str | Path):
			
 
				         self.db_path = str(db_path)
			
@@ -69,6 +138,7 @@ class SQLiteClusterStore:
 
				         now = datetime.now(timezone.utc)
			
 
				         with self._conn() as conn:
			
 
				             for c in clusters:
			
 
				+                c = sanitize_cluster_payload(c)
			
 
				                 cluster_id = c["cluster_id"]
			
 
				                 payload = json.dumps(c, ensure_ascii=False)
			
 
				                 conn.execute(
			
--- a/scripts/backfill_news_entities.py
+++ b/scripts/backfill_news_entities.py
@@ -12,7 +12,6 @@ import json
 
				 import sys
			
 
				 import logging
			
 
				 from pathlib import Path
			
 
				-from typing import Any
			
 
				 
			
 
				 
			
 
				 ROOT = Path(__file__).resolve().parents[1]
			
@@ -21,32 +20,38 @@ if str(ROOT) not in sys.path:
 
				 
			
 
				 from news_mcp.config import DB_PATH
			
 
				 from news_mcp.entity_normalize import normalize_entities
			
 
				-from news_mcp.storage.sqlite_store import SQLiteClusterStore
			
 
				-from news_mcp.trends_resolution import resolve_entity_via_trends
			
 
				+from news_mcp.storage.sqlite_store import SQLiteClusterStore, sanitize_cluster_payload
			
 
				 
			
 
				 
			
 
				-def _compute_entity_resolutions(entities: list[str]) -> list[dict[str, Any]]:
			
 
				-    return [resolve_entity_via_trends(ent) for ent in entities]
			
 
				-
			
 
				-
			
 
				-def _needs_backfill(cluster: dict) -> bool:
			
 
				+def _backfill_reason(cluster: dict) -> str | None:
			
 
				+    """Return a reason string if the cluster needs backfill, else None."""
			
 
				     raw_entities = cluster.get("entities", []) or []
			
 
				     normalized_entities = normalize_entities(raw_entities)
			
 
				-    resolutions = cluster.get("entityResolutions", []) or []
			
 
				+    resolutions = cluster.get("entityResolutions", None)
			
 
				 
			
 
				-    # Clearly missing or stale metadata.
			
 
				-    if not resolutions:
			
 
				-        return bool(normalized_entities or raw_entities)
			
 
				     if normalized_entities != raw_entities:
			
 
				-        return True
			
 
				-    if len(resolutions) != len(normalized_entities):
			
 
				-        return True
			
 
				-    for res in resolutions:
			
 
				+        return "entities_not_normalized"
			
 
				+
			
 
				+    if normalized_entities and not resolutions:
			
 
				+        return "missing_entityResolutions"
			
 
				+
			
 
				+    if resolutions is not None and not isinstance(resolutions, list):
			
 
				+        return "invalid_resolution_shape"
			
 
				+
			
 
				+    if isinstance(resolutions, list) and len(resolutions) != len(normalized_entities):
			
 
				+        return "resolution_length_mismatch"
			
 
				+
			
 
				+    for res in resolutions or []:
			
 
				         if not isinstance(res, dict):
			
 
				-            return True
			
 
				+            return "invalid_resolution_shape"
			
 
				         if not res.get("normalized") or not res.get("canonical_label"):
			
 
				-            return True
			
 
				-    return False
			
 
				+            return "resolution_missing_fields"
			
 
				+
			
 
				+    sanitized = sanitize_cluster_payload(cluster)
			
 
				+    if sanitized != cluster:
			
 
				+        return "payload_sanitized"
			
 
				+
			
 
				+    return None
			
 
				 
			
 
				 
			
 
				 async def backfill(
			
@@ -55,6 +60,7 @@ async def backfill(
 
				     dry_run: bool = False,
			
 
				     scan_only: bool = False,
			
 
				     progress_every: int = 20,
			
 
				+    write_batch_size: int = 100,
			
 
				 ) -> dict[str, int]:
			
 
				     store = SQLiteClusterStore(db_path)
			
 
				 
			
@@ -65,6 +71,10 @@ async def backfill(
 
				     total = 0
			
 
				     updated = 0
			
 
				     skipped = 0
			
 
				+    reason_counts: dict[str, int] = {}
			
 
				+    first_samples: dict[str, list[str]] = {}
			
 
				+    sample_limit_per_reason = 5
			
 
				+    pending_writes: list[tuple[dict, str]] = []
			
 
				     logger = logging.getLogger("news_mcp.backfill")
			
 
				 
			
 
				     print(f"starting backfill: total_rows={len(rows)} limit={limit or 'all'} dry_run={dry_run} scan_only={scan_only}", flush=True)
			
@@ -77,20 +87,28 @@ async def backfill(
 
				             skipped += 1
			
 
				             continue
			
 
				 
			
 
				-        if not _needs_backfill(cluster):
			
 
				+        reason = _backfill_reason(cluster)
			
 
				+        if reason is None:
			
 
				             continue
			
 
				+        reason_counts[reason] = reason_counts.get(reason, 0) + 1
			
 
				 
			
 
				-        raw_entities = cluster.get("entities", []) or []
			
 
				-        normalized_entities = normalize_entities(raw_entities)
			
 
				-        entity_resolutions = _compute_entity_resolutions(normalized_entities) if not scan_only else []
			
 
				+        # Record a few example cluster_ids per reason (helpful for root cause).
			
 
				+        if sample_limit_per_reason and reason in reason_counts and reason not in first_samples:
			
 
				+            first_samples[reason] = []
			
 
				+        if reason in first_samples and len(first_samples[reason]) < sample_limit_per_reason:
			
 
				+            first_samples[reason].append(cluster.get("cluster_id") or cluster_id)
			
 
				 
			
 
				-        cluster = dict(cluster)
			
 
				-        cluster["entities"] = normalized_entities
			
 
				-        if not scan_only:
			
 
				-            cluster["entityResolutions"] = entity_resolutions
			
 
				+        sanitized_cluster = sanitize_cluster_payload(cluster, include_resolutions=not scan_only)
			
 
				 
			
 
				         if not dry_run:
			
 
				-            store.upsert_clusters([cluster], topic=topic or cluster.get("topic", "other"))
			
 
				+            pending_writes.append((sanitized_cluster, topic or sanitized_cluster.get("topic", "other")))
			
 
				+            if write_batch_size > 0 and len(pending_writes) >= write_batch_size:
			
 
				+                topic_groups: dict[str, list[dict]] = {}
			
 
				+                for item_cluster, item_topic in pending_writes:
			
 
				+                    topic_groups.setdefault(item_topic, []).append(item_cluster)
			
 
				+                for batch_topic, batch_clusters in topic_groups.items():
			
 
				+                    store.upsert_clusters(batch_clusters, topic=batch_topic)
			
 
				+                pending_writes.clear()
			
 
				         updated += 1
			
 
				 
			
 
				         if limit is not None and updated >= limit:
			
@@ -103,9 +121,21 @@ async def backfill(
 
				         if progress_every and total % progress_every == 0:
			
 
				             print(flush=True)
			
 
				 
			
 
				+    if not dry_run and pending_writes:
			
 
				+        topic_groups: dict[str, list[dict]] = {}
			
 
				+        for item_cluster, item_topic in pending_writes:
			
 
				+            topic_groups.setdefault(item_topic, []).append(item_cluster)
			
 
				+        for batch_topic, batch_clusters in topic_groups.items():
			
 
				+            store.upsert_clusters(batch_clusters, topic=batch_topic)
			
 
				+
			
 
				     if total % max(1, progress_every) != 0:
			
 
				         print(flush=True)
			
 
				 
			
 
				+    # Print debug summary by reason.
			
 
				+    if reason_counts:
			
 
				+        print("reason_counts=" + json.dumps(reason_counts, ensure_ascii=False), flush=True)
			
 
				+        for reason, samples in first_samples.items():
			
 
				+            print(f"sample_{reason}=" + ",".join(samples), flush=True)
			
 
				     return {"total": total, "updated": updated, "skipped": skipped}
			
 
				 
			
 
				 
			
@@ -116,6 +146,7 @@ def main() -> None:
 
				     parser.add_argument("--dry-run", action="store_true", help="Compute changes without writing them")
			
 
				     parser.add_argument("--scan-only", action="store_true", help="Only detect stale rows; skip trends lookups")
			
 
				     parser.add_argument("--progress-every", type=int, default=20, help="Emit a progress line every N processed rows")
			
 
				+    parser.add_argument("--write-batch-size", type=int, default=100, help="Flush writes every N updated clusters")
			
 
				     args = parser.parse_args()
			
 
				 
			
 
				     logging.basicConfig(level=logging.INFO, format="%(message)s")
			
@@ -126,6 +157,7 @@ def main() -> None:
 
				             dry_run=args.dry_run,
			
 
				             scan_only=args.scan_only,
			
 
				             progress_every=max(0, int(args.progress_every)),
			
 
				+            write_batch_size=max(0, int(args.write_batch_size)),
			
 
				         )
			
 
				     )
			
 
				     mode = "DRY RUN" if args.dry_run else "DONE"
			
--- a/scripts/dedup_articles_in_clusters.py
+++ b/scripts/dedup_articles_in_clusters.py
@@ -14,40 +14,12 @@ import argparse
 
				 import json
			
 
				 import sys
			
 
				 from pathlib import Path
			
 
				-from typing import Any
			
 
				-from urllib.parse import urlparse
			
 
				 
			
 
				 ROOT = Path(__file__).resolve().parents[1]
			
 
				 sys.path.insert(0, str(ROOT))
			
 
				 
			
 
				 from news_mcp.config import DB_PATH
			
 
				-from news_mcp.storage.sqlite_store import SQLiteClusterStore
			
 
				-
			
 
				-
			
 
				-def _article_key(article: dict[str, Any]) -> str:
			
 
				-    url = str(article.get("url") or "").strip()
			
 
				-    if not url:
			
 
				-        return str(article.get("title") or "")
			
 
				-    try:
			
 
				-        parsed = urlparse(url)
			
 
				-        parts = [p for p in parsed.path.split("/") if p]
			
 
				-        if parts:
			
 
				-            return parts[-1]
			
 
				-    except Exception:
			
 
				-        pass
			
 
				-    return url
			
 
				-
			
 
				-
			
 
				-def _dedup_articles(articles: list[dict[str, Any]]) -> list[dict[str, Any]]:
			
 
				-    seen = set()
			
 
				-    out = []
			
 
				-    for article in articles:
			
 
				-        key = _article_key(article)
			
 
				-        if key in seen:
			
 
				-            continue
			
 
				-        seen.add(key)
			
 
				-        out.append(article)
			
 
				-    return out
			
 
				+from news_mcp.storage.sqlite_store import SQLiteClusterStore, sanitize_cluster_payload
			
 
				 
			
 
				 
			
 
				 def main() -> None:
			
@@ -76,13 +48,13 @@ def main() -> None:
 
				         except Exception:
			
 
				             continue
			
 
				 
			
 
				-        articles = cluster.get("articles", []) or []
			
 
				-        deduped = _dedup_articles([a for a in articles if isinstance(a, dict)])
			
 
				-        if len(deduped) == len(articles):
			
 
				+        sanitized = sanitize_cluster_payload(cluster)
			
 
				+        original_articles = cluster.get("articles", []) or []
			
 
				+        deduped = sanitized.get("articles", []) or []
			
 
				+        if deduped == original_articles:
			
 
				             continue
			
 
				 
			
 
				-        cluster = dict(cluster)
			
 
				-        cluster["articles"] = deduped
			
 
				+        cluster = sanitized
			
 
				         if not args.dry_run:
			
 
				             store.upsert_clusters([cluster], topic=topic or cluster.get("topic", "other"))
			
 
				         updated += 1