| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417 |
- from __future__ import annotations
- import json
- import sqlite3
- from dataclasses import dataclass
- from datetime import datetime, timezone, timedelta
- from pathlib import Path
- from typing import Any
- from urllib.parse import urlparse
- from email.utils import parsedate_to_datetime
- from news_mcp.entity_normalize import normalize_entities
- from news_mcp.trends_resolution import resolve_entity_via_trends
- @dataclass
- class ClusterRow:
- cluster_id: str
- topic: str
- payload: dict
- updated_at: datetime
- META_LAST_PRUNE_AT = "last_prune_at"
- def _article_key(article: dict[str, Any]) -> str:
- url = str(article.get("url") or "").strip()
- if not url:
- return str(article.get("title") or "")
- try:
- parsed = urlparse(url)
- parts = [p for p in parsed.path.split("/") if p]
- if parts:
- return parts[-1]
- except Exception:
- pass
- return url
- def _dedup_articles(articles: list[dict[str, Any]]) -> list[dict[str, Any]]:
- seen: set[str] = set()
- out: list[dict[str, Any]] = []
- for article in articles:
- key = _article_key(article)
- if key in seen:
- continue
- seen.add(key)
- out.append(article)
- return out
- def _has_valid_entity_resolutions(resolutions: Any, entities: list[str]) -> bool:
- if not isinstance(resolutions, list):
- return False
- if len(resolutions) != len(entities):
- return False
- for res in resolutions:
- if not isinstance(res, dict):
- return False
- if not res.get("normalized") or not res.get("canonical_label"):
- return False
- return True
- def sanitize_cluster_payload(cluster: dict[str, Any], *, include_resolutions: bool = True) -> dict[str, Any]:
- """Normalize cluster payload so every stored payload is internally consistent."""
- out = dict(cluster)
- raw_articles = out.get("articles", []) or []
- articles = [a for a in raw_articles if isinstance(a, dict)]
- out["articles"] = _dedup_articles(articles)
- raw_entities = out.get("entities", []) or []
- entities = normalize_entities(raw_entities)
- out["entities"] = entities
- if not include_resolutions:
- return out
- resolutions = out.get("entityResolutions", None)
- if entities:
- if not _has_valid_entity_resolutions(resolutions, entities):
- out["entityResolutions"] = [resolve_entity_via_trends(e) for e in entities]
- else:
- # Keep the empty case explicit and stable.
- out["entityResolutions"] = []
- return out
- class SQLiteClusterStore:
- def __init__(self, db_path: str | Path):
- self.db_path = str(db_path)
- self._init_db()
- def _conn(self) -> sqlite3.Connection:
- return sqlite3.connect(self.db_path)
- def _init_db(self) -> None:
- Path(self.db_path).parent.mkdir(parents=True, exist_ok=True)
- with self._conn() as conn:
- conn.execute("PRAGMA journal_mode=WAL")
- conn.execute("PRAGMA synchronous=NORMAL")
- conn.execute("PRAGMA busy_timeout=5000")
- conn.execute(
- """
- CREATE TABLE IF NOT EXISTS clusters (
- cluster_id TEXT PRIMARY KEY,
- topic TEXT NOT NULL,
- payload TEXT NOT NULL,
- updated_at TEXT NOT NULL,
- summary_payload TEXT,
- summary_updated_at TEXT
- )
- """
- )
- # If the table already exists without the summary columns,
- # add them (SQLite-friendly incremental migrations).
- for col_def in [
- "summary_payload TEXT",
- "summary_updated_at TEXT",
- ]:
- col = col_def.split()[0]
- try:
- conn.execute(f"ALTER TABLE clusters ADD COLUMN {col_def}")
- except sqlite3.OperationalError:
- pass
- conn.execute(
- "CREATE INDEX IF NOT EXISTS idx_clusters_topic ON clusters(topic)"
- )
- conn.execute(
- "CREATE INDEX IF NOT EXISTS idx_clusters_updated_at ON clusters(updated_at)"
- )
- conn.execute(
- """
- CREATE TABLE IF NOT EXISTS feed_state (
- feed_key TEXT PRIMARY KEY,
- last_hash TEXT NOT NULL,
- updated_at TEXT NOT NULL
- )
- """
- )
- conn.execute(
- """
- CREATE TABLE IF NOT EXISTS meta (
- key TEXT PRIMARY KEY,
- value TEXT NOT NULL
- )
- """
- )
- def upsert_clusters(self, clusters: list[dict], topic: str) -> None:
- now = datetime.now(timezone.utc)
- with self._conn() as conn:
- for c in clusters:
- c = sanitize_cluster_payload(c)
- cluster_id = c["cluster_id"]
- payload = json.dumps(c, ensure_ascii=False)
- conn.execute(
- "INSERT INTO clusters(cluster_id, topic, payload, updated_at) VALUES(?,?,?,?) "
- "ON CONFLICT(cluster_id) DO UPDATE SET topic=excluded.topic, payload=excluded.payload, updated_at=excluded.updated_at",
- (cluster_id, topic, payload, now.isoformat()),
- )
- def upsert_cluster_summary(
- self,
- cluster_id: str,
- summary_payload: dict,
- ) -> None:
- now = datetime.now(timezone.utc).isoformat()
- with self._conn() as conn:
- conn.execute(
- "INSERT INTO clusters(cluster_id, topic, payload, updated_at, summary_payload, summary_updated_at) "
- "VALUES(?,?,?,?,?,?) "
- "ON CONFLICT(cluster_id) DO UPDATE SET "
- "summary_payload=excluded.summary_payload, summary_updated_at=excluded.summary_updated_at",
- (
- cluster_id,
- "", # topic not used for update
- json.dumps({}, ensure_ascii=False),
- now,
- json.dumps(summary_payload, ensure_ascii=False),
- now,
- ),
- )
- def get_cluster_summary(self, cluster_id: str, ttl_hours: float) -> dict | None:
- cutoff = datetime.now(timezone.utc) - timedelta(hours=ttl_hours)
- cutoff_iso = cutoff.isoformat()
- with self._conn() as conn:
- cur = conn.execute(
- "SELECT summary_payload, summary_updated_at FROM clusters "
- "WHERE cluster_id=? AND summary_updated_at >= ?",
- (cluster_id, cutoff_iso),
- )
- row = cur.fetchone()
- if not row or not row[0]:
- return None
- return json.loads(row[0])
- def get_latest_clusters(self, topic: str, ttl_hours: float, limit: int) -> list[dict]:
- """Return newest clusters by *their own* timestamp.
- Filtering/sorting by the DB row's `updated_at` can drift away from the
- actual event time in `payload.timestamp`.
- """
- cutoff = datetime.now(timezone.utc) - timedelta(hours=float(ttl_hours))
- cutoff_ts = cutoff.timestamp()
- def _parse_payload_ts(ts: Any) -> float | None:
- if not ts:
- return None
- if isinstance(ts, (int, float)):
- return float(ts)
- text = str(ts).strip()
- try:
- dt = datetime.fromisoformat(text.replace('Z', '+00:00'))
- if dt.tzinfo is None:
- dt = dt.replace(tzinfo=timezone.utc)
- return dt.astimezone(timezone.utc).timestamp()
- except Exception:
- pass
- try:
- dt = parsedate_to_datetime(text)
- if dt.tzinfo is None:
- dt = dt.replace(tzinfo=timezone.utc)
- return dt.astimezone(timezone.utc).timestamp()
- except Exception:
- return None
- # Pull a wider candidate set, then filter by payload.timestamp.
- with self._conn() as conn:
- cur = conn.execute(
- "SELECT payload FROM clusters WHERE topic=? LIMIT ?",
- (topic, int(max(200, limit) * 10)),
- )
- candidates = [json.loads(r[0]) for r in cur.fetchall()]
- filtered: list[dict] = []
- for c in candidates:
- ts = _parse_payload_ts(c.get("timestamp"))
- if ts is None:
- continue
- if ts >= cutoff_ts:
- filtered.append(c)
- filtered.sort(key=lambda c: _parse_payload_ts(c.get("timestamp")) or 0.0, reverse=True)
- return filtered[: int(limit)]
- def get_latest_clusters_all_topics(self, ttl_hours: float, limit: int) -> list[dict]:
- cutoff = datetime.now(timezone.utc) - timedelta(hours=float(ttl_hours))
- cutoff_ts = cutoff.timestamp()
- def _parse_payload_ts(ts: Any) -> float | None:
- if not ts:
- return None
- if isinstance(ts, (int, float)):
- return float(ts)
- text = str(ts).strip()
- try:
- dt = datetime.fromisoformat(text.replace('Z', '+00:00'))
- if dt.tzinfo is None:
- dt = dt.replace(tzinfo=timezone.utc)
- return dt.astimezone(timezone.utc).timestamp()
- except Exception:
- pass
- try:
- dt = parsedate_to_datetime(text)
- if dt.tzinfo is None:
- dt = dt.replace(tzinfo=timezone.utc)
- return dt.astimezone(timezone.utc).timestamp()
- except Exception:
- return None
- with self._conn() as conn:
- cur = conn.execute(
- "SELECT payload FROM clusters LIMIT ?",
- (int(max(500, limit) * 10),),
- )
- candidates = [json.loads(r[0]) for r in cur.fetchall()]
- filtered: list[dict] = []
- for c in candidates:
- ts = _parse_payload_ts(c.get("timestamp"))
- if ts is None:
- continue
- if ts >= cutoff_ts:
- filtered.append(c)
- filtered.sort(key=lambda c: _parse_payload_ts(c.get("timestamp")) or 0.0, reverse=True)
- return filtered[: int(limit)]
- def get_cluster_by_id(self, cluster_id: str) -> dict | None:
- with self._conn() as conn:
- cur = conn.execute(
- "SELECT payload FROM clusters WHERE cluster_id=?",
- (cluster_id,),
- )
- row = cur.fetchone()
- return json.loads(row[0]) if row else None
- def get_feed_hash(self, feed_key: str) -> str | None:
- with self._conn() as conn:
- cur = conn.execute(
- "SELECT last_hash FROM feed_state WHERE feed_key=?",
- (feed_key,),
- )
- row = cur.fetchone()
- return row[0] if row else None
- def set_feed_hash(self, feed_key: str, last_hash: str) -> None:
- now = datetime.now(timezone.utc).isoformat()
- with self._conn() as conn:
- conn.execute(
- "INSERT INTO feed_state(feed_key, last_hash, updated_at) VALUES(?,?,?) "
- "ON CONFLICT(feed_key) DO UPDATE SET last_hash=excluded.last_hash, updated_at=excluded.updated_at",
- (feed_key, last_hash, now),
- )
- def get_feed_state(self, feed_key: str) -> dict | None:
- with self._conn() as conn:
- cur = conn.execute(
- "SELECT last_hash, updated_at FROM feed_state WHERE feed_key=?",
- (feed_key,),
- )
- row = cur.fetchone()
- if not row:
- return None
- return {"last_hash": row[0], "updated_at": row[1]}
- def get_meta(self, key: str) -> str | None:
- with self._conn() as conn:
- cur = conn.execute("SELECT value FROM meta WHERE key=?", (key,))
- row = cur.fetchone()
- return row[0] if row else None
- def set_meta(self, key: str, value: str) -> None:
- with self._conn() as conn:
- conn.execute(
- "INSERT INTO meta(key, value) VALUES(?, ?) "
- "ON CONFLICT(key) DO UPDATE SET value=excluded.value",
- (key, value),
- )
- def prune_clusters(self, retention_days: float) -> int:
- retention_days = float(retention_days)
- if retention_days <= 0:
- return 0
- cutoff = datetime.now(timezone.utc) - timedelta(days=retention_days)
- cutoff_iso = cutoff.isoformat()
- pruned_at = datetime.now(timezone.utc).isoformat()
- with self._conn() as conn:
- cur = conn.execute("DELETE FROM clusters WHERE updated_at < ?", (cutoff_iso,))
- deleted = int(cur.rowcount or 0)
- conn.execute(
- "INSERT INTO meta(key, value) VALUES(?, ?) "
- "ON CONFLICT(key) DO UPDATE SET value=excluded.value",
- (META_LAST_PRUNE_AT, pruned_at),
- )
- return deleted
- def prune_if_due(self, pruning_enabled: bool, retention_days: float, interval_hours: float = 24.0) -> dict[str, Any]:
- retention_days = float(retention_days)
- interval_hours = float(interval_hours)
- if (not pruning_enabled) or retention_days <= 0:
- return {
- "enabled": bool(pruning_enabled),
- "deleted": 0,
- "due": False,
- "retention_days": retention_days,
- "interval_hours": interval_hours,
- "last_prune_at": self.get_meta(META_LAST_PRUNE_AT),
- }
- last_prune_at = self.get_meta(META_LAST_PRUNE_AT)
- now = datetime.now(timezone.utc)
- due = True
- if last_prune_at:
- try:
- last_dt = datetime.fromisoformat(last_prune_at)
- due = now - last_dt >= timedelta(hours=max(1.0, interval_hours))
- except Exception:
- due = True
- if not due:
- return {
- "enabled": True,
- "deleted": 0,
- "due": False,
- "retention_days": retention_days,
- "interval_hours": interval_hours,
- "last_prune_at": last_prune_at,
- }
- deleted = self.prune_clusters(retention_days)
- last_prune_at = self.get_meta(META_LAST_PRUNE_AT)
- return {
- "enabled": True,
- "deleted": deleted,
- "due": True,
- "retention_days": retention_days,
- "interval_hours": interval_hours,
- "last_prune_at": last_prune_at,
- }
- def get_prune_state(self, pruning_enabled: bool, retention_days: float, interval_hours: float = 24.0) -> dict[str, Any]:
- return {
- "enabled": bool(pruning_enabled),
- "retention_days": float(retention_days),
- "interval_hours": float(interval_hours),
- "last_prune_at": self.get_meta(META_LAST_PRUNE_AT),
- }
|