|
@@ -7,6 +7,7 @@ from datetime import datetime, timezone, timedelta
|
|
|
from pathlib import Path
|
|
from pathlib import Path
|
|
|
from typing import Any
|
|
from typing import Any
|
|
|
from urllib.parse import urlparse
|
|
from urllib.parse import urlparse
|
|
|
|
|
+from email.utils import parsedate_to_datetime
|
|
|
|
|
|
|
|
from news_mcp.entity_normalize import normalize_entities
|
|
from news_mcp.entity_normalize import normalize_entities
|
|
|
from news_mcp.trends_resolution import resolve_entity_via_trends
|
|
from news_mcp.trends_resolution import resolve_entity_via_trends
|
|
@@ -202,25 +203,97 @@ class SQLiteClusterStore:
|
|
|
return json.loads(row[0])
|
|
return json.loads(row[0])
|
|
|
|
|
|
|
|
def get_latest_clusters(self, topic: str, ttl_hours: float, limit: int) -> list[dict]:
|
|
def get_latest_clusters(self, topic: str, ttl_hours: float, limit: int) -> list[dict]:
|
|
|
- cutoff = datetime.now(timezone.utc) - timedelta(hours=ttl_hours)
|
|
|
|
|
- cutoff_iso = cutoff.isoformat()
|
|
|
|
|
|
|
+ """Return newest clusters by *their own* timestamp.
|
|
|
|
|
+
|
|
|
|
|
+ Filtering/sorting by the DB row's `updated_at` can drift away from the
|
|
|
|
|
+ actual event time in `payload.timestamp`.
|
|
|
|
|
+ """
|
|
|
|
|
+
|
|
|
|
|
+ cutoff = datetime.now(timezone.utc) - timedelta(hours=float(ttl_hours))
|
|
|
|
|
+ cutoff_ts = cutoff.timestamp()
|
|
|
|
|
+
|
|
|
|
|
+ def _parse_payload_ts(ts: Any) -> float | None:
|
|
|
|
|
+ if not ts:
|
|
|
|
|
+ return None
|
|
|
|
|
+ if isinstance(ts, (int, float)):
|
|
|
|
|
+ return float(ts)
|
|
|
|
|
+ text = str(ts).strip()
|
|
|
|
|
+ try:
|
|
|
|
|
+ dt = datetime.fromisoformat(text.replace('Z', '+00:00'))
|
|
|
|
|
+ if dt.tzinfo is None:
|
|
|
|
|
+ dt = dt.replace(tzinfo=timezone.utc)
|
|
|
|
|
+ return dt.astimezone(timezone.utc).timestamp()
|
|
|
|
|
+ except Exception:
|
|
|
|
|
+ pass
|
|
|
|
|
+ try:
|
|
|
|
|
+ dt = parsedate_to_datetime(text)
|
|
|
|
|
+ if dt.tzinfo is None:
|
|
|
|
|
+ dt = dt.replace(tzinfo=timezone.utc)
|
|
|
|
|
+ return dt.astimezone(timezone.utc).timestamp()
|
|
|
|
|
+ except Exception:
|
|
|
|
|
+ return None
|
|
|
|
|
+
|
|
|
|
|
+ # Pull a wider candidate set, then filter by payload.timestamp.
|
|
|
with self._conn() as conn:
|
|
with self._conn() as conn:
|
|
|
cur = conn.execute(
|
|
cur = conn.execute(
|
|
|
- "SELECT payload FROM clusters WHERE topic=? AND updated_at >= ? ORDER BY updated_at DESC LIMIT ?",
|
|
|
|
|
- (topic, cutoff_iso, int(limit)),
|
|
|
|
|
|
|
+ "SELECT payload FROM clusters WHERE topic=? LIMIT ?",
|
|
|
|
|
+ (topic, int(max(200, limit) * 10)),
|
|
|
)
|
|
)
|
|
|
- rows = [json.loads(r[0]) for r in cur.fetchall()]
|
|
|
|
|
- return rows
|
|
|
|
|
|
|
+ candidates = [json.loads(r[0]) for r in cur.fetchall()]
|
|
|
|
|
+
|
|
|
|
|
+ filtered: list[dict] = []
|
|
|
|
|
+ for c in candidates:
|
|
|
|
|
+ ts = _parse_payload_ts(c.get("timestamp"))
|
|
|
|
|
+ if ts is None:
|
|
|
|
|
+ continue
|
|
|
|
|
+ if ts >= cutoff_ts:
|
|
|
|
|
+ filtered.append(c)
|
|
|
|
|
+
|
|
|
|
|
+ filtered.sort(key=lambda c: _parse_payload_ts(c.get("timestamp")) or 0.0, reverse=True)
|
|
|
|
|
+ return filtered[: int(limit)]
|
|
|
|
|
|
|
|
def get_latest_clusters_all_topics(self, ttl_hours: float, limit: int) -> list[dict]:
|
|
def get_latest_clusters_all_topics(self, ttl_hours: float, limit: int) -> list[dict]:
|
|
|
- cutoff = datetime.now(timezone.utc) - timedelta(hours=ttl_hours)
|
|
|
|
|
- cutoff_iso = cutoff.isoformat()
|
|
|
|
|
|
|
+ cutoff = datetime.now(timezone.utc) - timedelta(hours=float(ttl_hours))
|
|
|
|
|
+ cutoff_ts = cutoff.timestamp()
|
|
|
|
|
+
|
|
|
|
|
+ def _parse_payload_ts(ts: Any) -> float | None:
|
|
|
|
|
+ if not ts:
|
|
|
|
|
+ return None
|
|
|
|
|
+ if isinstance(ts, (int, float)):
|
|
|
|
|
+ return float(ts)
|
|
|
|
|
+ text = str(ts).strip()
|
|
|
|
|
+ try:
|
|
|
|
|
+ dt = datetime.fromisoformat(text.replace('Z', '+00:00'))
|
|
|
|
|
+ if dt.tzinfo is None:
|
|
|
|
|
+ dt = dt.replace(tzinfo=timezone.utc)
|
|
|
|
|
+ return dt.astimezone(timezone.utc).timestamp()
|
|
|
|
|
+ except Exception:
|
|
|
|
|
+ pass
|
|
|
|
|
+ try:
|
|
|
|
|
+ dt = parsedate_to_datetime(text)
|
|
|
|
|
+ if dt.tzinfo is None:
|
|
|
|
|
+ dt = dt.replace(tzinfo=timezone.utc)
|
|
|
|
|
+ return dt.astimezone(timezone.utc).timestamp()
|
|
|
|
|
+ except Exception:
|
|
|
|
|
+ return None
|
|
|
|
|
+
|
|
|
with self._conn() as conn:
|
|
with self._conn() as conn:
|
|
|
cur = conn.execute(
|
|
cur = conn.execute(
|
|
|
- "SELECT payload FROM clusters WHERE updated_at >= ? ORDER BY updated_at DESC LIMIT ?",
|
|
|
|
|
- (cutoff_iso, int(limit)),
|
|
|
|
|
|
|
+ "SELECT payload FROM clusters LIMIT ?",
|
|
|
|
|
+ (int(max(500, limit) * 10),),
|
|
|
)
|
|
)
|
|
|
- return [json.loads(r[0]) for r in cur.fetchall()]
|
|
|
|
|
|
|
+ candidates = [json.loads(r[0]) for r in cur.fetchall()]
|
|
|
|
|
+
|
|
|
|
|
+ filtered: list[dict] = []
|
|
|
|
|
+ for c in candidates:
|
|
|
|
|
+ ts = _parse_payload_ts(c.get("timestamp"))
|
|
|
|
|
+ if ts is None:
|
|
|
|
|
+ continue
|
|
|
|
|
+ if ts >= cutoff_ts:
|
|
|
|
|
+ filtered.append(c)
|
|
|
|
|
+
|
|
|
|
|
+ filtered.sort(key=lambda c: _parse_payload_ts(c.get("timestamp")) or 0.0, reverse=True)
|
|
|
|
|
+ return filtered[: int(limit)]
|
|
|
|
|
|
|
|
def get_cluster_by_id(self, cluster_id: str) -> dict | None:
|
|
def get_cluster_by_id(self, cluster_id: str) -> dict | None:
|
|
|
with self._conn() as conn:
|
|
with self._conn() as conn:
|