|
@@ -0,0 +1,81 @@
|
|
|
|
|
+from __future__ import annotations
|
|
|
|
|
+
|
|
|
|
|
+from dataclasses import dataclass
|
|
|
|
|
+from datetime import datetime, timezone, timedelta
|
|
|
|
|
+from math import sqrt
|
|
|
|
|
+from typing import Any
|
|
|
|
|
+
|
|
|
|
|
+
|
|
|
|
|
+@dataclass(frozen=True)
|
|
|
|
|
+class CandidateRules:
|
|
|
|
|
+ """Cheap, non-embedding filters before we compare vectors."""
|
|
|
|
|
+
|
|
|
|
|
+ require_topic_match: bool = True
|
|
|
|
|
+ require_entity_overlap: int = 1
|
|
|
|
|
+ max_age_hours: int = 72
|
|
|
|
|
+
|
|
|
|
|
+
|
|
|
|
|
+def cosine_similarity(a: list[float], b: list[float]) -> float:
|
|
|
|
|
+ if not a or not b or len(a) != len(b):
|
|
|
|
|
+ return 0.0
|
|
|
|
|
+ dot = sum(x * y for x, y in zip(a, b))
|
|
|
|
|
+ na = sqrt(sum(x * x for x in a))
|
|
|
|
|
+ nb = sqrt(sum(y * y for y in b))
|
|
|
|
|
+ if na == 0.0 or nb == 0.0:
|
|
|
|
|
+ return 0.0
|
|
|
|
|
+ return dot / (na * nb)
|
|
|
|
|
+
|
|
|
|
|
+
|
|
|
|
|
+def _to_dt(value: Any) -> datetime | None:
|
|
|
|
|
+ if not value:
|
|
|
|
|
+ return None
|
|
|
|
|
+ if isinstance(value, datetime):
|
|
|
|
|
+ return value
|
|
|
|
|
+ try:
|
|
|
|
|
+ s = str(value).replace("Z", "+00:00")
|
|
|
|
|
+ dt = datetime.fromisoformat(s)
|
|
|
|
|
+ if dt.tzinfo is None:
|
|
|
|
|
+ return dt.replace(tzinfo=timezone.utc)
|
|
|
|
|
+ return dt
|
|
|
|
|
+ except Exception:
|
|
|
|
|
+ return None
|
|
|
|
|
+
|
|
|
|
|
+
|
|
|
|
|
+def cluster_is_candidate(
|
|
|
|
|
+ article: dict[str, Any],
|
|
|
|
|
+ cluster: dict[str, Any],
|
|
|
|
|
+ *,
|
|
|
|
|
+ rules: CandidateRules | None = None,
|
|
|
|
|
+ article_topic: str | None = None,
|
|
|
|
|
+) -> bool:
|
|
|
|
|
+ rules = rules or CandidateRules()
|
|
|
|
|
+
|
|
|
|
|
+ if rules.require_topic_match and article_topic is not None:
|
|
|
|
|
+ if str(article_topic).strip().lower() != str(cluster.get("topic", "")).strip().lower():
|
|
|
|
|
+ return False
|
|
|
|
|
+
|
|
|
|
|
+ # Require some overlap in extracted entities if both sides have them.
|
|
|
|
|
+ article_entities = {
|
|
|
|
|
+ str(e).strip().lower()
|
|
|
|
|
+ for e in (article.get("entities", []) or [])
|
|
|
|
|
+ if str(e).strip()
|
|
|
|
|
+ }
|
|
|
|
|
+ cluster_entities = {
|
|
|
|
|
+ str(e).strip().lower()
|
|
|
|
|
+ for e in (cluster.get("entities", []) or [])
|
|
|
|
|
+ if str(e).strip()
|
|
|
|
|
+ }
|
|
|
|
|
+ if article_entities and cluster_entities:
|
|
|
|
|
+ overlap = len(article_entities & cluster_entities)
|
|
|
|
|
+ if overlap < rules.require_entity_overlap:
|
|
|
|
|
+ return False
|
|
|
|
|
+
|
|
|
|
|
+ # Age gate: keep comparisons within a recent window.
|
|
|
|
|
+ article_dt = _to_dt(article.get("timestamp"))
|
|
|
|
|
+ cluster_dt = _to_dt(cluster.get("last_updated") or cluster.get("timestamp"))
|
|
|
|
|
+ if article_dt and cluster_dt:
|
|
|
|
|
+ age = abs(article_dt - cluster_dt)
|
|
|
|
|
+ if age > timedelta(hours=rules.max_age_hours):
|
|
|
|
|
+ return False
|
|
|
|
|
+
|
|
|
|
|
+ return True
|