from __future__ import annotations from dataclasses import dataclass from datetime import datetime, timezone, timedelta from math import sqrt from typing import Any @dataclass(frozen=True) class CandidateRules: """Cheap, non-embedding filters before we compare vectors.""" require_topic_match: bool = True require_entity_overlap: int = 1 max_age_hours: int = 72 def cosine_similarity(a: list[float], b: list[float]) -> float: if not a or not b or len(a) != len(b): return 0.0 dot = sum(x * y for x, y in zip(a, b)) na = sqrt(sum(x * x for x in a)) nb = sqrt(sum(y * y for y in b)) if na == 0.0 or nb == 0.0: return 0.0 return dot / (na * nb) def _to_dt(value: Any) -> datetime | None: if not value: return None if isinstance(value, datetime): return value try: s = str(value).replace("Z", "+00:00") dt = datetime.fromisoformat(s) if dt.tzinfo is None: return dt.replace(tzinfo=timezone.utc) return dt except Exception: return None def cluster_is_candidate( article: dict[str, Any], cluster: dict[str, Any], *, rules: CandidateRules | None = None, article_topic: str | None = None, ) -> bool: rules = rules or CandidateRules() if rules.require_topic_match and article_topic is not None: if str(article_topic).strip().lower() != str(cluster.get("topic", "")).strip().lower(): return False # Require some overlap in extracted entities if both sides have them. article_entities = { str(e).strip().lower() for e in (article.get("entities", []) or []) if str(e).strip() } cluster_entities = { str(e).strip().lower() for e in (cluster.get("entities", []) or []) if str(e).strip() } if article_entities and cluster_entities: overlap = len(article_entities & cluster_entities) if overlap < rules.require_entity_overlap: return False # Age gate: keep comparisons within a recent window. article_dt = _to_dt(article.get("timestamp")) cluster_dt = _to_dt(cluster.get("last_updated") or cluster.get("timestamp")) if article_dt and cluster_dt: age = abs(article_dt - cluster_dt) if age > timedelta(hours=rules.max_age_hours): return False return True