| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081 |
- from __future__ import annotations
- from dataclasses import dataclass
- from datetime import datetime, timezone, timedelta
- from math import sqrt
- from typing import Any
- @dataclass(frozen=True)
- class CandidateRules:
- """Cheap, non-embedding filters before we compare vectors."""
- require_topic_match: bool = True
- require_entity_overlap: int = 1
- max_age_hours: int = 72
- def cosine_similarity(a: list[float], b: list[float]) -> float:
- if not a or not b or len(a) != len(b):
- return 0.0
- dot = sum(x * y for x, y in zip(a, b))
- na = sqrt(sum(x * x for x in a))
- nb = sqrt(sum(y * y for y in b))
- if na == 0.0 or nb == 0.0:
- return 0.0
- return dot / (na * nb)
- def _to_dt(value: Any) -> datetime | None:
- if not value:
- return None
- if isinstance(value, datetime):
- return value
- try:
- s = str(value).replace("Z", "+00:00")
- dt = datetime.fromisoformat(s)
- if dt.tzinfo is None:
- return dt.replace(tzinfo=timezone.utc)
- return dt
- except Exception:
- return None
- def cluster_is_candidate(
- article: dict[str, Any],
- cluster: dict[str, Any],
- *,
- rules: CandidateRules | None = None,
- article_topic: str | None = None,
- ) -> bool:
- rules = rules or CandidateRules()
- if rules.require_topic_match and article_topic is not None:
- if str(article_topic).strip().lower() != str(cluster.get("topic", "")).strip().lower():
- return False
- # Require some overlap in extracted entities if both sides have them.
- article_entities = {
- str(e).strip().lower()
- for e in (article.get("entities", []) or [])
- if str(e).strip()
- }
- cluster_entities = {
- str(e).strip().lower()
- for e in (cluster.get("entities", []) or [])
- if str(e).strip()
- }
- if article_entities and cluster_entities:
- overlap = len(article_entities & cluster_entities)
- if overlap < rules.require_entity_overlap:
- return False
- # Age gate: keep comparisons within a recent window.
- article_dt = _to_dt(article.get("timestamp"))
- cluster_dt = _to_dt(cluster.get("last_updated") or cluster.get("timestamp"))
- if article_dt and cluster_dt:
- age = abs(article_dt - cluster_dt)
- if age > timedelta(hours=rules.max_age_hours):
- return False
- return True
|