Ver código fonte

tests: add embedding support guards for clustering

Lukas Goldschmidt 1 mês atrás
pai
commit
c984d1f589
2 arquivos alterados com 120 adições e 0 exclusões
  1. 81 0
      news_mcp/dedup/embedding_support.py
  2. 39 0
      test_embedding_support.py

+ 81 - 0
news_mcp/dedup/embedding_support.py

@@ -0,0 +1,81 @@
+from __future__ import annotations
+
+from dataclasses import dataclass
+from datetime import datetime, timezone, timedelta
+from math import sqrt
+from typing import Any
+
+
+@dataclass(frozen=True)
+class CandidateRules:
+    """Cheap, non-embedding filters before we compare vectors."""
+
+    require_topic_match: bool = True
+    require_entity_overlap: int = 1
+    max_age_hours: int = 72
+
+
+def cosine_similarity(a: list[float], b: list[float]) -> float:
+    if not a or not b or len(a) != len(b):
+        return 0.0
+    dot = sum(x * y for x, y in zip(a, b))
+    na = sqrt(sum(x * x for x in a))
+    nb = sqrt(sum(y * y for y in b))
+    if na == 0.0 or nb == 0.0:
+        return 0.0
+    return dot / (na * nb)
+
+
+def _to_dt(value: Any) -> datetime | None:
+    if not value:
+        return None
+    if isinstance(value, datetime):
+        return value
+    try:
+        s = str(value).replace("Z", "+00:00")
+        dt = datetime.fromisoformat(s)
+        if dt.tzinfo is None:
+            return dt.replace(tzinfo=timezone.utc)
+        return dt
+    except Exception:
+        return None
+
+
+def cluster_is_candidate(
+    article: dict[str, Any],
+    cluster: dict[str, Any],
+    *,
+    rules: CandidateRules | None = None,
+    article_topic: str | None = None,
+) -> bool:
+    rules = rules or CandidateRules()
+
+    if rules.require_topic_match and article_topic is not None:
+        if str(article_topic).strip().lower() != str(cluster.get("topic", "")).strip().lower():
+            return False
+
+    # Require some overlap in extracted entities if both sides have them.
+    article_entities = {
+        str(e).strip().lower()
+        for e in (article.get("entities", []) or [])
+        if str(e).strip()
+    }
+    cluster_entities = {
+        str(e).strip().lower()
+        for e in (cluster.get("entities", []) or [])
+        if str(e).strip()
+    }
+    if article_entities and cluster_entities:
+        overlap = len(article_entities & cluster_entities)
+        if overlap < rules.require_entity_overlap:
+            return False
+
+    # Age gate: keep comparisons within a recent window.
+    article_dt = _to_dt(article.get("timestamp"))
+    cluster_dt = _to_dt(cluster.get("last_updated") or cluster.get("timestamp"))
+    if article_dt and cluster_dt:
+        age = abs(article_dt - cluster_dt)
+        if age > timedelta(hours=rules.max_age_hours):
+            return False
+
+    return True

+ 39 - 0
test_embedding_support.py

@@ -0,0 +1,39 @@
+from __future__ import annotations
+
+from news_mcp.dedup.embedding_support import CandidateRules, cluster_is_candidate, cosine_similarity
+
+
+def test_cosine_similarity_identical_vectors_is_one():
+    assert cosine_similarity([1.0, 2.0, 3.0], [1.0, 2.0, 3.0]) == 1.0
+
+
+def test_cosine_similarity_orthogonal_vectors_is_zero():
+    assert cosine_similarity([1.0, 0.0], [0.0, 1.0]) == 0.0
+
+
+def test_cosine_similarity_rejects_shape_mismatch():
+    assert cosine_similarity([1.0, 2.0], [1.0]) == 0.0
+
+
+def test_cluster_candidate_requires_topic_match_when_enabled():
+    article = {"timestamp": "2026-04-01T00:00:00Z", "entities": ["Iran"]}
+    cluster = {"topic": "macro", "timestamp": "2026-04-01T00:00:00Z", "entities": ["Iran"]}
+    assert not cluster_is_candidate(article, cluster, article_topic="crypto", rules=CandidateRules(require_topic_match=True))
+
+
+def test_cluster_candidate_allows_topic_mismatch_when_disabled():
+    article = {"timestamp": "2026-04-01T00:00:00Z", "entities": ["Iran"]}
+    cluster = {"topic": "macro", "timestamp": "2026-04-01T00:00:00Z", "entities": ["Iran"]}
+    assert cluster_is_candidate(article, cluster, article_topic="crypto", rules=CandidateRules(require_topic_match=False))
+
+
+def test_cluster_candidate_requires_entity_overlap():
+    article = {"timestamp": "2026-04-01T00:00:00Z", "entities": ["Iran"]}
+    cluster = {"topic": "macro", "timestamp": "2026-04-01T00:00:00Z", "entities": ["Israel"]}
+    assert not cluster_is_candidate(article, cluster, article_topic="macro", rules=CandidateRules(require_topic_match=False, require_entity_overlap=1))
+
+
+def test_cluster_candidate_accepts_overlap_and_recency():
+    article = {"timestamp": "2026-04-01T00:00:00Z", "entities": ["Iran", "US"]}
+    cluster = {"topic": "macro", "timestamp": "2026-03-31T23:00:00Z", "entities": ["Iran"]}
+    assert cluster_is_candidate(article, cluster, article_topic="macro", rules=CandidateRules(require_topic_match=False, require_entity_overlap=1, max_age_hours=24))