test_embedding_support.py 1.9 KB

123456789101112131415161718192021222324252627282930313233343536373839
  1. from __future__ import annotations
  2. from news_mcp.dedup.embedding_support import CandidateRules, cluster_is_candidate, cosine_similarity
  3. def test_cosine_similarity_identical_vectors_is_one():
  4. assert cosine_similarity([1.0, 2.0, 3.0], [1.0, 2.0, 3.0]) == 1.0
  5. def test_cosine_similarity_orthogonal_vectors_is_zero():
  6. assert cosine_similarity([1.0, 0.0], [0.0, 1.0]) == 0.0
  7. def test_cosine_similarity_rejects_shape_mismatch():
  8. assert cosine_similarity([1.0, 2.0], [1.0]) == 0.0
  9. def test_cluster_candidate_requires_topic_match_when_enabled():
  10. article = {"timestamp": "2026-04-01T00:00:00Z", "entities": ["Iran"]}
  11. cluster = {"topic": "macro", "timestamp": "2026-04-01T00:00:00Z", "entities": ["Iran"]}
  12. assert not cluster_is_candidate(article, cluster, article_topic="crypto", rules=CandidateRules(require_topic_match=True))
  13. def test_cluster_candidate_allows_topic_mismatch_when_disabled():
  14. article = {"timestamp": "2026-04-01T00:00:00Z", "entities": ["Iran"]}
  15. cluster = {"topic": "macro", "timestamp": "2026-04-01T00:00:00Z", "entities": ["Iran"]}
  16. assert cluster_is_candidate(article, cluster, article_topic="crypto", rules=CandidateRules(require_topic_match=False))
  17. def test_cluster_candidate_requires_entity_overlap():
  18. article = {"timestamp": "2026-04-01T00:00:00Z", "entities": ["Iran"]}
  19. cluster = {"topic": "macro", "timestamp": "2026-04-01T00:00:00Z", "entities": ["Israel"]}
  20. assert not cluster_is_candidate(article, cluster, article_topic="macro", rules=CandidateRules(require_topic_match=False, require_entity_overlap=1))
  21. def test_cluster_candidate_accepts_overlap_and_recency():
  22. article = {"timestamp": "2026-04-01T00:00:00Z", "entities": ["Iran", "US"]}
  23. cluster = {"topic": "macro", "timestamp": "2026-03-31T23:00:00Z", "entities": ["Iran"]}
  24. assert cluster_is_candidate(article, cluster, article_topic="macro", rules=CandidateRules(require_topic_match=False, require_entity_overlap=1, max_age_hours=24))