| 123456789101112131415161718192021222324252627282930313233343536373839 |
- from __future__ import annotations
- from news_mcp.dedup.embedding_support import CandidateRules, cluster_is_candidate, cosine_similarity
- def test_cosine_similarity_identical_vectors_is_one():
- assert cosine_similarity([1.0, 2.0, 3.0], [1.0, 2.0, 3.0]) == 1.0
- def test_cosine_similarity_orthogonal_vectors_is_zero():
- assert cosine_similarity([1.0, 0.0], [0.0, 1.0]) == 0.0
- def test_cosine_similarity_rejects_shape_mismatch():
- assert cosine_similarity([1.0, 2.0], [1.0]) == 0.0
- def test_cluster_candidate_requires_topic_match_when_enabled():
- article = {"timestamp": "2026-04-01T00:00:00Z", "entities": ["Iran"]}
- cluster = {"topic": "macro", "timestamp": "2026-04-01T00:00:00Z", "entities": ["Iran"]}
- assert not cluster_is_candidate(article, cluster, article_topic="crypto", rules=CandidateRules(require_topic_match=True))
- def test_cluster_candidate_allows_topic_mismatch_when_disabled():
- article = {"timestamp": "2026-04-01T00:00:00Z", "entities": ["Iran"]}
- cluster = {"topic": "macro", "timestamp": "2026-04-01T00:00:00Z", "entities": ["Iran"]}
- assert cluster_is_candidate(article, cluster, article_topic="crypto", rules=CandidateRules(require_topic_match=False))
- def test_cluster_candidate_requires_entity_overlap():
- article = {"timestamp": "2026-04-01T00:00:00Z", "entities": ["Iran"]}
- cluster = {"topic": "macro", "timestamp": "2026-04-01T00:00:00Z", "entities": ["Israel"]}
- assert not cluster_is_candidate(article, cluster, article_topic="macro", rules=CandidateRules(require_topic_match=False, require_entity_overlap=1))
- def test_cluster_candidate_accepts_overlap_and_recency():
- article = {"timestamp": "2026-04-01T00:00:00Z", "entities": ["Iran", "US"]}
- cluster = {"topic": "macro", "timestamp": "2026-03-31T23:00:00Z", "entities": ["Iran"]}
- assert cluster_is_candidate(article, cluster, article_topic="macro", rules=CandidateRules(require_topic_match=False, require_entity_overlap=1, max_age_hours=24))
|