from __future__ import annotations from news_mcp.dedup.embedding_support import CandidateRules, cluster_is_candidate, cosine_similarity def test_cosine_similarity_identical_vectors_is_one(): assert cosine_similarity([1.0, 2.0, 3.0], [1.0, 2.0, 3.0]) == 1.0 def test_cosine_similarity_orthogonal_vectors_is_zero(): assert cosine_similarity([1.0, 0.0], [0.0, 1.0]) == 0.0 def test_cosine_similarity_rejects_shape_mismatch(): assert cosine_similarity([1.0, 2.0], [1.0]) == 0.0 def test_cluster_candidate_requires_topic_match_when_enabled(): article = {"timestamp": "2026-04-01T00:00:00Z", "entities": ["Iran"]} cluster = {"topic": "macro", "timestamp": "2026-04-01T00:00:00Z", "entities": ["Iran"]} assert not cluster_is_candidate(article, cluster, article_topic="crypto", rules=CandidateRules(require_topic_match=True)) def test_cluster_candidate_allows_topic_mismatch_when_disabled(): article = {"timestamp": "2026-04-01T00:00:00Z", "entities": ["Iran"]} cluster = {"topic": "macro", "timestamp": "2026-04-01T00:00:00Z", "entities": ["Iran"]} assert cluster_is_candidate(article, cluster, article_topic="crypto", rules=CandidateRules(require_topic_match=False)) def test_cluster_candidate_requires_entity_overlap(): article = {"timestamp": "2026-04-01T00:00:00Z", "entities": ["Iran"]} cluster = {"topic": "macro", "timestamp": "2026-04-01T00:00:00Z", "entities": ["Israel"]} assert not cluster_is_candidate(article, cluster, article_topic="macro", rules=CandidateRules(require_topic_match=False, require_entity_overlap=1)) def test_cluster_candidate_accepts_overlap_and_recency(): article = {"timestamp": "2026-04-01T00:00:00Z", "entities": ["Iran", "US"]} cluster = {"topic": "macro", "timestamp": "2026-03-31T23:00:00Z", "entities": ["Iran"]} assert cluster_is_candidate(article, cluster, article_topic="macro", rules=CandidateRules(require_topic_match=False, require_entity_overlap=1, max_age_hours=24))