|
@@ -92,33 +92,74 @@ DEFAULT_JACCARD_THRESHOLD = 0.55
|
|
|
|
|
|
|
|
|
|
|
|
|
def _signals(article: Dict[str, Any], cluster: Dict[str, Any]) -> dict:
|
|
def _signals(article: Dict[str, Any], cluster: Dict[str, Any]) -> dict:
|
|
|
- """Per-pair similarity signals (title, jaccard, embedding cosine)."""
|
|
|
|
|
|
|
+ """Per-pair similarity signals (title, jaccard, embedding cosine).
|
|
|
|
|
+
|
|
|
|
|
+ Compares the article against ALL articles in the cluster and returns the
|
|
|
|
|
+ best (max) signal across all comparisons. The cosine signal uses the
|
|
|
|
|
+ cluster-level embedding; title and jaccard are computed per-article and
|
|
|
|
|
+ the maximum is returned so that a match against any cluster member counts.
|
|
|
|
|
+ """
|
|
|
a_title = str(article.get("title") or "")
|
|
a_title = str(article.get("title") or "")
|
|
|
c_title = str(cluster.get("headline") or "")
|
|
c_title = str(cluster.get("headline") or "")
|
|
|
-
|
|
|
|
|
- title_sim = _title_similarity(a_title, c_title) if a_title and c_title else 0.0
|
|
|
|
|
-
|
|
|
|
|
- a_text = _cluster_text(article)
|
|
|
|
|
- c_text_seed = (cluster.get("articles") or [{}])[0]
|
|
|
|
|
- c_text = _cluster_text(c_text_seed) if c_text_seed else c_title
|
|
|
|
|
- jaccard = _jaccard(_tokens(a_text), _tokens(c_text)) if a_text and c_text else 0.0
|
|
|
|
|
-
|
|
|
|
|
a_emb = article.get("_embedding")
|
|
a_emb = article.get("_embedding")
|
|
|
c_emb = cluster.get("embedding")
|
|
c_emb = cluster.get("embedding")
|
|
|
cosine = cosine_similarity(a_emb, c_emb) if a_emb and c_emb else 0.0
|
|
cosine = cosine_similarity(a_emb, c_emb) if a_emb and c_emb else 0.0
|
|
|
|
|
|
|
|
- return {"title": title_sim, "jaccard": jaccard, "cosine": cosine}
|
|
|
|
|
-
|
|
|
|
|
|
|
+ best_title = 0.0
|
|
|
|
|
+ best_jaccard = 0.0
|
|
|
|
|
+ a_text = _cluster_text(article)
|
|
|
|
|
+ a_toks = _tokens(a_text) if a_text else set()
|
|
|
|
|
|
|
|
-def _is_match(signals: dict, *, embeddings_enabled: bool) -> tuple[bool, str, float]:
|
|
|
|
|
- """Decide whether two items should merge based on the strongest signal."""
|
|
|
|
|
|
|
+ # Compare against every article in the cluster, take the best scores.
|
|
|
|
|
+ cluster_articles = cluster.get("articles") or ([{"title": c_title}] if c_title else [])
|
|
|
|
|
+ for ca in cluster_articles:
|
|
|
|
|
+ if not isinstance(ca, dict):
|
|
|
|
|
+ continue
|
|
|
|
|
+ # title signal
|
|
|
|
|
+ ca_title = str(ca.get("title") or "")
|
|
|
|
|
+ if a_title and ca_title:
|
|
|
|
|
+ t = _title_similarity(a_title, ca_title)
|
|
|
|
|
+ if t > best_title:
|
|
|
|
|
+ best_title = t
|
|
|
|
|
+ # jaccard signal
|
|
|
|
|
+ ca_text = _cluster_text(ca)
|
|
|
|
|
+ if a_text and ca_text:
|
|
|
|
|
+ j = _jaccard(a_toks, _tokens(ca_text))
|
|
|
|
|
+ if j > best_jaccard:
|
|
|
|
|
+ best_jaccard = j
|
|
|
|
|
+ # early exit: if both title and jaccard are already very high
|
|
|
|
|
+ if best_title >= 0.95 and best_jaccard >= 0.80:
|
|
|
|
|
+ break
|
|
|
|
|
+
|
|
|
|
|
+ return {"title": best_title, "jaccard": best_jaccard, "cosine": cosine}
|
|
|
|
|
+
|
|
|
|
|
+
|
|
|
|
|
+def _is_match(
|
|
|
|
|
+ signals: dict,
|
|
|
|
|
+ *,
|
|
|
|
|
+ embeddings_enabled: bool,
|
|
|
|
|
+ title_threshold: float = DEFAULT_TITLE_THRESHOLD,
|
|
|
|
|
+ jaccard_threshold: float = DEFAULT_JACCARD_THRESHOLD,
|
|
|
|
|
+) -> tuple[bool, str, float]:
|
|
|
|
|
+ """Decide whether two items should merge based on the strongest signal.
|
|
|
|
|
+
|
|
|
|
|
+ Cascade: cosine (if embeddings enabled) → title → jaccard → consensus.
|
|
|
|
|
+ Returns (matched, signal_name, signal_value).
|
|
|
|
|
+ """
|
|
|
cosine_threshold = NEWS_EMBEDDING_SIMILARITY_THRESHOLD
|
|
cosine_threshold = NEWS_EMBEDDING_SIMILARITY_THRESHOLD
|
|
|
if embeddings_enabled and signals["cosine"] >= cosine_threshold:
|
|
if embeddings_enabled and signals["cosine"] >= cosine_threshold:
|
|
|
return True, "cosine", signals["cosine"]
|
|
return True, "cosine", signals["cosine"]
|
|
|
- if signals["title"] >= DEFAULT_TITLE_THRESHOLD:
|
|
|
|
|
|
|
+ if signals["title"] >= title_threshold:
|
|
|
return True, "title", signals["title"]
|
|
return True, "title", signals["title"]
|
|
|
- if signals["jaccard"] >= DEFAULT_JACCARD_THRESHOLD:
|
|
|
|
|
|
|
+ if signals["jaccard"] >= jaccard_threshold:
|
|
|
return True, "jaccard", signals["jaccard"]
|
|
return True, "jaccard", signals["jaccard"]
|
|
|
|
|
+ if (
|
|
|
|
|
+ embeddings_enabled
|
|
|
|
|
+ and signals["cosine"] >= 0.80
|
|
|
|
|
+ and (signals["jaccard"] >= 0.30 or signals["title"] >= 0.55)
|
|
|
|
|
+ ):
|
|
|
|
|
+ val = (signals["cosine"] + max(signals["jaccard"], signals["title"])) / 2.0
|
|
|
|
|
+ return True, "consensus", val
|
|
|
return False, "none", 0.0
|
|
return False, "none", 0.0
|
|
|
|
|
|
|
|
|
|
|
|
@@ -388,28 +429,12 @@ def dedup_and_cluster_articles(
|
|
|
best_signal_value = 0.0
|
|
best_signal_value = 0.0
|
|
|
for idx, c in enumerate(clusters):
|
|
for idx, c in enumerate(clusters):
|
|
|
sigs = _signals(a_with_emb, c)
|
|
sigs = _signals(a_with_emb, c)
|
|
|
- local_match = False
|
|
|
|
|
- if NEWS_EMBEDDINGS_ENABLED and sigs["cosine"] >= NEWS_EMBEDDING_SIMILARITY_THRESHOLD:
|
|
|
|
|
- local_match = True
|
|
|
|
|
- signal_name, signal_value = "cosine", sigs["cosine"]
|
|
|
|
|
- elif sigs["title"] >= title_threshold:
|
|
|
|
|
- local_match = True
|
|
|
|
|
- signal_name, signal_value = "title", sigs["title"]
|
|
|
|
|
- elif sigs["jaccard"] >= DEFAULT_JACCARD_THRESHOLD:
|
|
|
|
|
- local_match = True
|
|
|
|
|
- signal_name, signal_value = "jaccard", sigs["jaccard"]
|
|
|
|
|
- elif (
|
|
|
|
|
- NEWS_EMBEDDINGS_ENABLED
|
|
|
|
|
- and sigs["cosine"] >= 0.80
|
|
|
|
|
- and (sigs["jaccard"] >= 0.30 or sigs["title"] >= 0.55)
|
|
|
|
|
- ):
|
|
|
|
|
- local_match = True
|
|
|
|
|
- signal_name = "consensus"
|
|
|
|
|
- signal_value = (sigs["cosine"] + max(sigs["jaccard"], sigs["title"])) / 2.0
|
|
|
|
|
- else:
|
|
|
|
|
- signal_name, signal_value = "none", max(sigs["title"], sigs["jaccard"], sigs["cosine"])
|
|
|
|
|
-
|
|
|
|
|
- if local_match and signal_value > best_signal_value:
|
|
|
|
|
|
|
+ matched, signal_name, signal_value = _is_match(
|
|
|
|
|
+ sigs,
|
|
|
|
|
+ embeddings_enabled=NEWS_EMBEDDINGS_ENABLED,
|
|
|
|
|
+ title_threshold=title_threshold,
|
|
|
|
|
+ )
|
|
|
|
|
+ if matched and signal_value > best_signal_value:
|
|
|
best_idx = idx
|
|
best_idx = idx
|
|
|
best_signal_name = signal_name
|
|
best_signal_name = signal_name
|
|
|
best_signal_value = signal_value
|
|
best_signal_value = signal_value
|
|
@@ -422,6 +447,11 @@ def dedup_and_cluster_articles(
|
|
|
if a.get("source") and a["source"] not in c["sources"]:
|
|
if a.get("source") and a["source"] not in c["sources"]:
|
|
|
c["sources"].append(a["source"])
|
|
c["sources"].append(a["source"])
|
|
|
c["last_updated"] = max(str(c.get("last_updated", "")), str(a.get("timestamp", "")))
|
|
c["last_updated"] = max(str(c.get("last_updated", "")), str(a.get("timestamp", "")))
|
|
|
|
|
+ # Update cluster embedding to the new article's embedding so later
|
|
|
|
|
+ # comparisons can match against the most recently added content.
|
|
|
|
|
+ if NEWS_EMBEDDINGS_ENABLED and article_embedding is not None:
|
|
|
|
|
+ c["embedding"] = article_embedding
|
|
|
|
|
+ c["embedding_model"] = "ollama:nomic-embed-text"
|
|
|
c.setdefault("_merge_signals", []).append(
|
|
c.setdefault("_merge_signals", []).append(
|
|
|
{"signal": best_signal_name, "value": round(best_signal_value, 3)}
|
|
{"signal": best_signal_name, "value": round(best_signal_value, 3)}
|
|
|
)
|
|
)
|