|
@@ -1,5 +1,6 @@
|
|
|
from __future__ import annotations
|
|
from __future__ import annotations
|
|
|
|
|
|
|
|
|
|
+import asyncio
|
|
|
import hashlib
|
|
import hashlib
|
|
|
import re
|
|
import re
|
|
|
from difflib import SequenceMatcher
|
|
from difflib import SequenceMatcher
|
|
@@ -18,7 +19,6 @@ from news_mcp.sources.news_feeds import normalize_topic_from_title
|
|
|
|
|
|
|
|
def _normalize_title(title: str) -> str:
|
|
def _normalize_title(title: str) -> str:
|
|
|
t = title.lower().strip()
|
|
t = title.lower().strip()
|
|
|
- # Remove punctuation-ish characters for similarity scoring.
|
|
|
|
|
t = re.sub(r"[^a-z0-9\s]", " ", t)
|
|
t = re.sub(r"[^a-z0-9\s]", " ", t)
|
|
|
t = re.sub(r"\s+", " ", t).strip()
|
|
t = re.sub(r"\s+", " ", t).strip()
|
|
|
return t
|
|
return t
|
|
@@ -48,12 +48,9 @@ def _cluster_text(a: Dict[str, Any]) -> str:
|
|
|
|
|
|
|
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# ---------------------------------------------------------------------------
|
|
|
-# Token / Jaccard signal (used as a fallback alongside title similarity when
|
|
|
|
|
-# embeddings are unavailable, and as a soft signal even when they are).
|
|
|
|
|
|
|
+# Token / Jaccard signal
|
|
|
# ---------------------------------------------------------------------------
|
|
# ---------------------------------------------------------------------------
|
|
|
|
|
|
|
|
-# Tiny stop-word set — we keep it small on purpose because the corpus is news
|
|
|
|
|
-# headlines, where every additional removal risks losing genuine signal.
|
|
|
|
|
_STOPWORDS = frozenset(
|
|
_STOPWORDS = frozenset(
|
|
|
{
|
|
{
|
|
|
"a", "an", "the", "of", "to", "in", "on", "at", "for", "by", "with",
|
|
"a", "an", "the", "of", "to", "in", "on", "at", "for", "by", "with",
|
|
@@ -68,7 +65,6 @@ _STOPWORDS = frozenset(
|
|
|
|
|
|
|
|
|
|
|
|
|
def _tokens(text: str) -> set[str]:
|
|
def _tokens(text: str) -> set[str]:
|
|
|
- """Lowercase content tokens, stop-words removed, length>=3."""
|
|
|
|
|
tokens = re.findall(r"[a-z0-9][a-z0-9\-]+", text.lower())
|
|
tokens = re.findall(r"[a-z0-9][a-z0-9\-]+", text.lower())
|
|
|
return {t for t in tokens if len(t) >= 3 and t not in _STOPWORDS}
|
|
return {t for t in tokens if len(t) >= 3 and t not in _STOPWORDS}
|
|
|
|
|
|
|
@@ -86,22 +82,12 @@ def _jaccard(a: set, b: set) -> float:
|
|
|
# Composite similarity
|
|
# Composite similarity
|
|
|
# ---------------------------------------------------------------------------
|
|
# ---------------------------------------------------------------------------
|
|
|
|
|
|
|
|
-
|
|
|
|
|
-# Each signal has its own threshold. We accept a merge if ANY signal clears its
|
|
|
|
|
-# threshold, which makes clustering robust when one signal happens to be weak
|
|
|
|
|
-# (short headlines kill SequenceMatcher; single-word stories kill Jaccard;
|
|
|
|
|
-# Ollama outages kill cosine similarity).
|
|
|
|
|
DEFAULT_TITLE_THRESHOLD = 0.87
|
|
DEFAULT_TITLE_THRESHOLD = 0.87
|
|
|
DEFAULT_JACCARD_THRESHOLD = 0.55
|
|
DEFAULT_JACCARD_THRESHOLD = 0.55
|
|
|
|
|
|
|
|
|
|
|
|
|
def _signals(article: Dict[str, Any], cluster: Dict[str, Any]) -> dict:
|
|
def _signals(article: Dict[str, Any], cluster: Dict[str, Any]) -> dict:
|
|
|
- """Per-pair similarity signals (title, jaccard, embedding cosine).
|
|
|
|
|
-
|
|
|
|
|
- Embedding cosine is only computed when both sides have a vector; we never
|
|
|
|
|
- block on a fresh Ollama request here — that's the caller's job, so this
|
|
|
|
|
- function stays pure and easy to test.
|
|
|
|
|
- """
|
|
|
|
|
|
|
+ """Per-pair similarity signals (title, jaccard, embedding cosine)."""
|
|
|
a_title = str(article.get("title") or "")
|
|
a_title = str(article.get("title") or "")
|
|
|
c_title = str(cluster.get("headline") or "")
|
|
c_title = str(cluster.get("headline") or "")
|
|
|
|
|
|
|
@@ -120,11 +106,7 @@ def _signals(article: Dict[str, Any], cluster: Dict[str, Any]) -> dict:
|
|
|
|
|
|
|
|
|
|
|
|
|
def _is_match(signals: dict, *, embeddings_enabled: bool) -> tuple[bool, str, float]:
|
|
def _is_match(signals: dict, *, embeddings_enabled: bool) -> tuple[bool, str, float]:
|
|
|
- """Decide whether two items should merge based on the strongest signal.
|
|
|
|
|
-
|
|
|
|
|
- Returns (matched, signal_name, signal_value). The signal_name lets callers
|
|
|
|
|
- log *why* something merged, which is huge for debugging clustering quality.
|
|
|
|
|
- """
|
|
|
|
|
|
|
+ """Decide whether two items should merge based on the strongest signal."""
|
|
|
cosine_threshold = NEWS_EMBEDDING_SIMILARITY_THRESHOLD
|
|
cosine_threshold = NEWS_EMBEDDING_SIMILARITY_THRESHOLD
|
|
|
if embeddings_enabled and signals["cosine"] >= cosine_threshold:
|
|
if embeddings_enabled and signals["cosine"] >= cosine_threshold:
|
|
|
return True, "cosine", signals["cosine"]
|
|
return True, "cosine", signals["cosine"]
|
|
@@ -136,7 +118,65 @@ def _is_match(signals: dict, *, embeddings_enabled: bool) -> tuple[bool, str, fl
|
|
|
|
|
|
|
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# ---------------------------------------------------------------------------
|
|
|
-# Public API
|
|
|
|
|
|
|
+# Embedding pre-computation (async internally)
|
|
|
|
|
+# ---------------------------------------------------------------------------
|
|
|
|
|
+
|
|
|
|
|
+
|
|
|
|
|
+async def _compute_embeddings_concurrently(
|
|
|
|
|
+ articles: List[Dict[str, Any]],
|
|
|
|
|
+) -> Dict[str, list[float] | None]:
|
|
|
|
|
+ """Compute embeddings for unique article texts concurrently.
|
|
|
|
|
+
|
|
|
|
|
+ Returns a cache dict: text -> embedding vector or None.
|
|
|
|
|
+ """
|
|
|
|
|
+ unique_texts: list[str] = []
|
|
|
|
|
+ seen: set[str] = set()
|
|
|
|
|
+ for a in articles:
|
|
|
|
|
+ text = _cluster_text(a)
|
|
|
|
|
+ if text and text not in seen:
|
|
|
|
|
+ seen.add(text)
|
|
|
|
|
+ unique_texts.append(text)
|
|
|
|
|
+
|
|
|
|
|
+ emb_tasks = [ollama_embed(text) for text in unique_texts]
|
|
|
|
|
+ emb_results = await asyncio.gather(*emb_tasks, return_exceptions=True)
|
|
|
|
|
+
|
|
|
|
|
+ cache: Dict[str, list[float] | None] = {}
|
|
|
|
|
+ for text, result in zip(unique_texts, emb_results):
|
|
|
|
|
+ if isinstance(result, list):
|
|
|
|
|
+ cache[text] = result
|
|
|
|
|
+ else:
|
|
|
|
|
+ cache[text] = None
|
|
|
|
|
+ return cache
|
|
|
|
|
+
|
|
|
|
|
+
|
|
|
|
|
+def _compute_embeddings_sync(
|
|
|
|
|
+ articles: List[Dict[str, Any]],
|
|
|
|
|
+) -> Dict[str, list[float] | None]:
|
|
|
|
|
+ """Synchronous wrapper that runs the async embedding computation.
|
|
|
|
|
+
|
|
|
|
|
+ Handles three cases:
|
|
|
|
|
+ 1. Already inside an async event loop (called from poller) -> schedule
|
|
|
|
|
+ as a task and run it to completion on the running loop.
|
|
|
|
|
+ 2. No event loop at all (plain sync caller) -> use asyncio.run().
|
|
|
|
|
+ """
|
|
|
|
|
+ try:
|
|
|
|
|
+ loop = asyncio.get_running_loop()
|
|
|
|
|
+ except RuntimeError:
|
|
|
|
|
+ # No running loop — safe to use asyncio.run()
|
|
|
|
|
+ return asyncio.run(_compute_embeddings_concurrently(articles))
|
|
|
|
|
+
|
|
|
|
|
+ # We're inside a running event loop (e.g. the poller). Create a new loop
|
|
|
|
|
+ # in a thread to avoid blocking.
|
|
|
|
|
+ import concurrent.futures
|
|
|
|
|
+ with concurrent.futures.ThreadPoolExecutor(max_workers=1) as pool:
|
|
|
|
|
+ future = pool.submit(
|
|
|
|
|
+ asyncio.run, _compute_embeddings_concurrently(articles)
|
|
|
|
|
+ )
|
|
|
|
|
+ return future.result()
|
|
|
|
|
+
|
|
|
|
|
+
|
|
|
|
|
+# ---------------------------------------------------------------------------
|
|
|
|
|
+# Public API (sync — backward compatible with tests)
|
|
|
# ---------------------------------------------------------------------------
|
|
# ---------------------------------------------------------------------------
|
|
|
|
|
|
|
|
|
|
|
|
@@ -146,36 +186,23 @@ def dedup_and_cluster_articles(
|
|
|
) -> Dict[str, List[Dict[str, Any]]]:
|
|
) -> Dict[str, List[Dict[str, Any]]]:
|
|
|
"""Deduplicate raw articles into clusters keyed by topic.
|
|
"""Deduplicate raw articles into clusters keyed by topic.
|
|
|
|
|
|
|
|
- v1.1 strategy: composite similarity.
|
|
|
|
|
|
|
+ v1.2: embedding pre-computation is async/concurrent under the hood, but
|
|
|
|
|
+ this public function remains synchronous for backward compatibility.
|
|
|
|
|
+
|
|
|
|
|
+ A pair merges if ANY signal clears its threshold:
|
|
|
* title fuzzy ratio
|
|
* title fuzzy ratio
|
|
|
- * token Jaccard over headline+summary (cheap, surprisingly resilient
|
|
|
|
|
- when titles are reworded heavily across outlets)
|
|
|
|
|
|
|
+ * token Jaccard over headline+summary
|
|
|
* Ollama embedding cosine when available
|
|
* Ollama embedding cosine when available
|
|
|
-
|
|
|
|
|
- A pair merges if ANY signal clears its threshold. Falling back through
|
|
|
|
|
- multiple signals means a transient Ollama outage doesn't collapse the
|
|
|
|
|
- server back into title-only clustering, and a heavily-reworded headline
|
|
|
|
|
- can still merge via Jaccard or embeddings.
|
|
|
|
|
-
|
|
|
|
|
- The ``similarity_threshold`` argument is kept for backward compatibility
|
|
|
|
|
- with the test suite. When provided, it overrides the title threshold.
|
|
|
|
|
"""
|
|
"""
|
|
|
|
|
|
|
|
title_threshold = similarity_threshold if similarity_threshold is not None else DEFAULT_TITLE_THRESHOLD
|
|
title_threshold = similarity_threshold if similarity_threshold is not None else DEFAULT_TITLE_THRESHOLD
|
|
|
|
|
|
|
|
- by_topic: Dict[str, List[Dict[str, Any]]] = {}
|
|
|
|
|
|
|
+ # Pre-compute embeddings concurrently (sync boundary handles async internally)
|
|
|
embedding_cache: Dict[str, list[float] | None] = {}
|
|
embedding_cache: Dict[str, list[float] | None] = {}
|
|
|
|
|
+ if NEWS_EMBEDDINGS_ENABLED:
|
|
|
|
|
+ embedding_cache = _compute_embeddings_sync(articles)
|
|
|
|
|
|
|
|
- def _embedding_for_text(text: str) -> list[float] | None:
|
|
|
|
|
- if not NEWS_EMBEDDINGS_ENABLED or not text:
|
|
|
|
|
- return None
|
|
|
|
|
- if text in embedding_cache:
|
|
|
|
|
- return embedding_cache[text]
|
|
|
|
|
- emb = ollama_embed(text)
|
|
|
|
|
- # Cache None too so a single failure doesn't trigger repeated retries
|
|
|
|
|
- # within one ingestion cycle. The next refresh call clears this map.
|
|
|
|
|
- embedding_cache[text] = emb
|
|
|
|
|
- return emb
|
|
|
|
|
|
|
+ by_topic: Dict[str, List[Dict[str, Any]]] = {}
|
|
|
|
|
|
|
|
for a in articles:
|
|
for a in articles:
|
|
|
title = a.get("title") or ""
|
|
title = a.get("title") or ""
|
|
@@ -183,10 +210,8 @@ def dedup_and_cluster_articles(
|
|
|
continue
|
|
continue
|
|
|
topic = normalize_topic_from_title(title)
|
|
topic = normalize_topic_from_title(title)
|
|
|
article_text = _cluster_text(a)
|
|
article_text = _cluster_text(a)
|
|
|
- article_embedding = _embedding_for_text(article_text)
|
|
|
|
|
|
|
|
|
|
- # Attach embedding on the article dict so _signals() can read it
|
|
|
|
|
- # without re-computing.
|
|
|
|
|
|
|
+ article_embedding = embedding_cache.get(article_text) if NEWS_EMBEDDINGS_ENABLED else None
|
|
|
a_with_emb = dict(a)
|
|
a_with_emb = dict(a)
|
|
|
if article_embedding is not None:
|
|
if article_embedding is not None:
|
|
|
a_with_emb["_embedding"] = article_embedding
|
|
a_with_emb["_embedding"] = article_embedding
|
|
@@ -199,8 +224,6 @@ def dedup_and_cluster_articles(
|
|
|
best_signal_value = 0.0
|
|
best_signal_value = 0.0
|
|
|
for idx, c in enumerate(clusters):
|
|
for idx, c in enumerate(clusters):
|
|
|
sigs = _signals(a_with_emb, c)
|
|
sigs = _signals(a_with_emb, c)
|
|
|
- # Use the title threshold the caller explicitly passed (test override)
|
|
|
|
|
- # but otherwise rely on the module defaults.
|
|
|
|
|
local_match = False
|
|
local_match = False
|
|
|
if NEWS_EMBEDDINGS_ENABLED and sigs["cosine"] >= NEWS_EMBEDDING_SIMILARITY_THRESHOLD:
|
|
if NEWS_EMBEDDINGS_ENABLED and sigs["cosine"] >= NEWS_EMBEDDING_SIMILARITY_THRESHOLD:
|
|
|
local_match = True
|
|
local_match = True
|
|
@@ -211,11 +234,6 @@ def dedup_and_cluster_articles(
|
|
|
elif sigs["jaccard"] >= DEFAULT_JACCARD_THRESHOLD:
|
|
elif sigs["jaccard"] >= DEFAULT_JACCARD_THRESHOLD:
|
|
|
local_match = True
|
|
local_match = True
|
|
|
signal_name, signal_value = "jaccard", sigs["jaccard"]
|
|
signal_name, signal_value = "jaccard", sigs["jaccard"]
|
|
|
- # Consensus rule: when no single signal clears its strict threshold
|
|
|
|
|
- # but two of them are simultaneously "strong-ish", treat that as a
|
|
|
|
|
- # match. This catches reworded headlines whose embedding is just
|
|
|
|
|
- # below the strict cosine cutoff. Numbers are intentionally
|
|
|
|
|
- # conservative — both signals must be clearly above noise.
|
|
|
|
|
elif (
|
|
elif (
|
|
|
NEWS_EMBEDDINGS_ENABLED
|
|
NEWS_EMBEDDINGS_ENABLED
|
|
|
and sigs["cosine"] >= 0.80
|
|
and sigs["cosine"] >= 0.80
|
|
@@ -240,13 +258,10 @@ def dedup_and_cluster_articles(
|
|
|
if a.get("source") and a["source"] not in c["sources"]:
|
|
if a.get("source") and a["source"] not in c["sources"]:
|
|
|
c["sources"].append(a["source"])
|
|
c["sources"].append(a["source"])
|
|
|
c["last_updated"] = max(str(c.get("last_updated", "")), str(a.get("timestamp", "")))
|
|
c["last_updated"] = max(str(c.get("last_updated", "")), str(a.get("timestamp", "")))
|
|
|
- # Keep a tiny audit trail per cluster on which signal grew it last.
|
|
|
|
|
- # Not surfaced through tools — lives in the payload only for debug.
|
|
|
|
|
c.setdefault("_merge_signals", []).append(
|
|
c.setdefault("_merge_signals", []).append(
|
|
|
{"signal": best_signal_name, "value": round(best_signal_value, 3)}
|
|
{"signal": best_signal_name, "value": round(best_signal_value, 3)}
|
|
|
)
|
|
)
|
|
|
else:
|
|
else:
|
|
|
- # Stable cluster id: based on topic + normalized canonical title.
|
|
|
|
|
key = f"{topic}|{_normalize_title(title)}"
|
|
key = f"{topic}|{_normalize_title(title)}"
|
|
|
cid = hashlib.sha1(key.encode("utf-8")).hexdigest()
|
|
cid = hashlib.sha1(key.encode("utf-8")).hexdigest()
|
|
|
cluster_embedding = article_embedding if NEWS_EMBEDDINGS_ENABLED else None
|
|
cluster_embedding = article_embedding if NEWS_EMBEDDINGS_ENABLED else None
|
|
@@ -269,8 +284,7 @@ def dedup_and_cluster_articles(
|
|
|
}
|
|
}
|
|
|
)
|
|
)
|
|
|
|
|
|
|
|
- # Strip the internal merge audit trail before returning so it does not
|
|
|
|
|
- # accidentally bloat the SQLite payload. Storage layer doesn't filter it.
|
|
|
|
|
|
|
+ # Strip the internal merge audit trail before returning
|
|
|
for clusters in by_topic.values():
|
|
for clusters in by_topic.values():
|
|
|
for c in clusters:
|
|
for c in clusters:
|
|
|
c.pop("_merge_signals", None)
|
|
c.pop("_merge_signals", None)
|