1 hete · 1724023a6e
--- a/news_mcp/dashboard/dashboard_store.py
+++ b/news_mcp/dashboard/dashboard_store.py
@@ -10,6 +10,7 @@ from news_mcp.config import (
 
				     NEWS_PRUNING_ENABLED,
			
 
				     NEWS_REFRESH_INTERVAL_SECONDS,
			
 
				     NEWS_RETENTION_DAYS,
			
 
				+    DEFAULT_TOPICS,
			
 
				 )
			
 
				 from news_mcp.storage.sqlite_store import SQLiteClusterStore
			
 
				 
			
@@ -320,6 +321,7 @@ class DashboardStore:
 
				             return dt.astimezone(timezone.utc)
			
 
				 
			
 
				         counter: dict[str, int] = {}
			
 
				+        _topic_labels = {t.lower() for t in DEFAULT_TOPICS}
			
 
				         for (payload_text,) in rows:
			
 
				             c = json.loads(payload_text)
			
 
				             dt = _parse_ts(c.get("timestamp"))
			
@@ -333,6 +335,10 @@ class DashboardStore:
 
				                 kw_str = str(kw).strip()
			
 
				                 if not kw_str:
			
 
				                     continue
			
 
				+                # Skip topic labels (crypto, macro, regulation, ai, other)
			
 
				+                # that the LLM sometimes returns as keywords.
			
 
				+                if kw_str.lower() in _topic_labels:
			
 
				+                    continue
			
 
				                 # Skip keywords that are already entities in this cluster
			
 
				                 if kw_str.lower() in ents_in_cluster:
			
 
				                     continue
			
--- a/news_mcp/enrichment/llm_enrich.py
+++ b/news_mcp/enrichment/llm_enrich.py
@@ -48,6 +48,14 @@ async def classify_cluster_llm(cluster: Dict[str, Any]) -> Dict[str, Any]:
 
				     entities = _filter_entities(normalize_entities(parsed.get("entities", [])))
			
 
				     keywords = _filter_entities(normalize_entities(parsed.get("keywords", [])))
			
 
				 
			
 
				+    # Filter out topic labels from keywords. The LLM often returns the
			
 
				+    # topic (e.g. "crypto", "macro", "regulation", "ai") as a keyword
			
 
				+    # since the prompt asks for "keywords that justify the classification".
			
 
				+    # These are already captured by the cluster topic field and should not
			
 
				+    # pollute keyword search/scoring/frequencies.
			
 
				+    _topic_labels = {t.lower() for t in DEFAULT_TOPICS}
			
 
				+    keywords = [k for k in keywords if k.lower() not in _topic_labels]
			
 
				+
			
 
				     out.update({
			
 
				         "topic": topic,
			
 
				         "entities": entities,