1 vecka sedan · 1724023a6e
--- a/news_mcp/dashboard/dashboard_store.py
+++ b/news_mcp/dashboard/dashboard_store.py
@@ -10,6 +10,7 @@ from news_mcp.config import (
 
															     NEWS_PRUNING_ENABLED,
														
 
															     NEWS_REFRESH_INTERVAL_SECONDS,
														
 
															     NEWS_RETENTION_DAYS,
														
 
															+    DEFAULT_TOPICS,
														
 
															 )
														
 
															 from news_mcp.storage.sqlite_store import SQLiteClusterStore
														
@@ -320,6 +321,7 @@ class DashboardStore:
 
															             return dt.astimezone(timezone.utc)
														
 
															         counter: dict[str, int] = {}
														
 
															+        _topic_labels = {t.lower() for t in DEFAULT_TOPICS}
														
 
															         for (payload_text,) in rows:
														
 
															             c = json.loads(payload_text)
														
 
															             dt = _parse_ts(c.get("timestamp"))
														
@@ -333,6 +335,10 @@ class DashboardStore:
 
															                 kw_str = str(kw).strip()
														
 
															                 if not kw_str:
														
 
															                     continue
														
 
															+                # Skip topic labels (crypto, macro, regulation, ai, other)
														
 
															+                # that the LLM sometimes returns as keywords.
														
 
															+                if kw_str.lower() in _topic_labels:
														
 
															+                    continue
														
 
															                 # Skip keywords that are already entities in this cluster
														
 
															                 if kw_str.lower() in ents_in_cluster:
														
 
															                     continue
														
--- a/news_mcp/enrichment/llm_enrich.py
+++ b/news_mcp/enrichment/llm_enrich.py
@@ -48,6 +48,14 @@ async def classify_cluster_llm(cluster: Dict[str, Any]) -> Dict[str, Any]:
 
															     entities = _filter_entities(normalize_entities(parsed.get("entities", [])))
														
 
															     keywords = _filter_entities(normalize_entities(parsed.get("keywords", [])))
														
 
															+    # Filter out topic labels from keywords. The LLM often returns the
														
 
															+    # topic (e.g. "crypto", "macro", "regulation", "ai") as a keyword
														
 
															+    # since the prompt asks for "keywords that justify the classification".
														
 
															+    # These are already captured by the cluster topic field and should not
														
 
															+    # pollute keyword search/scoring/frequencies.
														
 
															+    _topic_labels = {t.lower() for t in DEFAULT_TOPICS}
														
 
															+    keywords = [k for k in keywords if k.lower() not in _topic_labels]
														
 
															+
														
 
															     out.update({
														
 
															         "topic": topic,
														
 
															         "entities": entities,