Forráskód Böngészése

fix: filter topic labels from keywords at extraction and in dashboard

The LLM sometimes returns topic labels (crypto, macro, regulation, ai,
other) as keywords since the prompt asks for "keywords that justify the
classification". These pollute the keyword frequency view and are
redundant with the cluster topic field.

- classify_cluster_llm() filters out DEFAULT_TOPICS from keywords
- get_keyword_frequencies() repeats the filter as a safety net for
  clusters that were enriched before the extraction-side fix
Lukas Goldschmidt 1 hete
szülő
commit
1724023a6e

+ 6 - 0
news_mcp/dashboard/dashboard_store.py

@@ -10,6 +10,7 @@ from news_mcp.config import (
     NEWS_PRUNING_ENABLED,
     NEWS_REFRESH_INTERVAL_SECONDS,
     NEWS_RETENTION_DAYS,
+    DEFAULT_TOPICS,
 )
 from news_mcp.storage.sqlite_store import SQLiteClusterStore
 
@@ -320,6 +321,7 @@ class DashboardStore:
             return dt.astimezone(timezone.utc)
 
         counter: dict[str, int] = {}
+        _topic_labels = {t.lower() for t in DEFAULT_TOPICS}
         for (payload_text,) in rows:
             c = json.loads(payload_text)
             dt = _parse_ts(c.get("timestamp"))
@@ -333,6 +335,10 @@ class DashboardStore:
                 kw_str = str(kw).strip()
                 if not kw_str:
                     continue
+                # Skip topic labels (crypto, macro, regulation, ai, other)
+                # that the LLM sometimes returns as keywords.
+                if kw_str.lower() in _topic_labels:
+                    continue
                 # Skip keywords that are already entities in this cluster
                 if kw_str.lower() in ents_in_cluster:
                     continue

+ 8 - 0
news_mcp/enrichment/llm_enrich.py

@@ -48,6 +48,14 @@ async def classify_cluster_llm(cluster: Dict[str, Any]) -> Dict[str, Any]:
     entities = _filter_entities(normalize_entities(parsed.get("entities", [])))
     keywords = _filter_entities(normalize_entities(parsed.get("keywords", [])))
 
+    # Filter out topic labels from keywords. The LLM often returns the
+    # topic (e.g. "crypto", "macro", "regulation", "ai") as a keyword
+    # since the prompt asks for "keywords that justify the classification".
+    # These are already captured by the cluster topic field and should not
+    # pollute keyword search/scoring/frequencies.
+    _topic_labels = {t.lower() for t in DEFAULT_TOPICS}
+    keywords = [k for k in keywords if k.lower() not in _topic_labels]
+
     out.update({
         "topic": topic,
         "entities": entities,