Bläddra i källkod

fix: filter topic labels from keywords at extraction and in dashboard

The LLM sometimes returns topic labels (crypto, macro, regulation, ai,
other) as keywords since the prompt asks for "keywords that justify the
classification". These pollute the keyword frequency view and are
redundant with the cluster topic field.

- classify_cluster_llm() filters out DEFAULT_TOPICS from keywords
- get_keyword_frequencies() repeats the filter as a safety net for
  clusters that were enriched before the extraction-side fix
Lukas Goldschmidt 1 vecka sedan
förälder
incheckning
1724023a6e
2 ändrade filer med 14 tillägg och 0 borttagningar
  1. 6 0
      news_mcp/dashboard/dashboard_store.py
  2. 8 0
      news_mcp/enrichment/llm_enrich.py

+ 6 - 0
news_mcp/dashboard/dashboard_store.py

@@ -10,6 +10,7 @@ from news_mcp.config import (
     NEWS_PRUNING_ENABLED,
     NEWS_PRUNING_ENABLED,
     NEWS_REFRESH_INTERVAL_SECONDS,
     NEWS_REFRESH_INTERVAL_SECONDS,
     NEWS_RETENTION_DAYS,
     NEWS_RETENTION_DAYS,
+    DEFAULT_TOPICS,
 )
 )
 from news_mcp.storage.sqlite_store import SQLiteClusterStore
 from news_mcp.storage.sqlite_store import SQLiteClusterStore
 
 
@@ -320,6 +321,7 @@ class DashboardStore:
             return dt.astimezone(timezone.utc)
             return dt.astimezone(timezone.utc)
 
 
         counter: dict[str, int] = {}
         counter: dict[str, int] = {}
+        _topic_labels = {t.lower() for t in DEFAULT_TOPICS}
         for (payload_text,) in rows:
         for (payload_text,) in rows:
             c = json.loads(payload_text)
             c = json.loads(payload_text)
             dt = _parse_ts(c.get("timestamp"))
             dt = _parse_ts(c.get("timestamp"))
@@ -333,6 +335,10 @@ class DashboardStore:
                 kw_str = str(kw).strip()
                 kw_str = str(kw).strip()
                 if not kw_str:
                 if not kw_str:
                     continue
                     continue
+                # Skip topic labels (crypto, macro, regulation, ai, other)
+                # that the LLM sometimes returns as keywords.
+                if kw_str.lower() in _topic_labels:
+                    continue
                 # Skip keywords that are already entities in this cluster
                 # Skip keywords that are already entities in this cluster
                 if kw_str.lower() in ents_in_cluster:
                 if kw_str.lower() in ents_in_cluster:
                     continue
                     continue

+ 8 - 0
news_mcp/enrichment/llm_enrich.py

@@ -48,6 +48,14 @@ async def classify_cluster_llm(cluster: Dict[str, Any]) -> Dict[str, Any]:
     entities = _filter_entities(normalize_entities(parsed.get("entities", [])))
     entities = _filter_entities(normalize_entities(parsed.get("entities", [])))
     keywords = _filter_entities(normalize_entities(parsed.get("keywords", [])))
     keywords = _filter_entities(normalize_entities(parsed.get("keywords", [])))
 
 
+    # Filter out topic labels from keywords. The LLM often returns the
+    # topic (e.g. "crypto", "macro", "regulation", "ai") as a keyword
+    # since the prompt asks for "keywords that justify the classification".
+    # These are already captured by the cluster topic field and should not
+    # pollute keyword search/scoring/frequencies.
+    _topic_labels = {t.lower() for t in DEFAULT_TOPICS}
+    keywords = [k for k in keywords if k.lower() not in _topic_labels]
+
     out.update({
     out.update({
         "topic": topic,
         "topic": topic,
         "entities": entities,
         "entities": entities,