Bladeren bron

Clean stored news summaries of HTML to match ingestion

Lukas Goldschmidt 1 maand geleden
bovenliggende
commit
1e5f9c6936
5 gewijzigde bestanden met toevoegingen van 174 en 15 verwijderingen
  1. 26 0
      OUTLOOK.md
  2. 7 2
      news_mcp/enrichment/llm_enrich.py
  3. 66 12
      news_mcp/mcp_server_fastmcp.py
  4. 11 1
      news_mcp/sources/news_feeds.py
  5. 64 0
      news_mcp/trends_resolution.py

+ 26 - 0
OUTLOOK.md

@@ -392,16 +392,31 @@ But only if you:
 
 # ✅ Completed since this outlook was written
 
+* v0.1.0 released and tagged
 * provider-agnostic LLM extraction/summarization layer added
 * prompts moved into separate files for easier updates
 * entity blacklist implemented and made case-insensitive
+* wildcard blacklist support added for entities/topics/keywords
 * live extraction smoke test added
+* JSON-backed alias map added for query normalization
+* query normalization added so shorthand like `btc` and `trump` still works
 * docs updated with the new env vars and workflow
 
 ---
 
 # 🔭 Next high-level steps
 
+## What is left of v0.1.0
+
+The first version is now effectively a usable baseline. The remaining work for v0.1.x is mostly polish:
+
+* stabilize extraction quality across a few more real-world samples
+* expand the alias map only where usage demands it
+* tune emerging-topic noise so repeated source names do not dominate
+* keep sentiment labels aligned with scores as the model improves
+
+## Where v0.2.0 should lead
+
 1. **Normalization layer**
 
    * canonicalize acronyms and entity variants before storage / querying
@@ -422,3 +437,14 @@ But only if you:
    * track how important entities evolve over time
    * allow replay of when entities first appeared, how topics shifted, and how sentiment changed
    * useful later for narrative reconstruction and trend timelines
+
+## Longer-term direction
+
+The endgame is not just “news search”, but a light narrative memory system:
+
+* entity histories over time
+* topic shifts and turning points
+* sentiment arcs
+* replayable timelines for a person, company, or event
+
+That should stay in mind while keeping the current implementation simple.

+ 7 - 2
news_mcp/enrichment/llm_enrich.py

@@ -6,6 +6,7 @@ from typing import Any, Dict
 from news_mcp.config import NEWS_ENTITY_BLACKLIST
 from news_mcp.entity_normalize import normalize_entities
 from news_mcp.llm import call_extraction, call_summary
+from news_mcp.trends_resolution import resolve_entity_via_trends
 
 
 def _matches_blacklist(value: str, blacklist=None) -> bool:
@@ -31,12 +32,16 @@ async def classify_cluster_groq(cluster: Dict[str, Any]) -> Dict[str, Any]:
     topic = parsed.get("topic", cluster.get("topic"))
     if topic and _matches_blacklist(topic):
         topic = "other"
+    entities = normalize_entities(_filter_entities(parsed.get("entities", [])))
+    keywords = normalize_entities(_filter_entities(parsed.get("keywords", [])))
+
     out.update({
         "topic": topic,
-        "entities": normalize_entities(_filter_entities(parsed.get("entities", []))),
+        "entities": entities,
+        "entityResolutions": [resolve_entity_via_trends(e) for e in entities],
         "sentiment": parsed.get("sentiment", "neutral"),
         "sentimentScore": parsed.get("sentimentScore"),
-        "keywords": normalize_entities(_filter_entities(parsed.get("keywords", []))),
+        "keywords": keywords,
     })
     return out
 

+ 66 - 12
news_mcp/mcp_server_fastmcp.py

@@ -9,6 +9,7 @@ from news_mcp.config import NEWS_REFRESH_INTERVAL_SECONDS, NEWS_BACKGROUND_REFRE
 from news_mcp.jobs.poller import refresh_clusters
 from news_mcp.storage.sqlite_store import SQLiteClusterStore
 from news_mcp.enrichment.llm_enrich import summarize_cluster_groq
+from news_mcp.trends_resolution import resolve_entity_via_trends
 from news_mcp.llm import active_llm_config
 from news_mcp.entity_normalize import normalize_query
 from collections import Counter
@@ -21,22 +22,58 @@ mcp = FastMCP(
 )
 
 
+def _cluster_entity_haystack(cluster: dict) -> list[str]:
+    """Collect the normalized entity clues attached to a cluster."""
+    values: list[str] = []
+    for ent in cluster.get("entities", []) or []:
+        values.append(str(ent).strip().lower())
+    for res in cluster.get("entityResolutions", []) or []:
+        if not isinstance(res, dict):
+            continue
+        for key in ("normalized", "canonical_label", "mid"):
+            val = res.get(key)
+            if val:
+                values.append(str(val).strip().lower())
+    return [v for v in values if v]
+
+
 @mcp.tool(description="What is happening right now? Return the latest deduplicated news clusters for a topic.")
 async def get_latest_events(topic: str = "crypto", limit: int = 5):
     limit = max(1, min(int(limit), 20))
-    # In v1, `topic` is a coarse category. If the caller passes an entity name
-    # (e.g. "trump"/"iran"), gracefully fall back to `other`.
+    # If the caller passes an entity-like value, resolve it and use the canonical
+    # entity as the query lens. Otherwise keep the original topic path.
     topic_norm = normalize_query(topic).lower()
+    resolved = resolve_entity_via_trends(topic_norm)
     allowed = {t.lower() for t in DEFAULT_TOPICS}
-    if topic_norm not in allowed:
-        topic_norm = "other"
+    is_topic = topic_norm in allowed
+    query_terms = {
+        topic_norm,
+        str(resolved.get("normalized") or "").strip().lower(),
+        str(resolved.get("canonical_label") or "").strip().lower(),
+        str(resolved.get("mid") or "").strip().lower(),
+    }
+    query_terms = {q for q in query_terms if q}
+
     store = SQLiteClusterStore(DB_PATH)
 
-    # Cache-first: only refresh if we currently have no fresh clusters for this topic.
-    clusters = store.get_latest_clusters(topic=topic_norm, ttl_hours=CLUSTERS_TTL_HOURS, limit=limit)
-    if not clusters:
-        await refresh_clusters(topic=topic_norm, limit=200)
+    if is_topic:
+        # Cache-first: only refresh if we currently have no fresh clusters for this topic.
         clusters = store.get_latest_clusters(topic=topic_norm, ttl_hours=CLUSTERS_TTL_HOURS, limit=limit)
+        if not clusters:
+            await refresh_clusters(topic=topic_norm, limit=200)
+            clusters = store.get_latest_clusters(topic=topic_norm, ttl_hours=CLUSTERS_TTL_HOURS, limit=limit)
+    else:
+        # Entity-aware mode: search recent clusters across all topics and match by
+        # raw entity, canonical label, or MID.
+        clusters = store.get_latest_clusters_all_topics(ttl_hours=CLUSTERS_TTL_HOURS, limit=limit * 8)
+        filtered = []
+        for c in clusters:
+            haystack = _cluster_entity_haystack(c)
+            if any(any(term in item for item in haystack) for term in query_terms):
+                filtered.append(c)
+            if len(filtered) >= limit:
+                break
+        clusters = filtered
 
     # Ensure the response is compact and agent-friendly.
     clusters_sorted = sorted(clusters, key=lambda x: float(x.get("importance", 0.0)), reverse=True)
@@ -66,14 +103,23 @@ async def get_events_for_entity(entity: str, limit: int = 10):
     if not query:
         return []
 
+    resolved = resolve_entity_via_trends(query)
+    query_terms = {
+        query,
+        str(resolved.get("normalized") or "").strip().lower(),
+        str(resolved.get("canonical_label") or "").strip().lower(),
+        str(resolved.get("mid") or "").strip().lower(),
+    }
+    query_terms = {q for q in query_terms if q}
+
     # Cache-first: search recent clusters across all topics.
     store = SQLiteClusterStore(DB_PATH)
     clusters = store.get_latest_clusters_all_topics(ttl_hours=CLUSTERS_TTL_HOURS, limit=limit * 5)
 
     hits = []
     for c in clusters:
-        ents = c.get("entities") or []
-        if any(query in str(e).lower() for e in ents):
+        haystack = _cluster_entity_haystack(c)
+        if any(any(term in item for item in haystack) for term in query_terms):
             hits.append(c)
         if len(hits) >= limit:
             break
@@ -191,6 +237,14 @@ async def get_news_sentiment(entity: str, timeframe: str = "24h"):
     store = SQLiteClusterStore(DB_PATH)
 
     ent = normalize_query(entity).strip().lower()
+    resolved = resolve_entity_via_trends(ent)
+    query_terms = {
+        ent,
+        str(resolved.get("normalized") or "").strip().lower(),
+        str(resolved.get("canonical_label") or "").strip().lower(),
+        str(resolved.get("mid") or "").strip().lower(),
+    }
+    query_terms = {q for q in query_terms if q}
     if not ent:
         return {
             "entity": entity,
@@ -210,8 +264,8 @@ async def get_news_sentiment(entity: str, timeframe: str = "24h"):
     clusters = store.get_latest_clusters_all_topics(ttl_hours=hours, limit=500)
     matched = []
     for c in clusters:
-        ents = c.get("entities") or []
-        if any(ent in str(e).lower() for e in ents):
+        haystack = _cluster_entity_haystack(c)
+        if any(any(term in item for item in haystack) for term in query_terms):
             matched.append(c)
 
     if not matched:

+ 11 - 1
news_mcp/sources/news_feeds.py

@@ -2,6 +2,7 @@ from __future__ import annotations
 
 import hashlib
 import logging
+import re
 from typing import Any, Dict, List
 
 import feedparser
@@ -17,6 +18,15 @@ def _canonical_url(url: str) -> str:
     return url.strip()
 
 
+def _strip_html(text: str) -> str:
+    """Remove obvious HTML so downstream summaries stay readable."""
+    text = re.sub(r"<script.*?</script>", "", text, flags=re.I | re.S)
+    text = re.sub(r"<style.*?</style>", "", text, flags=re.I | re.S)
+    text = re.sub(r"<[^>]+>", " ", text)
+    text = re.sub(r"\s+", " ", text)
+    return text.strip()
+
+
 def _feed_urls() -> List[str]:
     urls = [u.strip() for u in NEWS_FEED_URLS.split(",") if u.strip()]
     if not urls:
@@ -44,7 +54,7 @@ def fetch_news_articles(limit: int = 50) -> List[Dict[str, Any]]:
             title = str(getattr(entry, "title", "")).strip()
             url = _canonical_url(str(getattr(entry, "link", "")).strip())
             timestamp = str(getattr(entry, "published", "")) or str(getattr(entry, "updated", ""))
-            summary = str(getattr(entry, "summary", "")) or str(getattr(entry, "description", ""))
+            summary = _strip_html(str(getattr(entry, "summary", "")) or str(getattr(entry, "description", "")))
 
             if not title or not url:
                 continue

+ 64 - 0
news_mcp/trends_resolution.py

@@ -0,0 +1,64 @@
+from __future__ import annotations
+
+import json
+import os
+import subprocess
+from functools import lru_cache
+from typing import Any
+
+from news_mcp.entity_normalize import normalize_entity
+
+
+@lru_cache(maxsize=1024)
+def resolve_entity_via_trends(entity: str) -> dict[str, Any]:
+    """Resolve a normalized entity through trends-mcp, falling back cleanly.
+
+    The input is normalized first using the same local normalization rules used
+    everywhere else in news-mcp, so query and storage paths stay aligned.
+    """
+    normalized = normalize_entity(entity)
+    if not normalized:
+        return {
+            "raw": entity,
+            "normalized": "",
+            "canonical_label": "",
+            "mid": None,
+            "type": None,
+            "source": "empty",
+        }
+
+    config = os.getenv("MCPORTER_CONFIG", os.path.expanduser("~/.openclaw/workspace/config/mcporter.json"))
+    command = [
+        "mcporter",
+        "--config",
+        config,
+        "call",
+        "trends.resolve_entity",
+        f"keyword={normalized}",
+    ]
+
+    try:
+        proc = subprocess.run(command, capture_output=True, text=True, timeout=20, check=False)
+        if proc.returncode == 0 and proc.stdout.strip():
+            payload = json.loads(proc.stdout)
+            return {
+                "raw": entity,
+                "normalized": normalized,
+                "canonical_label": payload.get("canonical_label") or normalized,
+                "mid": payload.get("mid"),
+                "type": payload.get("type"),
+                "candidates": payload.get("candidates", []),
+                "source": "trends-mcp",
+            }
+    except Exception:
+        pass
+
+    # Conservative fallback: keep the local normalized form and leave MID unset.
+    return {
+        "raw": entity,
+        "normalized": normalized,
+        "canonical_label": normalized,
+        "mid": None,
+        "type": None,
+        "source": "fallback",
+    }