пре 1 месец · 1e5f9c6936
--- a/OUTLOOK.md
+++ b/OUTLOOK.md
@@ -392,16 +392,31 @@ But only if you:
 
				 
			
 
				 # ✅ Completed since this outlook was written
			
 
				 
			
 
				+* v0.1.0 released and tagged
			
 
				 * provider-agnostic LLM extraction/summarization layer added
			
 
				 * prompts moved into separate files for easier updates
			
 
				 * entity blacklist implemented and made case-insensitive
			
 
				+* wildcard blacklist support added for entities/topics/keywords
			
 
				 * live extraction smoke test added
			
 
				+* JSON-backed alias map added for query normalization
			
 
				+* query normalization added so shorthand like `btc` and `trump` still works
			
 
				 * docs updated with the new env vars and workflow
			
 
				 
			
 
				 ---
			
 
				 
			
 
				 # 🔭 Next high-level steps
			
 
				 
			
 
				+## What is left of v0.1.0
			
 
				+
			
 
				+The first version is now effectively a usable baseline. The remaining work for v0.1.x is mostly polish:
			
 
				+
			
 
				+* stabilize extraction quality across a few more real-world samples
			
 
				+* expand the alias map only where usage demands it
			
 
				+* tune emerging-topic noise so repeated source names do not dominate
			
 
				+* keep sentiment labels aligned with scores as the model improves
			
 
				+
			
 
				+## Where v0.2.0 should lead
			
 
				+
			
 
				 1. **Normalization layer**
			
 
				 
			
 
				    * canonicalize acronyms and entity variants before storage / querying
			
@@ -422,3 +437,14 @@ But only if you:
 
				    * track how important entities evolve over time
			
 
				    * allow replay of when entities first appeared, how topics shifted, and how sentiment changed
			
 
				    * useful later for narrative reconstruction and trend timelines
			
 
				+
			
 
				+## Longer-term direction
			
 
				+
			
 
				+The endgame is not just “news search”, but a light narrative memory system:
			
 
				+
			
 
				+* entity histories over time
			
 
				+* topic shifts and turning points
			
 
				+* sentiment arcs
			
 
				+* replayable timelines for a person, company, or event
			
 
				+
			
 
				+That should stay in mind while keeping the current implementation simple.
			
--- a/news_mcp/enrichment/llm_enrich.py
+++ b/news_mcp/enrichment/llm_enrich.py
@@ -6,6 +6,7 @@ from typing import Any, Dict
 
				 from news_mcp.config import NEWS_ENTITY_BLACKLIST
			
 
				 from news_mcp.entity_normalize import normalize_entities
			
 
				 from news_mcp.llm import call_extraction, call_summary
			
 
				+from news_mcp.trends_resolution import resolve_entity_via_trends
			
 
				 
			
 
				 
			
 
				 def _matches_blacklist(value: str, blacklist=None) -> bool:
			
@@ -31,12 +32,16 @@ async def classify_cluster_groq(cluster: Dict[str, Any]) -> Dict[str, Any]:
 
				     topic = parsed.get("topic", cluster.get("topic"))
			
 
				     if topic and _matches_blacklist(topic):
			
 
				         topic = "other"
			
 
				+    entities = normalize_entities(_filter_entities(parsed.get("entities", [])))
			
 
				+    keywords = normalize_entities(_filter_entities(parsed.get("keywords", [])))
			
 
				+
			
 
				     out.update({
			
 
				         "topic": topic,
			
 
				-        "entities": normalize_entities(_filter_entities(parsed.get("entities", []))),
			
 
				+        "entities": entities,
			
 
				+        "entityResolutions": [resolve_entity_via_trends(e) for e in entities],
			
 
				         "sentiment": parsed.get("sentiment", "neutral"),
			
 
				         "sentimentScore": parsed.get("sentimentScore"),
			
 
				-        "keywords": normalize_entities(_filter_entities(parsed.get("keywords", []))),
			
 
				+        "keywords": keywords,
			
 
				     })
			
 
				     return out
			
 
				 
			
--- a/news_mcp/mcp_server_fastmcp.py
+++ b/news_mcp/mcp_server_fastmcp.py
@@ -9,6 +9,7 @@ from news_mcp.config import NEWS_REFRESH_INTERVAL_SECONDS, NEWS_BACKGROUND_REFRE
 
				 from news_mcp.jobs.poller import refresh_clusters
			
 
				 from news_mcp.storage.sqlite_store import SQLiteClusterStore
			
 
				 from news_mcp.enrichment.llm_enrich import summarize_cluster_groq
			
 
				+from news_mcp.trends_resolution import resolve_entity_via_trends
			
 
				 from news_mcp.llm import active_llm_config
			
 
				 from news_mcp.entity_normalize import normalize_query
			
 
				 from collections import Counter
			
@@ -21,22 +22,58 @@ mcp = FastMCP(
 
				 )
			
 
				 
			
 
				 
			
 
				+def _cluster_entity_haystack(cluster: dict) -> list[str]:
			
 
				+    """Collect the normalized entity clues attached to a cluster."""
			
 
				+    values: list[str] = []
			
 
				+    for ent in cluster.get("entities", []) or []:
			
 
				+        values.append(str(ent).strip().lower())
			
 
				+    for res in cluster.get("entityResolutions", []) or []:
			
 
				+        if not isinstance(res, dict):
			
 
				+            continue
			
 
				+        for key in ("normalized", "canonical_label", "mid"):
			
 
				+            val = res.get(key)
			
 
				+            if val:
			
 
				+                values.append(str(val).strip().lower())
			
 
				+    return [v for v in values if v]
			
 
				+
			
 
				+
			
 
				 @mcp.tool(description="What is happening right now? Return the latest deduplicated news clusters for a topic.")
			
 
				 async def get_latest_events(topic: str = "crypto", limit: int = 5):
			
 
				     limit = max(1, min(int(limit), 20))
			
 
				-    # In v1, `topic` is a coarse category. If the caller passes an entity name
			
 
				-    # (e.g. "trump"/"iran"), gracefully fall back to `other`.
			
 
				+    # If the caller passes an entity-like value, resolve it and use the canonical
			
 
				+    # entity as the query lens. Otherwise keep the original topic path.
			
 
				     topic_norm = normalize_query(topic).lower()
			
 
				+    resolved = resolve_entity_via_trends(topic_norm)
			
 
				     allowed = {t.lower() for t in DEFAULT_TOPICS}
			
 
				-    if topic_norm not in allowed:
			
 
				-        topic_norm = "other"
			
 
				+    is_topic = topic_norm in allowed
			
 
				+    query_terms = {
			
 
				+        topic_norm,
			
 
				+        str(resolved.get("normalized") or "").strip().lower(),
			
 
				+        str(resolved.get("canonical_label") or "").strip().lower(),
			
 
				+        str(resolved.get("mid") or "").strip().lower(),
			
 
				+    }
			
 
				+    query_terms = {q for q in query_terms if q}
			
 
				+
			
 
				     store = SQLiteClusterStore(DB_PATH)
			
 
				 
			
 
				-    # Cache-first: only refresh if we currently have no fresh clusters for this topic.
			
 
				-    clusters = store.get_latest_clusters(topic=topic_norm, ttl_hours=CLUSTERS_TTL_HOURS, limit=limit)
			
 
				-    if not clusters:
			
 
				-        await refresh_clusters(topic=topic_norm, limit=200)
			
 
				+    if is_topic:
			
 
				+        # Cache-first: only refresh if we currently have no fresh clusters for this topic.
			
 
				         clusters = store.get_latest_clusters(topic=topic_norm, ttl_hours=CLUSTERS_TTL_HOURS, limit=limit)
			
 
				+        if not clusters:
			
 
				+            await refresh_clusters(topic=topic_norm, limit=200)
			
 
				+            clusters = store.get_latest_clusters(topic=topic_norm, ttl_hours=CLUSTERS_TTL_HOURS, limit=limit)
			
 
				+    else:
			
 
				+        # Entity-aware mode: search recent clusters across all topics and match by
			
 
				+        # raw entity, canonical label, or MID.
			
 
				+        clusters = store.get_latest_clusters_all_topics(ttl_hours=CLUSTERS_TTL_HOURS, limit=limit * 8)
			
 
				+        filtered = []
			
 
				+        for c in clusters:
			
 
				+            haystack = _cluster_entity_haystack(c)
			
 
				+            if any(any(term in item for item in haystack) for term in query_terms):
			
 
				+                filtered.append(c)
			
 
				+            if len(filtered) >= limit:
			
 
				+                break
			
 
				+        clusters = filtered
			
 
				 
			
 
				     # Ensure the response is compact and agent-friendly.
			
 
				     clusters_sorted = sorted(clusters, key=lambda x: float(x.get("importance", 0.0)), reverse=True)
			
@@ -66,14 +103,23 @@ async def get_events_for_entity(entity: str, limit: int = 10):
 
				     if not query:
			
 
				         return []
			
 
				 
			
 
				+    resolved = resolve_entity_via_trends(query)
			
 
				+    query_terms = {
			
 
				+        query,
			
 
				+        str(resolved.get("normalized") or "").strip().lower(),
			
 
				+        str(resolved.get("canonical_label") or "").strip().lower(),
			
 
				+        str(resolved.get("mid") or "").strip().lower(),
			
 
				+    }
			
 
				+    query_terms = {q for q in query_terms if q}
			
 
				+
			
 
				     # Cache-first: search recent clusters across all topics.
			
 
				     store = SQLiteClusterStore(DB_PATH)
			
 
				     clusters = store.get_latest_clusters_all_topics(ttl_hours=CLUSTERS_TTL_HOURS, limit=limit * 5)
			
 
				 
			
 
				     hits = []
			
 
				     for c in clusters:
			
 
				-        ents = c.get("entities") or []
			
 
				-        if any(query in str(e).lower() for e in ents):
			
 
				+        haystack = _cluster_entity_haystack(c)
			
 
				+        if any(any(term in item for item in haystack) for term in query_terms):
			
 
				             hits.append(c)
			
 
				         if len(hits) >= limit:
			
 
				             break
			
@@ -191,6 +237,14 @@ async def get_news_sentiment(entity: str, timeframe: str = "24h"):
 
				     store = SQLiteClusterStore(DB_PATH)
			
 
				 
			
 
				     ent = normalize_query(entity).strip().lower()
			
 
				+    resolved = resolve_entity_via_trends(ent)
			
 
				+    query_terms = {
			
 
				+        ent,
			
 
				+        str(resolved.get("normalized") or "").strip().lower(),
			
 
				+        str(resolved.get("canonical_label") or "").strip().lower(),
			
 
				+        str(resolved.get("mid") or "").strip().lower(),
			
 
				+    }
			
 
				+    query_terms = {q for q in query_terms if q}
			
 
				     if not ent:
			
 
				         return {
			
 
				             "entity": entity,
			
@@ -210,8 +264,8 @@ async def get_news_sentiment(entity: str, timeframe: str = "24h"):
 
				     clusters = store.get_latest_clusters_all_topics(ttl_hours=hours, limit=500)
			
 
				     matched = []
			
 
				     for c in clusters:
			
 
				-        ents = c.get("entities") or []
			
 
				-        if any(ent in str(e).lower() for e in ents):
			
 
				+        haystack = _cluster_entity_haystack(c)
			
 
				+        if any(any(term in item for item in haystack) for term in query_terms):
			
 
				             matched.append(c)
			
 
				 
			
 
				     if not matched:
			
--- a/news_mcp/sources/news_feeds.py
+++ b/news_mcp/sources/news_feeds.py
@@ -2,6 +2,7 @@ from __future__ import annotations
 
				 
			
 
				 import hashlib
			
 
				 import logging
			
 
				+import re
			
 
				 from typing import Any, Dict, List
			
 
				 
			
 
				 import feedparser
			
@@ -17,6 +18,15 @@ def _canonical_url(url: str) -> str:
 
				     return url.strip()
			
 
				 
			
 
				 
			
 
				+def _strip_html(text: str) -> str:
			
 
				+    """Remove obvious HTML so downstream summaries stay readable."""
			
 
				+    text = re.sub(r"<script.*?</script>", "", text, flags=re.I | re.S)
			
 
				+    text = re.sub(r"<style.*?</style>", "", text, flags=re.I | re.S)
			
 
				+    text = re.sub(r"<[^>]+>", " ", text)
			
 
				+    text = re.sub(r"\s+", " ", text)
			
 
				+    return text.strip()
			
 
				+
			
 
				+
			
 
				 def _feed_urls() -> List[str]:
			
 
				     urls = [u.strip() for u in NEWS_FEED_URLS.split(",") if u.strip()]
			
 
				     if not urls:
			
@@ -44,7 +54,7 @@ def fetch_news_articles(limit: int = 50) -> List[Dict[str, Any]]:
 
				             title = str(getattr(entry, "title", "")).strip()
			
 
				             url = _canonical_url(str(getattr(entry, "link", "")).strip())
			
 
				             timestamp = str(getattr(entry, "published", "")) or str(getattr(entry, "updated", ""))
			
 
				-            summary = str(getattr(entry, "summary", "")) or str(getattr(entry, "description", ""))
			
 
				+            summary = _strip_html(str(getattr(entry, "summary", "")) or str(getattr(entry, "description", "")))
			
 
				 
			
 
				             if not title or not url:
			
 
				                 continue
			
--- a/news_mcp/trends_resolution.py
+++ b/news_mcp/trends_resolution.py
@@ -0,0 +1,64 @@
 
				+from __future__ import annotations
			
 
				+
			
 
				+import json
			
 
				+import os
			
 
				+import subprocess
			
 
				+from functools import lru_cache
			
 
				+from typing import Any
			
 
				+
			
 
				+from news_mcp.entity_normalize import normalize_entity
			
 
				+
			
 
				+
			
 
				+@lru_cache(maxsize=1024)
			
 
				+def resolve_entity_via_trends(entity: str) -> dict[str, Any]:
			
 
				+    """Resolve a normalized entity through trends-mcp, falling back cleanly.
			
 
				+
			
 
				+    The input is normalized first using the same local normalization rules used
			
 
				+    everywhere else in news-mcp, so query and storage paths stay aligned.
			
 
				+    """
			
 
				+    normalized = normalize_entity(entity)
			
 
				+    if not normalized:
			
 
				+        return {
			
 
				+            "raw": entity,
			
 
				+            "normalized": "",
			
 
				+            "canonical_label": "",
			
 
				+            "mid": None,
			
 
				+            "type": None,
			
 
				+            "source": "empty",
			
 
				+        }
			
 
				+
			
 
				+    config = os.getenv("MCPORTER_CONFIG", os.path.expanduser("~/.openclaw/workspace/config/mcporter.json"))
			
 
				+    command = [
			
 
				+        "mcporter",
			
 
				+        "--config",
			
 
				+        config,
			
 
				+        "call",
			
 
				+        "trends.resolve_entity",
			
 
				+        f"keyword={normalized}",
			
 
				+    ]
			
 
				+
			
 
				+    try:
			
 
				+        proc = subprocess.run(command, capture_output=True, text=True, timeout=20, check=False)
			
 
				+        if proc.returncode == 0 and proc.stdout.strip():
			
 
				+            payload = json.loads(proc.stdout)
			
 
				+            return {
			
 
				+                "raw": entity,
			
 
				+                "normalized": normalized,
			
 
				+                "canonical_label": payload.get("canonical_label") or normalized,
			
 
				+                "mid": payload.get("mid"),
			
 
				+                "type": payload.get("type"),
			
 
				+                "candidates": payload.get("candidates", []),
			
 
				+                "source": "trends-mcp",
			
 
				+            }
			
 
				+    except Exception:
			
 
				+        pass
			
 
				+
			
 
				+    # Conservative fallback: keep the local normalized form and leave MID unset.
			
 
				+    return {
			
 
				+        "raw": entity,
			
 
				+        "normalized": normalized,
			
 
				+        "canonical_label": normalized,
			
 
				+        "mid": None,
			
 
				+        "type": None,
			
 
				+        "source": "fallback",
			
 
				+    }