1 Minggu lalu · 8ee480b872
--- a/news_mcp/mcp_server_fastmcp.py
+++ b/news_mcp/mcp_server_fastmcp.py
@@ -2,6 +2,7 @@ from __future__ import annotations
 
				 
			
 
				 import asyncio
			
 
				 import hashlib
			
 
				+import json
			
 
				 import logging
			
 
				 
			
 
				 import math
			
@@ -175,10 +176,10 @@ NEWS_TOOL_CARDS = [
 
				         "Produce a concise LLM-written explanation for one cluster and key facts.",
			
 
				         [
			
 
				             {"name": "event_id", "type": "string", "meaning": "cluster_id; do not surface in user-facing prose"},
			
 
				-            {"name": "include_articles", "type": "boolean", "default": False},
			
 
				+            {"name": "include_articles", "type": "boolean", "default": True},
			
 
				         ],
			
 
				-        ["headline", "mergedSummary", "keyFacts", "sources", "articles?"],
			
 
				-        ["Prefer this after you have already chosen a specific cluster to explain."],
			
 
				+        ["headline", "mergedSummary", "keyFacts", "sources", "entities", "keywords", "related_entities", "related_keywords", "topic", "sentiment", "importance", "articles"],
			
 
				+        ["Rich cluster drill-down. Returns LLM summary + cluster metadata + articles. Defaults to include articles."],
			
 
				     ),
			
 
				     _tool_card(
			
 
				         "detect_emerging_topics",
			
@@ -189,7 +190,7 @@ NEWS_TOOL_CARDS = [
 
				             {"name": "topic", "type": "string", "default": "all topics", "examples": ["crypto", "macro", "regulation", "ai", "other"]},
			
 
				             {"name": "around", "type": "string", "default": "none", "meaning": "entity to scope results to its neighborhood (e.g. \"Bitcoin\")"},
			
 
				         ],
			
 
				-        ["topic", "trend_score", "velocity", "recent_count", "prior_count", "source_count", "related_entities", "signal_type"],
			
 
				+        ["topic", "trend_score", "velocity", "recent_count", "prior_count", "source_count", "related_entities", "related_keywords", "signal_type"],
			
 
				         ["Use timeframe to control lookback, topic to scope to a category, around to find what's emerging near a specific entity. Signal types: entity (named entity), keyword (thematic descriptor), phrase (headline bigram). Check velocity and source_count to distinguish real spikes from noise."],
			
 
				     ),
			
 
				     _tool_card(
			
@@ -263,8 +264,10 @@ NEWS_AGENT_TIPS = [
 
				     "When describing clusters, keep sources and timestamps visible so the user can assess recency and provenance.",
			
 
				     "Prefer a short chain of tools over many parallel calls unless you are building a neighborhood map or comparison table.",
			
 
				     "For tricky names, rely on the server's resolver instead of inventing alias rules in the client.",
			
 
				-    "Use detect_emerging_topics with timeframe=\"4h\" for what's hot right now, timeframe=\"3d\" for weekly trends. Use topic= to scope to a category, around= to find what's emerging near a specific entity. Check velocity to distinguish accelerating signals from steady-state ones. Filter by signal_type to focus on entities, keywords, or phrases.",
			
 
				+    "Use detect_emerging_topics with timeframe=\"4h\" for what's hot right now, timeframe=\"3d\" for weekly trends. Use topic= to scope to a category, around= to find what's emerging near a specific entity. Check velocity to distinguish accelerating signals from steady-state ones. Filter by signal_type to focus on entities, keywords, or phrases. Each result also includes related_keywords for thematic context.",
			
 
				+    "get_event_summary returns a rich result: headline, mergedSummary, keyFacts, entities, keywords, related_entities, related_keywords, topic, sentiment, importance, and articles (included by default). Use it for full cluster drill-down.",
			
 
				     "Each cluster contains both entities (named entities with identity resolution) and keywords (thematic descriptors). Use keywords to understand what a story is about beyond the named entities.",
			
 
				+    "Use detect_emerging_topics with multiple timeframes (e.g. 4h vs 3d) and compare results to distinguish what's hot right now vs what's persistently trending. related_keywords help identify thematic neighborhoods.",
			
 
				 ]
			
 
				 
			
 
				 
			
@@ -305,6 +308,16 @@ NEWS_EXAMPLE_CHAINS = [
 
				             "get_events_for_entity(entity=...) on the top emerging neighbor",
			
 
				         ],
			
 
				     },
			
 
				+    {
			
 
				+        "task": "Full investigation pipeline",
			
 
				+        "chain": [
			
 
				+            "detect_emerging_topics(limit=20, timeframe=\"3d\")",
			
 
				+            "pick an emerging entity/keyword and note its related_entities and related_keywords",
			
 
				+            "get_event_summary(event_id=...) on the top cluster for full context including articles",
			
 
				+            "get_news_sentiment(entity=...) to gauge tone around the emerging topic",
			
 
				+            "detect_emerging_topics(around=<entity>, timeframe=\"4h\") to scout its neighborhood",
			
 
				+        ],
			
 
				+    },
			
 
				 ]
			
 
				 
			
 
				 
			
@@ -470,28 +483,48 @@ async def get_related_recent_entities(subject: str, timeframe: str = "72h", limi
 
				     return result
			
 
				 
			
 
				 
			
 
				-@mcp.tool(description="Investigate one cluster in depth and return a concise LLM-written explanation plus key facts.")
			
 
				-async def get_event_summary(event_id: str, include_articles: bool = False):
			
 
				+@mcp.tool(description="Investigate one cluster in depth and return a concise LLM-written explanation plus key facts, "
			
 
				+           "entities, keywords, related entities and keywords, sentiment, importance, and articles.")
			
 
				+async def get_event_summary(event_id: str, include_articles: bool = True):
			
 
				     store = SQLiteClusterStore(DB_PATH)
			
 
				 
			
 
				     # Summary cache: reuse if present within TTL.
			
 
				+    cluster = store.get_cluster_by_id(event_id)
			
 
				+    if not cluster:
			
 
				+        return {
			
 
				+            "event_id": event_id,
			
 
				+            "error": "NOT_FOUND",
			
 
				+        }
			
 
				+
			
 
				     cached_summary = store.get_cluster_summary(
			
 
				         cluster_id=event_id,
			
 
				         ttl_hours=DEFAULT_LOOKBACK_HOURS,
			
 
				     )
			
 
				-    if cached_summary:
			
 
				-        out = {
			
 
				-            "event_id": event_id,
			
 
				-            "headline": cached_summary.get("headline"),
			
 
				-            "mergedSummary": cached_summary.get("mergedSummary"),
			
 
				-            "keyFacts": cached_summary.get("keyFacts", []),
			
 
				-            "sources": cached_summary.get("sources", []),
			
 
				-        }
			
 
				 
			
 
				+    def _enrich(base: dict, src_cluster: dict) -> dict:
			
 
				+        base["entities"] = src_cluster.get("entities", [])
			
 
				+        base["keywords"] = src_cluster.get("keywords", [])
			
 
				+        base["topic"] = src_cluster.get("topic", "other")
			
 
				+        base["sentiment"] = src_cluster.get("sentiment", "neutral")
			
 
				+        base["sentimentScore"] = src_cluster.get("sentimentScore")
			
 
				+        base["importance"] = src_cluster.get("importance", 0.0)
			
 
				+        # Related entities: from co-occurrence in this cluster's article set
			
 
				+        resolved = src_cluster.get("entityResolutions", []) or []
			
 
				+        related_ents = []
			
 
				+        seen_ents = {str(e).strip().lower() for e in (src_cluster.get("entities", []) or [])}
			
 
				+        for res in resolved:
			
 
				+            if isinstance(res, dict):
			
 
				+                label = str(res.get("canonical_label") or res.get("normalized") or "").strip()
			
 
				+                if label and label.lower() not in seen_ents:
			
 
				+                    related_ents.append(label)
			
 
				+                    seen_ents.add(label.lower())
			
 
				+        base["related_entities"] = related_ents[:10]
			
 
				+        # Related keywords: from the cluster's own keywords (thematic descriptors)
			
 
				+        # plus any co-occurring keywords from recent related clusters
			
 
				+        base["related_keywords"] = _fetch_related_keywords(store, src_cluster, event_id)
			
 
				         if include_articles:
			
 
				-            cluster = store.get_cluster_by_id(event_id)
			
 
				-            arts = (cluster or {}).get("articles", []) or []
			
 
				-            out["articles"] = [
			
 
				+            arts = src_cluster.get("articles", []) or []
			
 
				+            base["articles"] = [
			
 
				                 {
			
 
				                     "title": a.get("title"),
			
 
				                     "url": a.get("url"),
			
@@ -501,31 +534,20 @@ async def get_event_summary(event_id: str, include_articles: bool = False):
 
				                 for a in arts
			
 
				                 if isinstance(a, dict)
			
 
				             ]
			
 
				-        return out
			
 
				+        return base
			
 
				 
			
 
				-    cluster = store.get_cluster_by_id(event_id)
			
 
				-    if not cluster:
			
 
				-        return {
			
 
				+    if cached_summary:
			
 
				+        out = {
			
 
				             "event_id": event_id,
			
 
				-            "error": "NOT_FOUND",
			
 
				+            "headline": cached_summary.get("headline"),
			
 
				+            "mergedSummary": cached_summary.get("mergedSummary"),
			
 
				+            "keyFacts": cached_summary.get("keyFacts", []),
			
 
				+            "sources": cached_summary.get("sources", []),
			
 
				         }
			
 
				-
			
 
				-    articles_out = None
			
 
				-    if include_articles:
			
 
				-        arts = cluster.get("articles", []) or []
			
 
				-        articles_out = [
			
 
				-            {
			
 
				-                "title": a.get("title"),
			
 
				-                "url": a.get("url"),
			
 
				-                "source": a.get("source"),
			
 
				-                "timestamp": a.get("timestamp"),
			
 
				-            }
			
 
				-            for a in arts
			
 
				-            if isinstance(a, dict)
			
 
				-        ]
			
 
				+        out = _enrich(out, cluster)
			
 
				+        return out
			
 
				 
			
 
				     summary = await summarize_cluster_llm(cluster)
			
 
				-
			
 
				     store.upsert_cluster_summary(event_id, summary)
			
 
				     out = {
			
 
				         "event_id": event_id,
			
@@ -534,12 +556,75 @@ async def get_event_summary(event_id: str, include_articles: bool = False):
 
				         "keyFacts": summary.get("keyFacts", []),
			
 
				         "sources": summary.get("sources", []),
			
 
				     }
			
 
				-
			
 
				-    if include_articles:
			
 
				-        out["articles"] = articles_out or []
			
 
				+    out = _enrich(out, cluster)
			
 
				     return out
			
 
				 
			
 
				 
			
 
				+def _fetch_related_keywords(store: SQLiteClusterStore, cluster: dict, event_id: str) -> list[str]:
			
 
				+    """Find keywords that co-occur with this cluster's entities in recent clusters.
			
 
				+
			
 
				+    This gives agents thematic context: what else was being discussed alongside
			
 
				+    the entities in this cluster during the same window.
			
 
				+    """
			
 
				+    entities = cluster.get("entities", []) or []
			
 
				+    if not entities:
			
 
				+        return []
			
 
				+
			
 
				+    # Build a set of entity terms to search with
			
 
				+    entity_terms = set()
			
 
				+    for e in entities:
			
 
				+        entity_terms.add(str(e).strip().lower())
			
 
				+    for res in (cluster.get("entityResolutions", []) or []):
			
 
				+        if isinstance(res, dict):
			
 
				+            for key in ("normalized", "canonical_label"):
			
 
				+                val = res.get(key)
			
 
				+                if val:
			
 
				+                    entity_terms.add(str(val).strip().lower())
			
 
				+    entity_terms.discard("")
			
 
				+
			
 
				+    if not entity_terms:
			
 
				+        return []
			
 
				+
			
 
				+    # Find recent clusters that share any entity, collect their keywords
			
 
				+    # Use payload_ts lookback of 48h for co-occurrence window
			
 
				+    from datetime import timedelta
			
 
				+    cutoff = (datetime.now(timezone.utc) - timedelta(hours=48)).isoformat()
			
 
				+    placeholders = ",".join("?" for _ in entity_terms)
			
 
				+
			
 
				+    try:
			
 
				+        rows = store._conn().execute(
			
 
				+            f"SELECT DISTINCT c.payload FROM clusters c "
			
 
				+            f"JOIN cluster_entities ce ON c.cluster_id = ce.cluster_id "
			
 
				+            f"WHERE c.payload_ts >= ? AND c.cluster_id != ? "
			
 
				+            f"AND ce.entity IN ({placeholders}) "
			
 
				+            f"ORDER BY c.payload_ts DESC LIMIT 20",
			
 
				+            (cutoff, event_id, *entity_terms),
			
 
				+        ).fetchall()
			
 
				+    except Exception:
			
 
				+        return []
			
 
				+
			
 
				+    kw_counter: dict[str, int] = {}
			
 
				+    cluster_kws = {str(k).strip().lower() for k in (cluster.get("keywords", []) or []) if str(k).strip()}
			
 
				+    for (payload_text,) in rows:
			
 
				+        try:
			
 
				+            c = json.loads(payload_text)
			
 
				+        except Exception:
			
 
				+            continue
			
 
				+        for kw in (c.get("keywords", []) or []):
			
 
				+            kw_norm = str(kw).strip()
			
 
				+            if not kw_norm:
			
 
				+                continue
			
 
				+            kw_key = kw_norm.lower()
			
 
				+            # Skip keywords that already appear in this cluster
			
 
				+            if kw_key in cluster_kws:
			
 
				+                continue
			
 
				+            kw_counter[kw_norm] = kw_counter.get(kw_norm, 0) + 1
			
 
				+
			
 
				+    # Return top keywords by co-occurrence count
			
 
				+    sorted_kws = sorted(kw_counter.items(), key=lambda x: -x[1])
			
 
				+    return [kw for kw, _ in sorted_kws[:10]]
			
 
				+
			
 
				+
			
 
				 @mcp.tool(description="Explore what is starting to matter: surface emerging entities, thematic keywords, and phrases from recent clusters. "
			
 
				            "Use timeframe to control the lookback window, topic to scope to a category, and around to find what's emerging near a specific entity. "
			
 
				            "Results include signal_type (entity / keyword / phrase) for downstream filtering.")
			
@@ -625,6 +710,7 @@ async def detect_emerging_topics(limit: int = 10, timeframe: str = "24h", topic:
 
				     kw_importance_recent = Counter()
			
 
				     kw_sources: dict[str, set] = {}
			
 
				     kw_buckets: dict[str, set] = {}
			
 
				+    kw_cooccur: dict[str, Counter] = {}
			
 
				 
			
 
				     bucket_size_hours = max(1.0, hours / 6.0)  # split window into ~6 buckets
			
 
				 
			
@@ -711,6 +797,22 @@ async def detect_emerging_topics(limit: int = 10, timeframe: str = "24h", topic:
 
				                     continue
			
 
				                 entity_cooccur[a][b] += 1
			
 
				 
			
 
				+        # keyword co-occurrence: which keywords appear together in the same clusters
			
 
				+        for i in range(len(kws_in_cluster)):
			
 
				+            ka = kws_in_cluster[i]
			
 
				+            if ka not in kw_cooccur:
			
 
				+                kw_cooccur[ka] = Counter()
			
 
				+            for j in range(len(kws_in_cluster)):
			
 
				+                if i == j:
			
 
				+                    continue
			
 
				+                kb = kws_in_cluster[j]
			
 
				+                kw_cooccur[ka][kb] += 1
			
 
				+            # also track entity↔keyword co-occurrence
			
 
				+            for ent in ents_norm:
			
 
				+                if _is_generic_entity(ent):
			
 
				+                    continue
			
 
				+                kw_cooccur[ka][ent] += 1
			
 
				+
			
 
				         # bigram phrases (recent only)
			
 
				         if is_recent:
			
 
				             text = f"{c.get('headline', '')} {c.get('summary', '')}"
			
@@ -763,10 +865,17 @@ async def detect_emerging_topics(limit: int = 10, timeframe: str = "24h", topic:
 
				                 if other != ent:
			
 
				                     related.append(other)
			
 
				 
			
 
				+        related_kws = []
			
 
				+        if ent in kw_cooccur:
			
 
				+            for kw, _cnt in kw_cooccur[ent].most_common(5):
			
 
				+                if kw != ent:
			
 
				+                    related_kws.append(kw)
			
 
				+
			
 
				         scored.append({
			
 
				             "topic": ent,
			
 
				             "trend_score": min(0.99, round(composed_score, 3)),
			
 
				             "related_entities": related[:3] if related else [ent],
			
 
				+            "related_keywords": related_kws[:5],
			
 
				             "velocity": round(velocity, 2),
			
 
				             "recent_count": recent_n,
			
 
				             "prior_count": prior_n,
			
@@ -806,10 +915,17 @@ async def detect_emerging_topics(limit: int = 10, timeframe: str = "24h", topic:
 
				             0.15 * min(1.0, avg_imp)
			
 
				         )
			
 
				 
			
 
				+        kw_related_kws = []
			
 
				+        if kw in kw_cooccur:
			
 
				+            for other, _cnt in kw_cooccur[kw].most_common(5):
			
 
				+                if other != kw:
			
 
				+                    kw_related_kws.append(other)
			
 
				+
			
 
				         kw_scored.append({
			
 
				             "topic": kw,
			
 
				             "trend_score": min(0.99, round(composed_score, 3)),
			
 
				             "related_entities": [],
			
 
				+            "related_keywords": kw_related_kws[:5],
			
 
				             "velocity": round(velocity, 2),
			
 
				             "recent_count": recent_n,
			
 
				             "prior_count": prior_n,
			
@@ -841,6 +957,7 @@ async def detect_emerging_topics(limit: int = 10, timeframe: str = "24h", topic:
 
				             "topic": phrase.title(),
			
 
				             "trend_score": min(0.99, round(0.30 + 0.15 * min(count, 5), 2)),
			
 
				             "related_entities": [],
			
 
				+            "related_keywords": [],
			
 
				             "velocity": None,
			
 
				             "recent_count": count,
			
 
				             "prior_count": 0,