Jelajahi Sumber

feat: enrich MCP tool responses with keywords, related entities/keywords

get_event_summary:
- include_articles now defaults to true
- Added entities, keywords, related_entities, related_keywords, topic,
  sentiment, sentimentScore, importance to response
- related_keywords fetched via SQL junction-table co-occurrence query:
  finds recent clusters sharing the same entities, collects their keywords

detect_emerging_topics:
- Added related_keywords to every result (entity, keyword, phrase)
- kw_cooccur counter tracks keyword↔keyword and keyword↔entity
  co-occurrence per cluster during the accumulation pass

Agent tips and example chains updated to reflect richer outputs.
Tool cards updated with new output fields.

Tests: 32/32 pass.
Lukas Goldschmidt 1 Minggu lalu
induk
melakukan
8ee480b872
1 mengubah file dengan 158 tambahan dan 41 penghapusan
  1. 158 41
      news_mcp/mcp_server_fastmcp.py

+ 158 - 41
news_mcp/mcp_server_fastmcp.py

@@ -2,6 +2,7 @@ from __future__ import annotations
 
 import asyncio
 import hashlib
+import json
 import logging
 
 import math
@@ -175,10 +176,10 @@ NEWS_TOOL_CARDS = [
         "Produce a concise LLM-written explanation for one cluster and key facts.",
         [
             {"name": "event_id", "type": "string", "meaning": "cluster_id; do not surface in user-facing prose"},
-            {"name": "include_articles", "type": "boolean", "default": False},
+            {"name": "include_articles", "type": "boolean", "default": True},
         ],
-        ["headline", "mergedSummary", "keyFacts", "sources", "articles?"],
-        ["Prefer this after you have already chosen a specific cluster to explain."],
+        ["headline", "mergedSummary", "keyFacts", "sources", "entities", "keywords", "related_entities", "related_keywords", "topic", "sentiment", "importance", "articles"],
+        ["Rich cluster drill-down. Returns LLM summary + cluster metadata + articles. Defaults to include articles."],
     ),
     _tool_card(
         "detect_emerging_topics",
@@ -189,7 +190,7 @@ NEWS_TOOL_CARDS = [
             {"name": "topic", "type": "string", "default": "all topics", "examples": ["crypto", "macro", "regulation", "ai", "other"]},
             {"name": "around", "type": "string", "default": "none", "meaning": "entity to scope results to its neighborhood (e.g. \"Bitcoin\")"},
         ],
-        ["topic", "trend_score", "velocity", "recent_count", "prior_count", "source_count", "related_entities", "signal_type"],
+        ["topic", "trend_score", "velocity", "recent_count", "prior_count", "source_count", "related_entities", "related_keywords", "signal_type"],
         ["Use timeframe to control lookback, topic to scope to a category, around to find what's emerging near a specific entity. Signal types: entity (named entity), keyword (thematic descriptor), phrase (headline bigram). Check velocity and source_count to distinguish real spikes from noise."],
     ),
     _tool_card(
@@ -263,8 +264,10 @@ NEWS_AGENT_TIPS = [
     "When describing clusters, keep sources and timestamps visible so the user can assess recency and provenance.",
     "Prefer a short chain of tools over many parallel calls unless you are building a neighborhood map or comparison table.",
     "For tricky names, rely on the server's resolver instead of inventing alias rules in the client.",
-    "Use detect_emerging_topics with timeframe=\"4h\" for what's hot right now, timeframe=\"3d\" for weekly trends. Use topic= to scope to a category, around= to find what's emerging near a specific entity. Check velocity to distinguish accelerating signals from steady-state ones. Filter by signal_type to focus on entities, keywords, or phrases.",
+    "Use detect_emerging_topics with timeframe=\"4h\" for what's hot right now, timeframe=\"3d\" for weekly trends. Use topic= to scope to a category, around= to find what's emerging near a specific entity. Check velocity to distinguish accelerating signals from steady-state ones. Filter by signal_type to focus on entities, keywords, or phrases. Each result also includes related_keywords for thematic context.",
+    "get_event_summary returns a rich result: headline, mergedSummary, keyFacts, entities, keywords, related_entities, related_keywords, topic, sentiment, importance, and articles (included by default). Use it for full cluster drill-down.",
     "Each cluster contains both entities (named entities with identity resolution) and keywords (thematic descriptors). Use keywords to understand what a story is about beyond the named entities.",
+    "Use detect_emerging_topics with multiple timeframes (e.g. 4h vs 3d) and compare results to distinguish what's hot right now vs what's persistently trending. related_keywords help identify thematic neighborhoods.",
 ]
 
 
@@ -305,6 +308,16 @@ NEWS_EXAMPLE_CHAINS = [
             "get_events_for_entity(entity=...) on the top emerging neighbor",
         ],
     },
+    {
+        "task": "Full investigation pipeline",
+        "chain": [
+            "detect_emerging_topics(limit=20, timeframe=\"3d\")",
+            "pick an emerging entity/keyword and note its related_entities and related_keywords",
+            "get_event_summary(event_id=...) on the top cluster for full context including articles",
+            "get_news_sentiment(entity=...) to gauge tone around the emerging topic",
+            "detect_emerging_topics(around=<entity>, timeframe=\"4h\") to scout its neighborhood",
+        ],
+    },
 ]
 
 
@@ -470,28 +483,48 @@ async def get_related_recent_entities(subject: str, timeframe: str = "72h", limi
     return result
 
 
-@mcp.tool(description="Investigate one cluster in depth and return a concise LLM-written explanation plus key facts.")
-async def get_event_summary(event_id: str, include_articles: bool = False):
+@mcp.tool(description="Investigate one cluster in depth and return a concise LLM-written explanation plus key facts, "
+           "entities, keywords, related entities and keywords, sentiment, importance, and articles.")
+async def get_event_summary(event_id: str, include_articles: bool = True):
     store = SQLiteClusterStore(DB_PATH)
 
     # Summary cache: reuse if present within TTL.
+    cluster = store.get_cluster_by_id(event_id)
+    if not cluster:
+        return {
+            "event_id": event_id,
+            "error": "NOT_FOUND",
+        }
+
     cached_summary = store.get_cluster_summary(
         cluster_id=event_id,
         ttl_hours=DEFAULT_LOOKBACK_HOURS,
     )
-    if cached_summary:
-        out = {
-            "event_id": event_id,
-            "headline": cached_summary.get("headline"),
-            "mergedSummary": cached_summary.get("mergedSummary"),
-            "keyFacts": cached_summary.get("keyFacts", []),
-            "sources": cached_summary.get("sources", []),
-        }
 
+    def _enrich(base: dict, src_cluster: dict) -> dict:
+        base["entities"] = src_cluster.get("entities", [])
+        base["keywords"] = src_cluster.get("keywords", [])
+        base["topic"] = src_cluster.get("topic", "other")
+        base["sentiment"] = src_cluster.get("sentiment", "neutral")
+        base["sentimentScore"] = src_cluster.get("sentimentScore")
+        base["importance"] = src_cluster.get("importance", 0.0)
+        # Related entities: from co-occurrence in this cluster's article set
+        resolved = src_cluster.get("entityResolutions", []) or []
+        related_ents = []
+        seen_ents = {str(e).strip().lower() for e in (src_cluster.get("entities", []) or [])}
+        for res in resolved:
+            if isinstance(res, dict):
+                label = str(res.get("canonical_label") or res.get("normalized") or "").strip()
+                if label and label.lower() not in seen_ents:
+                    related_ents.append(label)
+                    seen_ents.add(label.lower())
+        base["related_entities"] = related_ents[:10]
+        # Related keywords: from the cluster's own keywords (thematic descriptors)
+        # plus any co-occurring keywords from recent related clusters
+        base["related_keywords"] = _fetch_related_keywords(store, src_cluster, event_id)
         if include_articles:
-            cluster = store.get_cluster_by_id(event_id)
-            arts = (cluster or {}).get("articles", []) or []
-            out["articles"] = [
+            arts = src_cluster.get("articles", []) or []
+            base["articles"] = [
                 {
                     "title": a.get("title"),
                     "url": a.get("url"),
@@ -501,31 +534,20 @@ async def get_event_summary(event_id: str, include_articles: bool = False):
                 for a in arts
                 if isinstance(a, dict)
             ]
-        return out
+        return base
 
-    cluster = store.get_cluster_by_id(event_id)
-    if not cluster:
-        return {
+    if cached_summary:
+        out = {
             "event_id": event_id,
-            "error": "NOT_FOUND",
+            "headline": cached_summary.get("headline"),
+            "mergedSummary": cached_summary.get("mergedSummary"),
+            "keyFacts": cached_summary.get("keyFacts", []),
+            "sources": cached_summary.get("sources", []),
         }
-
-    articles_out = None
-    if include_articles:
-        arts = cluster.get("articles", []) or []
-        articles_out = [
-            {
-                "title": a.get("title"),
-                "url": a.get("url"),
-                "source": a.get("source"),
-                "timestamp": a.get("timestamp"),
-            }
-            for a in arts
-            if isinstance(a, dict)
-        ]
+        out = _enrich(out, cluster)
+        return out
 
     summary = await summarize_cluster_llm(cluster)
-
     store.upsert_cluster_summary(event_id, summary)
     out = {
         "event_id": event_id,
@@ -534,12 +556,75 @@ async def get_event_summary(event_id: str, include_articles: bool = False):
         "keyFacts": summary.get("keyFacts", []),
         "sources": summary.get("sources", []),
     }
-
-    if include_articles:
-        out["articles"] = articles_out or []
+    out = _enrich(out, cluster)
     return out
 
 
+def _fetch_related_keywords(store: SQLiteClusterStore, cluster: dict, event_id: str) -> list[str]:
+    """Find keywords that co-occur with this cluster's entities in recent clusters.
+
+    This gives agents thematic context: what else was being discussed alongside
+    the entities in this cluster during the same window.
+    """
+    entities = cluster.get("entities", []) or []
+    if not entities:
+        return []
+
+    # Build a set of entity terms to search with
+    entity_terms = set()
+    for e in entities:
+        entity_terms.add(str(e).strip().lower())
+    for res in (cluster.get("entityResolutions", []) or []):
+        if isinstance(res, dict):
+            for key in ("normalized", "canonical_label"):
+                val = res.get(key)
+                if val:
+                    entity_terms.add(str(val).strip().lower())
+    entity_terms.discard("")
+
+    if not entity_terms:
+        return []
+
+    # Find recent clusters that share any entity, collect their keywords
+    # Use payload_ts lookback of 48h for co-occurrence window
+    from datetime import timedelta
+    cutoff = (datetime.now(timezone.utc) - timedelta(hours=48)).isoformat()
+    placeholders = ",".join("?" for _ in entity_terms)
+
+    try:
+        rows = store._conn().execute(
+            f"SELECT DISTINCT c.payload FROM clusters c "
+            f"JOIN cluster_entities ce ON c.cluster_id = ce.cluster_id "
+            f"WHERE c.payload_ts >= ? AND c.cluster_id != ? "
+            f"AND ce.entity IN ({placeholders}) "
+            f"ORDER BY c.payload_ts DESC LIMIT 20",
+            (cutoff, event_id, *entity_terms),
+        ).fetchall()
+    except Exception:
+        return []
+
+    kw_counter: dict[str, int] = {}
+    cluster_kws = {str(k).strip().lower() for k in (cluster.get("keywords", []) or []) if str(k).strip()}
+    for (payload_text,) in rows:
+        try:
+            c = json.loads(payload_text)
+        except Exception:
+            continue
+        for kw in (c.get("keywords", []) or []):
+            kw_norm = str(kw).strip()
+            if not kw_norm:
+                continue
+            kw_key = kw_norm.lower()
+            # Skip keywords that already appear in this cluster
+            if kw_key in cluster_kws:
+                continue
+            kw_counter[kw_norm] = kw_counter.get(kw_norm, 0) + 1
+
+    # Return top keywords by co-occurrence count
+    sorted_kws = sorted(kw_counter.items(), key=lambda x: -x[1])
+    return [kw for kw, _ in sorted_kws[:10]]
+
+
 @mcp.tool(description="Explore what is starting to matter: surface emerging entities, thematic keywords, and phrases from recent clusters. "
            "Use timeframe to control the lookback window, topic to scope to a category, and around to find what's emerging near a specific entity. "
            "Results include signal_type (entity / keyword / phrase) for downstream filtering.")
@@ -625,6 +710,7 @@ async def detect_emerging_topics(limit: int = 10, timeframe: str = "24h", topic:
     kw_importance_recent = Counter()
     kw_sources: dict[str, set] = {}
     kw_buckets: dict[str, set] = {}
+    kw_cooccur: dict[str, Counter] = {}
 
     bucket_size_hours = max(1.0, hours / 6.0)  # split window into ~6 buckets
 
@@ -711,6 +797,22 @@ async def detect_emerging_topics(limit: int = 10, timeframe: str = "24h", topic:
                     continue
                 entity_cooccur[a][b] += 1
 
+        # keyword co-occurrence: which keywords appear together in the same clusters
+        for i in range(len(kws_in_cluster)):
+            ka = kws_in_cluster[i]
+            if ka not in kw_cooccur:
+                kw_cooccur[ka] = Counter()
+            for j in range(len(kws_in_cluster)):
+                if i == j:
+                    continue
+                kb = kws_in_cluster[j]
+                kw_cooccur[ka][kb] += 1
+            # also track entity↔keyword co-occurrence
+            for ent in ents_norm:
+                if _is_generic_entity(ent):
+                    continue
+                kw_cooccur[ka][ent] += 1
+
         # bigram phrases (recent only)
         if is_recent:
             text = f"{c.get('headline', '')} {c.get('summary', '')}"
@@ -763,10 +865,17 @@ async def detect_emerging_topics(limit: int = 10, timeframe: str = "24h", topic:
                 if other != ent:
                     related.append(other)
 
+        related_kws = []
+        if ent in kw_cooccur:
+            for kw, _cnt in kw_cooccur[ent].most_common(5):
+                if kw != ent:
+                    related_kws.append(kw)
+
         scored.append({
             "topic": ent,
             "trend_score": min(0.99, round(composed_score, 3)),
             "related_entities": related[:3] if related else [ent],
+            "related_keywords": related_kws[:5],
             "velocity": round(velocity, 2),
             "recent_count": recent_n,
             "prior_count": prior_n,
@@ -806,10 +915,17 @@ async def detect_emerging_topics(limit: int = 10, timeframe: str = "24h", topic:
             0.15 * min(1.0, avg_imp)
         )
 
+        kw_related_kws = []
+        if kw in kw_cooccur:
+            for other, _cnt in kw_cooccur[kw].most_common(5):
+                if other != kw:
+                    kw_related_kws.append(other)
+
         kw_scored.append({
             "topic": kw,
             "trend_score": min(0.99, round(composed_score, 3)),
             "related_entities": [],
+            "related_keywords": kw_related_kws[:5],
             "velocity": round(velocity, 2),
             "recent_count": recent_n,
             "prior_count": prior_n,
@@ -841,6 +957,7 @@ async def detect_emerging_topics(limit: int = 10, timeframe: str = "24h", topic:
             "topic": phrase.title(),
             "trend_score": min(0.99, round(0.30 + 0.15 * min(count, 5), 2)),
             "related_entities": [],
+            "related_keywords": [],
             "velocity": None,
             "recent_count": count,
             "prior_count": 0,