há 1 semana atrás · 77fb071a28
--- a/PROJECT.md
+++ b/PROJECT.md
@@ -136,6 +136,28 @@ news-mcp/
 
				 - No alerting/threshold notifications (Phase 2)
			
 
				 - No server-sent events for real-time dashboard updates
			
 
				 
			
 
				+## Keyword Utilization Upgrade (May 2026)
			
 
				+
			
 
				+### Problem
			
 
				+Keywords are extracted by the LLM (`extract_entities.prompt` — "provide short keywords that justify the classification"), stored in the cluster payload, and displayed in the dashboard detail view — but they are not used by any search, scoring, or retrieval path. Thematic signals like "ETF", "rate-cut", "contagion" are invisible to entity search, emerging-topics detection, and related-entity expansion.
			
 
				+
			
 
				+### Plan
			
 
				+
			
 
				+#### Phase 1 — Search & Retrieval (done)
			
 
				+- **1a**: Add keywords to `_cluster_entity_haystack()` in `mcp_server_fastmcp.py` so `get_events_for_entity()` and `get_news_sentiment()` match clusters by thematic keywords, not just named entities.
			
 
				+- **1b**: Add `keywords` field to cluster output dicts in `get_latest_events()` and `get_events_for_entity()` so downstream LLM agents see the full semantic picture.
			
 
				+
			
 
				+#### Phase 2 — Emerging Topics (pending)
			
 
				+- **2a**: Count keywords in `detect_emerging_topics()` with parallel `keyword_counts_recent` / `keyword_counts_prior` accumulators, scored with the same velocity/recency/source-diversity formula as entities.
			
 
				+- **2b**: Optionally promote high-velocity keywords to "suggested entities" on the dashboard.
			
 
				+
			
 
				+#### Phase 3 — Relatedness & Dashboard (pending)
			
 
				+- **3a**: Add keyword co-occurrence counting in `_collect_local_related()` in `related_entities.py`.
			
 
				+- **3b**: Add `get_keyword_frequencies()` to `DashboardStore` and a "Keywords" panel on the dashboard.
			
 
				+
			
 
				+#### Phase 4 — Prompt Refinement (optional)
			
 
				+- Split keyword extraction into "theme keywords" (subject matter) and "signal keywords" (what's new/notable) for differential weighting downstream.
			
 
				+
			
 
				 ## Timestamp Normalization (May 2026)
			
 
				 
			
 
				 ### Problem
			
--- a/dashboard/dashboard.js
+++ b/dashboard/dashboard.js
@@ -8,7 +8,7 @@ var _feedsData = [];
 
				 var _healthLoaded = false;
			
 
				 
			
 
				 function switchView(name) {
			
 
				-  var views = ['health','feeds','clusters','sentiment','entities','detail'];
			
 
				+  var views = ['health','feeds','clusters','sentiment','entities','keywords','detail'];
			
 
				   if (views.indexOf(name) === -1) return;
			
 
				   document.querySelectorAll('.view').forEach(function(v) { v.classList.toggle('active', v.id === 'view-' + name); });
			
 
				   document.querySelectorAll('.nav-links a').forEach(function(a) { a.classList.toggle('active', a.dataset.view === name); });
			
@@ -17,6 +17,7 @@ function switchView(name) {
 
				   if (name === 'clusters') reloadClusters();
			
 
				   if (name === 'sentiment') reloadSentiment();
			
 
				   if (name === 'entities') loadEntities();
			
 
				+  if (name === 'keywords') loadKeywords();
			
 
				 }
			
 
				 
			
 
				 function $(id) {
			
@@ -391,6 +392,74 @@ async function showEntityDetail(label) {
 
				   }
			
 
				 }
			
 
				 
			
 
				+// ── Keywords ──────────────────────────────────────────────
			
 
				+
			
 
				+var _keywordsData = [];
			
 
				+
			
 
				+async function loadKeywords() {
			
 
				+  try {
			
 
				+    var res = await fetch(API + '/keywords?hours=144&limit=30');
			
 
				+    var d = await res.json();
			
 
				+    _keywordsData = d.keywords || [];
			
 
				+    renderKeywordList();
			
 
				+    renderKeywordChart();
			
 
				+  } catch(e) {
			
 
				+    console.error('Keywords error:', e);
			
 
				+    var el = $('keyword-list'); if (el) el.innerHTML = '<div class="loading">Error</div>';
			
 
				+  }
			
 
				+}
			
 
				+
			
 
				+function renderKeywordList() {
			
 
				+  var el = $('keyword-list'); if (!el) return;
			
 
				+  if (!_keywordsData.length) { el.innerHTML = '<div class="loading">No keywords</div>'; return; }
			
 
				+  var html = '';
			
 
				+  for (var i = 0; i < _keywordsData.length; i++) {
			
 
				+    var k = _keywordsData[i];
			
 
				+    html += '<div class="entity-row" onclick="showKeywordDetail(\''+esc(k.label)+'\')" style="cursor:pointer"><span class="ent-label">' + esc(k.label) + '</span><span class="ent-count">' + k.count + 'x</span></div>';
			
 
				+  }
			
 
				+  el.innerHTML = html;
			
 
				+}
			
 
				+
			
 
				+function renderKeywordChart() {
			
 
				+  var top15 = _keywordsData.slice(0,15);
			
 
				+  if (!top15.length) return;
			
 
				+  if (_charts.keywords) _charts.keywords.destroy();
			
 
				+  _charts.keywords = new Chart($('chart-keywords').getContext('2d'), {
			
 
				+    type: 'bar',
			
 
				+    data: { labels: top15.map(function(k){return k.label.substring(0,24)}), datasets: [{ label:'Occurrences', data:top15.map(function(k){return k.count}), backgroundColor:'rgba(71,207,125,0.3)', borderColor:'#47cf7d', borderWidth:1 }] },
			
 
				+    options: { indexAxis:'y', responsive:true, plugins:{legend:{display:false}}, scales:{ x:{ticks:{color:'#8a8f9b'},grid:{color:'rgba(42,46,58,.5)'}}, y:{ticks:{color:'#8a8f9b'},grid:{display:false}} } }
			
 
				+  });
			
 
				+}
			
 
				+
			
 
				+// Show clusters containing this keyword, sorted by date DESC
			
 
				+async function showKeywordDetail(label) {
			
 
				+  if (!label) return;
			
 
				+  var el = $('keyword-detail'); if (!el) return;
			
 
				+  el.innerHTML = '<div class="loading">Fetching clusters with keyword ' + esc(label) + '…</div>';
			
 
				+  try {
			
 
				+    var res = await fetch(API + '/clusters?topic=all&hours=144&limit=200');
			
 
				+    var d = await res.json();
			
 
				+    var matched = (d.clusters || []).filter(function(c) {
			
 
				+      return (c.keywords||[]).some(function(k) { return (k||'').toLowerCase() === label.toLowerCase(); });
			
 
				+    });
			
 
				+    matched.sort(function(a,b) {
			
 
				+      var ta = new Date(a.timestamp || 0).getTime();
			
 
				+      var tb = new Date(b.timestamp || 0).getTime();
			
 
				+      return tb - ta;
			
 
				+    });
			
 
				+    if (!matched.length) { el.innerHTML = '<p class="muted">No clusters have keyword "' + esc(label) + '" in the current window.</p>'; return; }
			
 
				+    var html = '<h4 style="font-size:.85rem;margin-bottom:.5rem">Clusters with keyword ' + esc(label) + ' (' + matched.length + ')</h4>';
			
 
				+    for (var i = 0; i < matched.length; i++) {
			
 
				+      var c = matched[i];
			
 
				+      html += '<div style="margin-bottom:.6rem;padding:.6rem;background:var(--surface2);border-radius:6px;font-size:.82rem;cursor:pointer" onclick="openClusterModal(\''+esc(c.cluster_id)+'\')">'+
			
 
				+        '<b>'+esc(c.headline)+'</b><br><span class="muted">'+topicChip(c.topic)+' '+sentimentClass(c.sentiment)+' '+esc(String(c.sentimentScore||''))+' &middot; '+esc(String(c.timestamp||''))+'</span></div>';
			
 
				+    }
			
 
				+    el.innerHTML = html;
			
 
				+  } catch(e) {
			
 
				+    el.innerHTML = '<p class="muted">Error: ' + esc(e.message) + '</p>';
			
 
				+  }
			
 
				+}
			
 
				+
			
 
				 // ── Detail modal ─────────────────────────────────────────
			
 
				 function openClusterModal(clusterId) {
			
 
				   if (!clusterId) return;
			
--- a/dashboard/index.html
+++ b/dashboard/index.html
@@ -19,6 +19,7 @@
 
				     <a href="#" onclick="switchView('clusters'); return false;" data-view="clusters">Clusters</a>
			
 
				     <a href="#" onclick="switchView('sentiment'); return false;" data-view="sentiment">Sentiment</a>
			
 
				     <a href="#" onclick="switchView('entities'); return false;" data-view="entities">Entities</a>
			
 
				+    <a href="#" onclick="switchView('keywords'); return false;" data-view="keywords">Keywords</a>
			
 
				     <a href="#" onclick="switchView('detail'); return false;" data-view="detail">Detail</a>
			
 
				   </div>
			
 
				   <div class="nav-meta" id="nav-meta"></div>
			
@@ -138,6 +139,24 @@
 
				   </div>
			
 
				 </div>
			
 
				 
			
 
				+<!-- KEYWORDS VIEW -->
			
 
				+<div id="view-keywords" class="view">
			
 
				+  <div class="grid grid-3">
			
 
				+    <div class="card card-wide">
			
 
				+      <h3>🏷️ Top Keywords <small class="muted">(24h occurrences, excluding entities)</small></h3>
			
 
				+      <div id="keyword-list"><div class="loading">Loading…</div></div>
			
 
				+    </div>
			
 
				+    <div class="card">
			
 
				+      <h3>📊 Keyword Frequency</h3>
			
 
				+      <div class="chart-wrap"><canvas id="chart-keywords"></canvas></div>
			
 
				+    </div>
			
 
				+    <div class="card">
			
 
				+      <h3>ℹ️ Keyword Detail</h3>
			
 
				+      <div id="keyword-detail"><p class="muted">Click a keyword in the list to see matching clusters.</p></div>
			
 
				+    </div>
			
 
				+  </div>
			
 
				+</div>
			
 
				+
			
 
				 <!-- DETAIL VIEW -->
			
 
				 <div id="view-detail" class="view">
			
 
				   <div class="card">
			
--- a/news_mcp/dashboard/dashboard_store.py
+++ b/news_mcp/dashboard/dashboard_store.py
@@ -281,3 +281,66 @@ class DashboardStore:
 
				             })
			
 
				         return result
			
 
				 
			
 
				+    # ── Keyword Frequencies ─────────────────────────────────────────
			
 
				+
			
 
				+    def get_keyword_frequencies(
			
 
				+        self,
			
 
				+        hours: float = 24,
			
 
				+        limit: int = 30,
			
 
				+    ) -> list[dict[str, Any]]:
			
 
				+        """Top keywords by occurrence count in recent clusters.
			
 
				+
			
 
				+        Mirrors get_entity_frequencies but for LLM-curated thematic keywords.
			
 
				+        Filters by the cluster's own event timestamp (payload.timestamp).
			
 
				+        Only includes keywords that are NOT already extracted as entities
			
 
				+        in the same cluster — the entity signal is higher quality and is
			
 
				+        already shown in the entity frequencies view.
			
 
				+        """
			
 
				+        cutoff = datetime.now(timezone.utc) - timedelta(hours=hours)
			
 
				+
			
 
				+        query = "SELECT payload FROM clusters"
			
 
				+        params: list = []
			
 
				+        with self._store._conn() as conn:
			
 
				+            cur = conn.execute(query, params)
			
 
				+            rows = cur.fetchall()
			
 
				+
			
 
				+        def _parse_ts(ts):
			
 
				+            if not ts:
			
 
				+                return None
			
 
				+            s = str(ts).strip()
			
 
				+            try:
			
 
				+                dt = datetime.fromisoformat(s.replace("Z", "+00:00"))
			
 
				+            except Exception:
			
 
				+                try:
			
 
				+                    dt = parsedate_to_datetime(s)
			
 
				+                except Exception:
			
 
				+                    return None
			
 
				+            if dt.tzinfo is None:
			
 
				+                dt = dt.replace(tzinfo=timezone.utc)
			
 
				+            return dt.astimezone(timezone.utc)
			
 
				+
			
 
				+        counter: dict[str, int] = {}
			
 
				+        for (payload_text,) in rows:
			
 
				+            c = json.loads(payload_text)
			
 
				+            dt = _parse_ts(c.get("timestamp"))
			
 
				+            if dt is None:
			
 
				+                continue
			
 
				+            if dt < cutoff:
			
 
				+                continue
			
 
				+            # Get entities in this cluster to dedup against keywords
			
 
				+            ents_in_cluster = {str(e).strip().lower() for e in (c.get("entities", []) or []) if str(e).strip()}
			
 
				+            for kw in c.get("keywords", []):
			
 
				+                kw_str = str(kw).strip()
			
 
				+                if not kw_str:
			
 
				+                    continue
			
 
				+                # Skip keywords that are already entities in this cluster
			
 
				+                if kw_str.lower() in ents_in_cluster:
			
 
				+                    continue
			
 
				+                counter[kw_str] = counter.get(kw_str, 0) + 1
			
 
				+
			
 
				+        sorted_kws = sorted(counter.items(), key=lambda x: -x[1])[:limit]
			
 
				+        return [
			
 
				+            {"label": label, "count": count}
			
 
				+            for label, count in sorted_kws
			
 
				+        ]
			
 
				+
			
--- a/news_mcp/mcp_server_fastmcp.py
+++ b/news_mcp/mcp_server_fastmcp.py
@@ -65,7 +65,7 @@ mcp = FastMCP(
 
				 
			
 
				 
			
 
				 def _cluster_entity_haystack(cluster: dict) -> list[str]:
			
 
				-    """Collect the normalized entity clues attached to a cluster."""
			
 
				+    """Collect the normalized entity + keyword clues attached to a cluster."""
			
 
				     values: list[str] = []
			
 
				     for ent in cluster.get("entities", []) or []:
			
 
				         values.append(str(ent).strip().lower())
			
@@ -76,6 +76,11 @@ def _cluster_entity_haystack(cluster: dict) -> list[str]:
 
				             val = res.get(key)
			
 
				             if val:
			
 
				                 values.append(str(val).strip().lower())
			
 
				+    # Keywords are LLM-curated thematic descriptors — include them in the
			
 
				+    # searchable haystack so entity/theme queries match on subject-matter
			
 
				+    # signals, not just named entities.
			
 
				+    for kw in cluster.get("keywords", []) or []:
			
 
				+        values.append(str(kw).strip().lower())
			
 
				     return [v for v in values if v]
			
 
				 
			
 
				 
			
@@ -148,20 +153,20 @@ NEWS_TOOL_CARDS = [
 
				             {"name": "limit", "type": "integer", "default": 5, "range": "1-20"},
			
 
				             {"name": "include_articles", "type": "boolean", "default": False},
			
 
				         ],
			
 
				-        ["headline", "summary", "entities", "sentiment", "importance", "sources", "timestamp", "articles?"],
			
 
				-        ["Use when you want the freshest clusters and are willing to let the server decide topic vs entity mode."],
			
 
				+        ["headline", "summary", "entities", "keywords", "sentiment", "importance", "sources", "timestamp", "articles?"],
			
 
				+        ["Use when you want the freshest clusters. Each cluster includes both named entities and LLM-curated thematic keywords describing what the story is about."],
			
 
				     ),
			
 
				     _tool_card(
			
 
				         "get_events_for_entity",
			
 
				-        "Search recent clusters for a person, place, company, or theme by entity matching.",
			
 
				+        "Search recent clusters for a person, place, company, theme, or keyword by matching entities and thematic keywords.",
			
 
				         [
			
 
				-            {"name": "entity", "type": "string", "meaning": "entity label or phrase"},
			
 
				+            {"name": "entity", "type": "string", "meaning": "entity label, phrase, or keyword to search for"},
			
 
				             {"name": "timeframe", "type": "string", "default": "24h", "examples": ["24h", "72h", "3d"]},
			
 
				             {"name": "limit", "type": "integer", "default": 10, "range": "1-30"},
			
 
				             {"name": "include_articles", "type": "boolean", "default": False},
			
 
				         ],
			
 
				-        ["headline", "summary", "entities", "sentiment", "importance", "sources", "timestamp", "articles?"],
			
 
				-        ["Normalization is automatic; use this for an entity-centered deep dive."],
			
 
				+        ["headline", "summary", "entities", "keywords", "sentiment", "importance", "sources", "timestamp", "articles?"],
			
 
				+        ["Matches against both named entities and thematic keywords. Use this for an entity-centered or theme-centered deep dive."],
			
 
				     ),
			
 
				     _tool_card(
			
 
				         "get_event_summary",
			
@@ -175,7 +180,7 @@ NEWS_TOOL_CARDS = [
 
				     ),
			
 
				     _tool_card(
			
 
				         "detect_emerging_topics",
			
 
				-        "Surface entities and phrases starting to matter in the recent window.",
			
 
				+        "Surface emerging entities, thematic keywords, and phrases that are accelerating in the recent window.",
			
 
				         [
			
 
				             {"name": "limit", "type": "integer", "default": 10, "range": "1-20"},
			
 
				             {"name": "timeframe", "type": "string", "default": "24h", "examples": ["4h", "24h", "3d"]},
			
@@ -183,21 +188,21 @@ NEWS_TOOL_CARDS = [
 
				             {"name": "around", "type": "string", "default": "none", "meaning": "entity to scope results to its neighborhood (e.g. \"Bitcoin\")"},
			
 
				         ],
			
 
				         ["topic", "trend_score", "velocity", "recent_count", "prior_count", "source_count", "related_entities", "signal_type"],
			
 
				-        ["Use timeframe to control lookback, topic to scope to a category, around to find what's emerging near a specific entity."],
			
 
				+        ["Use timeframe to control lookback, topic to scope to a category, around to find what's emerging near a specific entity. Signal types: entity (named entity), keyword (thematic descriptor), phrase (headline bigram). Check velocity and source_count to distinguish real spikes from noise."],
			
 
				     ),
			
 
				     _tool_card(
			
 
				         "get_news_sentiment",
			
 
				-        "Estimate sentiment around an entity over a lookback window.",
			
 
				+        "Estimate sentiment around an entity or keyword over a lookback window.",
			
 
				         [
			
 
				-            {"name": "entity", "type": "string"},
			
 
				+            {"name": "entity", "type": "string", "meaning": "entity label, phrase, or keyword to analyze"},
			
 
				             {"name": "timeframe", "type": "string", "default": "24h"},
			
 
				         ],
			
 
				         ["entity", "sentiment", "score", "cluster_count"],
			
 
				-        ["Use after locating a cluster set or entity neighborhood."],
			
 
				+        ["Matches clusters by entities and keywords. Use after locating a cluster set or entity neighborhood."],
			
 
				     ),
			
 
				     _tool_card(
			
 
				         "get_related_recent_entities",
			
 
				-        "Blend local co-occurrence with Google Trends related topics, while preserving mids where available.",
			
 
				+        "Find entities and thematic keywords commonly co-occurring with a subject in recent clusters, optionally blended with Google Trends suggestions.",
			
 
				         [
			
 
				             {"name": "subject", "type": "string", "meaning": "canonical entity or subject phrase"},
			
 
				             {"name": "timeframe", "type": "string", "default": "72h"},
			
@@ -205,7 +210,7 @@ NEWS_TOOL_CARDS = [
 
				             {"name": "include_trends", "type": "boolean", "default": True},
			
 
				         ],
			
 
				         ["subject", "related[].normalized", "related[].canonical_label", "related[].mid", "related[].sources", "related[].scores"],
			
 
				-        ["Use this to drill from a subject into related entities, then feed those into get_events_for_entity."],
			
 
				+        ["Use this to drill from a subject into related entities and themes, then feed results into get_events_for_entity."],
			
 
				     ),
			
 
				 ]
			
 
				 
			
@@ -256,7 +261,8 @@ NEWS_AGENT_TIPS = [
 
				     "When describing clusters, keep sources and timestamps visible so the user can assess recency and provenance.",
			
 
				     "Prefer a short chain of tools over many parallel calls unless you are building a neighborhood map or comparison table.",
			
 
				     "For tricky names, rely on the server's resolver instead of inventing alias rules in the client.",
			
 
				-    "Use detect_emerging_topics with timeframe=\"4h\" for what's hot right now, timeframe=\"3d\" for weekly trends. Use topic= to scope to a category, around= to find what's emerging near a specific entity. Check velocity to distinguish accelerating signals from steady-state ones.",
			
 
				+    "Use detect_emerging_topics with timeframe=\"4h\" for what's hot right now, timeframe=\"3d\" for weekly trends. Use topic= to scope to a category, around= to find what's emerging near a specific entity. Check velocity to distinguish accelerating signals from steady-state ones. Filter by signal_type to focus on entities, keywords, or phrases.",
			
 
				+    "Each cluster contains both entities (named entities with identity resolution) and keywords (thematic descriptors). Use keywords to understand what a story is about beyond the named entities.",
			
 
				 ]
			
 
				 
			
 
				 
			
@@ -328,7 +334,7 @@ async def toggle_feed(feed_url: str, enabled: bool) -> dict:
 
				     return {"ok": True, "feed_key": feed_url.strip(), "enabled": enabled, "details": updated}
			
 
				 
			
 
				 
			
 
				-@mcp.tool(description="Investigate a topic and return the newest deduplicated news clusters, sorted by recency.")
			
 
				+@mcp.tool(description="Investigate a topic and return the newest deduplicated news clusters with entities and thematic keywords, sorted by recency.")
			
 
				 async def get_latest_events(topic: str | None = None, limit: int = 5, include_articles: bool = False):
			
 
				     limit = max(1, min(int(limit), 20))
			
 
				     # When topic is omitted, search across all topics (no topic filter).
			
@@ -378,6 +384,7 @@ async def get_latest_events(topic: str | None = None, limit: int = 5, include_ar
 
				             "headline": c.get("headline"),
			
 
				             "summary": c.get("summary"),
			
 
				             "entities": c.get("entities", []),
			
 
				+            "keywords": c.get("keywords", []),
			
 
				             "sentiment": c.get("sentiment", "neutral"),
			
 
				             "importance": c.get("importance", 0.0),
			
 
				             "sources": c.get("sources", []),
			
@@ -401,7 +408,7 @@ async def get_latest_events(topic: str | None = None, limit: int = 5, include_ar
 
				     return out
			
 
				 
			
 
				 
			
 
				-@mcp.tool(description="Investigate a person, company, place, or theme by matching extracted entities within a time window.")
			
 
				+@mcp.tool(description="Investigate a person, company, place, theme, or keyword by matching entities and thematic keywords within a time window.")
			
 
				 async def get_events_for_entity(entity: str, limit: int = 10, timeframe: str = "24h", include_articles: bool = False):
			
 
				     limit = max(1, min(int(limit), 30))
			
 
				     query = normalize_query(entity).strip().lower()
			
@@ -440,6 +447,7 @@ async def get_events_for_entity(entity: str, limit: int = 10, timeframe: str = "
 
				             "headline": c.get("headline"),
			
 
				             "summary": c.get("summary"),
			
 
				             "entities": c.get("entities", []),
			
 
				+            "keywords": c.get("keywords", []),
			
 
				             "sentiment": c.get("sentiment", "neutral"),
			
 
				             "importance": c.get("importance", 0.0),
			
 
				             "sources": c.get("sources", []),
			
@@ -461,7 +469,7 @@ async def get_events_for_entity(entity: str, limit: int = 10, timeframe: str = "
 
				     return out
			
 
				 
			
 
				 
			
 
				-@mcp.tool(description="Return entities most commonly associated with the subject in recent clusters, optionally blended with Google Trends suggestions.")
			
 
				+@mcp.tool(description="Return entities and thematic keywords commonly co-occurring with the subject in recent clusters, optionally blended with Google Trends suggestions.")
			
 
				 async def get_related_recent_entities(subject: str, timeframe: str = "72h", limit: int = 10, include_trends: bool = True):
			
 
				     limit = max(1, min(int(limit), 25))
			
 
				     hours = _parse_timeframe_to_hours(timeframe)
			
@@ -547,8 +555,9 @@ async def get_event_summary(event_id: str, include_articles: bool = False):
 
				     return out
			
 
				 
			
 
				 
			
 
				-@mcp.tool(description="Explore what is starting to matter: surface emerging entities and phrases from recent clusters. "
			
 
				-           "Use timeframe to control the lookback window, topic to scope to a category, and around to find what's emerging near a specific entity.")
			
 
				+@mcp.tool(description="Explore what is starting to matter: surface emerging entities, thematic keywords, and phrases from recent clusters. "
			
 
				+           "Use timeframe to control the lookback window, topic to scope to a category, and around to find what's emerging near a specific entity. "
			
 
				+           "Results include signal_type (entity / keyword / phrase) for downstream filtering.")
			
 
				 async def detect_emerging_topics(limit: int = 10, timeframe: str = "24h", topic: str | None = None, around: str | None = None):
			
 
				     """Surface entities and phrases that are accelerating in recent clusters.
			
 
				 
			
@@ -631,12 +640,28 @@ async def detect_emerging_topics(limit: int = 10, timeframe: str = "24h", topic:
 
				     entity_cooccur: dict[str, Counter] = {}
			
 
				     phrase_counts_recent = Counter()
			
 
				 
			
 
				+    # Keyword accumulators — same scoring pipeline as entities, but tracking
			
 
				+    # LLM-curated thematic descriptors instead of named entities.
			
 
				+    kw_counts_recent = Counter()
			
 
				+    kw_counts_prior = Counter()
			
 
				+    kw_importance_recent = Counter()
			
 
				+    kw_sources: dict[str, set] = {}
			
 
				+    kw_buckets: dict[str, set] = {}
			
 
				+
			
 
				     bucket_size_hours = max(1.0, hours / 6.0)  # split window into ~6 buckets
			
 
				 
			
 
				     for c in clusters:
			
 
				         ents_in_cluster = [e for e in (c.get("entities", []) or []) if not _is_generic_entity(e)]
			
 
				         ents_norm = [str(e).strip().lower() for e in ents_in_cluster if str(e).strip()]
			
 
				 
			
 
				+        # Keywords: deduplicate per cluster so a cluster with the same keyword
			
 
				+        # listed twice doesn't inflate counts.
			
 
				+        kws_in_cluster = list(dict.fromkeys(
			
 
				+            str(k).strip().lower()
			
 
				+            for k in (c.get("keywords", []) or [])
			
 
				+            if str(k).strip() and not _is_generic_entity(k)
			
 
				+        ))
			
 
				+
			
 
				         age_h = _cluster_age_hours(c)
			
 
				         is_recent = age_h <= half_hours
			
 
				         bucket_idx = int(age_h / bucket_size_hours)
			
@@ -674,6 +699,25 @@ async def detect_emerging_topics(limit: int = 10, timeframe: str = "24h", topic:
 
				                 except Exception:
			
 
				                     pass
			
 
				 
			
 
				+        # --- keyword counting (same recent/prior split as entities) ---
			
 
				+        kw_counts = kw_counts_recent if is_recent else kw_counts_prior
			
 
				+        kw_imp_acc = kw_importance_recent if is_recent else None
			
 
				+        for kw in kws_in_cluster:
			
 
				+            kw_counts[kw] += 1
			
 
				+            if kw not in kw_sources:
			
 
				+                kw_sources[kw] = set()
			
 
				+            src = c.get("source") or c.get("headline", "").split(" - ")[-1] if c.get("headline") else ""
			
 
				+            if src:
			
 
				+                kw_sources[kw].add(str(src))
			
 
				+            if kw not in kw_buckets:
			
 
				+                kw_buckets[kw] = set()
			
 
				+            kw_buckets[kw].add(bucket_idx)
			
 
				+            if kw_imp_acc is not None:
			
 
				+                try:
			
 
				+                    kw_imp_acc[kw] += float(c.get("importance", 0.0) or 0.0)  # type: ignore[assignment]
			
 
				+                except Exception:
			
 
				+                    pass
			
 
				+
			
 
				         # co-occurrence (only for clusters matching the around filter, if any)
			
 
				         for i in range(len(ents_norm)):
			
 
				             a = ents_norm[i]
			
@@ -753,13 +797,67 @@ async def detect_emerging_topics(limit: int = 10, timeframe: str = "24h", topic:
 
				             "signal_type": "entity",
			
 
				         })
			
 
				 
			
 
				+    # --- score keywords (same velocity/recency/source/sustained/importance formula) ---
			
 
				+    all_keywords = set(kw_counts_recent.keys()) | set(kw_counts_prior.keys())
			
 
				+    kw_scored = []
			
 
				+
			
 
				+    for kw in all_keywords:
			
 
				+        # Skip keywords that are already scored as entities — entity signal is
			
 
				+        # higher quality (proper nouns, resolved identities).
			
 
				+        if kw in all_entities:
			
 
				+            continue
			
 
				+
			
 
				+        recent_n = kw_counts_recent.get(kw, 0)
			
 
				+        prior_n = kw_counts_prior.get(kw, 0)
			
 
				+        total_n = recent_n + prior_n
			
 
				+
			
 
				+        if total_n < 1:
			
 
				+            continue
			
 
				+
			
 
				+        velocity = (recent_n + 0.5) / (prior_n + 0.5)
			
 
				+        recency_ratio = recent_n / total_n
			
 
				+        n_sources = len(kw_sources.get(kw, set()))
			
 
				+        n_buckets = len(kw_buckets.get(kw, set()))
			
 
				+        avg_imp = (kw_importance_recent.get(kw, 0.0) / max(1, recent_n)) if recent_n > 0 else 0.0
			
 
				+
			
 
				+        composed_score = (
			
 
				+            0.35 * min(1.0, math.log1p(velocity) / math.log1p(4.0)) +
			
 
				+            0.25 * recency_ratio +
			
 
				+            0.15 * min(1.0, n_sources / 5.0) +
			
 
				+            0.10 * min(1.0, n_buckets / 4.0) +
			
 
				+            0.15 * min(1.0, avg_imp)
			
 
				+        )
			
 
				+
			
 
				+        kw_scored.append({
			
 
				+            "topic": kw,
			
 
				+            "trend_score": min(0.99, round(composed_score, 3)),
			
 
				+            "related_entities": [],
			
 
				+            "velocity": round(velocity, 2),
			
 
				+            "recent_count": recent_n,
			
 
				+            "prior_count": prior_n,
			
 
				+            "source_count": n_sources,
			
 
				+            "avg_importance": round(avg_imp, 3),
			
 
				+            "signal_type": "keyword",
			
 
				+        })
			
 
				+
			
 
				+    # sort keywords by score descending
			
 
				+    kw_scored.sort(key=lambda x: (-x["trend_score"], -x["velocity"], x["topic"]))
			
 
				+
			
 
				     # sort by composed score descending
			
 
				     scored.sort(key=lambda x: (-x["trend_score"], -x["velocity"], x["topic"]))
			
 
				 
			
 
				-    # --- add phrase signals (only from recent window) ---
			
 
				+    # --- merge: entities first, then keywords, then phrases ---
			
 
				     emerging = list(scored)  # start with entities
			
 
				+    seen_topics = {item["topic"] for item in emerging}
			
 
				+
			
 
				+    for kw_item in kw_scored:
			
 
				+        if kw_item["topic"] not in seen_topics:
			
 
				+            emerging.append(kw_item)
			
 
				+            seen_topics.add(kw_item["topic"])
			
 
				+
			
 
				+    # --- add phrase signals (only from recent window) ---
			
 
				     for phrase, count in phrase_counts_recent.most_common(limit * 2):
			
 
				-        if any(item["topic"] == phrase for item in emerging):
			
 
				+        if phrase in seen_topics:
			
 
				             continue
			
 
				         emerging.append({
			
 
				             "topic": phrase.title(),
			
@@ -772,13 +870,15 @@ async def detect_emerging_topics(limit: int = 10, timeframe: str = "24h", topic:
 
				             "avg_importance": 0.0,
			
 
				             "signal_type": "phrase",
			
 
				         })
			
 
				+        seen_topics.add(phrase)
			
 
				         if len(emerging) >= limit:
			
 
				             break
			
 
				 
			
 
				     return emerging[:limit]
			
 
				 
			
 
				 
			
 
				-@mcp.tool(description="Investigate whether sentiment around an entity is positive, negative, or neutral over a chosen lookback window.")
			
 
				+@mcp.tool(description="Investigate whether sentiment around an entity or keyword is positive, negative, or neutral over a chosen lookback window. "
			
 
				+           "Matches clusters by both named entities and thematic keywords.")
			
 
				 async def get_news_sentiment(entity: str, timeframe: str = "24h"):
			
 
				     store = SQLiteClusterStore(DB_PATH)
			
 
				 
			
@@ -855,11 +955,12 @@ async def get_capabilities():
 
				     return {
			
 
				         "server": {
			
 
				             "name": "news-mcp",
			
 
				-            "purpose": "Recent news clusters, entity drill-down, sentiment, emerging topics, and related-entity expansion.",
			
 
				+            "purpose": "Recent news clusters with entities and thematic keywords, entity/keyword drill-down, sentiment, emerging topics, and related-entity expansion.",
			
 
				             "output_conventions": {
			
 
				                 "cluster_ids": "Do not surface cluster_id in user-facing prose unless explicitly requested; treat it as internal navigation metadata.",
			
 
				                 "sources": "Always preserve and display sources when summarizing a cluster or entity result.",
			
 
				                 "timestamps": "Mention timestamps consistently when comparing multiple clusters or when recency matters.",
			
 
				+                "clusters": "Each cluster includes entities (named entities with optional MID/canonical_label) and keywords (thematic descriptors). Both are searchable; entities are higher-signal, keywords capture subject-matter themes.",
			
 
				             },
			
 
				         },
			
 
				         "tools": NEWS_TOOL_CARDS,
			
@@ -867,10 +968,11 @@ async def get_capabilities():
 
				         "example_chains": NEWS_EXAMPLE_CHAINS,
			
 
				         "agent_tips": NEWS_AGENT_TIPS,
			
 
				         "guidance": [
			
 
				-            "Use get_latest_events for a tail, get_events_for_entity for entity deep dives, and get_related_recent_entities for neighborhood expansion.",
			
 
				+            "Use get_latest_events for a tail, get_events_for_entity for entity/keyword deep dives, and get_related_recent_entities for neighborhood expansion.",
			
 
				             "Prefer normalized/canonical entities when possible, but the server will resolve common aliases and MIDs for you.",
			
 
				             "When presenting results to users, summarize the cluster; avoid exposing internal IDs unless they are needed for follow-up tool calls.",
			
 
				-            "For emerging topics, use detect_emerging_topics with timeframe and around parameters to scope your query. High velocity + high source_count = strong emerging signal.",
			
 
				+            "For emerging topics, use detect_emerging_topics with timeframe and around parameters. Signal types: entity (named entity, highest quality), keyword (thematic descriptor), phrase (headline bigram). High velocity + high source_count = strong signal.",
			
 
				+            "get_events_for_entity and get_news_sentiment match both entities and thematic keywords — use keywords when the subject is a theme rather than a named entity.",
			
 
				         ],
			
 
				     }
			
 
				 
			
@@ -1064,6 +1166,19 @@ def api_entities(
 
				     except Exception as e:
			
 
				         return _api_err(e, f"entities(hours={hours})")
			
 
				 
			
 
				+@app.get("/api/v1/keywords")
			
 
				+def api_keywords(
			
 
				+    hours: int = 24,
			
 
				+    limit: int = 30,
			
 
				+):
			
 
				+    """Top keyword frequencies (thematic descriptors, excluding terms already counted as entities)."""
			
 
				+    try:
			
 
				+        store = DashboardStore(_shared_store)
			
 
				+        keywords = store.get_keyword_frequencies(hours=hours, limit=limit)
			
 
				+        return {"keywords": keywords, "hours": hours}
			
 
				+    except Exception as e:
			
 
				+        return _api_err(e, f"keywords(hours={hours})")
			
 
				+
			
 
				 @app.get("/api/v1/cluster/{cluster_id}")
			
 
				 def api_cluster_detail(cluster_id: str):
			
 
				     """Full cluster detail for drill-down."""
			
--- a/news_mcp/related_entities.py
+++ b/news_mcp/related_entities.py
@@ -50,7 +50,11 @@ def _collect_local_related(
 
				         if not (haystack_set & subject_terms):
			
 
				             continue
			
 
				 
			
 
				-        # Count other entities normalized.
			
 
				+        # Collect entities already present in this cluster (by normalized form)
			
 
				+        # so we can skip keywords that are already counted as entities.
			
 
				+        ents_in_cluster = {str(e).strip().lower() for e in (cluster.get("entities", []) or []) if str(e).strip()}
			
 
				+
			
 
				+        # Count other entities (existing behavior).
			
 
				         for ent in cluster.get("entities", []) or []:
			
 
				             ent_norm = normalize_entity(ent)
			
 
				             if not ent_norm:
			
@@ -59,6 +63,24 @@ def _collect_local_related(
 
				             if ent_key in subject_terms:
			
 
				                 continue
			
 
				             counter[ent_norm] += 1
			
 
				+
			
 
				+        # Count keywords that are NOT already entities in this cluster.
			
 
				+        # Keywords are LLM-curated thematic descriptors — they capture
			
 
				+        # subject-matter signals that may not be named entities.
			
 
				+        for kw in cluster.get("keywords", []) or []:
			
 
				+            kw_norm = str(kw).strip()
			
 
				+            if not kw_norm:
			
 
				+                continue
			
 
				+            kw_key = kw_norm.lower()
			
 
				+            # Skip if this keyword is already an entity in this cluster
			
 
				+            # (entity signal is higher quality — has MID, canonical_label).
			
 
				+            if kw_key in ents_in_cluster:
			
 
				+                continue
			
 
				+            # Skip if it matches the subject itself
			
 
				+            if kw_key in subject_terms:
			
 
				+                continue
			
 
				+            counter[kw_norm] += 1
			
 
				+
			
 
				     return counter.most_common(limit)