пре 1 недеља · 77fb071a28
--- a/PROJECT.md
+++ b/PROJECT.md
@@ -136,6 +136,28 @@ news-mcp/
 
															 - No alerting/threshold notifications (Phase 2)
														
 
															 - No server-sent events for real-time dashboard updates
														
 
															+## Keyword Utilization Upgrade (May 2026)
														
 
															+
														
 
															+### Problem
														
 
															+Keywords are extracted by the LLM (`extract_entities.prompt` — "provide short keywords that justify the classification"), stored in the cluster payload, and displayed in the dashboard detail view — but they are not used by any search, scoring, or retrieval path. Thematic signals like "ETF", "rate-cut", "contagion" are invisible to entity search, emerging-topics detection, and related-entity expansion.
														
 
															+
														
 
															+### Plan
														
 
															+
														
 
															+#### Phase 1 — Search & Retrieval (done)
														
 
															+- **1a**: Add keywords to `_cluster_entity_haystack()` in `mcp_server_fastmcp.py` so `get_events_for_entity()` and `get_news_sentiment()` match clusters by thematic keywords, not just named entities.
														
 
															+- **1b**: Add `keywords` field to cluster output dicts in `get_latest_events()` and `get_events_for_entity()` so downstream LLM agents see the full semantic picture.
														
 
															+
														
 
															+#### Phase 2 — Emerging Topics (pending)
														
 
															+- **2a**: Count keywords in `detect_emerging_topics()` with parallel `keyword_counts_recent` / `keyword_counts_prior` accumulators, scored with the same velocity/recency/source-diversity formula as entities.
														
 
															+- **2b**: Optionally promote high-velocity keywords to "suggested entities" on the dashboard.
														
 
															+
														
 
															+#### Phase 3 — Relatedness & Dashboard (pending)
														
 
															+- **3a**: Add keyword co-occurrence counting in `_collect_local_related()` in `related_entities.py`.
														
 
															+- **3b**: Add `get_keyword_frequencies()` to `DashboardStore` and a "Keywords" panel on the dashboard.
														
 
															+
														
 
															+#### Phase 4 — Prompt Refinement (optional)
														
 
															+- Split keyword extraction into "theme keywords" (subject matter) and "signal keywords" (what's new/notable) for differential weighting downstream.
														
 
															+
														
 
															 ## Timestamp Normalization (May 2026)
														
 
															 ### Problem
														
--- a/dashboard/dashboard.js
+++ b/dashboard/dashboard.js
@@ -8,7 +8,7 @@ var _feedsData = [];
 
															 var _healthLoaded = false;
														
 
															 function switchView(name) {
														
 
															-  var views = ['health','feeds','clusters','sentiment','entities','detail'];
														
 
															+  var views = ['health','feeds','clusters','sentiment','entities','keywords','detail'];
														
 
															   if (views.indexOf(name) === -1) return;
														
 
															   document.querySelectorAll('.view').forEach(function(v) { v.classList.toggle('active', v.id === 'view-' + name); });
														
 
															   document.querySelectorAll('.nav-links a').forEach(function(a) { a.classList.toggle('active', a.dataset.view === name); });
														
@@ -17,6 +17,7 @@ function switchView(name) {
 
															   if (name === 'clusters') reloadClusters();
														
 
															   if (name === 'sentiment') reloadSentiment();
														
 
															   if (name === 'entities') loadEntities();
														
 
															+  if (name === 'keywords') loadKeywords();
														
 
															 }
														
 
															 function $(id) {
														
@@ -391,6 +392,74 @@ async function showEntityDetail(label) {
 
															   }
														
 
															 }
														
 
															+// ── Keywords ──────────────────────────────────────────────
														
 
															+
														
 
															+var _keywordsData = [];
														
 
															+
														
 
															+async function loadKeywords() {
														
 
															+  try {
														
 
															+    var res = await fetch(API + '/keywords?hours=144&limit=30');
														
 
															+    var d = await res.json();
														
 
															+    _keywordsData = d.keywords || [];
														
 
															+    renderKeywordList();
														
 
															+    renderKeywordChart();
														
 
															+  } catch(e) {
														
 
															+    console.error('Keywords error:', e);
														
 
															+    var el = $('keyword-list'); if (el) el.innerHTML = '<div class="loading">Error</div>';
														
 
															+  }
														
 
															+}
														
 
															+
														
 
															+function renderKeywordList() {
														
 
															+  var el = $('keyword-list'); if (!el) return;
														
 
															+  if (!_keywordsData.length) { el.innerHTML = '<div class="loading">No keywords</div>'; return; }
														
 
															+  var html = '';
														
 
															+  for (var i = 0; i < _keywordsData.length; i++) {
														
 
															+    var k = _keywordsData[i];
														
 
															+    html += '<div class="entity-row" onclick="showKeywordDetail(\''+esc(k.label)+'\')" style="cursor:pointer"><span class="ent-label">' + esc(k.label) + '</span><span class="ent-count">' + k.count + 'x</span></div>';
														
 
															+  }
														
 
															+  el.innerHTML = html;
														
 
															+}
														
 
															+
														
 
															+function renderKeywordChart() {
														
 
															+  var top15 = _keywordsData.slice(0,15);
														
 
															+  if (!top15.length) return;
														
 
															+  if (_charts.keywords) _charts.keywords.destroy();
														
 
															+  _charts.keywords = new Chart($('chart-keywords').getContext('2d'), {
														
 
															+    type: 'bar',
														
 
															+    data: { labels: top15.map(function(k){return k.label.substring(0,24)}), datasets: [{ label:'Occurrences', data:top15.map(function(k){return k.count}), backgroundColor:'rgba(71,207,125,0.3)', borderColor:'#47cf7d', borderWidth:1 }] },
														
 
															+    options: { indexAxis:'y', responsive:true, plugins:{legend:{display:false}}, scales:{ x:{ticks:{color:'#8a8f9b'},grid:{color:'rgba(42,46,58,.5)'}}, y:{ticks:{color:'#8a8f9b'},grid:{display:false}} } }
														
 
															+  });
														
 
															+}
														
 
															+
														
 
															+// Show clusters containing this keyword, sorted by date DESC
														
 
															+async function showKeywordDetail(label) {
														
 
															+  if (!label) return;
														
 
															+  var el = $('keyword-detail'); if (!el) return;
														
 
															+  el.innerHTML = '<div class="loading">Fetching clusters with keyword ' + esc(label) + '…</div>';
														
 
															+  try {
														
 
															+    var res = await fetch(API + '/clusters?topic=all&hours=144&limit=200');
														
 
															+    var d = await res.json();
														
 
															+    var matched = (d.clusters || []).filter(function(c) {
														
 
															+      return (c.keywords||[]).some(function(k) { return (k||'').toLowerCase() === label.toLowerCase(); });
														
 
															+    });
														
 
															+    matched.sort(function(a,b) {
														
 
															+      var ta = new Date(a.timestamp || 0).getTime();
														
 
															+      var tb = new Date(b.timestamp || 0).getTime();
														
 
															+      return tb - ta;
														
 
															+    });
														
 
															+    if (!matched.length) { el.innerHTML = '<p class="muted">No clusters have keyword "' + esc(label) + '" in the current window.</p>'; return; }
														
 
															+    var html = '<h4 style="font-size:.85rem;margin-bottom:.5rem">Clusters with keyword ' + esc(label) + ' (' + matched.length + ')</h4>';
														
 
															+    for (var i = 0; i < matched.length; i++) {
														
 
															+      var c = matched[i];
														
 
															+      html += '<div style="margin-bottom:.6rem;padding:.6rem;background:var(--surface2);border-radius:6px;font-size:.82rem;cursor:pointer" onclick="openClusterModal(\''+esc(c.cluster_id)+'\')">'+
														
 
															+        '<b>'+esc(c.headline)+'</b><br><span class="muted">'+topicChip(c.topic)+' '+sentimentClass(c.sentiment)+' '+esc(String(c.sentimentScore||''))+' &middot; '+esc(String(c.timestamp||''))+'</span></div>';
														
 
															+    }
														
 
															+    el.innerHTML = html;
														
 
															+  } catch(e) {
														
 
															+    el.innerHTML = '<p class="muted">Error: ' + esc(e.message) + '</p>';
														
 
															+  }
														
 
															+}
														
 
															+
														
 
															 // ── Detail modal ─────────────────────────────────────────
														
 
															 function openClusterModal(clusterId) {
														
 
															   if (!clusterId) return;
														
--- a/dashboard/index.html
+++ b/dashboard/index.html
@@ -19,6 +19,7 @@
 
															     <a href="#" onclick="switchView('clusters'); return false;" data-view="clusters">Clusters</a>
														
 
															     <a href="#" onclick="switchView('sentiment'); return false;" data-view="sentiment">Sentiment</a>
														
 
															     <a href="#" onclick="switchView('entities'); return false;" data-view="entities">Entities</a>
														
 
															+    <a href="#" onclick="switchView('keywords'); return false;" data-view="keywords">Keywords</a>
														
 
															     <a href="#" onclick="switchView('detail'); return false;" data-view="detail">Detail</a>
														
 
															   </div>
														
 
															   <div class="nav-meta" id="nav-meta"></div>
														
@@ -138,6 +139,24 @@
 
															   </div>
														
 
															 </div>
														
 
															+<!-- KEYWORDS VIEW -->
														
 
															+<div id="view-keywords" class="view">
														
 
															+  <div class="grid grid-3">
														
 
															+    <div class="card card-wide">
														
 
															+      <h3>🏷️ Top Keywords <small class="muted">(24h occurrences, excluding entities)</small></h3>
														
 
															+      <div id="keyword-list"><div class="loading">Loading…</div></div>
														
 
															+    </div>
														
 
															+    <div class="card">
														
 
															+      <h3>📊 Keyword Frequency</h3>
														
 
															+      <div class="chart-wrap"><canvas id="chart-keywords"></canvas></div>
														
 
															+    </div>
														
 
															+    <div class="card">
														
 
															+      <h3>ℹ️ Keyword Detail</h3>
														
 
															+      <div id="keyword-detail"><p class="muted">Click a keyword in the list to see matching clusters.</p></div>
														
 
															+    </div>
														
 
															+  </div>
														
 
															+</div>
														
 
															+
														
 
															 <!-- DETAIL VIEW -->
														
 
															 <div id="view-detail" class="view">
														
 
															   <div class="card">
														
--- a/news_mcp/dashboard/dashboard_store.py
+++ b/news_mcp/dashboard/dashboard_store.py
@@ -281,3 +281,66 @@ class DashboardStore:
 
															             })
														
 
															         return result
														
 
															+    # ── Keyword Frequencies ─────────────────────────────────────────
														
 
															+
														
 
															+    def get_keyword_frequencies(
														
 
															+        self,
														
 
															+        hours: float = 24,
														
 
															+        limit: int = 30,
														
 
															+    ) -> list[dict[str, Any]]:
														
 
															+        """Top keywords by occurrence count in recent clusters.
														
 
															+
														
 
															+        Mirrors get_entity_frequencies but for LLM-curated thematic keywords.
														
 
															+        Filters by the cluster's own event timestamp (payload.timestamp).
														
 
															+        Only includes keywords that are NOT already extracted as entities
														
 
															+        in the same cluster — the entity signal is higher quality and is
														
 
															+        already shown in the entity frequencies view.
														
 
															+        """
														
 
															+        cutoff = datetime.now(timezone.utc) - timedelta(hours=hours)
														
 
															+
														
 
															+        query = "SELECT payload FROM clusters"
														
 
															+        params: list = []
														
 
															+        with self._store._conn() as conn:
														
 
															+            cur = conn.execute(query, params)
														
 
															+            rows = cur.fetchall()
														
 
															+
														
 
															+        def _parse_ts(ts):
														
 
															+            if not ts:
														
 
															+                return None
														
 
															+            s = str(ts).strip()
														
 
															+            try:
														
 
															+                dt = datetime.fromisoformat(s.replace("Z", "+00:00"))
														
 
															+            except Exception:
														
 
															+                try:
														
 
															+                    dt = parsedate_to_datetime(s)
														
 
															+                except Exception:
														
 
															+                    return None
														
 
															+            if dt.tzinfo is None:
														
 
															+                dt = dt.replace(tzinfo=timezone.utc)
														
 
															+            return dt.astimezone(timezone.utc)
														
 
															+
														
 
															+        counter: dict[str, int] = {}
														
 
															+        for (payload_text,) in rows:
														
 
															+            c = json.loads(payload_text)
														
 
															+            dt = _parse_ts(c.get("timestamp"))
														
 
															+            if dt is None:
														
 
															+                continue
														
 
															+            if dt < cutoff:
														
 
															+                continue
														
 
															+            # Get entities in this cluster to dedup against keywords
														
 
															+            ents_in_cluster = {str(e).strip().lower() for e in (c.get("entities", []) or []) if str(e).strip()}
														
 
															+            for kw in c.get("keywords", []):
														
 
															+                kw_str = str(kw).strip()
														
 
															+                if not kw_str:
														
 
															+                    continue
														
 
															+                # Skip keywords that are already entities in this cluster
														
 
															+                if kw_str.lower() in ents_in_cluster:
														
 
															+                    continue
														
 
															+                counter[kw_str] = counter.get(kw_str, 0) + 1
														
 
															+
														
 
															+        sorted_kws = sorted(counter.items(), key=lambda x: -x[1])[:limit]
														
 
															+        return [
														
 
															+            {"label": label, "count": count}
														
 
															+            for label, count in sorted_kws
														
 
															+        ]
														
 
															+
														
--- a/news_mcp/mcp_server_fastmcp.py
+++ b/news_mcp/mcp_server_fastmcp.py
@@ -65,7 +65,7 @@ mcp = FastMCP(
 
															 def _cluster_entity_haystack(cluster: dict) -> list[str]:
														
 
															-    """Collect the normalized entity clues attached to a cluster."""
														
 
															+    """Collect the normalized entity + keyword clues attached to a cluster."""
														
 
															     values: list[str] = []
														
 
															     for ent in cluster.get("entities", []) or []:
														
 
															         values.append(str(ent).strip().lower())
														
@@ -76,6 +76,11 @@ def _cluster_entity_haystack(cluster: dict) -> list[str]:
 
															             val = res.get(key)
														
 
															             if val:
														
 
															                 values.append(str(val).strip().lower())
														
 
															+    # Keywords are LLM-curated thematic descriptors — include them in the
														
 
															+    # searchable haystack so entity/theme queries match on subject-matter
														
 
															+    # signals, not just named entities.
														
 
															+    for kw in cluster.get("keywords", []) or []:
														
 
															+        values.append(str(kw).strip().lower())
														
 
															     return [v for v in values if v]
														
@@ -148,20 +153,20 @@ NEWS_TOOL_CARDS = [
 
															             {"name": "limit", "type": "integer", "default": 5, "range": "1-20"},
														
 
															             {"name": "include_articles", "type": "boolean", "default": False},
														
 
															         ],
														
 
															-        ["headline", "summary", "entities", "sentiment", "importance", "sources", "timestamp", "articles?"],
														
 
															-        ["Use when you want the freshest clusters and are willing to let the server decide topic vs entity mode."],
														
 
															+        ["headline", "summary", "entities", "keywords", "sentiment", "importance", "sources", "timestamp", "articles?"],
														
 
															+        ["Use when you want the freshest clusters. Each cluster includes both named entities and LLM-curated thematic keywords describing what the story is about."],
														
 
															     ),
														
 
															     _tool_card(
														
 
															         "get_events_for_entity",
														
 
															-        "Search recent clusters for a person, place, company, or theme by entity matching.",
														
 
															+        "Search recent clusters for a person, place, company, theme, or keyword by matching entities and thematic keywords.",
														
 
															         [
														
 
															-            {"name": "entity", "type": "string", "meaning": "entity label or phrase"},
														
 
															+            {"name": "entity", "type": "string", "meaning": "entity label, phrase, or keyword to search for"},
														
 
															             {"name": "timeframe", "type": "string", "default": "24h", "examples": ["24h", "72h", "3d"]},
														
 
															             {"name": "limit", "type": "integer", "default": 10, "range": "1-30"},
														
 
															             {"name": "include_articles", "type": "boolean", "default": False},
														
 
															         ],
														
 
															-        ["headline", "summary", "entities", "sentiment", "importance", "sources", "timestamp", "articles?"],
														
 
															-        ["Normalization is automatic; use this for an entity-centered deep dive."],
														
 
															+        ["headline", "summary", "entities", "keywords", "sentiment", "importance", "sources", "timestamp", "articles?"],
														
 
															+        ["Matches against both named entities and thematic keywords. Use this for an entity-centered or theme-centered deep dive."],
														
 
															     ),
														
 
															     _tool_card(
														
 
															         "get_event_summary",
														
@@ -175,7 +180,7 @@ NEWS_TOOL_CARDS = [
 
															     ),
														
 
															     _tool_card(
														
 
															         "detect_emerging_topics",
														
 
															-        "Surface entities and phrases starting to matter in the recent window.",
														
 
															+        "Surface emerging entities, thematic keywords, and phrases that are accelerating in the recent window.",
														
 
															         [
														
 
															             {"name": "limit", "type": "integer", "default": 10, "range": "1-20"},
														
 
															             {"name": "timeframe", "type": "string", "default": "24h", "examples": ["4h", "24h", "3d"]},
														
@@ -183,21 +188,21 @@ NEWS_TOOL_CARDS = [
 
															             {"name": "around", "type": "string", "default": "none", "meaning": "entity to scope results to its neighborhood (e.g. \"Bitcoin\")"},
														
 
															         ],
														
 
															         ["topic", "trend_score", "velocity", "recent_count", "prior_count", "source_count", "related_entities", "signal_type"],
														
 
															-        ["Use timeframe to control lookback, topic to scope to a category, around to find what's emerging near a specific entity."],
														
 
															+        ["Use timeframe to control lookback, topic to scope to a category, around to find what's emerging near a specific entity. Signal types: entity (named entity), keyword (thematic descriptor), phrase (headline bigram). Check velocity and source_count to distinguish real spikes from noise."],
														
 
															     ),
														
 
															     _tool_card(
														
 
															         "get_news_sentiment",
														
 
															-        "Estimate sentiment around an entity over a lookback window.",
														
 
															+        "Estimate sentiment around an entity or keyword over a lookback window.",
														
 
															         [
														
 
															-            {"name": "entity", "type": "string"},
														
 
															+            {"name": "entity", "type": "string", "meaning": "entity label, phrase, or keyword to analyze"},
														
 
															             {"name": "timeframe", "type": "string", "default": "24h"},
														
 
															         ],
														
 
															         ["entity", "sentiment", "score", "cluster_count"],
														
 
															-        ["Use after locating a cluster set or entity neighborhood."],
														
 
															+        ["Matches clusters by entities and keywords. Use after locating a cluster set or entity neighborhood."],
														
 
															     ),
														
 
															     _tool_card(
														
 
															         "get_related_recent_entities",
														
 
															-        "Blend local co-occurrence with Google Trends related topics, while preserving mids where available.",
														
 
															+        "Find entities and thematic keywords commonly co-occurring with a subject in recent clusters, optionally blended with Google Trends suggestions.",
														
 
															         [
														
 
															             {"name": "subject", "type": "string", "meaning": "canonical entity or subject phrase"},
														
 
															             {"name": "timeframe", "type": "string", "default": "72h"},
														
@@ -205,7 +210,7 @@ NEWS_TOOL_CARDS = [
 
															             {"name": "include_trends", "type": "boolean", "default": True},
														
 
															         ],
														
 
															         ["subject", "related[].normalized", "related[].canonical_label", "related[].mid", "related[].sources", "related[].scores"],
														
 
															-        ["Use this to drill from a subject into related entities, then feed those into get_events_for_entity."],
														
 
															+        ["Use this to drill from a subject into related entities and themes, then feed results into get_events_for_entity."],
														
 
															     ),
														
 
															 ]
														
@@ -256,7 +261,8 @@ NEWS_AGENT_TIPS = [
 
															     "When describing clusters, keep sources and timestamps visible so the user can assess recency and provenance.",
														
 
															     "Prefer a short chain of tools over many parallel calls unless you are building a neighborhood map or comparison table.",
														
 
															     "For tricky names, rely on the server's resolver instead of inventing alias rules in the client.",
														
 
															-    "Use detect_emerging_topics with timeframe=\"4h\" for what's hot right now, timeframe=\"3d\" for weekly trends. Use topic= to scope to a category, around= to find what's emerging near a specific entity. Check velocity to distinguish accelerating signals from steady-state ones.",
														
 
															+    "Use detect_emerging_topics with timeframe=\"4h\" for what's hot right now, timeframe=\"3d\" for weekly trends. Use topic= to scope to a category, around= to find what's emerging near a specific entity. Check velocity to distinguish accelerating signals from steady-state ones. Filter by signal_type to focus on entities, keywords, or phrases.",
														
 
															+    "Each cluster contains both entities (named entities with identity resolution) and keywords (thematic descriptors). Use keywords to understand what a story is about beyond the named entities.",
														
 
															 ]
														
@@ -328,7 +334,7 @@ async def toggle_feed(feed_url: str, enabled: bool) -> dict:
 
															     return {"ok": True, "feed_key": feed_url.strip(), "enabled": enabled, "details": updated}
														
 
															-@mcp.tool(description="Investigate a topic and return the newest deduplicated news clusters, sorted by recency.")
														
 
															+@mcp.tool(description="Investigate a topic and return the newest deduplicated news clusters with entities and thematic keywords, sorted by recency.")
														
 
															 async def get_latest_events(topic: str | None = None, limit: int = 5, include_articles: bool = False):
														
 
															     limit = max(1, min(int(limit), 20))
														
 
															     # When topic is omitted, search across all topics (no topic filter).
														
@@ -378,6 +384,7 @@ async def get_latest_events(topic: str | None = None, limit: int = 5, include_ar
 
															             "headline": c.get("headline"),
														
 
															             "summary": c.get("summary"),
														
 
															             "entities": c.get("entities", []),
														
 
															+            "keywords": c.get("keywords", []),
														
 
															             "sentiment": c.get("sentiment", "neutral"),
														
 
															             "importance": c.get("importance", 0.0),
														
 
															             "sources": c.get("sources", []),
														
@@ -401,7 +408,7 @@ async def get_latest_events(topic: str | None = None, limit: int = 5, include_ar
 
															     return out
														
 
															-@mcp.tool(description="Investigate a person, company, place, or theme by matching extracted entities within a time window.")
														
 
															+@mcp.tool(description="Investigate a person, company, place, theme, or keyword by matching entities and thematic keywords within a time window.")
														
 
															 async def get_events_for_entity(entity: str, limit: int = 10, timeframe: str = "24h", include_articles: bool = False):
														
 
															     limit = max(1, min(int(limit), 30))
														
 
															     query = normalize_query(entity).strip().lower()
														
@@ -440,6 +447,7 @@ async def get_events_for_entity(entity: str, limit: int = 10, timeframe: str = "
 
															             "headline": c.get("headline"),
														
 
															             "summary": c.get("summary"),
														
 
															             "entities": c.get("entities", []),
														
 
															+            "keywords": c.get("keywords", []),
														
 
															             "sentiment": c.get("sentiment", "neutral"),
														
 
															             "importance": c.get("importance", 0.0),
														
 
															             "sources": c.get("sources", []),
														
@@ -461,7 +469,7 @@ async def get_events_for_entity(entity: str, limit: int = 10, timeframe: str = "
 
															     return out
														
 
															-@mcp.tool(description="Return entities most commonly associated with the subject in recent clusters, optionally blended with Google Trends suggestions.")
														
 
															+@mcp.tool(description="Return entities and thematic keywords commonly co-occurring with the subject in recent clusters, optionally blended with Google Trends suggestions.")
														
 
															 async def get_related_recent_entities(subject: str, timeframe: str = "72h", limit: int = 10, include_trends: bool = True):
														
 
															     limit = max(1, min(int(limit), 25))
														
 
															     hours = _parse_timeframe_to_hours(timeframe)
														
@@ -547,8 +555,9 @@ async def get_event_summary(event_id: str, include_articles: bool = False):
 
															     return out
														
 
															-@mcp.tool(description="Explore what is starting to matter: surface emerging entities and phrases from recent clusters. "
														
 
															-           "Use timeframe to control the lookback window, topic to scope to a category, and around to find what's emerging near a specific entity.")
														
 
															+@mcp.tool(description="Explore what is starting to matter: surface emerging entities, thematic keywords, and phrases from recent clusters. "
														
 
															+           "Use timeframe to control the lookback window, topic to scope to a category, and around to find what's emerging near a specific entity. "
														
 
															+           "Results include signal_type (entity / keyword / phrase) for downstream filtering.")
														
 
															 async def detect_emerging_topics(limit: int = 10, timeframe: str = "24h", topic: str | None = None, around: str | None = None):
														
 
															     """Surface entities and phrases that are accelerating in recent clusters.
														
@@ -631,12 +640,28 @@ async def detect_emerging_topics(limit: int = 10, timeframe: str = "24h", topic:
 
															     entity_cooccur: dict[str, Counter] = {}
														
 
															     phrase_counts_recent = Counter()
														
 
															+    # Keyword accumulators — same scoring pipeline as entities, but tracking
														
 
															+    # LLM-curated thematic descriptors instead of named entities.
														
 
															+    kw_counts_recent = Counter()
														
 
															+    kw_counts_prior = Counter()
														
 
															+    kw_importance_recent = Counter()
														
 
															+    kw_sources: dict[str, set] = {}
														
 
															+    kw_buckets: dict[str, set] = {}
														
 
															+
														
 
															     bucket_size_hours = max(1.0, hours / 6.0)  # split window into ~6 buckets
														
 
															     for c in clusters:
														
 
															         ents_in_cluster = [e for e in (c.get("entities", []) or []) if not _is_generic_entity(e)]
														
 
															         ents_norm = [str(e).strip().lower() for e in ents_in_cluster if str(e).strip()]
														
 
															+        # Keywords: deduplicate per cluster so a cluster with the same keyword
														
 
															+        # listed twice doesn't inflate counts.
														
 
															+        kws_in_cluster = list(dict.fromkeys(
														
 
															+            str(k).strip().lower()
														
 
															+            for k in (c.get("keywords", []) or [])
														
 
															+            if str(k).strip() and not _is_generic_entity(k)
														
 
															+        ))
														
 
															+
														
 
															         age_h = _cluster_age_hours(c)
														
 
															         is_recent = age_h <= half_hours
														
 
															         bucket_idx = int(age_h / bucket_size_hours)
														
@@ -674,6 +699,25 @@ async def detect_emerging_topics(limit: int = 10, timeframe: str = "24h", topic:
 
															                 except Exception:
														
 
															                     pass
														
 
															+        # --- keyword counting (same recent/prior split as entities) ---
														
 
															+        kw_counts = kw_counts_recent if is_recent else kw_counts_prior
														
 
															+        kw_imp_acc = kw_importance_recent if is_recent else None
														
 
															+        for kw in kws_in_cluster:
														
 
															+            kw_counts[kw] += 1
														
 
															+            if kw not in kw_sources:
														
 
															+                kw_sources[kw] = set()
														
 
															+            src = c.get("source") or c.get("headline", "").split(" - ")[-1] if c.get("headline") else ""
														
 
															+            if src:
														
 
															+                kw_sources[kw].add(str(src))
														
 
															+            if kw not in kw_buckets:
														
 
															+                kw_buckets[kw] = set()
														
 
															+            kw_buckets[kw].add(bucket_idx)
														
 
															+            if kw_imp_acc is not None:
														
 
															+                try:
														
 
															+                    kw_imp_acc[kw] += float(c.get("importance", 0.0) or 0.0)  # type: ignore[assignment]
														
 
															+                except Exception:
														
 
															+                    pass
														
 
															+
														
 
															         # co-occurrence (only for clusters matching the around filter, if any)
														
 
															         for i in range(len(ents_norm)):
														
 
															             a = ents_norm[i]
														
@@ -753,13 +797,67 @@ async def detect_emerging_topics(limit: int = 10, timeframe: str = "24h", topic:
 
															             "signal_type": "entity",
														
 
															         })
														
 
															+    # --- score keywords (same velocity/recency/source/sustained/importance formula) ---
														
 
															+    all_keywords = set(kw_counts_recent.keys()) | set(kw_counts_prior.keys())
														
 
															+    kw_scored = []
														
 
															+
														
 
															+    for kw in all_keywords:
														
 
															+        # Skip keywords that are already scored as entities — entity signal is
														
 
															+        # higher quality (proper nouns, resolved identities).
														
 
															+        if kw in all_entities:
														
 
															+            continue
														
 
															+
														
 
															+        recent_n = kw_counts_recent.get(kw, 0)
														
 
															+        prior_n = kw_counts_prior.get(kw, 0)
														
 
															+        total_n = recent_n + prior_n
														
 
															+
														
 
															+        if total_n < 1:
														
 
															+            continue
														
 
															+
														
 
															+        velocity = (recent_n + 0.5) / (prior_n + 0.5)
														
 
															+        recency_ratio = recent_n / total_n
														
 
															+        n_sources = len(kw_sources.get(kw, set()))
														
 
															+        n_buckets = len(kw_buckets.get(kw, set()))
														
 
															+        avg_imp = (kw_importance_recent.get(kw, 0.0) / max(1, recent_n)) if recent_n > 0 else 0.0
														
 
															+
														
 
															+        composed_score = (
														
 
															+            0.35 * min(1.0, math.log1p(velocity) / math.log1p(4.0)) +
														
 
															+            0.25 * recency_ratio +
														
 
															+            0.15 * min(1.0, n_sources / 5.0) +
														
 
															+            0.10 * min(1.0, n_buckets / 4.0) +
														
 
															+            0.15 * min(1.0, avg_imp)
														
 
															+        )
														
 
															+
														
 
															+        kw_scored.append({
														
 
															+            "topic": kw,
														
 
															+            "trend_score": min(0.99, round(composed_score, 3)),
														
 
															+            "related_entities": [],
														
 
															+            "velocity": round(velocity, 2),
														
 
															+            "recent_count": recent_n,
														
 
															+            "prior_count": prior_n,
														
 
															+            "source_count": n_sources,
														
 
															+            "avg_importance": round(avg_imp, 3),
														
 
															+            "signal_type": "keyword",
														
 
															+        })
														
 
															+
														
 
															+    # sort keywords by score descending
														
 
															+    kw_scored.sort(key=lambda x: (-x["trend_score"], -x["velocity"], x["topic"]))
														
 
															+
														
 
															     # sort by composed score descending
														
 
															     scored.sort(key=lambda x: (-x["trend_score"], -x["velocity"], x["topic"]))
														
 
															-    # --- add phrase signals (only from recent window) ---
														
 
															+    # --- merge: entities first, then keywords, then phrases ---
														
 
															     emerging = list(scored)  # start with entities
														
 
															+    seen_topics = {item["topic"] for item in emerging}
														
 
															+
														
 
															+    for kw_item in kw_scored:
														
 
															+        if kw_item["topic"] not in seen_topics:
														
 
															+            emerging.append(kw_item)
														
 
															+            seen_topics.add(kw_item["topic"])
														
 
															+
														
 
															+    # --- add phrase signals (only from recent window) ---
														
 
															     for phrase, count in phrase_counts_recent.most_common(limit * 2):
														
 
															-        if any(item["topic"] == phrase for item in emerging):
														
 
															+        if phrase in seen_topics:
														
 
															             continue
														
 
															         emerging.append({
														
 
															             "topic": phrase.title(),
														
@@ -772,13 +870,15 @@ async def detect_emerging_topics(limit: int = 10, timeframe: str = "24h", topic:
 
															             "avg_importance": 0.0,
														
 
															             "signal_type": "phrase",
														
 
															         })
														
 
															+        seen_topics.add(phrase)
														
 
															         if len(emerging) >= limit:
														
 
															             break
														
 
															     return emerging[:limit]
														
 
															-@mcp.tool(description="Investigate whether sentiment around an entity is positive, negative, or neutral over a chosen lookback window.")
														
 
															+@mcp.tool(description="Investigate whether sentiment around an entity or keyword is positive, negative, or neutral over a chosen lookback window. "
														
 
															+           "Matches clusters by both named entities and thematic keywords.")
														
 
															 async def get_news_sentiment(entity: str, timeframe: str = "24h"):
														
 
															     store = SQLiteClusterStore(DB_PATH)
														
@@ -855,11 +955,12 @@ async def get_capabilities():
 
															     return {
														
 
															         "server": {
														
 
															             "name": "news-mcp",
														
 
															-            "purpose": "Recent news clusters, entity drill-down, sentiment, emerging topics, and related-entity expansion.",
														
 
															+            "purpose": "Recent news clusters with entities and thematic keywords, entity/keyword drill-down, sentiment, emerging topics, and related-entity expansion.",
														
 
															             "output_conventions": {
														
 
															                 "cluster_ids": "Do not surface cluster_id in user-facing prose unless explicitly requested; treat it as internal navigation metadata.",
														
 
															                 "sources": "Always preserve and display sources when summarizing a cluster or entity result.",
														
 
															                 "timestamps": "Mention timestamps consistently when comparing multiple clusters or when recency matters.",
														
 
															+                "clusters": "Each cluster includes entities (named entities with optional MID/canonical_label) and keywords (thematic descriptors). Both are searchable; entities are higher-signal, keywords capture subject-matter themes.",
														
 
															             },
														
 
															         },
														
 
															         "tools": NEWS_TOOL_CARDS,
														
@@ -867,10 +968,11 @@ async def get_capabilities():
 
															         "example_chains": NEWS_EXAMPLE_CHAINS,
														
 
															         "agent_tips": NEWS_AGENT_TIPS,
														
 
															         "guidance": [
														
 
															-            "Use get_latest_events for a tail, get_events_for_entity for entity deep dives, and get_related_recent_entities for neighborhood expansion.",
														
 
															+            "Use get_latest_events for a tail, get_events_for_entity for entity/keyword deep dives, and get_related_recent_entities for neighborhood expansion.",
														
 
															             "Prefer normalized/canonical entities when possible, but the server will resolve common aliases and MIDs for you.",
														
 
															             "When presenting results to users, summarize the cluster; avoid exposing internal IDs unless they are needed for follow-up tool calls.",
														
 
															-            "For emerging topics, use detect_emerging_topics with timeframe and around parameters to scope your query. High velocity + high source_count = strong emerging signal.",
														
 
															+            "For emerging topics, use detect_emerging_topics with timeframe and around parameters. Signal types: entity (named entity, highest quality), keyword (thematic descriptor), phrase (headline bigram). High velocity + high source_count = strong signal.",
														
 
															+            "get_events_for_entity and get_news_sentiment match both entities and thematic keywords — use keywords when the subject is a theme rather than a named entity.",
														
 
															         ],
														
 
															     }
														
@@ -1064,6 +1166,19 @@ def api_entities(
 
															     except Exception as e:
														
 
															         return _api_err(e, f"entities(hours={hours})")
														
 
															+@app.get("/api/v1/keywords")
														
 
															+def api_keywords(
														
 
															+    hours: int = 24,
														
 
															+    limit: int = 30,
														
 
															+):
														
 
															+    """Top keyword frequencies (thematic descriptors, excluding terms already counted as entities)."""
														
 
															+    try:
														
 
															+        store = DashboardStore(_shared_store)
														
 
															+        keywords = store.get_keyword_frequencies(hours=hours, limit=limit)
														
 
															+        return {"keywords": keywords, "hours": hours}
														
 
															+    except Exception as e:
														
 
															+        return _api_err(e, f"keywords(hours={hours})")
														
 
															+
														
 
															 @app.get("/api/v1/cluster/{cluster_id}")
														
 
															 def api_cluster_detail(cluster_id: str):
														
 
															     """Full cluster detail for drill-down."""
														
--- a/news_mcp/related_entities.py
+++ b/news_mcp/related_entities.py
@@ -50,7 +50,11 @@ def _collect_local_related(
 
															         if not (haystack_set & subject_terms):
														
 
															             continue
														
 
															-        # Count other entities normalized.
														
 
															+        # Collect entities already present in this cluster (by normalized form)
														
 
															+        # so we can skip keywords that are already counted as entities.
														
 
															+        ents_in_cluster = {str(e).strip().lower() for e in (cluster.get("entities", []) or []) if str(e).strip()}
														
 
															+
														
 
															+        # Count other entities (existing behavior).
														
 
															         for ent in cluster.get("entities", []) or []:
														
 
															             ent_norm = normalize_entity(ent)
														
 
															             if not ent_norm:
														
@@ -59,6 +63,24 @@ def _collect_local_related(
 
															             if ent_key in subject_terms:
														
 
															                 continue
														
 
															             counter[ent_norm] += 1
														
 
															+
														
 
															+        # Count keywords that are NOT already entities in this cluster.
														
 
															+        # Keywords are LLM-curated thematic descriptors — they capture
														
 
															+        # subject-matter signals that may not be named entities.
														
 
															+        for kw in cluster.get("keywords", []) or []:
														
 
															+            kw_norm = str(kw).strip()
														
 
															+            if not kw_norm:
														
 
															+                continue
														
 
															+            kw_key = kw_norm.lower()
														
 
															+            # Skip if this keyword is already an entity in this cluster
														
 
															+            # (entity signal is higher quality — has MID, canonical_label).
														
 
															+            if kw_key in ents_in_cluster:
														
 
															+                continue
														
 
															+            # Skip if it matches the subject itself
														
 
															+            if kw_key in subject_terms:
														
 
															+                continue
														
 
															+            counter[kw_norm] += 1
														
 
															+
														
 
															     return counter.most_common(limit)