|
@@ -65,7 +65,7 @@ mcp = FastMCP(
|
|
|
|
|
|
|
|
|
|
|
|
|
def _cluster_entity_haystack(cluster: dict) -> list[str]:
|
|
def _cluster_entity_haystack(cluster: dict) -> list[str]:
|
|
|
- """Collect the normalized entity clues attached to a cluster."""
|
|
|
|
|
|
|
+ """Collect the normalized entity + keyword clues attached to a cluster."""
|
|
|
values: list[str] = []
|
|
values: list[str] = []
|
|
|
for ent in cluster.get("entities", []) or []:
|
|
for ent in cluster.get("entities", []) or []:
|
|
|
values.append(str(ent).strip().lower())
|
|
values.append(str(ent).strip().lower())
|
|
@@ -76,6 +76,11 @@ def _cluster_entity_haystack(cluster: dict) -> list[str]:
|
|
|
val = res.get(key)
|
|
val = res.get(key)
|
|
|
if val:
|
|
if val:
|
|
|
values.append(str(val).strip().lower())
|
|
values.append(str(val).strip().lower())
|
|
|
|
|
+ # Keywords are LLM-curated thematic descriptors — include them in the
|
|
|
|
|
+ # searchable haystack so entity/theme queries match on subject-matter
|
|
|
|
|
+ # signals, not just named entities.
|
|
|
|
|
+ for kw in cluster.get("keywords", []) or []:
|
|
|
|
|
+ values.append(str(kw).strip().lower())
|
|
|
return [v for v in values if v]
|
|
return [v for v in values if v]
|
|
|
|
|
|
|
|
|
|
|
|
@@ -148,20 +153,20 @@ NEWS_TOOL_CARDS = [
|
|
|
{"name": "limit", "type": "integer", "default": 5, "range": "1-20"},
|
|
{"name": "limit", "type": "integer", "default": 5, "range": "1-20"},
|
|
|
{"name": "include_articles", "type": "boolean", "default": False},
|
|
{"name": "include_articles", "type": "boolean", "default": False},
|
|
|
],
|
|
],
|
|
|
- ["headline", "summary", "entities", "sentiment", "importance", "sources", "timestamp", "articles?"],
|
|
|
|
|
- ["Use when you want the freshest clusters and are willing to let the server decide topic vs entity mode."],
|
|
|
|
|
|
|
+ ["headline", "summary", "entities", "keywords", "sentiment", "importance", "sources", "timestamp", "articles?"],
|
|
|
|
|
+ ["Use when you want the freshest clusters. Each cluster includes both named entities and LLM-curated thematic keywords describing what the story is about."],
|
|
|
),
|
|
),
|
|
|
_tool_card(
|
|
_tool_card(
|
|
|
"get_events_for_entity",
|
|
"get_events_for_entity",
|
|
|
- "Search recent clusters for a person, place, company, or theme by entity matching.",
|
|
|
|
|
|
|
+ "Search recent clusters for a person, place, company, theme, or keyword by matching entities and thematic keywords.",
|
|
|
[
|
|
[
|
|
|
- {"name": "entity", "type": "string", "meaning": "entity label or phrase"},
|
|
|
|
|
|
|
+ {"name": "entity", "type": "string", "meaning": "entity label, phrase, or keyword to search for"},
|
|
|
{"name": "timeframe", "type": "string", "default": "24h", "examples": ["24h", "72h", "3d"]},
|
|
{"name": "timeframe", "type": "string", "default": "24h", "examples": ["24h", "72h", "3d"]},
|
|
|
{"name": "limit", "type": "integer", "default": 10, "range": "1-30"},
|
|
{"name": "limit", "type": "integer", "default": 10, "range": "1-30"},
|
|
|
{"name": "include_articles", "type": "boolean", "default": False},
|
|
{"name": "include_articles", "type": "boolean", "default": False},
|
|
|
],
|
|
],
|
|
|
- ["headline", "summary", "entities", "sentiment", "importance", "sources", "timestamp", "articles?"],
|
|
|
|
|
- ["Normalization is automatic; use this for an entity-centered deep dive."],
|
|
|
|
|
|
|
+ ["headline", "summary", "entities", "keywords", "sentiment", "importance", "sources", "timestamp", "articles?"],
|
|
|
|
|
+ ["Matches against both named entities and thematic keywords. Use this for an entity-centered or theme-centered deep dive."],
|
|
|
),
|
|
),
|
|
|
_tool_card(
|
|
_tool_card(
|
|
|
"get_event_summary",
|
|
"get_event_summary",
|
|
@@ -175,7 +180,7 @@ NEWS_TOOL_CARDS = [
|
|
|
),
|
|
),
|
|
|
_tool_card(
|
|
_tool_card(
|
|
|
"detect_emerging_topics",
|
|
"detect_emerging_topics",
|
|
|
- "Surface entities and phrases starting to matter in the recent window.",
|
|
|
|
|
|
|
+ "Surface emerging entities, thematic keywords, and phrases that are accelerating in the recent window.",
|
|
|
[
|
|
[
|
|
|
{"name": "limit", "type": "integer", "default": 10, "range": "1-20"},
|
|
{"name": "limit", "type": "integer", "default": 10, "range": "1-20"},
|
|
|
{"name": "timeframe", "type": "string", "default": "24h", "examples": ["4h", "24h", "3d"]},
|
|
{"name": "timeframe", "type": "string", "default": "24h", "examples": ["4h", "24h", "3d"]},
|
|
@@ -183,21 +188,21 @@ NEWS_TOOL_CARDS = [
|
|
|
{"name": "around", "type": "string", "default": "none", "meaning": "entity to scope results to its neighborhood (e.g. \"Bitcoin\")"},
|
|
{"name": "around", "type": "string", "default": "none", "meaning": "entity to scope results to its neighborhood (e.g. \"Bitcoin\")"},
|
|
|
],
|
|
],
|
|
|
["topic", "trend_score", "velocity", "recent_count", "prior_count", "source_count", "related_entities", "signal_type"],
|
|
["topic", "trend_score", "velocity", "recent_count", "prior_count", "source_count", "related_entities", "signal_type"],
|
|
|
- ["Use timeframe to control lookback, topic to scope to a category, around to find what's emerging near a specific entity."],
|
|
|
|
|
|
|
+ ["Use timeframe to control lookback, topic to scope to a category, around to find what's emerging near a specific entity. Signal types: entity (named entity), keyword (thematic descriptor), phrase (headline bigram). Check velocity and source_count to distinguish real spikes from noise."],
|
|
|
),
|
|
),
|
|
|
_tool_card(
|
|
_tool_card(
|
|
|
"get_news_sentiment",
|
|
"get_news_sentiment",
|
|
|
- "Estimate sentiment around an entity over a lookback window.",
|
|
|
|
|
|
|
+ "Estimate sentiment around an entity or keyword over a lookback window.",
|
|
|
[
|
|
[
|
|
|
- {"name": "entity", "type": "string"},
|
|
|
|
|
|
|
+ {"name": "entity", "type": "string", "meaning": "entity label, phrase, or keyword to analyze"},
|
|
|
{"name": "timeframe", "type": "string", "default": "24h"},
|
|
{"name": "timeframe", "type": "string", "default": "24h"},
|
|
|
],
|
|
],
|
|
|
["entity", "sentiment", "score", "cluster_count"],
|
|
["entity", "sentiment", "score", "cluster_count"],
|
|
|
- ["Use after locating a cluster set or entity neighborhood."],
|
|
|
|
|
|
|
+ ["Matches clusters by entities and keywords. Use after locating a cluster set or entity neighborhood."],
|
|
|
),
|
|
),
|
|
|
_tool_card(
|
|
_tool_card(
|
|
|
"get_related_recent_entities",
|
|
"get_related_recent_entities",
|
|
|
- "Blend local co-occurrence with Google Trends related topics, while preserving mids where available.",
|
|
|
|
|
|
|
+ "Find entities and thematic keywords commonly co-occurring with a subject in recent clusters, optionally blended with Google Trends suggestions.",
|
|
|
[
|
|
[
|
|
|
{"name": "subject", "type": "string", "meaning": "canonical entity or subject phrase"},
|
|
{"name": "subject", "type": "string", "meaning": "canonical entity or subject phrase"},
|
|
|
{"name": "timeframe", "type": "string", "default": "72h"},
|
|
{"name": "timeframe", "type": "string", "default": "72h"},
|
|
@@ -205,7 +210,7 @@ NEWS_TOOL_CARDS = [
|
|
|
{"name": "include_trends", "type": "boolean", "default": True},
|
|
{"name": "include_trends", "type": "boolean", "default": True},
|
|
|
],
|
|
],
|
|
|
["subject", "related[].normalized", "related[].canonical_label", "related[].mid", "related[].sources", "related[].scores"],
|
|
["subject", "related[].normalized", "related[].canonical_label", "related[].mid", "related[].sources", "related[].scores"],
|
|
|
- ["Use this to drill from a subject into related entities, then feed those into get_events_for_entity."],
|
|
|
|
|
|
|
+ ["Use this to drill from a subject into related entities and themes, then feed results into get_events_for_entity."],
|
|
|
),
|
|
),
|
|
|
]
|
|
]
|
|
|
|
|
|
|
@@ -256,7 +261,8 @@ NEWS_AGENT_TIPS = [
|
|
|
"When describing clusters, keep sources and timestamps visible so the user can assess recency and provenance.",
|
|
"When describing clusters, keep sources and timestamps visible so the user can assess recency and provenance.",
|
|
|
"Prefer a short chain of tools over many parallel calls unless you are building a neighborhood map or comparison table.",
|
|
"Prefer a short chain of tools over many parallel calls unless you are building a neighborhood map or comparison table.",
|
|
|
"For tricky names, rely on the server's resolver instead of inventing alias rules in the client.",
|
|
"For tricky names, rely on the server's resolver instead of inventing alias rules in the client.",
|
|
|
- "Use detect_emerging_topics with timeframe=\"4h\" for what's hot right now, timeframe=\"3d\" for weekly trends. Use topic= to scope to a category, around= to find what's emerging near a specific entity. Check velocity to distinguish accelerating signals from steady-state ones.",
|
|
|
|
|
|
|
+ "Use detect_emerging_topics with timeframe=\"4h\" for what's hot right now, timeframe=\"3d\" for weekly trends. Use topic= to scope to a category, around= to find what's emerging near a specific entity. Check velocity to distinguish accelerating signals from steady-state ones. Filter by signal_type to focus on entities, keywords, or phrases.",
|
|
|
|
|
+ "Each cluster contains both entities (named entities with identity resolution) and keywords (thematic descriptors). Use keywords to understand what a story is about beyond the named entities.",
|
|
|
]
|
|
]
|
|
|
|
|
|
|
|
|
|
|
|
@@ -328,7 +334,7 @@ async def toggle_feed(feed_url: str, enabled: bool) -> dict:
|
|
|
return {"ok": True, "feed_key": feed_url.strip(), "enabled": enabled, "details": updated}
|
|
return {"ok": True, "feed_key": feed_url.strip(), "enabled": enabled, "details": updated}
|
|
|
|
|
|
|
|
|
|
|
|
|
-@mcp.tool(description="Investigate a topic and return the newest deduplicated news clusters, sorted by recency.")
|
|
|
|
|
|
|
+@mcp.tool(description="Investigate a topic and return the newest deduplicated news clusters with entities and thematic keywords, sorted by recency.")
|
|
|
async def get_latest_events(topic: str | None = None, limit: int = 5, include_articles: bool = False):
|
|
async def get_latest_events(topic: str | None = None, limit: int = 5, include_articles: bool = False):
|
|
|
limit = max(1, min(int(limit), 20))
|
|
limit = max(1, min(int(limit), 20))
|
|
|
# When topic is omitted, search across all topics (no topic filter).
|
|
# When topic is omitted, search across all topics (no topic filter).
|
|
@@ -378,6 +384,7 @@ async def get_latest_events(topic: str | None = None, limit: int = 5, include_ar
|
|
|
"headline": c.get("headline"),
|
|
"headline": c.get("headline"),
|
|
|
"summary": c.get("summary"),
|
|
"summary": c.get("summary"),
|
|
|
"entities": c.get("entities", []),
|
|
"entities": c.get("entities", []),
|
|
|
|
|
+ "keywords": c.get("keywords", []),
|
|
|
"sentiment": c.get("sentiment", "neutral"),
|
|
"sentiment": c.get("sentiment", "neutral"),
|
|
|
"importance": c.get("importance", 0.0),
|
|
"importance": c.get("importance", 0.0),
|
|
|
"sources": c.get("sources", []),
|
|
"sources": c.get("sources", []),
|
|
@@ -401,7 +408,7 @@ async def get_latest_events(topic: str | None = None, limit: int = 5, include_ar
|
|
|
return out
|
|
return out
|
|
|
|
|
|
|
|
|
|
|
|
|
-@mcp.tool(description="Investigate a person, company, place, or theme by matching extracted entities within a time window.")
|
|
|
|
|
|
|
+@mcp.tool(description="Investigate a person, company, place, theme, or keyword by matching entities and thematic keywords within a time window.")
|
|
|
async def get_events_for_entity(entity: str, limit: int = 10, timeframe: str = "24h", include_articles: bool = False):
|
|
async def get_events_for_entity(entity: str, limit: int = 10, timeframe: str = "24h", include_articles: bool = False):
|
|
|
limit = max(1, min(int(limit), 30))
|
|
limit = max(1, min(int(limit), 30))
|
|
|
query = normalize_query(entity).strip().lower()
|
|
query = normalize_query(entity).strip().lower()
|
|
@@ -440,6 +447,7 @@ async def get_events_for_entity(entity: str, limit: int = 10, timeframe: str = "
|
|
|
"headline": c.get("headline"),
|
|
"headline": c.get("headline"),
|
|
|
"summary": c.get("summary"),
|
|
"summary": c.get("summary"),
|
|
|
"entities": c.get("entities", []),
|
|
"entities": c.get("entities", []),
|
|
|
|
|
+ "keywords": c.get("keywords", []),
|
|
|
"sentiment": c.get("sentiment", "neutral"),
|
|
"sentiment": c.get("sentiment", "neutral"),
|
|
|
"importance": c.get("importance", 0.0),
|
|
"importance": c.get("importance", 0.0),
|
|
|
"sources": c.get("sources", []),
|
|
"sources": c.get("sources", []),
|
|
@@ -461,7 +469,7 @@ async def get_events_for_entity(entity: str, limit: int = 10, timeframe: str = "
|
|
|
return out
|
|
return out
|
|
|
|
|
|
|
|
|
|
|
|
|
-@mcp.tool(description="Return entities most commonly associated with the subject in recent clusters, optionally blended with Google Trends suggestions.")
|
|
|
|
|
|
|
+@mcp.tool(description="Return entities and thematic keywords commonly co-occurring with the subject in recent clusters, optionally blended with Google Trends suggestions.")
|
|
|
async def get_related_recent_entities(subject: str, timeframe: str = "72h", limit: int = 10, include_trends: bool = True):
|
|
async def get_related_recent_entities(subject: str, timeframe: str = "72h", limit: int = 10, include_trends: bool = True):
|
|
|
limit = max(1, min(int(limit), 25))
|
|
limit = max(1, min(int(limit), 25))
|
|
|
hours = _parse_timeframe_to_hours(timeframe)
|
|
hours = _parse_timeframe_to_hours(timeframe)
|
|
@@ -547,8 +555,9 @@ async def get_event_summary(event_id: str, include_articles: bool = False):
|
|
|
return out
|
|
return out
|
|
|
|
|
|
|
|
|
|
|
|
|
-@mcp.tool(description="Explore what is starting to matter: surface emerging entities and phrases from recent clusters. "
|
|
|
|
|
- "Use timeframe to control the lookback window, topic to scope to a category, and around to find what's emerging near a specific entity.")
|
|
|
|
|
|
|
+@mcp.tool(description="Explore what is starting to matter: surface emerging entities, thematic keywords, and phrases from recent clusters. "
|
|
|
|
|
+ "Use timeframe to control the lookback window, topic to scope to a category, and around to find what's emerging near a specific entity. "
|
|
|
|
|
+ "Results include signal_type (entity / keyword / phrase) for downstream filtering.")
|
|
|
async def detect_emerging_topics(limit: int = 10, timeframe: str = "24h", topic: str | None = None, around: str | None = None):
|
|
async def detect_emerging_topics(limit: int = 10, timeframe: str = "24h", topic: str | None = None, around: str | None = None):
|
|
|
"""Surface entities and phrases that are accelerating in recent clusters.
|
|
"""Surface entities and phrases that are accelerating in recent clusters.
|
|
|
|
|
|
|
@@ -631,12 +640,28 @@ async def detect_emerging_topics(limit: int = 10, timeframe: str = "24h", topic:
|
|
|
entity_cooccur: dict[str, Counter] = {}
|
|
entity_cooccur: dict[str, Counter] = {}
|
|
|
phrase_counts_recent = Counter()
|
|
phrase_counts_recent = Counter()
|
|
|
|
|
|
|
|
|
|
+ # Keyword accumulators — same scoring pipeline as entities, but tracking
|
|
|
|
|
+ # LLM-curated thematic descriptors instead of named entities.
|
|
|
|
|
+ kw_counts_recent = Counter()
|
|
|
|
|
+ kw_counts_prior = Counter()
|
|
|
|
|
+ kw_importance_recent = Counter()
|
|
|
|
|
+ kw_sources: dict[str, set] = {}
|
|
|
|
|
+ kw_buckets: dict[str, set] = {}
|
|
|
|
|
+
|
|
|
bucket_size_hours = max(1.0, hours / 6.0) # split window into ~6 buckets
|
|
bucket_size_hours = max(1.0, hours / 6.0) # split window into ~6 buckets
|
|
|
|
|
|
|
|
for c in clusters:
|
|
for c in clusters:
|
|
|
ents_in_cluster = [e for e in (c.get("entities", []) or []) if not _is_generic_entity(e)]
|
|
ents_in_cluster = [e for e in (c.get("entities", []) or []) if not _is_generic_entity(e)]
|
|
|
ents_norm = [str(e).strip().lower() for e in ents_in_cluster if str(e).strip()]
|
|
ents_norm = [str(e).strip().lower() for e in ents_in_cluster if str(e).strip()]
|
|
|
|
|
|
|
|
|
|
+ # Keywords: deduplicate per cluster so a cluster with the same keyword
|
|
|
|
|
+ # listed twice doesn't inflate counts.
|
|
|
|
|
+ kws_in_cluster = list(dict.fromkeys(
|
|
|
|
|
+ str(k).strip().lower()
|
|
|
|
|
+ for k in (c.get("keywords", []) or [])
|
|
|
|
|
+ if str(k).strip() and not _is_generic_entity(k)
|
|
|
|
|
+ ))
|
|
|
|
|
+
|
|
|
age_h = _cluster_age_hours(c)
|
|
age_h = _cluster_age_hours(c)
|
|
|
is_recent = age_h <= half_hours
|
|
is_recent = age_h <= half_hours
|
|
|
bucket_idx = int(age_h / bucket_size_hours)
|
|
bucket_idx = int(age_h / bucket_size_hours)
|
|
@@ -674,6 +699,25 @@ async def detect_emerging_topics(limit: int = 10, timeframe: str = "24h", topic:
|
|
|
except Exception:
|
|
except Exception:
|
|
|
pass
|
|
pass
|
|
|
|
|
|
|
|
|
|
+ # --- keyword counting (same recent/prior split as entities) ---
|
|
|
|
|
+ kw_counts = kw_counts_recent if is_recent else kw_counts_prior
|
|
|
|
|
+ kw_imp_acc = kw_importance_recent if is_recent else None
|
|
|
|
|
+ for kw in kws_in_cluster:
|
|
|
|
|
+ kw_counts[kw] += 1
|
|
|
|
|
+ if kw not in kw_sources:
|
|
|
|
|
+ kw_sources[kw] = set()
|
|
|
|
|
+ src = c.get("source") or c.get("headline", "").split(" - ")[-1] if c.get("headline") else ""
|
|
|
|
|
+ if src:
|
|
|
|
|
+ kw_sources[kw].add(str(src))
|
|
|
|
|
+ if kw not in kw_buckets:
|
|
|
|
|
+ kw_buckets[kw] = set()
|
|
|
|
|
+ kw_buckets[kw].add(bucket_idx)
|
|
|
|
|
+ if kw_imp_acc is not None:
|
|
|
|
|
+ try:
|
|
|
|
|
+ kw_imp_acc[kw] += float(c.get("importance", 0.0) or 0.0) # type: ignore[assignment]
|
|
|
|
|
+ except Exception:
|
|
|
|
|
+ pass
|
|
|
|
|
+
|
|
|
# co-occurrence (only for clusters matching the around filter, if any)
|
|
# co-occurrence (only for clusters matching the around filter, if any)
|
|
|
for i in range(len(ents_norm)):
|
|
for i in range(len(ents_norm)):
|
|
|
a = ents_norm[i]
|
|
a = ents_norm[i]
|
|
@@ -753,13 +797,67 @@ async def detect_emerging_topics(limit: int = 10, timeframe: str = "24h", topic:
|
|
|
"signal_type": "entity",
|
|
"signal_type": "entity",
|
|
|
})
|
|
})
|
|
|
|
|
|
|
|
|
|
+ # --- score keywords (same velocity/recency/source/sustained/importance formula) ---
|
|
|
|
|
+ all_keywords = set(kw_counts_recent.keys()) | set(kw_counts_prior.keys())
|
|
|
|
|
+ kw_scored = []
|
|
|
|
|
+
|
|
|
|
|
+ for kw in all_keywords:
|
|
|
|
|
+ # Skip keywords that are already scored as entities — entity signal is
|
|
|
|
|
+ # higher quality (proper nouns, resolved identities).
|
|
|
|
|
+ if kw in all_entities:
|
|
|
|
|
+ continue
|
|
|
|
|
+
|
|
|
|
|
+ recent_n = kw_counts_recent.get(kw, 0)
|
|
|
|
|
+ prior_n = kw_counts_prior.get(kw, 0)
|
|
|
|
|
+ total_n = recent_n + prior_n
|
|
|
|
|
+
|
|
|
|
|
+ if total_n < 1:
|
|
|
|
|
+ continue
|
|
|
|
|
+
|
|
|
|
|
+ velocity = (recent_n + 0.5) / (prior_n + 0.5)
|
|
|
|
|
+ recency_ratio = recent_n / total_n
|
|
|
|
|
+ n_sources = len(kw_sources.get(kw, set()))
|
|
|
|
|
+ n_buckets = len(kw_buckets.get(kw, set()))
|
|
|
|
|
+ avg_imp = (kw_importance_recent.get(kw, 0.0) / max(1, recent_n)) if recent_n > 0 else 0.0
|
|
|
|
|
+
|
|
|
|
|
+ composed_score = (
|
|
|
|
|
+ 0.35 * min(1.0, math.log1p(velocity) / math.log1p(4.0)) +
|
|
|
|
|
+ 0.25 * recency_ratio +
|
|
|
|
|
+ 0.15 * min(1.0, n_sources / 5.0) +
|
|
|
|
|
+ 0.10 * min(1.0, n_buckets / 4.0) +
|
|
|
|
|
+ 0.15 * min(1.0, avg_imp)
|
|
|
|
|
+ )
|
|
|
|
|
+
|
|
|
|
|
+ kw_scored.append({
|
|
|
|
|
+ "topic": kw,
|
|
|
|
|
+ "trend_score": min(0.99, round(composed_score, 3)),
|
|
|
|
|
+ "related_entities": [],
|
|
|
|
|
+ "velocity": round(velocity, 2),
|
|
|
|
|
+ "recent_count": recent_n,
|
|
|
|
|
+ "prior_count": prior_n,
|
|
|
|
|
+ "source_count": n_sources,
|
|
|
|
|
+ "avg_importance": round(avg_imp, 3),
|
|
|
|
|
+ "signal_type": "keyword",
|
|
|
|
|
+ })
|
|
|
|
|
+
|
|
|
|
|
+ # sort keywords by score descending
|
|
|
|
|
+ kw_scored.sort(key=lambda x: (-x["trend_score"], -x["velocity"], x["topic"]))
|
|
|
|
|
+
|
|
|
# sort by composed score descending
|
|
# sort by composed score descending
|
|
|
scored.sort(key=lambda x: (-x["trend_score"], -x["velocity"], x["topic"]))
|
|
scored.sort(key=lambda x: (-x["trend_score"], -x["velocity"], x["topic"]))
|
|
|
|
|
|
|
|
- # --- add phrase signals (only from recent window) ---
|
|
|
|
|
|
|
+ # --- merge: entities first, then keywords, then phrases ---
|
|
|
emerging = list(scored) # start with entities
|
|
emerging = list(scored) # start with entities
|
|
|
|
|
+ seen_topics = {item["topic"] for item in emerging}
|
|
|
|
|
+
|
|
|
|
|
+ for kw_item in kw_scored:
|
|
|
|
|
+ if kw_item["topic"] not in seen_topics:
|
|
|
|
|
+ emerging.append(kw_item)
|
|
|
|
|
+ seen_topics.add(kw_item["topic"])
|
|
|
|
|
+
|
|
|
|
|
+ # --- add phrase signals (only from recent window) ---
|
|
|
for phrase, count in phrase_counts_recent.most_common(limit * 2):
|
|
for phrase, count in phrase_counts_recent.most_common(limit * 2):
|
|
|
- if any(item["topic"] == phrase for item in emerging):
|
|
|
|
|
|
|
+ if phrase in seen_topics:
|
|
|
continue
|
|
continue
|
|
|
emerging.append({
|
|
emerging.append({
|
|
|
"topic": phrase.title(),
|
|
"topic": phrase.title(),
|
|
@@ -772,13 +870,15 @@ async def detect_emerging_topics(limit: int = 10, timeframe: str = "24h", topic:
|
|
|
"avg_importance": 0.0,
|
|
"avg_importance": 0.0,
|
|
|
"signal_type": "phrase",
|
|
"signal_type": "phrase",
|
|
|
})
|
|
})
|
|
|
|
|
+ seen_topics.add(phrase)
|
|
|
if len(emerging) >= limit:
|
|
if len(emerging) >= limit:
|
|
|
break
|
|
break
|
|
|
|
|
|
|
|
return emerging[:limit]
|
|
return emerging[:limit]
|
|
|
|
|
|
|
|
|
|
|
|
|
-@mcp.tool(description="Investigate whether sentiment around an entity is positive, negative, or neutral over a chosen lookback window.")
|
|
|
|
|
|
|
+@mcp.tool(description="Investigate whether sentiment around an entity or keyword is positive, negative, or neutral over a chosen lookback window. "
|
|
|
|
|
+ "Matches clusters by both named entities and thematic keywords.")
|
|
|
async def get_news_sentiment(entity: str, timeframe: str = "24h"):
|
|
async def get_news_sentiment(entity: str, timeframe: str = "24h"):
|
|
|
store = SQLiteClusterStore(DB_PATH)
|
|
store = SQLiteClusterStore(DB_PATH)
|
|
|
|
|
|
|
@@ -855,11 +955,12 @@ async def get_capabilities():
|
|
|
return {
|
|
return {
|
|
|
"server": {
|
|
"server": {
|
|
|
"name": "news-mcp",
|
|
"name": "news-mcp",
|
|
|
- "purpose": "Recent news clusters, entity drill-down, sentiment, emerging topics, and related-entity expansion.",
|
|
|
|
|
|
|
+ "purpose": "Recent news clusters with entities and thematic keywords, entity/keyword drill-down, sentiment, emerging topics, and related-entity expansion.",
|
|
|
"output_conventions": {
|
|
"output_conventions": {
|
|
|
"cluster_ids": "Do not surface cluster_id in user-facing prose unless explicitly requested; treat it as internal navigation metadata.",
|
|
"cluster_ids": "Do not surface cluster_id in user-facing prose unless explicitly requested; treat it as internal navigation metadata.",
|
|
|
"sources": "Always preserve and display sources when summarizing a cluster or entity result.",
|
|
"sources": "Always preserve and display sources when summarizing a cluster or entity result.",
|
|
|
"timestamps": "Mention timestamps consistently when comparing multiple clusters or when recency matters.",
|
|
"timestamps": "Mention timestamps consistently when comparing multiple clusters or when recency matters.",
|
|
|
|
|
+ "clusters": "Each cluster includes entities (named entities with optional MID/canonical_label) and keywords (thematic descriptors). Both are searchable; entities are higher-signal, keywords capture subject-matter themes.",
|
|
|
},
|
|
},
|
|
|
},
|
|
},
|
|
|
"tools": NEWS_TOOL_CARDS,
|
|
"tools": NEWS_TOOL_CARDS,
|
|
@@ -867,10 +968,11 @@ async def get_capabilities():
|
|
|
"example_chains": NEWS_EXAMPLE_CHAINS,
|
|
"example_chains": NEWS_EXAMPLE_CHAINS,
|
|
|
"agent_tips": NEWS_AGENT_TIPS,
|
|
"agent_tips": NEWS_AGENT_TIPS,
|
|
|
"guidance": [
|
|
"guidance": [
|
|
|
- "Use get_latest_events for a tail, get_events_for_entity for entity deep dives, and get_related_recent_entities for neighborhood expansion.",
|
|
|
|
|
|
|
+ "Use get_latest_events for a tail, get_events_for_entity for entity/keyword deep dives, and get_related_recent_entities for neighborhood expansion.",
|
|
|
"Prefer normalized/canonical entities when possible, but the server will resolve common aliases and MIDs for you.",
|
|
"Prefer normalized/canonical entities when possible, but the server will resolve common aliases and MIDs for you.",
|
|
|
"When presenting results to users, summarize the cluster; avoid exposing internal IDs unless they are needed for follow-up tool calls.",
|
|
"When presenting results to users, summarize the cluster; avoid exposing internal IDs unless they are needed for follow-up tool calls.",
|
|
|
- "For emerging topics, use detect_emerging_topics with timeframe and around parameters to scope your query. High velocity + high source_count = strong emerging signal.",
|
|
|
|
|
|
|
+ "For emerging topics, use detect_emerging_topics with timeframe and around parameters. Signal types: entity (named entity, highest quality), keyword (thematic descriptor), phrase (headline bigram). High velocity + high source_count = strong signal.",
|
|
|
|
|
+ "get_events_for_entity and get_news_sentiment match both entities and thematic keywords — use keywords when the subject is a theme rather than a named entity.",
|
|
|
],
|
|
],
|
|
|
}
|
|
}
|
|
|
|
|
|
|
@@ -1064,6 +1166,19 @@ def api_entities(
|
|
|
except Exception as e:
|
|
except Exception as e:
|
|
|
return _api_err(e, f"entities(hours={hours})")
|
|
return _api_err(e, f"entities(hours={hours})")
|
|
|
|
|
|
|
|
|
|
+@app.get("/api/v1/keywords")
|
|
|
|
|
+def api_keywords(
|
|
|
|
|
+ hours: int = 24,
|
|
|
|
|
+ limit: int = 30,
|
|
|
|
|
+):
|
|
|
|
|
+ """Top keyword frequencies (thematic descriptors, excluding terms already counted as entities)."""
|
|
|
|
|
+ try:
|
|
|
|
|
+ store = DashboardStore(_shared_store)
|
|
|
|
|
+ keywords = store.get_keyword_frequencies(hours=hours, limit=limit)
|
|
|
|
|
+ return {"keywords": keywords, "hours": hours}
|
|
|
|
|
+ except Exception as e:
|
|
|
|
|
+ return _api_err(e, f"keywords(hours={hours})")
|
|
|
|
|
+
|
|
|
@app.get("/api/v1/cluster/{cluster_id}")
|
|
@app.get("/api/v1/cluster/{cluster_id}")
|
|
|
def api_cluster_detail(cluster_id: str):
|
|
def api_cluster_detail(cluster_id: str):
|
|
|
"""Full cluster detail for drill-down."""
|
|
"""Full cluster detail for drill-down."""
|