|
@@ -2,6 +2,8 @@ from __future__ import annotations
|
|
|
|
|
|
|
|
import asyncio
|
|
import asyncio
|
|
|
import logging
|
|
import logging
|
|
|
|
|
+import math
|
|
|
|
|
+import re
|
|
|
import time
|
|
import time
|
|
|
from collections import Counter
|
|
from collections import Counter
|
|
|
from datetime import datetime, timezone
|
|
from datetime import datetime, timezone
|
|
@@ -157,9 +159,14 @@ NEWS_TOOL_CARDS = [
|
|
|
_tool_card(
|
|
_tool_card(
|
|
|
"detect_emerging_topics",
|
|
"detect_emerging_topics",
|
|
|
"Surface entities and phrases starting to matter in the recent window.",
|
|
"Surface entities and phrases starting to matter in the recent window.",
|
|
|
- [{"name": "limit", "type": "integer", "default": 10, "range": "1-20"}],
|
|
|
|
|
- ["topic", "trend_score", "related_entities", "signal_type", "count", "avg_importance"],
|
|
|
|
|
- ["Good for 'what is heating up?' style questions."],
|
|
|
|
|
|
|
+ [
|
|
|
|
|
+ {"name": "limit", "type": "integer", "default": 10, "range": "1-20"},
|
|
|
|
|
+ {"name": "timeframe", "type": "string", "default": "24h", "examples": ["4h", "24h", "3d"]},
|
|
|
|
|
+ {"name": "topic", "type": "string", "default": "all topics", "examples": ["crypto", "macro", "regulation", "ai", "other"]},
|
|
|
|
|
+ {"name": "around", "type": "string", "default": "none", "meaning": "entity to scope results to its neighborhood (e.g. \"Bitcoin\")"},
|
|
|
|
|
+ ],
|
|
|
|
|
+ ["topic", "trend_score", "velocity", "recent_count", "prior_count", "source_count", "related_entities", "signal_type"],
|
|
|
|
|
+ ["Use timeframe to control lookback, topic to scope to a category, around to find what's emerging near a specific entity."],
|
|
|
),
|
|
),
|
|
|
_tool_card(
|
|
_tool_card(
|
|
|
"get_news_sentiment",
|
|
"get_news_sentiment",
|
|
@@ -220,7 +227,7 @@ NEWS_COMPOSITION_RECIPES = [
|
|
|
"get_events_for_entity(entity=...)",
|
|
"get_events_for_entity(entity=...)",
|
|
|
"get_news_sentiment(entity=...)",
|
|
"get_news_sentiment(entity=...)",
|
|
|
],
|
|
],
|
|
|
- "notes": ["Good for trend scouting and risk mapping."],
|
|
|
|
|
|
|
+ "notes": ["Use timeframe to control lookback, topic to scope to a category, around to find what's emerging near a specific entity. Good for trend scouting and risk mapping."],
|
|
|
},
|
|
},
|
|
|
]
|
|
]
|
|
|
|
|
|
|
@@ -515,98 +522,230 @@ async def get_event_summary(event_id: str, include_articles: bool = False):
|
|
|
return out
|
|
return out
|
|
|
|
|
|
|
|
|
|
|
|
|
-@mcp.tool(description="Explore what is starting to matter: surface emerging entities and phrases from recent clusters.")
|
|
|
|
|
-async def detect_emerging_topics(limit: int = 10):
|
|
|
|
|
|
|
+@mcp.tool(description="Explore what is starting to matter: surface emerging entities and phrases from recent clusters. "
|
|
|
|
|
+ "Use timeframe to control the lookback window, topic to scope to a category, and around to find what's emerging near a specific entity.")
|
|
|
|
|
+async def detect_emerging_topics(limit: int = 10, timeframe: str = "24h", topic: str | None = None, around: str | None = None):
|
|
|
|
|
+ """Surface entities and phrases that are accelerating in recent clusters.
|
|
|
|
|
+
|
|
|
|
|
+ Args:
|
|
|
|
|
+ limit: max results to return (1-20, default 10).
|
|
|
|
|
+ timeframe: lookback window like "4h", "24h", "3d" (default "24h").
|
|
|
|
|
+ topic: optional coarse topic filter ("crypto", "macro", "regulation", "ai", "other").
|
|
|
|
|
+ around: optional entity — only return entities that co-occur with this entity
|
|
|
|
|
+ in the recent window (e.g. "Bitcoin" to find what's emerging in Bitcoin's neighborhood).
|
|
|
|
|
+ """
|
|
|
limit = max(1, min(int(limit), 20))
|
|
limit = max(1, min(int(limit), 20))
|
|
|
|
|
+ hours = _parse_timeframe_to_hours(timeframe)
|
|
|
|
|
+ half_hours = hours / 2.0
|
|
|
|
|
+
|
|
|
store = SQLiteClusterStore(DB_PATH)
|
|
store = SQLiteClusterStore(DB_PATH)
|
|
|
- clusters = store.get_latest_clusters_all_topics(ttl_hours=DEFAULT_LOOKBACK_HOURS, limit=200)
|
|
|
|
|
|
|
+ # Fetch more clusters than needed so velocity stats are meaningful even for short windows.
|
|
|
|
|
+ clusters = store.get_latest_clusters_all_topics(ttl_hours=hours, limit=500)
|
|
|
|
|
|
|
|
- import re
|
|
|
|
|
|
|
+ # --- optional topic filter ---
|
|
|
|
|
+ if topic:
|
|
|
|
|
+ topic_norm = normalize_query(topic).strip().lower()
|
|
|
|
|
+ if topic_norm:
|
|
|
|
|
+ clusters = [c for c in clusters if (c.get("topic") or "other").strip().lower() == topic_norm]
|
|
|
|
|
+
|
|
|
|
|
+ # --- resolve the 'around' entity ---
|
|
|
|
|
+ around_terms: set[str] = set()
|
|
|
|
|
+ if around:
|
|
|
|
|
+ around_norm = normalize_query(around).strip().lower()
|
|
|
|
|
+ if around_norm:
|
|
|
|
|
+ resolved = resolve_entity_via_trends(around_norm)
|
|
|
|
|
+ around_terms = {
|
|
|
|
|
+ around_norm,
|
|
|
|
|
+ str(resolved.get("normalized") or "").strip().lower(),
|
|
|
|
|
+ str(resolved.get("canonical_label") or "").strip().lower(),
|
|
|
|
|
+ }
|
|
|
|
|
+ around_terms.discard("")
|
|
|
|
|
+
|
|
|
|
|
+ # split clusters into first-half vs second-half by timestamp
|
|
|
|
|
+ # clusters are already sorted most-recent-first from the store
|
|
|
|
|
+ now = datetime.now(timezone.utc)
|
|
|
|
|
+
|
|
|
|
|
+ def _cluster_age_hours(c: dict) -> float:
|
|
|
|
|
+ """Return the cluster's age in hours (approximate, from now)."""
|
|
|
|
|
+ ts = c.get("timestamp") or c.get("last_updated")
|
|
|
|
|
+ if not ts:
|
|
|
|
|
+ return 0.0 # treat un-dated as fresh
|
|
|
|
|
+ try:
|
|
|
|
|
+ s = str(ts).replace("Z", "+00:00")
|
|
|
|
|
+ dt = datetime.fromisoformat(s)
|
|
|
|
|
+ if dt.tzinfo is None:
|
|
|
|
|
+ dt = dt.replace(tzinfo=timezone.utc)
|
|
|
|
|
+ return max(0.0, (now - dt.astimezone(timezone.utc)).total_seconds() / 3600.0)
|
|
|
|
|
+ except Exception:
|
|
|
|
|
+ try:
|
|
|
|
|
+ dt = parsedate_to_datetime(str(ts))
|
|
|
|
|
+ if dt.tzinfo is None:
|
|
|
|
|
+ dt = dt.replace(tzinfo=timezone.utc)
|
|
|
|
|
+ return max(0.0, (now - dt.astimezone(timezone.utc)).total_seconds() / 3600.0)
|
|
|
|
|
+ except Exception:
|
|
|
|
|
+ return 0.0
|
|
|
|
|
|
|
|
- entity_counts = Counter()
|
|
|
|
|
- entity_importance_sum = Counter()
|
|
|
|
|
- # co-occurrence: ent -> other_ent -> count
|
|
|
|
|
- entity_cooccur = {}
|
|
|
|
|
- phrase_counts = Counter()
|
|
|
|
|
- topic_counts = Counter()
|
|
|
|
|
|
|
+ # Generic entity filter
|
|
|
|
|
+ _generic_tokens = {"news", "latest", "breaking", "update", "updates", "report", "reports"}
|
|
|
|
|
|
|
|
- # Very light heuristics to reduce “meta entities” dominating emerging topics.
|
|
|
|
|
- # Keep it conservative: only skip obvious boilerplate.
|
|
|
|
|
def _is_generic_entity(ent: str) -> bool:
|
|
def _is_generic_entity(ent: str) -> bool:
|
|
|
e = str(ent).strip().lower()
|
|
e = str(ent).strip().lower()
|
|
|
- if not e:
|
|
|
|
|
|
|
+ if not e or len(e) < 4:
|
|
|
return True
|
|
return True
|
|
|
- if len(e) < 4:
|
|
|
|
|
- return True
|
|
|
|
|
- # common outlet-ish / meta-ish tokens
|
|
|
|
|
- if e in {"news", "latest", "breaking"}:
|
|
|
|
|
|
|
+ if e in _generic_tokens:
|
|
|
return True
|
|
return True
|
|
|
return False
|
|
return False
|
|
|
|
|
|
|
|
|
|
+ # --- accumulate signals ---
|
|
|
|
|
+ # recent = second half of timeframe (newer), prior = first half (older)
|
|
|
|
|
+ entity_counts_recent = Counter()
|
|
|
|
|
+ entity_counts_prior = Counter()
|
|
|
|
|
+ entity_importance_recent = Counter()
|
|
|
|
|
+ entity_sources: dict[str, set] = {} # ent -> set of source names
|
|
|
|
|
+ entity_buckets: dict[str, set] = {} # ent -> set of time-bucket indices (for sustained-spike detection)
|
|
|
|
|
+ entity_cooccur: dict[str, Counter] = {}
|
|
|
|
|
+ phrase_counts_recent = Counter()
|
|
|
|
|
+
|
|
|
|
|
+ bucket_size_hours = max(1.0, hours / 6.0) # split window into ~6 buckets
|
|
|
|
|
+
|
|
|
for c in clusters:
|
|
for c in clusters:
|
|
|
- topic_counts[c.get("topic", "other")] += 1
|
|
|
|
|
ents_in_cluster = [e for e in (c.get("entities", []) or []) if not _is_generic_entity(e)]
|
|
ents_in_cluster = [e for e in (c.get("entities", []) or []) if not _is_generic_entity(e)]
|
|
|
- ents_in_cluster_norm = [str(e).strip().lower() for e in ents_in_cluster if str(e).strip()]
|
|
|
|
|
- for ent in ents_in_cluster_norm:
|
|
|
|
|
- if _is_generic_entity(ent):
|
|
|
|
|
|
|
+ ents_norm = [str(e).strip().lower() for e in ents_in_cluster if str(e).strip()]
|
|
|
|
|
+
|
|
|
|
|
+ age_h = _cluster_age_hours(c)
|
|
|
|
|
+ is_recent = age_h <= half_hours
|
|
|
|
|
+ bucket_idx = int(age_h / bucket_size_hours)
|
|
|
|
|
+
|
|
|
|
|
+ # --- around filter: only count clusters that mention the target entity ---
|
|
|
|
|
+ if around_terms:
|
|
|
|
|
+ haystack = set(ents_norm)
|
|
|
|
|
+ for res in c.get("entityResolutions", []) or []:
|
|
|
|
|
+ if isinstance(res, dict):
|
|
|
|
|
+ for key in ("normalized", "canonical_label"):
|
|
|
|
|
+ val = res.get(key)
|
|
|
|
|
+ if val:
|
|
|
|
|
+ haystack.add(str(val).strip().lower())
|
|
|
|
|
+ if not (haystack & around_terms):
|
|
|
continue
|
|
continue
|
|
|
- entity_counts[ent] += 1
|
|
|
|
|
- try:
|
|
|
|
|
- entity_importance_sum[ent] += float(c.get("importance", 0.0) or 0.0)
|
|
|
|
|
- except Exception:
|
|
|
|
|
- pass
|
|
|
|
|
|
|
|
|
|
- # update co-occurrence counts
|
|
|
|
|
- for i in range(len(ents_in_cluster_norm)):
|
|
|
|
|
- a = ents_in_cluster_norm[i]
|
|
|
|
|
- if not a:
|
|
|
|
|
|
|
+ counts = entity_counts_recent if is_recent else entity_counts_prior
|
|
|
|
|
+ imp_acc = entity_importance_recent if is_recent else None # only importance from recent window
|
|
|
|
|
+
|
|
|
|
|
+ for ent in ents_norm:
|
|
|
|
|
+ if _is_generic_entity(ent):
|
|
|
|
|
+ continue
|
|
|
|
|
+ counts[ent] += 1
|
|
|
|
|
+ if ent not in entity_sources:
|
|
|
|
|
+ entity_sources[ent] = set()
|
|
|
|
|
+ src = c.get("source") or c.get("headline", "").split(" - ")[-1] if c.get("headline") else ""
|
|
|
|
|
+ if src:
|
|
|
|
|
+ entity_sources[ent].add(str(src))
|
|
|
|
|
+ if ent not in entity_buckets:
|
|
|
|
|
+ entity_buckets[ent] = set()
|
|
|
|
|
+ entity_buckets[ent].add(bucket_idx)
|
|
|
|
|
+ if imp_acc is not None:
|
|
|
|
|
+ try:
|
|
|
|
|
+ imp_acc[ent] += float(c.get("importance", 0.0) or 0.0)
|
|
|
|
|
+ except Exception:
|
|
|
|
|
+ pass
|
|
|
|
|
+
|
|
|
|
|
+ # co-occurrence (only for clusters matching the around filter, if any)
|
|
|
|
|
+ for i in range(len(ents_norm)):
|
|
|
|
|
+ a = ents_norm[i]
|
|
|
|
|
+ if _is_generic_entity(a):
|
|
|
continue
|
|
continue
|
|
|
- entity_cooccur.setdefault(a, Counter())
|
|
|
|
|
- for j in range(len(ents_in_cluster_norm)):
|
|
|
|
|
|
|
+ if a not in entity_cooccur:
|
|
|
|
|
+ entity_cooccur[a] = Counter()
|
|
|
|
|
+ for j in range(len(ents_norm)):
|
|
|
if i == j:
|
|
if i == j:
|
|
|
continue
|
|
continue
|
|
|
- b = ents_in_cluster_norm[j]
|
|
|
|
|
- if not b:
|
|
|
|
|
|
|
+ b = ents_norm[j]
|
|
|
|
|
+ if _is_generic_entity(b):
|
|
|
continue
|
|
continue
|
|
|
entity_cooccur[a][b] += 1
|
|
entity_cooccur[a][b] += 1
|
|
|
|
|
|
|
|
- text = f"{c.get('headline','')} {c.get('summary','')}"
|
|
|
|
|
- words = [w for w in re.findall(r"[A-Za-z][A-Za-z0-9\-]{2,}", text.lower())]
|
|
|
|
|
- for i in range(len(words) - 1):
|
|
|
|
|
- phrase = f"{words[i]} {words[i+1]}"
|
|
|
|
|
- if len(phrase) > 6:
|
|
|
|
|
- phrase_counts[phrase] += 1
|
|
|
|
|
-
|
|
|
|
|
- emerging = []
|
|
|
|
|
- # Combine frequency with average importance so “big signal” rises over pure repetition.
|
|
|
|
|
- for ent, count in entity_counts.most_common(limit):
|
|
|
|
|
- avg_imp = entity_importance_sum[ent] / max(1, count)
|
|
|
|
|
- # avg_imp is typically 0..~1; keep score bounded.
|
|
|
|
|
- trend_score = 0.25 + 0.40 * min(1.0, avg_imp) + 0.08 * min(6.0, float(count))
|
|
|
|
|
|
|
+ # bigram phrases (recent only)
|
|
|
|
|
+ if is_recent:
|
|
|
|
|
+ text = f"{c.get('headline', '')} {c.get('summary', '')}"
|
|
|
|
|
+ words = re.findall(r"[A-Za-z][A-Za-z0-9\-]{2,}", text.lower())
|
|
|
|
|
+ for i in range(len(words) - 1):
|
|
|
|
|
+ phrase = f"{words[i]} {words[i+1]}"
|
|
|
|
|
+ if len(phrase) > 6:
|
|
|
|
|
+ phrase_counts_recent[phrase] += 1
|
|
|
|
|
+
|
|
|
|
|
+ # --- score entities ---
|
|
|
|
|
+ all_entities = set(entity_counts_recent.keys()) | set(entity_counts_prior.keys())
|
|
|
|
|
+ scored = []
|
|
|
|
|
+
|
|
|
|
|
+ for ent in all_entities:
|
|
|
|
|
+ recent_n = entity_counts_recent.get(ent, 0)
|
|
|
|
|
+ prior_n = entity_counts_prior.get(ent, 0)
|
|
|
|
|
+ total_n = recent_n + prior_n
|
|
|
|
|
+
|
|
|
|
|
+ if total_n < 1:
|
|
|
|
|
+ continue
|
|
|
|
|
+
|
|
|
|
|
+ # velocity: ratio of recent vs prior (smoothed to avoid division noise)
|
|
|
|
|
+ # 0 prior → velocity = recent_n (pure emergence)
|
|
|
|
|
+ # equal → velocity = 1.0 (steady)
|
|
|
|
|
+ velocity = (recent_n + 0.5) / (prior_n + 0.5)
|
|
|
|
|
+
|
|
|
|
|
+ # recency weight: what fraction of total hits are in the recent window
|
|
|
|
|
+ recency_ratio = recent_n / total_n
|
|
|
|
|
+
|
|
|
|
|
+ # source diversity: how many distinct outlets
|
|
|
|
|
+ n_sources = len(entity_sources.get(ent, set()))
|
|
|
|
|
+
|
|
|
|
|
+ # sustained: how many distinct time buckets did it appear in (max ~6)
|
|
|
|
|
+ n_buckets = len(entity_buckets.get(ent, set()))
|
|
|
|
|
+
|
|
|
|
|
+ # average importance (recent window only)
|
|
|
|
|
+ avg_imp = (entity_importance_recent.get(ent, 0.0) / max(1, recent_n)) if recent_n > 0 else 0.0
|
|
|
|
|
+
|
|
|
|
|
+ composed_score = (
|
|
|
|
|
+ 0.35 * min(1.0, math.log1p(velocity) / math.log1p(4.0)) + # velocity (0..1, 4x = max)
|
|
|
|
|
+ 0.25 * recency_ratio + # recency concentration
|
|
|
|
|
+ 0.15 * min(1.0, n_sources / 5.0) + # source diversity
|
|
|
|
|
+ 0.10 * min(1.0, n_buckets / 4.0) + # sustained (>1 bucket)
|
|
|
|
|
+ 0.15 * min(1.0, avg_imp) # importance
|
|
|
|
|
+ )
|
|
|
|
|
+
|
|
|
related = []
|
|
related = []
|
|
|
- for other, _cnt in (entity_cooccur.get(ent) or Counter()).most_common(3):
|
|
|
|
|
- # avoid returning the entity itself (shouldn't happen, but be safe)
|
|
|
|
|
- if other != ent:
|
|
|
|
|
- related.append(other)
|
|
|
|
|
|
|
+ if ent in entity_cooccur:
|
|
|
|
|
+ for other, _cnt in entity_cooccur[ent].most_common(5):
|
|
|
|
|
+ if other != ent:
|
|
|
|
|
+ related.append(other)
|
|
|
|
|
|
|
|
- emerging.append({
|
|
|
|
|
|
|
+ scored.append({
|
|
|
"topic": ent,
|
|
"topic": ent,
|
|
|
- "trend_score": min(0.99, round(trend_score, 2)),
|
|
|
|
|
- "related_entities": related if related else [ent],
|
|
|
|
|
- "signal_type": "entity",
|
|
|
|
|
- "count": count,
|
|
|
|
|
|
|
+ "trend_score": min(0.99, round(composed_score, 3)),
|
|
|
|
|
+ "related_entities": related[:3] if related else [ent],
|
|
|
|
|
+ "velocity": round(velocity, 2),
|
|
|
|
|
+ "recent_count": recent_n,
|
|
|
|
|
+ "prior_count": prior_n,
|
|
|
|
|
+ "source_count": n_sources,
|
|
|
"avg_importance": round(avg_imp, 3),
|
|
"avg_importance": round(avg_imp, 3),
|
|
|
|
|
+ "signal_type": "entity",
|
|
|
})
|
|
})
|
|
|
|
|
|
|
|
- for phrase, count in phrase_counts.most_common(limit * 2):
|
|
|
|
|
|
|
+ # sort by composed score descending
|
|
|
|
|
+ scored.sort(key=lambda x: (-x["trend_score"], -x["velocity"], x["topic"]))
|
|
|
|
|
+
|
|
|
|
|
+ # --- add phrase signals (only from recent window) ---
|
|
|
|
|
+ emerging = list(scored) # start with entities
|
|
|
|
|
+ for phrase, count in phrase_counts_recent.most_common(limit * 2):
|
|
|
if any(item["topic"] == phrase for item in emerging):
|
|
if any(item["topic"] == phrase for item in emerging):
|
|
|
continue
|
|
continue
|
|
|
emerging.append({
|
|
emerging.append({
|
|
|
"topic": phrase.title(),
|
|
"topic": phrase.title(),
|
|
|
- "trend_score": min(0.99, round(0.20 + 0.10 * count, 2)),
|
|
|
|
|
|
|
+ "trend_score": min(0.99, round(0.30 + 0.15 * min(count, 5), 2)),
|
|
|
"related_entities": [],
|
|
"related_entities": [],
|
|
|
|
|
+ "velocity": None,
|
|
|
|
|
+ "recent_count": count,
|
|
|
|
|
+ "prior_count": 0,
|
|
|
|
|
+ "source_count": 0,
|
|
|
|
|
+ "avg_importance": 0.0,
|
|
|
"signal_type": "phrase",
|
|
"signal_type": "phrase",
|
|
|
- "count": count,
|
|
|
|
|
})
|
|
})
|
|
|
if len(emerging) >= limit:
|
|
if len(emerging) >= limit:
|
|
|
break
|
|
break
|