1 týždeň pred · 763a686ed0
--- a/RELEASE_NOTES.md
+++ b/RELEASE_NOTES.md
@@ -3,6 +3,13 @@
 
				 ## v0.3.1 — stable cluster IDs, cross-cycle merge, orphan dedup, multi-article signals
			
 
				 
			
 
				 ### Highlights
			
 
				+- **Emerging topics rewrite** (`detect_emerging_topics`): complete rewrite with 5 new capabilities:
			
 
				+  - **Timeframe parameter** (`"4h"`, `"24h"`, `"3d"`, etc.) — controls lookback window instead of always using `DEFAULT_LOOKBACK_HOURS`
			
 
				+  - **Velocity scoring** — splits the window into recent vs prior half, computes `velocity = (recent + 0.5) / (prior + 0.5)`. Entities accelerating now vs before score much higher than steady-state ones
			
 
				+  - **Composed trend score** — replaces the flat `0.25 + 0.40*imp + 0.08*count` formula with a weighted combination of: velocity (35%), recency concentration (25%), source diversity (15%), sustained presence across time buckets (10%), importance (15%)
			
 
				+  - **Topic scoping** — optional `topic` parameter filters to a specific category before scoring
			
 
				+  - **Entity neighborhood scoping** — optional `around` parameter only returns entities co-occurring with the specified entity (e.g. `around="Bitcoin"` finds what's emerging in Bitcoin's neighborhood)
			
 
				+  - **Richer output** — each result now includes `velocity`, `recent_count`, `prior_count`, `source_count` alongside `trend_score` and `related_entities`
			
 
				 - **Multi-article signal comparison**: `_signals()` now compares a new article against ALL articles in a candidate cluster (not just the seed). The best title and jaccard scores across all cluster members are used for matching.
			
 
				 - **Stable cluster IDs**: `cluster_id = sha1(topic | min_article_key)` instead of `sha1(topic | seed_title)`. The same set of articles always maps to the same ID regardless of processing order. This eliminates duplicate clusters for the same event.
			
 
				 - **Cross-cycle merge**: the poller loads recent clusters from the DB (controlled by `NEWS_CLUSTER_MAX_AGE_HOURS`, default 4h) and seeds them as merge targets before clustering. New articles in poll N+1 can merge into clusters created in poll N.
			
@@ -15,6 +22,7 @@
 
				 - No database schema changes.
			
 
				 - Existing cluster IDs will change format on the next polling cycle (old rows are updated in-place via `ON CONFLICT(cluster_id)` once the new ID is computed). Transient enrichment cache misses may occur for one cycle.
			
 
				 - Old duplicate clusters (same event, different IDs) will age out via pruning. To clean them immediately, run the article dedup cleanup script.
			
 
				+- `detect_emerging_topics` output shape changed: `count` replaced by `recent_count` + `prior_count`, new fields `velocity` and `source_count`. Clients using the old `count` field need to switch to `recent_count`.
			
 
				 
			
 
				 ## v0.3.0 — concurrent polling, enrichment retry, all-topics default
			
 
				 
			
--- a/news_mcp/mcp_server_fastmcp.py
+++ b/news_mcp/mcp_server_fastmcp.py
@@ -2,6 +2,8 @@ from __future__ import annotations
 
				 
			
 
				 import asyncio
			
 
				 import logging
			
 
				+import math
			
 
				+import re
			
 
				 import time
			
 
				 from collections import Counter
			
 
				 from datetime import datetime, timezone
			
@@ -157,9 +159,14 @@ NEWS_TOOL_CARDS = [
 
				     _tool_card(
			
 
				         "detect_emerging_topics",
			
 
				         "Surface entities and phrases starting to matter in the recent window.",
			
 
				-        [{"name": "limit", "type": "integer", "default": 10, "range": "1-20"}],
			
 
				-        ["topic", "trend_score", "related_entities", "signal_type", "count", "avg_importance"],
			
 
				-        ["Good for 'what is heating up?' style questions."],
			
 
				+        [
			
 
				+            {"name": "limit", "type": "integer", "default": 10, "range": "1-20"},
			
 
				+            {"name": "timeframe", "type": "string", "default": "24h", "examples": ["4h", "24h", "3d"]},
			
 
				+            {"name": "topic", "type": "string", "default": "all topics", "examples": ["crypto", "macro", "regulation", "ai", "other"]},
			
 
				+            {"name": "around", "type": "string", "default": "none", "meaning": "entity to scope results to its neighborhood (e.g. \"Bitcoin\")"},
			
 
				+        ],
			
 
				+        ["topic", "trend_score", "velocity", "recent_count", "prior_count", "source_count", "related_entities", "signal_type"],
			
 
				+        ["Use timeframe to control lookback, topic to scope to a category, around to find what's emerging near a specific entity."],
			
 
				     ),
			
 
				     _tool_card(
			
 
				         "get_news_sentiment",
			
@@ -220,7 +227,7 @@ NEWS_COMPOSITION_RECIPES = [
 
				             "get_events_for_entity(entity=...)",
			
 
				             "get_news_sentiment(entity=...)",
			
 
				         ],
			
 
				-        "notes": ["Good for trend scouting and risk mapping."],
			
 
				+        "notes": ["Use timeframe to control lookback, topic to scope to a category, around to find what's emerging near a specific entity. Good for trend scouting and risk mapping."],
			
 
				     },
			
 
				 ]
			
 
				 
			
@@ -515,98 +522,230 @@ async def get_event_summary(event_id: str, include_articles: bool = False):
 
				     return out
			
 
				 
			
 
				 
			
 
				-@mcp.tool(description="Explore what is starting to matter: surface emerging entities and phrases from recent clusters.")
			
 
				-async def detect_emerging_topics(limit: int = 10):
			
 
				+@mcp.tool(description="Explore what is starting to matter: surface emerging entities and phrases from recent clusters. "
			
 
				+           "Use timeframe to control the lookback window, topic to scope to a category, and around to find what's emerging near a specific entity.")
			
 
				+async def detect_emerging_topics(limit: int = 10, timeframe: str = "24h", topic: str | None = None, around: str | None = None):
			
 
				+    """Surface entities and phrases that are accelerating in recent clusters.
			
 
				+
			
 
				+    Args:
			
 
				+        limit: max results to return (1-20, default 10).
			
 
				+        timeframe: lookback window like "4h", "24h", "3d" (default "24h").
			
 
				+        topic: optional coarse topic filter ("crypto", "macro", "regulation", "ai", "other").
			
 
				+        around: optional entity — only return entities that co-occur with this entity
			
 
				+                 in the recent window (e.g. "Bitcoin" to find what's emerging in Bitcoin's neighborhood).
			
 
				+    """
			
 
				     limit = max(1, min(int(limit), 20))
			
 
				+    hours = _parse_timeframe_to_hours(timeframe)
			
 
				+    half_hours = hours / 2.0
			
 
				+
			
 
				     store = SQLiteClusterStore(DB_PATH)
			
 
				-    clusters = store.get_latest_clusters_all_topics(ttl_hours=DEFAULT_LOOKBACK_HOURS, limit=200)
			
 
				+    # Fetch more clusters than needed so velocity stats are meaningful even for short windows.
			
 
				+    clusters = store.get_latest_clusters_all_topics(ttl_hours=hours, limit=500)
			
 
				 
			
 
				-    import re
			
 
				+    # --- optional topic filter ---
			
 
				+    if topic:
			
 
				+        topic_norm = normalize_query(topic).strip().lower()
			
 
				+        if topic_norm:
			
 
				+            clusters = [c for c in clusters if (c.get("topic") or "other").strip().lower() == topic_norm]
			
 
				+
			
 
				+    # --- resolve the 'around' entity ---
			
 
				+    around_terms: set[str] = set()
			
 
				+    if around:
			
 
				+        around_norm = normalize_query(around).strip().lower()
			
 
				+        if around_norm:
			
 
				+            resolved = resolve_entity_via_trends(around_norm)
			
 
				+            around_terms = {
			
 
				+                around_norm,
			
 
				+                str(resolved.get("normalized") or "").strip().lower(),
			
 
				+                str(resolved.get("canonical_label") or "").strip().lower(),
			
 
				+            }
			
 
				+            around_terms.discard("")
			
 
				+
			
 
				+    # split clusters into first-half vs second-half by timestamp
			
 
				+    # clusters are already sorted most-recent-first from the store
			
 
				+    now = datetime.now(timezone.utc)
			
 
				+
			
 
				+    def _cluster_age_hours(c: dict) -> float:
			
 
				+        """Return the cluster's age in hours (approximate, from now)."""
			
 
				+        ts = c.get("timestamp") or c.get("last_updated")
			
 
				+        if not ts:
			
 
				+            return 0.0  # treat un-dated as fresh
			
 
				+        try:
			
 
				+            s = str(ts).replace("Z", "+00:00")
			
 
				+            dt = datetime.fromisoformat(s)
			
 
				+            if dt.tzinfo is None:
			
 
				+                dt = dt.replace(tzinfo=timezone.utc)
			
 
				+            return max(0.0, (now - dt.astimezone(timezone.utc)).total_seconds() / 3600.0)
			
 
				+        except Exception:
			
 
				+            try:
			
 
				+                dt = parsedate_to_datetime(str(ts))
			
 
				+                if dt.tzinfo is None:
			
 
				+                    dt = dt.replace(tzinfo=timezone.utc)
			
 
				+                return max(0.0, (now - dt.astimezone(timezone.utc)).total_seconds() / 3600.0)
			
 
				+            except Exception:
			
 
				+                return 0.0
			
 
				 
			
 
				-    entity_counts = Counter()
			
 
				-    entity_importance_sum = Counter()
			
 
				-    # co-occurrence: ent -> other_ent -> count
			
 
				-    entity_cooccur = {}
			
 
				-    phrase_counts = Counter()
			
 
				-    topic_counts = Counter()
			
 
				+    # Generic entity filter
			
 
				+    _generic_tokens = {"news", "latest", "breaking", "update", "updates", "report", "reports"}
			
 
				 
			
 
				-    # Very light heuristics to reduce “meta entities” dominating emerging topics.
			
 
				-    # Keep it conservative: only skip obvious boilerplate.
			
 
				     def _is_generic_entity(ent: str) -> bool:
			
 
				         e = str(ent).strip().lower()
			
 
				-        if not e:
			
 
				+        if not e or len(e) < 4:
			
 
				             return True
			
 
				-        if len(e) < 4:
			
 
				-            return True
			
 
				-        # common outlet-ish / meta-ish tokens
			
 
				-        if e in {"news", "latest", "breaking"}:
			
 
				+        if e in _generic_tokens:
			
 
				             return True
			
 
				         return False
			
 
				 
			
 
				+    # --- accumulate signals ---
			
 
				+    # recent = second half of timeframe (newer), prior = first half (older)
			
 
				+    entity_counts_recent = Counter()
			
 
				+    entity_counts_prior = Counter()
			
 
				+    entity_importance_recent = Counter()
			
 
				+    entity_sources: dict[str, set] = {}  # ent -> set of source names
			
 
				+    entity_buckets: dict[str, set] = {}  # ent -> set of time-bucket indices (for sustained-spike detection)
			
 
				+    entity_cooccur: dict[str, Counter] = {}
			
 
				+    phrase_counts_recent = Counter()
			
 
				+
			
 
				+    bucket_size_hours = max(1.0, hours / 6.0)  # split window into ~6 buckets
			
 
				+
			
 
				     for c in clusters:
			
 
				-        topic_counts[c.get("topic", "other")] += 1
			
 
				         ents_in_cluster = [e for e in (c.get("entities", []) or []) if not _is_generic_entity(e)]
			
 
				-        ents_in_cluster_norm = [str(e).strip().lower() for e in ents_in_cluster if str(e).strip()]
			
 
				-        for ent in ents_in_cluster_norm:
			
 
				-            if _is_generic_entity(ent):
			
 
				+        ents_norm = [str(e).strip().lower() for e in ents_in_cluster if str(e).strip()]
			
 
				+
			
 
				+        age_h = _cluster_age_hours(c)
			
 
				+        is_recent = age_h <= half_hours
			
 
				+        bucket_idx = int(age_h / bucket_size_hours)
			
 
				+
			
 
				+        # --- around filter: only count clusters that mention the target entity ---
			
 
				+        if around_terms:
			
 
				+            haystack = set(ents_norm)
			
 
				+            for res in c.get("entityResolutions", []) or []:
			
 
				+                if isinstance(res, dict):
			
 
				+                    for key in ("normalized", "canonical_label"):
			
 
				+                        val = res.get(key)
			
 
				+                        if val:
			
 
				+                            haystack.add(str(val).strip().lower())
			
 
				+            if not (haystack & around_terms):
			
 
				                 continue
			
 
				-            entity_counts[ent] += 1
			
 
				-            try:
			
 
				-                entity_importance_sum[ent] += float(c.get("importance", 0.0) or 0.0)
			
 
				-            except Exception:
			
 
				-                pass
			
 
				 
			
 
				-        # update co-occurrence counts
			
 
				-        for i in range(len(ents_in_cluster_norm)):
			
 
				-            a = ents_in_cluster_norm[i]
			
 
				-            if not a:
			
 
				+        counts = entity_counts_recent if is_recent else entity_counts_prior
			
 
				+        imp_acc = entity_importance_recent if is_recent else None  # only importance from recent window
			
 
				+
			
 
				+        for ent in ents_norm:
			
 
				+            if _is_generic_entity(ent):
			
 
				+                continue
			
 
				+            counts[ent] += 1
			
 
				+            if ent not in entity_sources:
			
 
				+                entity_sources[ent] = set()
			
 
				+            src = c.get("source") or c.get("headline", "").split(" - ")[-1] if c.get("headline") else ""
			
 
				+            if src:
			
 
				+                entity_sources[ent].add(str(src))
			
 
				+            if ent not in entity_buckets:
			
 
				+                entity_buckets[ent] = set()
			
 
				+            entity_buckets[ent].add(bucket_idx)
			
 
				+            if imp_acc is not None:
			
 
				+                try:
			
 
				+                    imp_acc[ent] += float(c.get("importance", 0.0) or 0.0)
			
 
				+                except Exception:
			
 
				+                    pass
			
 
				+
			
 
				+        # co-occurrence (only for clusters matching the around filter, if any)
			
 
				+        for i in range(len(ents_norm)):
			
 
				+            a = ents_norm[i]
			
 
				+            if _is_generic_entity(a):
			
 
				                 continue
			
 
				-            entity_cooccur.setdefault(a, Counter())
			
 
				-            for j in range(len(ents_in_cluster_norm)):
			
 
				+            if a not in entity_cooccur:
			
 
				+                entity_cooccur[a] = Counter()
			
 
				+            for j in range(len(ents_norm)):
			
 
				                 if i == j:
			
 
				                     continue
			
 
				-                b = ents_in_cluster_norm[j]
			
 
				-                if not b:
			
 
				+                b = ents_norm[j]
			
 
				+                if _is_generic_entity(b):
			
 
				                     continue
			
 
				                 entity_cooccur[a][b] += 1
			
 
				 
			
 
				-        text = f"{c.get('headline','')} {c.get('summary','')}"
			
 
				-        words = [w for w in re.findall(r"[A-Za-z][A-Za-z0-9\-]{2,}", text.lower())]
			
 
				-        for i in range(len(words) - 1):
			
 
				-            phrase = f"{words[i]} {words[i+1]}"
			
 
				-            if len(phrase) > 6:
			
 
				-                phrase_counts[phrase] += 1
			
 
				-
			
 
				-    emerging = []
			
 
				-    # Combine frequency with average importance so “big signal” rises over pure repetition.
			
 
				-    for ent, count in entity_counts.most_common(limit):
			
 
				-        avg_imp = entity_importance_sum[ent] / max(1, count)
			
 
				-        # avg_imp is typically 0..~1; keep score bounded.
			
 
				-        trend_score = 0.25 + 0.40 * min(1.0, avg_imp) + 0.08 * min(6.0, float(count))
			
 
				+        # bigram phrases (recent only)
			
 
				+        if is_recent:
			
 
				+            text = f"{c.get('headline', '')} {c.get('summary', '')}"
			
 
				+            words = re.findall(r"[A-Za-z][A-Za-z0-9\-]{2,}", text.lower())
			
 
				+            for i in range(len(words) - 1):
			
 
				+                phrase = f"{words[i]} {words[i+1]}"
			
 
				+                if len(phrase) > 6:
			
 
				+                    phrase_counts_recent[phrase] += 1
			
 
				+
			
 
				+    # --- score entities ---
			
 
				+    all_entities = set(entity_counts_recent.keys()) | set(entity_counts_prior.keys())
			
 
				+    scored = []
			
 
				+
			
 
				+    for ent in all_entities:
			
 
				+        recent_n = entity_counts_recent.get(ent, 0)
			
 
				+        prior_n = entity_counts_prior.get(ent, 0)
			
 
				+        total_n = recent_n + prior_n
			
 
				+
			
 
				+        if total_n < 1:
			
 
				+            continue
			
 
				+
			
 
				+        # velocity: ratio of recent vs prior (smoothed to avoid division noise)
			
 
				+        # 0 prior → velocity = recent_n (pure emergence)
			
 
				+        # equal → velocity = 1.0 (steady)
			
 
				+        velocity = (recent_n + 0.5) / (prior_n + 0.5)
			
 
				+
			
 
				+        # recency weight: what fraction of total hits are in the recent window
			
 
				+        recency_ratio = recent_n / total_n
			
 
				+
			
 
				+        # source diversity: how many distinct outlets
			
 
				+        n_sources = len(entity_sources.get(ent, set()))
			
 
				+
			
 
				+        # sustained: how many distinct time buckets did it appear in (max ~6)
			
 
				+        n_buckets = len(entity_buckets.get(ent, set()))
			
 
				+
			
 
				+        # average importance (recent window only)
			
 
				+        avg_imp = (entity_importance_recent.get(ent, 0.0) / max(1, recent_n)) if recent_n > 0 else 0.0
			
 
				+
			
 
				+        composed_score = (
			
 
				+            0.35 * min(1.0, math.log1p(velocity) / math.log1p(4.0)) +   # velocity (0..1, 4x = max)
			
 
				+            0.25 * recency_ratio +                                        # recency concentration
			
 
				+            0.15 * min(1.0, n_sources / 5.0) +                            # source diversity
			
 
				+            0.10 * min(1.0, n_buckets / 4.0) +                            # sustained (>1 bucket)
			
 
				+            0.15 * min(1.0, avg_imp)                                      # importance
			
 
				+        )
			
 
				+
			
 
				         related = []
			
 
				-        for other, _cnt in (entity_cooccur.get(ent) or Counter()).most_common(3):
			
 
				-            # avoid returning the entity itself (shouldn't happen, but be safe)
			
 
				-            if other != ent:
			
 
				-                related.append(other)
			
 
				+        if ent in entity_cooccur:
			
 
				+            for other, _cnt in entity_cooccur[ent].most_common(5):
			
 
				+                if other != ent:
			
 
				+                    related.append(other)
			
 
				 
			
 
				-        emerging.append({
			
 
				+        scored.append({
			
 
				             "topic": ent,
			
 
				-            "trend_score": min(0.99, round(trend_score, 2)),
			
 
				-            "related_entities": related if related else [ent],
			
 
				-            "signal_type": "entity",
			
 
				-            "count": count,
			
 
				+            "trend_score": min(0.99, round(composed_score, 3)),
			
 
				+            "related_entities": related[:3] if related else [ent],
			
 
				+            "velocity": round(velocity, 2),
			
 
				+            "recent_count": recent_n,
			
 
				+            "prior_count": prior_n,
			
 
				+            "source_count": n_sources,
			
 
				             "avg_importance": round(avg_imp, 3),
			
 
				+            "signal_type": "entity",
			
 
				         })
			
 
				 
			
 
				-    for phrase, count in phrase_counts.most_common(limit * 2):
			
 
				+    # sort by composed score descending
			
 
				+    scored.sort(key=lambda x: (-x["trend_score"], -x["velocity"], x["topic"]))
			
 
				+
			
 
				+    # --- add phrase signals (only from recent window) ---
			
 
				+    emerging = list(scored)  # start with entities
			
 
				+    for phrase, count in phrase_counts_recent.most_common(limit * 2):
			
 
				         if any(item["topic"] == phrase for item in emerging):
			
 
				             continue
			
 
				         emerging.append({
			
 
				             "topic": phrase.title(),
			
 
				-            "trend_score": min(0.99, round(0.20 + 0.10 * count, 2)),
			
 
				+            "trend_score": min(0.99, round(0.30 + 0.15 * min(count, 5), 2)),
			
 
				             "related_entities": [],
			
 
				+            "velocity": None,
			
 
				+            "recent_count": count,
			
 
				+            "prior_count": 0,
			
 
				+            "source_count": 0,
			
 
				+            "avg_importance": 0.0,
			
 
				             "signal_type": "phrase",
			
 
				-            "count": count,
			
 
				         })
			
 
				         if len(emerging) >= limit:
			
 
				             break