1 månad sedan · 5ec094693f
--- a/.env.example
+++ b/.env.example
@@ -29,7 +29,10 @@ NEWS_FEED_URLS=
 
				 # Storage / refresh
			
 
				 NEWS_MCP_DATA_DIR=
			
 
				 NEWS_MCP_DB_PATH=
			
 
				-NEWS_CLUSTERS_TTL_HOURS=24
			
 
				+NEWS_DEFAULT_LOOKBACK_HOURS=24
			
 
				+NEWS_PRUNING_ENABLED=true
			
 
				+NEWS_RETENTION_DAYS=180
			
 
				+NEWS_PRUNE_INTERVAL_HOURS=24
			
 
				 NEWS_REFRESH_INTERVAL_SECONDS=900
			
 
				 NEWS_BACKGROUND_REFRESH_ENABLED=true
			
 
				 NEWS_BACKGROUND_REFRESH_ON_START=true
			
--- a/README.md
+++ b/README.md
@@ -23,6 +23,7 @@ Health:
 
				 - Enriches clusters with configurable LLM providers/models (topic/entities/sentiment/keywords)
			
 
				 - Applies a case-insensitive entity blacklist after extraction
			
 
				 - Caches clusters + LLM fields in SQLite
			
 
				+- Resolves entities in-process via Google Trends suggestions (no `trends-mcp` hop required for entity resolution)
			
 
				 
			
 
				 ## Tools (MCP)
			
 
				 
			
@@ -36,7 +37,7 @@ Health:
 
				 - when `include_articles=true`, includes `articles[].url` + minimal fields per returned cluster
			
 
				 
			
 
				 3) `get_event_summary(event_id, include_articles=false)`
			
 
				-- Groq-written compressed narrative for a given `cluster_id`
			
 
				+- LLM-written compressed narrative for a given `cluster_id`
			
 
				 - when `include_articles=true`, includes the underlying `articles` list (with `url`) from the stored cluster
			
 
				 
			
 
				 4) `detect_emerging_topics(limit)`
			
@@ -70,7 +71,10 @@ Key variables:
 
				 - `NEWS_REFRESH_INTERVAL_SECONDS` (default 900)
			
 
				 - `NEWS_BACKGROUND_REFRESH_ON_START` (default true)
			
 
				 - `NEWS_BACKGROUND_REFRESH_ENABLED` (default true)
			
 
				-- `NEWS_CLUSTERS_TTL_HOURS`
			
 
				+- `NEWS_DEFAULT_LOOKBACK_HOURS` (freshness window for reads; older rows are ignored by queries)
			
 
				+- `NEWS_PRUNING_ENABLED` (default true; if false, no rows are physically deleted)
			
 
				+- `NEWS_RETENTION_DAYS` (physical delete threshold for stored clusters)
			
 
				+- `NEWS_PRUNE_INTERVAL_HOURS` (how often in-server pruning may run)
			
 
				 - `GROQ_ENRICH_OTHER_ONLY` (default false; set true for cost control)
			
 
				 - `NEWS_EMBEDDINGS_ENABLED` (default false; enables Ollama embeddings for clustering when wired in)
			
 
				 - `OLLAMA_BASE_URL` / `OLLAMA_URL` (default `http://127.0.0.1:11434`)
			
@@ -79,6 +83,21 @@ Key variables:
 
				 
			
 
				 When embeddings are enabled, news-mcp tries Ollama first and falls back to the existing heuristic clustering path if Ollama is unavailable.
			
 
				 
			
 
				+## TTL vs pruning
			
 
				+
			
 
				+These are intentionally different:
			
 
				+
			
 
				+- `NEWS_DEFAULT_LOOKBACK_HOURS` controls **read freshness** only. Older rows remain in SQLite but do not appear in normal "latest" queries.
			
 
				+- `NEWS_PRUNING_ENABLED` controls whether the server is allowed to **physically delete** old rows.
			
 
				+- `NEWS_RETENTION_DAYS` controls how old rows may get before they are deleted.
			
 
				+- `NEWS_PRUNE_INTERVAL_HOURS` controls how often the server checks whether deletion is due.
			
 
				+
			
 
				+Pruning is self-contained inside the server:
			
 
				+- on startup
			
 
				+- after refresh cycles (prune-if-due)
			
 
				+
			
 
				+If `NEWS_PRUNING_ENABLED=false`, no pruning occurs and old rows are retained indefinitely.
			
 
				+
			
 
				 ## Live extraction smoke test
			
 
				 
			
 
				 Run a standardized, fabricated extraction test against the currently configured provider/model:
			
--- a/news_mcp/config.py
+++ b/news_mcp/config.py
@@ -17,7 +17,7 @@ NEWS_FEED_URLS = os.getenv("NEWS_FEED_URLS", os.getenv("NEWS_RSS_FEED_URLS", "")
 
				 RSS_FEED_URL = NEWS_FEED_URL
			
 
				 RSS_FEED_URLS = NEWS_FEED_URLS
			
 
				 
			
 
				-CLUSTERS_TTL_HOURS = float(os.getenv("NEWS_CLUSTERS_TTL_HOURS", "24"))
			
 
				+DEFAULT_LOOKBACK_HOURS = float(os.getenv("NEWS_DEFAULT_LOOKBACK_HOURS", os.getenv("NEWS_CLUSTERS_TTL_HOURS", "24")))
			
 
				 DEFAULT_TOPICS = ["crypto", "macro", "regulation", "ai", "other"]
			
 
				 
			
 
				 # LLM extraction / summarization
			
@@ -42,3 +42,6 @@ NEWS_EMBEDDING_SIMILARITY_THRESHOLD = float(os.getenv("NEWS_EMBEDDING_SIMILARITY
 
				 NEWS_REFRESH_INTERVAL_SECONDS = int(os.getenv("NEWS_REFRESH_INTERVAL_SECONDS", "900"))
			
 
				 NEWS_BACKGROUND_REFRESH_ENABLED = os.getenv("NEWS_BACKGROUND_REFRESH_ENABLED", "true").lower() == "true"
			
 
				 NEWS_BACKGROUND_REFRESH_ON_START = os.getenv("NEWS_BACKGROUND_REFRESH_ON_START", "true").lower() == "true"
			
 
				+NEWS_PRUNING_ENABLED = os.getenv("NEWS_PRUNING_ENABLED", "true").lower() == "true"
			
 
				+NEWS_RETENTION_DAYS = float(os.getenv("NEWS_RETENTION_DAYS", "180"))
			
 
				+NEWS_PRUNE_INTERVAL_HOURS = float(os.getenv("NEWS_PRUNE_INTERVAL_HOURS", "24"))
			
--- a/news_mcp/enrichment/llm_enrich.py
+++ b/news_mcp/enrichment/llm_enrich.py
@@ -26,7 +26,7 @@ def _filter_entities(entities, blacklist=None):
 
				     return out
			
 
				 
			
 
				 
			
 
				-async def classify_cluster_groq(cluster: Dict[str, Any]) -> Dict[str, Any]:
			
 
				+async def classify_cluster_llm(cluster: Dict[str, Any]) -> Dict[str, Any]:
			
 
				     parsed = await call_extraction(cluster)
			
 
				     out = dict(cluster)
			
 
				     topic = parsed.get("topic", cluster.get("topic"))
			
@@ -46,6 +46,11 @@ async def classify_cluster_groq(cluster: Dict[str, Any]) -> Dict[str, Any]:
 
				     return out
			
 
				 
			
 
				 
			
 
				-async def summarize_cluster_groq(cluster: Dict[str, Any]) -> Dict[str, Any]:
			
 
				+async def summarize_cluster_llm(cluster: Dict[str, Any]) -> Dict[str, Any]:
			
 
				     parsed = await call_summary(cluster)
			
 
				     return parsed
			
 
				+
			
 
				+
			
 
				+# Backward-compatible aliases during the transition away from provider-specific naming.
			
 
				+classify_cluster_groq = classify_cluster_llm
			
 
				+summarize_cluster_groq = summarize_cluster_llm
			
--- a/news_mcp/jobs/poller.py
+++ b/news_mcp/jobs/poller.py
@@ -3,15 +3,21 @@ from __future__ import annotations
 
				 import logging
			
 
				 from typing import Any, Dict
			
 
				 
			
 
				-from news_mcp.config import CLUSTERS_TTL_HOURS, DB_PATH, NEWS_FEED_URL, NEWS_FEED_URLS
			
 
				+from news_mcp.config import DEFAULT_LOOKBACK_HOURS, DB_PATH, NEWS_FEED_URL, NEWS_FEED_URLS
			
 
				 from news_mcp.dedup.cluster import dedup_and_cluster_articles
			
 
				 from news_mcp.enrichment.enrich import enrich_cluster
			
 
				-from news_mcp.enrichment.llm_enrich import classify_cluster_groq
			
 
				+from news_mcp.enrichment.llm_enrich import classify_cluster_llm
			
 
				 from news_mcp.trends_resolution import resolve_entity_via_trends
			
 
				 from news_mcp.sources.news_feeds import fetch_news_articles
			
 
				 from news_mcp.storage.sqlite_store import SQLiteClusterStore
			
 
				 
			
 
				-from news_mcp.config import GROQ_ENRICH_OTHER_ONLY, GROQ_MAX_CLUSTERS_PER_REFRESH
			
 
				+from news_mcp.config import (
			
 
				+    GROQ_ENRICH_OTHER_ONLY,
			
 
				+    GROQ_MAX_CLUSTERS_PER_REFRESH,
			
 
				+    NEWS_PRUNE_INTERVAL_HOURS,
			
 
				+    NEWS_PRUNING_ENABLED,
			
 
				+    NEWS_RETENTION_DAYS,
			
 
				+)
			
 
				 
			
 
				 
			
 
				 async def refresh_clusters(topic: str | None = None, limit: int = 80) -> None:
			
@@ -36,6 +42,12 @@ async def refresh_clusters(topic: str | None = None, limit: int = 80) -> None:
 
				     prev_hash = store.get_feed_hash(feed_key)
			
 
				     if prev_hash == last_hash:
			
 
				         logger.info("refresh unchanged feed_key=%s topic=%s", feed_key, topic)
			
 
				+        prune_result = store.prune_if_due(
			
 
				+            pruning_enabled=NEWS_PRUNING_ENABLED,
			
 
				+            retention_days=NEWS_RETENTION_DAYS,
			
 
				+            interval_hours=NEWS_PRUNE_INTERVAL_HOURS,
			
 
				+        )
			
 
				+        logger.info("refresh prune_result=%s", prune_result)
			
 
				         return
			
 
				     logger.info("refresh changed feed_key=%s topic=%s", feed_key, topic)
			
 
				     store.set_feed_hash(feed_key, last_hash)
			
@@ -75,11 +87,16 @@ async def refresh_clusters(topic: str | None = None, limit: int = 80) -> None:
 
				                     if existing.get("keywords"):
			
 
				                         c2["keywords"] = existing.get("keywords")
			
 
				                 else:
			
 
				-                    c2 = await classify_cluster_groq(c2)
			
 
				+                    c2 = await classify_cluster_llm(c2)
			
 
				 
			
 
				             enriched.append(c2)
			
 
				 
			
 
				         store.upsert_clusters(enriched, topic=t)
			
 
				         logger.info("refresh stored topic=%s clusters=%s", t, len(enriched))
			
 
				 
			
 
				-            
			
 
				+    prune_result = store.prune_if_due(
			
 
				+        pruning_enabled=NEWS_PRUNING_ENABLED,
			
 
				+        retention_days=NEWS_RETENTION_DAYS,
			
 
				+        interval_hours=NEWS_PRUNE_INTERVAL_HOURS,
			
 
				+    )
			
 
				+    logger.info("refresh prune_result=%s", prune_result)
			
--- a/news_mcp/mcp_server_fastmcp.py
+++ b/news_mcp/mcp_server_fastmcp.py
@@ -1,19 +1,30 @@
 
				 from __future__ import annotations
			
 
				 
			
 
				+import asyncio
			
 
				+import logging
			
 
				+from collections import Counter
			
 
				+from datetime import datetime, timezone
			
 
				+from email.utils import parsedate_to_datetime
			
 
				+
			
 
				 from fastapi import FastAPI
			
 
				 from mcp.server.fastmcp import FastMCP
			
 
				 from mcp.server.transport_security import TransportSecuritySettings
			
 
				 
			
 
				-from news_mcp.config import CLUSTERS_TTL_HOURS, DEFAULT_TOPICS, DB_PATH
			
 
				-from news_mcp.config import NEWS_REFRESH_INTERVAL_SECONDS, NEWS_BACKGROUND_REFRESH_ENABLED, NEWS_BACKGROUND_REFRESH_ON_START
			
 
				+from news_mcp.config import DEFAULT_LOOKBACK_HOURS, DEFAULT_TOPICS, DB_PATH
			
 
				+from news_mcp.config import (
			
 
				+    NEWS_PRUNE_INTERVAL_HOURS,
			
 
				+    NEWS_PRUNING_ENABLED,
			
 
				+    NEWS_REFRESH_INTERVAL_SECONDS,
			
 
				+    NEWS_BACKGROUND_REFRESH_ENABLED,
			
 
				+    NEWS_BACKGROUND_REFRESH_ON_START,
			
 
				+    NEWS_RETENTION_DAYS,
			
 
				+)
			
 
				 from news_mcp.jobs.poller import refresh_clusters
			
 
				 from news_mcp.storage.sqlite_store import SQLiteClusterStore
			
 
				-from news_mcp.enrichment.llm_enrich import summarize_cluster_groq
			
 
				+from news_mcp.enrichment.llm_enrich import summarize_cluster_llm
			
 
				 from news_mcp.trends_resolution import resolve_entity_via_trends
			
 
				 from news_mcp.llm import active_llm_config
			
 
				 from news_mcp.entity_normalize import normalize_query
			
 
				-from collections import Counter
			
 
				-import logging
			
 
				 
			
 
				 
			
 
				 mcp = FastMCP(
			
@@ -37,7 +48,40 @@ def _cluster_entity_haystack(cluster: dict) -> list[str]:
 
				     return [v for v in values if v]
			
 
				 
			
 
				 
			
 
				-@mcp.tool(description="What is happening right now? Return the latest deduplicated news clusters for a topic.")
			
 
				+def _parse_cluster_timestamp(value) -> datetime:
			
 
				+    if not value:
			
 
				+        return datetime.min.replace(tzinfo=timezone.utc)
			
 
				+    text = str(value).strip()
			
 
				+    if not text:
			
 
				+        return datetime.min.replace(tzinfo=timezone.utc)
			
 
				+    try:
			
 
				+        dt = datetime.fromisoformat(text.replace("Z", "+00:00"))
			
 
				+        if dt.tzinfo is None:
			
 
				+            dt = dt.replace(tzinfo=timezone.utc)
			
 
				+        return dt.astimezone(timezone.utc)
			
 
				+    except Exception:
			
 
				+        pass
			
 
				+    try:
			
 
				+        dt = parsedate_to_datetime(text)
			
 
				+        if dt.tzinfo is None:
			
 
				+            dt = dt.replace(tzinfo=timezone.utc)
			
 
				+        return dt.astimezone(timezone.utc)
			
 
				+    except Exception:
			
 
				+        return datetime.min.replace(tzinfo=timezone.utc)
			
 
				+
			
 
				+
			
 
				+def _sort_clusters_by_recency(clusters: list[dict]) -> list[dict]:
			
 
				+    return sorted(
			
 
				+        clusters,
			
 
				+        key=lambda c: (
			
 
				+            _parse_cluster_timestamp(c.get("timestamp")),
			
 
				+            float(c.get("importance", 0.0) or 0.0),
			
 
				+        ),
			
 
				+        reverse=True,
			
 
				+    )
			
 
				+
			
 
				+
			
 
				+@mcp.tool(description="Investigate a topic and return the newest deduplicated news clusters, sorted by recency.")
			
 
				 async def get_latest_events(topic: str = "crypto", limit: int = 5, include_articles: bool = False):
			
 
				     limit = max(1, min(int(limit), 20))
			
 
				     # If the caller passes an entity-like value, resolve it and use the canonical
			
@@ -58,14 +102,14 @@ async def get_latest_events(topic: str = "crypto", limit: int = 5, include_artic
 
				 
			
 
				     if is_topic:
			
 
				         # Cache-first: only refresh if we currently have no fresh clusters for this topic.
			
 
				-        clusters = store.get_latest_clusters(topic=topic_norm, ttl_hours=CLUSTERS_TTL_HOURS, limit=limit)
			
 
				+        clusters = store.get_latest_clusters(topic=topic_norm, ttl_hours=DEFAULT_LOOKBACK_HOURS, limit=limit)
			
 
				         if not clusters:
			
 
				             await refresh_clusters(topic=topic_norm, limit=200)
			
 
				-            clusters = store.get_latest_clusters(topic=topic_norm, ttl_hours=CLUSTERS_TTL_HOURS, limit=limit)
			
 
				+            clusters = store.get_latest_clusters(topic=topic_norm, ttl_hours=DEFAULT_LOOKBACK_HOURS, limit=limit)
			
 
				     else:
			
 
				         # Entity-aware mode: search recent clusters across all topics and match by
			
 
				         # raw entity, canonical label, or MID.
			
 
				-        clusters = store.get_latest_clusters_all_topics(ttl_hours=CLUSTERS_TTL_HOURS, limit=limit * 8)
			
 
				+        clusters = store.get_latest_clusters_all_topics(ttl_hours=DEFAULT_LOOKBACK_HOURS, limit=limit * 8)
			
 
				         filtered = []
			
 
				         for c in clusters:
			
 
				             haystack = _cluster_entity_haystack(c)
			
@@ -75,11 +119,8 @@ async def get_latest_events(topic: str = "crypto", limit: int = 5, include_artic
 
				                 break
			
 
				         clusters = filtered
			
 
				 
			
 
				-    # Ensure the response is compact and agent-friendly.
			
 
				-    clusters_sorted = sorted(clusters, key=lambda x: float(x.get("importance", 0.0)), reverse=True)
			
 
				-
			
 
				     out = []
			
 
				-    for c in clusters_sorted:
			
 
				+    for c in _sort_clusters_by_recency(clusters):
			
 
				         item = {
			
 
				             "cluster_id": c.get("cluster_id"),
			
 
				             "headline": c.get("headline"),
			
@@ -108,7 +149,7 @@ async def get_latest_events(topic: str = "crypto", limit: int = 5, include_artic
 
				     return out
			
 
				 
			
 
				 
			
 
				-@mcp.tool(description="What's happening with X? Filter clusters by extracted entity substring (case-insensitive) within a timeframe.")
			
 
				+@mcp.tool(description="Investigate a person, company, place, or theme by matching extracted entities within a time window.")
			
 
				 async def get_events_for_entity(entity: str, limit: int = 10, timeframe: str = "24h", include_articles: bool = False):
			
 
				     limit = max(1, min(int(limit), 30))
			
 
				     query = normalize_query(entity).strip().lower()
			
@@ -128,7 +169,7 @@ async def get_events_for_entity(entity: str, limit: int = 10, timeframe: str = "
 
				 
			
 
				     def _match_clusters(clusters: list[dict]) -> list[dict]:
			
 
				         hits: list[dict] = []
			
 
				-        for c in clusters:
			
 
				+        for c in _sort_clusters_by_recency(clusters):
			
 
				             haystack = _cluster_entity_haystack(c)
			
 
				             if any(any(term in item for item in haystack) for term in query_terms):
			
 
				                 hits.append(c)
			
@@ -136,14 +177,10 @@ async def get_events_for_entity(entity: str, limit: int = 10, timeframe: str = "
 
				                 break
			
 
				         return hits
			
 
				 
			
 
				-    clusters = store.get_latest_clusters_all_topics(ttl_hours=CLUSTERS_TTL_HOURS, limit=limit * 5)
			
 
				-    hits = _match_clusters(clusters)
			
 
				-
			
 
				     hours = _parse_timeframe_to_hours(timeframe)
			
 
				     clusters = store.get_latest_clusters_all_topics(ttl_hours=hours, limit=max(200, limit * 10))
			
 
				     hits = _match_clusters(clusters)
			
 
				 
			
 
				-    # Compress to tool response shape.
			
 
				     out = []
			
 
				     for c in hits:
			
 
				         item = {
			
@@ -172,14 +209,14 @@ async def get_events_for_entity(entity: str, limit: int = 10, timeframe: str = "
 
				     return out
			
 
				 
			
 
				 
			
 
				-@mcp.tool(description="Explain an event clearly by cluster_id (Groq summary).")
			
 
				+@mcp.tool(description="Investigate one cluster in depth and return a concise LLM-written explanation plus key facts.")
			
 
				 async def get_event_summary(event_id: str, include_articles: bool = False):
			
 
				     store = SQLiteClusterStore(DB_PATH)
			
 
				 
			
 
				     # Summary cache: reuse if present within TTL.
			
 
				     cached_summary = store.get_cluster_summary(
			
 
				         cluster_id=event_id,
			
 
				-        ttl_hours=CLUSTERS_TTL_HOURS,
			
 
				+        ttl_hours=DEFAULT_LOOKBACK_HOURS,
			
 
				     )
			
 
				     if cached_summary:
			
 
				         out = {
			
@@ -226,7 +263,7 @@ async def get_event_summary(event_id: str, include_articles: bool = False):
 
				             if isinstance(a, dict)
			
 
				         ]
			
 
				 
			
 
				-    summary = await summarize_cluster_groq(cluster)
			
 
				+    summary = await summarize_cluster_llm(cluster)
			
 
				 
			
 
				     store.upsert_cluster_summary(event_id, summary)
			
 
				     out = {
			
@@ -242,13 +279,12 @@ async def get_event_summary(event_id: str, include_articles: bool = False):
 
				     return out
			
 
				 
			
 
				 
			
 
				-@mcp.tool(description="Detect emerging topics/entities from recent cached news clusters.")
			
 
				+@mcp.tool(description="Explore what is starting to matter: surface emerging entities and phrases from recent clusters.")
			
 
				 async def detect_emerging_topics(limit: int = 10):
			
 
				     limit = max(1, min(int(limit), 20))
			
 
				     store = SQLiteClusterStore(DB_PATH)
			
 
				-    clusters = store.get_latest_clusters_all_topics(ttl_hours=CLUSTERS_TTL_HOURS, limit=200)
			
 
				+    clusters = store.get_latest_clusters_all_topics(ttl_hours=DEFAULT_LOOKBACK_HOURS, limit=200)
			
 
				 
			
 
				-    from collections import Counter
			
 
				     import re
			
 
				 
			
 
				     entity_counts = Counter()
			
@@ -280,9 +316,9 @@ async def detect_emerging_topics(limit: int = 10):
 
				                 continue
			
 
				             entity_counts[ent] += 1
			
 
				             try:
			
 
				-                    entity_importance_sum[ent] += float(c.get("importance", 0.0) or 0.0)
			
 
				+                entity_importance_sum[ent] += float(c.get("importance", 0.0) or 0.0)
			
 
				             except Exception:
			
 
				-                    pass
			
 
				+                pass
			
 
				 
			
 
				         # update co-occurrence counts
			
 
				         for i in range(len(ents_in_cluster_norm)):
			
@@ -342,7 +378,7 @@ async def detect_emerging_topics(limit: int = 10):
 
				     return emerging[:limit]
			
 
				 
			
 
				 
			
 
				-@mcp.tool(description="What's the overall sentiment around an entity within a timeframe?")
			
 
				+@mcp.tool(description="Investigate whether sentiment around an entity is positive, negative, or neutral over a chosen lookback window.")
			
 
				 async def get_news_sentiment(entity: str, timeframe: str = "24h"):
			
 
				     store = SQLiteClusterStore(DB_PATH)
			
 
				 
			
@@ -428,7 +464,7 @@ def _parse_timeframe_to_hours(timeframe: str) -> int:
 
				 
			
 
				 
			
 
				 @mcp.tool(
			
 
				-    description="Given a subject entity, find related entities via co-occurrence inside recent clusters (entity-only, no topic fallback)."
			
 
				+    description="Investigate which entities tend to appear alongside a subject entity in recent clusters, based on co-occurrence."
			
 
				 )
			
 
				 async def get_related_entities(subject: str, timeframe: str = "24h", limit: int = 10):
			
 
				     store = SQLiteClusterStore(DB_PATH)
			
@@ -529,6 +565,14 @@ async def _start_background_refresh():
 
				     _background_task_started = True
			
 
				     logger.info("news-mcp llm config: %s", active_llm_config())
			
 
				 
			
 
				+    store = SQLiteClusterStore(DB_PATH)
			
 
				+    prune_result = store.prune_if_due(
			
 
				+        pruning_enabled=NEWS_PRUNING_ENABLED,
			
 
				+        retention_days=NEWS_RETENTION_DAYS,
			
 
				+        interval_hours=NEWS_PRUNE_INTERVAL_HOURS,
			
 
				+    )
			
 
				+    logger.info("startup prune_result=%s", prune_result)
			
 
				+
			
 
				     async def _loop():
			
 
				         if not NEWS_BACKGROUND_REFRESH_ON_START:
			
 
				             await asyncio.sleep(float(NEWS_REFRESH_INTERVAL_SECONDS))
			
@@ -541,8 +585,6 @@ async def _start_background_refresh():
 
				                 pass
			
 
				             await asyncio.sleep(float(NEWS_REFRESH_INTERVAL_SECONDS))
			
 
				 
			
 
				-    import asyncio
			
 
				-
			
 
				     asyncio.create_task(_loop())
			
 
				 
			
 
				 
			
@@ -557,6 +599,14 @@ def root():
 
				             "enabled": NEWS_BACKGROUND_REFRESH_ENABLED,
			
 
				             "interval_seconds": NEWS_REFRESH_INTERVAL_SECONDS,
			
 
				         },
			
 
				+        "retention": {
			
 
				+            "ttl_hours": DEFAULT_LOOKBACK_HOURS,
			
 
				+            "retention_days": NEWS_RETENTION_DAYS,
			
 
				+        },
			
 
				+        "pruning": {
			
 
				+            "enabled": NEWS_PRUNING_ENABLED,
			
 
				+            "interval_hours": NEWS_PRUNE_INTERVAL_HOURS,
			
 
				+        },
			
 
				     }
			
 
				 
			
 
				 
			
@@ -565,7 +615,12 @@ def health():
 
				     store = SQLiteClusterStore(DB_PATH)
			
 
				     return {
			
 
				         "status": "ok",
			
 
				-        "ttl_hours": CLUSTERS_TTL_HOURS,
			
 
				+        "ttl_hours": DEFAULT_LOOKBACK_HOURS,
			
 
				         "db": str(DB_PATH),
			
 
				         "refresh": store.get_feed_state("breakingthenews"),
			
 
				+        "pruning": store.get_prune_state(
			
 
				+            pruning_enabled=NEWS_PRUNING_ENABLED,
			
 
				+            retention_days=NEWS_RETENTION_DAYS,
			
 
				+            interval_hours=NEWS_PRUNE_INTERVAL_HOURS,
			
 
				+        ),
			
 
				     }
			
--- a/news_mcp/storage/sqlite_store.py
+++ b/news_mcp/storage/sqlite_store.py
@@ -20,6 +20,9 @@ class ClusterRow:
 
				     updated_at: datetime
			
 
				 
			
 
				 
			
 
				+META_LAST_PRUNE_AT = "last_prune_at"
			
 
				+
			
 
				+
			
 
				 def _article_key(article: dict[str, Any]) -> str:
			
 
				     url = str(article.get("url") or "").strip()
			
 
				     if not url:
			
@@ -123,6 +126,9 @@ class SQLiteClusterStore:
 
				             conn.execute(
			
 
				                 "CREATE INDEX IF NOT EXISTS idx_clusters_topic ON clusters(topic)"
			
 
				             )
			
 
				+            conn.execute(
			
 
				+                "CREATE INDEX IF NOT EXISTS idx_clusters_updated_at ON clusters(updated_at)"
			
 
				+            )
			
 
				 
			
 
				             conn.execute(
			
 
				                 """
			
@@ -134,6 +140,15 @@ class SQLiteClusterStore:
 
				                 """
			
 
				             )
			
 
				 
			
 
				+            conn.execute(
			
 
				+                """
			
 
				+                CREATE TABLE IF NOT EXISTS meta (
			
 
				+                  key TEXT PRIMARY KEY,
			
 
				+                  value TEXT NOT NULL
			
 
				+                )
			
 
				+                """
			
 
				+            )
			
 
				+
			
 
				     def upsert_clusters(self, clusters: list[dict], topic: str) -> None:
			
 
				         now = datetime.now(timezone.utc)
			
 
				         with self._conn() as conn:
			
@@ -241,3 +256,86 @@ class SQLiteClusterStore:
 
				             if not row:
			
 
				                 return None
			
 
				             return {"last_hash": row[0], "updated_at": row[1]}
			
 
				+
			
 
				+    def get_meta(self, key: str) -> str | None:
			
 
				+        with self._conn() as conn:
			
 
				+            cur = conn.execute("SELECT value FROM meta WHERE key=?", (key,))
			
 
				+            row = cur.fetchone()
			
 
				+            return row[0] if row else None
			
 
				+
			
 
				+    def set_meta(self, key: str, value: str) -> None:
			
 
				+        with self._conn() as conn:
			
 
				+            conn.execute(
			
 
				+                "INSERT INTO meta(key, value) VALUES(?, ?) "
			
 
				+                "ON CONFLICT(key) DO UPDATE SET value=excluded.value",
			
 
				+                (key, value),
			
 
				+            )
			
 
				+
			
 
				+    def prune_clusters(self, retention_days: float) -> int:
			
 
				+        retention_days = float(retention_days)
			
 
				+        if retention_days <= 0:
			
 
				+            return 0
			
 
				+        cutoff = datetime.now(timezone.utc) - timedelta(days=retention_days)
			
 
				+        cutoff_iso = cutoff.isoformat()
			
 
				+        pruned_at = datetime.now(timezone.utc).isoformat()
			
 
				+        with self._conn() as conn:
			
 
				+            cur = conn.execute("DELETE FROM clusters WHERE updated_at < ?", (cutoff_iso,))
			
 
				+            deleted = int(cur.rowcount or 0)
			
 
				+            conn.execute(
			
 
				+                "INSERT INTO meta(key, value) VALUES(?, ?) "
			
 
				+                "ON CONFLICT(key) DO UPDATE SET value=excluded.value",
			
 
				+                (META_LAST_PRUNE_AT, pruned_at),
			
 
				+            )
			
 
				+            return deleted
			
 
				+
			
 
				+    def prune_if_due(self, pruning_enabled: bool, retention_days: float, interval_hours: float = 24.0) -> dict[str, Any]:
			
 
				+        retention_days = float(retention_days)
			
 
				+        interval_hours = float(interval_hours)
			
 
				+        if (not pruning_enabled) or retention_days <= 0:
			
 
				+            return {
			
 
				+                "enabled": bool(pruning_enabled),
			
 
				+                "deleted": 0,
			
 
				+                "due": False,
			
 
				+                "retention_days": retention_days,
			
 
				+                "interval_hours": interval_hours,
			
 
				+                "last_prune_at": self.get_meta(META_LAST_PRUNE_AT),
			
 
				+            }
			
 
				+
			
 
				+        last_prune_at = self.get_meta(META_LAST_PRUNE_AT)
			
 
				+        now = datetime.now(timezone.utc)
			
 
				+        due = True
			
 
				+        if last_prune_at:
			
 
				+            try:
			
 
				+                last_dt = datetime.fromisoformat(last_prune_at)
			
 
				+                due = now - last_dt >= timedelta(hours=max(1.0, interval_hours))
			
 
				+            except Exception:
			
 
				+                due = True
			
 
				+
			
 
				+        if not due:
			
 
				+            return {
			
 
				+                "enabled": True,
			
 
				+                "deleted": 0,
			
 
				+                "due": False,
			
 
				+                "retention_days": retention_days,
			
 
				+                "interval_hours": interval_hours,
			
 
				+                "last_prune_at": last_prune_at,
			
 
				+            }
			
 
				+
			
 
				+        deleted = self.prune_clusters(retention_days)
			
 
				+        last_prune_at = self.get_meta(META_LAST_PRUNE_AT)
			
 
				+        return {
			
 
				+            "enabled": True,
			
 
				+            "deleted": deleted,
			
 
				+            "due": True,
			
 
				+            "retention_days": retention_days,
			
 
				+            "interval_hours": interval_hours,
			
 
				+            "last_prune_at": last_prune_at,
			
 
				+        }
			
 
				+
			
 
				+    def get_prune_state(self, pruning_enabled: bool, retention_days: float, interval_hours: float = 24.0) -> dict[str, Any]:
			
 
				+        return {
			
 
				+            "enabled": bool(pruning_enabled),
			
 
				+            "retention_days": float(retention_days),
			
 
				+            "interval_hours": float(interval_hours),
			
 
				+            "last_prune_at": self.get_meta(META_LAST_PRUNE_AT),
			
 
				+        }
			
--- a/news_mcp/trends_resolution.py
+++ b/news_mcp/trends_resolution.py
@@ -1,20 +1,77 @@
 
				 from __future__ import annotations
			
 
				 
			
 
				 import json
			
 
				-import os
			
 
				-import subprocess
			
 
				+from datetime import datetime, timezone
			
 
				 from functools import lru_cache
			
 
				 from typing import Any
			
 
				+from urllib.parse import quote
			
 
				+
			
 
				+import httpx
			
 
				 
			
 
				 from news_mcp.entity_normalize import normalize_entity
			
 
				 
			
 
				 
			
 
				+class GoogleTrendsError(RuntimeError):
			
 
				+    pass
			
 
				+
			
 
				+
			
 
				+class GoogleTrendsProvider:
			
 
				+    """Minimal in-process Google Trends adapter used by news-mcp.
			
 
				+
			
 
				+    We only need entity suggestions for the resolver path, so keep this module
			
 
				+    intentionally narrow rather than importing the full trends-mcp server.
			
 
				+    """
			
 
				+
			
 
				+    _SUGGESTIONS_URL = "https://trends.google.com/trends/api/autocomplete/"
			
 
				+
			
 
				+    def __init__(self, *, hl: str = "en-US", tz: int = 120, timeout: float = 10.0):
			
 
				+        self.hl = hl
			
 
				+        self.tz = tz
			
 
				+        self.timeout = timeout
			
 
				+        self._headers = {
			
 
				+            "User-Agent": (
			
 
				+                "Mozilla/5.0 (X11; Linux x86_64) "
			
 
				+                "AppleWebKit/537.36 (KHTML, like Gecko) "
			
 
				+                "Chrome/135.0.0.0 Safari/537.36"
			
 
				+            ),
			
 
				+            "Accept": "application/json,text/javascript,*/*;q=0.1",
			
 
				+        }
			
 
				+
			
 
				+    def suggestions(self, keyword: str) -> list[dict[str, Any]]:
			
 
				+        url = self._SUGGESTIONS_URL + quote(keyword)
			
 
				+        params = {"hl": self.hl, "tz": str(self.tz)}
			
 
				+        try:
			
 
				+            response = httpx.get(url, params=params, headers=self._headers, timeout=self.timeout, follow_redirects=True)
			
 
				+            response.raise_for_status()
			
 
				+            text = response.text.strip()
			
 
				+            if text.startswith(")]}',"):
			
 
				+                text = text[5:]
			
 
				+            payload = json.loads(text)
			
 
				+            default = payload.get("default") if isinstance(payload, dict) else None
			
 
				+            topics = default.get("topics") if isinstance(default, dict) else None
			
 
				+            return topics if isinstance(topics, list) else []
			
 
				+        except Exception as exc:  # pragma: no cover - network/provider dependent
			
 
				+            raise GoogleTrendsError(f"suggestions failed for {keyword!r}: {exc}") from exc
			
 
				+
			
 
				+
			
 
				+@lru_cache(maxsize=1)
			
 
				+def _provider() -> GoogleTrendsProvider | None:
			
 
				+    try:
			
 
				+        return GoogleTrendsProvider()
			
 
				+    except Exception:
			
 
				+        return None
			
 
				+
			
 
				+
			
 
				+def _resolved_at() -> str:
			
 
				+    return datetime.now(timezone.utc).isoformat()
			
 
				+
			
 
				+
			
 
				 @lru_cache(maxsize=1024)
			
 
				 def resolve_entity_via_trends(entity: str) -> dict[str, Any]:
			
 
				-    """Resolve a normalized entity through trends-mcp, falling back cleanly.
			
 
				+    """Resolve an entity locally via Google Trends suggestions.
			
 
				 
			
 
				-    The input is normalized first using the same local normalization rules used
			
 
				-    everywhere else in news-mcp, so query and storage paths stay aligned.
			
 
				+    The returned shape intentionally mirrors the former trends-mcp bridge so the
			
 
				+    rest of news-mcp can stay unchanged during the migration.
			
 
				     """
			
 
				     normalized = normalize_entity(entity)
			
 
				     if not normalized:
			
@@ -24,34 +81,28 @@ def resolve_entity_via_trends(entity: str) -> dict[str, Any]:
 
				             "canonical_label": "",
			
 
				             "mid": None,
			
 
				             "type": None,
			
 
				+            "candidates": [],
			
 
				             "source": "empty",
			
 
				+            "resolved_at": _resolved_at(),
			
 
				         }
			
 
				 
			
 
				-    config = os.getenv("MCPORTER_CONFIG", os.path.expanduser("~/.openclaw/workspace/config/mcporter.json"))
			
 
				-    command = [
			
 
				-        "mcporter",
			
 
				-        "--config",
			
 
				-        config,
			
 
				-        "call",
			
 
				-        "trends.resolve_entity",
			
 
				-        f"keyword={normalized}",
			
 
				-    ]
			
 
				-
			
 
				-    try:
			
 
				-        proc = subprocess.run(command, capture_output=True, text=True, timeout=20, check=False)
			
 
				-        if proc.returncode == 0 and proc.stdout.strip():
			
 
				-            payload = json.loads(proc.stdout)
			
 
				+    provider = _provider()
			
 
				+    if provider is not None:
			
 
				+        try:
			
 
				+            suggestions = provider.suggestions(normalized)
			
 
				+            best = suggestions[0] if suggestions else None
			
 
				             return {
			
 
				                 "raw": entity,
			
 
				                 "normalized": normalized,
			
 
				-                "canonical_label": payload.get("canonical_label") or normalized,
			
 
				-                "mid": payload.get("mid"),
			
 
				-                "type": payload.get("type"),
			
 
				-                "candidates": payload.get("candidates", []),
			
 
				-                "source": "trends-mcp",
			
 
				+                "canonical_label": best.get("title") if best else normalized,
			
 
				+                "mid": best.get("mid") if best else None,
			
 
				+                "type": best.get("type") if best else None,
			
 
				+                "candidates": suggestions,
			
 
				+                "source": "google-trends",
			
 
				+                "resolved_at": _resolved_at(),
			
 
				             }
			
 
				-    except Exception:
			
 
				-        pass
			
 
				+        except Exception:
			
 
				+            pass
			
 
				 
			
 
				     # Conservative fallback: keep the local normalized form and leave MID unset.
			
 
				     return {
			
@@ -60,5 +111,7 @@ def resolve_entity_via_trends(entity: str) -> dict[str, Any]:
 
				         "canonical_label": normalized,
			
 
				         "mid": None,
			
 
				         "type": None,
			
 
				+        "candidates": [],
			
 
				         "source": "fallback",
			
 
				+        "resolved_at": _resolved_at(),
			
 
				     }
			
--- a/test_news_mcp.py
+++ b/test_news_mcp.py
@@ -9,6 +9,8 @@ from news_mcp.enrichment.importance import compute_importance
 
				 from news_mcp.enrichment.llm_enrich import _filter_entities, _matches_blacklist
			
 
				 from news_mcp.entity_normalize import normalize_query, normalize_entities
			
 
				 from news_mcp.llm import build_extraction_prompt, call_llm, load_prompt
			
 
				+from news_mcp.trends_resolution import resolve_entity_via_trends
			
 
				+from news_mcp.mcp_server_fastmcp import _sort_clusters_by_recency
			
 
				 
			
 
				 
			
 
				 def _article(title: str, url: str = "https://example.com/x", source: str = "Src", ts: str = "Mon, 30 Mar 2026 12:00:00 GMT"):
			
@@ -77,6 +79,71 @@ def test_sqlite_summary_cache_roundtrip():
 
				         assert cached["keyFacts"] == ["Fact 1"]
			
 
				 
			
 
				 
			
 
				+def test_prune_clusters_deletes_rows_older_than_retention():
			
 
				+    with tempfile.TemporaryDirectory() as td:
			
 
				+        db = Path(td) / "news.sqlite"
			
 
				+        store = SQLiteClusterStore(db)
			
 
				+        store.upsert_clusters([
			
 
				+            {
			
 
				+                "cluster_id": "fresh",
			
 
				+                "headline": "Fresh",
			
 
				+                "summary": "Fresh summary",
			
 
				+                "entities": ["Bitcoin"],
			
 
				+                "timestamp": "Wed, 01 Apr 2026 12:00:00 GMT",
			
 
				+                "articles": [],
			
 
				+            },
			
 
				+            {
			
 
				+                "cluster_id": "stale",
			
 
				+                "headline": "Stale",
			
 
				+                "summary": "Stale summary",
			
 
				+                "entities": ["Iran"],
			
 
				+                "timestamp": "Wed, 01 Apr 2026 11:00:00 GMT",
			
 
				+                "articles": [],
			
 
				+            },
			
 
				+        ], topic="other")
			
 
				+
			
 
				+        with store._conn() as conn:
			
 
				+            conn.execute(
			
 
				+                "UPDATE clusters SET updated_at=? WHERE cluster_id=?",
			
 
				+                ("2025-01-01T00:00:00+00:00", "stale"),
			
 
				+            )
			
 
				+
			
 
				+        deleted = store.prune_clusters(retention_days=30)
			
 
				+
			
 
				+        assert deleted == 1
			
 
				+        assert store.get_cluster_by_id("stale") is None
			
 
				+        assert store.get_cluster_by_id("fresh") is not None
			
 
				+        assert store.get_prune_state(pruning_enabled=True, retention_days=30, interval_hours=24)["last_prune_at"] is not None
			
 
				+
			
 
				+
			
 
				+def test_prune_if_due_skips_deletes_when_pruning_disabled():
			
 
				+    with tempfile.TemporaryDirectory() as td:
			
 
				+        db = Path(td) / "news.sqlite"
			
 
				+        store = SQLiteClusterStore(db)
			
 
				+        store.upsert_clusters([
			
 
				+            {
			
 
				+                "cluster_id": "stale",
			
 
				+                "headline": "Stale",
			
 
				+                "summary": "Stale summary",
			
 
				+                "entities": ["Iran"],
			
 
				+                "timestamp": "Wed, 01 Apr 2026 11:00:00 GMT",
			
 
				+                "articles": [],
			
 
				+            }
			
 
				+        ], topic="other")
			
 
				+
			
 
				+        with store._conn() as conn:
			
 
				+            conn.execute(
			
 
				+                "UPDATE clusters SET updated_at=? WHERE cluster_id=?",
			
 
				+                ("2025-01-01T00:00:00+00:00", "stale"),
			
 
				+            )
			
 
				+
			
 
				+        result = store.prune_if_due(pruning_enabled=False, retention_days=30, interval_hours=24)
			
 
				+
			
 
				+        assert result["enabled"] is False
			
 
				+        assert result["deleted"] == 0
			
 
				+        assert store.get_cluster_by_id("stale") is not None
			
 
				+
			
 
				+
			
 
				 def test_blacklist_filters_entities_case_insensitively():
			
 
				     entities = ["Bloomberg", "Reuters", "bloomberg", "CoinDesk"]
			
 
				     filtered = _filter_entities(entities, blacklist=["bloomberg"])
			
@@ -104,6 +171,35 @@ def test_load_prompt_reads_prompt_files():
 
				     assert "Return STRICT JSON" in text
			
 
				 
			
 
				 
			
 
				+def test_resolve_entity_falls_back_cleanly_when_provider_unavailable(monkeypatch):
			
 
				+    import news_mcp.trends_resolution as trends_resolution
			
 
				+
			
 
				+    trends_resolution.resolve_entity_via_trends.cache_clear()
			
 
				+    trends_resolution._provider.cache_clear()
			
 
				+    monkeypatch.setattr(trends_resolution, "_provider", lambda: None)
			
 
				+
			
 
				+    resolved = resolve_entity_via_trends("btc")
			
 
				+
			
 
				+    assert resolved["normalized"] == "Bitcoin"
			
 
				+    assert resolved["canonical_label"] == "Bitcoin"
			
 
				+    assert resolved["mid"] is None
			
 
				+    assert resolved["candidates"] == []
			
 
				+    assert resolved["source"] == "fallback"
			
 
				+
			
 
				+    trends_resolution.resolve_entity_via_trends.cache_clear()
			
 
				+
			
 
				+
			
 
				+def test_sort_clusters_by_recency_prefers_newer_timestamp_over_importance():
			
 
				+    clusters = [
			
 
				+        {"headline": "older", "timestamp": "Wed, 01 Apr 2026 10:00:00 GMT", "importance": 0.9},
			
 
				+        {"headline": "newer", "timestamp": "Wed, 01 Apr 2026 11:00:00 GMT", "importance": 0.1},
			
 
				+    ]
			
 
				+
			
 
				+    sorted_clusters = _sort_clusters_by_recency(clusters)
			
 
				+
			
 
				+    assert [c["headline"] for c in sorted_clusters] == ["newer", "older"]
			
 
				+
			
 
				+
			
 
				 def test_build_extraction_prompt_is_stable_without_blacklist():
			
 
				     cluster = {
			
 
				         "headline": "Bloomberg reports Bitcoin rallies after US rate comments",