1 month ago · 5ec094693f
--- a/.env.example
+++ b/.env.example
@@ -29,7 +29,10 @@ NEWS_FEED_URLS=
 
															 # Storage / refresh
														
 
															 NEWS_MCP_DATA_DIR=
														
 
															 NEWS_MCP_DB_PATH=
														
 
															-NEWS_CLUSTERS_TTL_HOURS=24
														
 
															+NEWS_DEFAULT_LOOKBACK_HOURS=24
														
 
															+NEWS_PRUNING_ENABLED=true
														
 
															+NEWS_RETENTION_DAYS=180
														
 
															+NEWS_PRUNE_INTERVAL_HOURS=24
														
 
															 NEWS_REFRESH_INTERVAL_SECONDS=900
														
 
															 NEWS_BACKGROUND_REFRESH_ENABLED=true
														
 
															 NEWS_BACKGROUND_REFRESH_ON_START=true
														
--- a/README.md
+++ b/README.md
@@ -23,6 +23,7 @@ Health:
 
															 - Enriches clusters with configurable LLM providers/models (topic/entities/sentiment/keywords)
														
 
															 - Applies a case-insensitive entity blacklist after extraction
														
 
															 - Caches clusters + LLM fields in SQLite
														
 
															+- Resolves entities in-process via Google Trends suggestions (no `trends-mcp` hop required for entity resolution)
														
 
															 ## Tools (MCP)
														
@@ -36,7 +37,7 @@ Health:
 
															 - when `include_articles=true`, includes `articles[].url` + minimal fields per returned cluster
														
 
															 3) `get_event_summary(event_id, include_articles=false)`
														
 
															-- Groq-written compressed narrative for a given `cluster_id`
														
 
															+- LLM-written compressed narrative for a given `cluster_id`
														
 
															 - when `include_articles=true`, includes the underlying `articles` list (with `url`) from the stored cluster
														
 
															 4) `detect_emerging_topics(limit)`
														
@@ -70,7 +71,10 @@ Key variables:
 
															 - `NEWS_REFRESH_INTERVAL_SECONDS` (default 900)
														
 
															 - `NEWS_BACKGROUND_REFRESH_ON_START` (default true)
														
 
															 - `NEWS_BACKGROUND_REFRESH_ENABLED` (default true)
														
 
															-- `NEWS_CLUSTERS_TTL_HOURS`
														
 
															+- `NEWS_DEFAULT_LOOKBACK_HOURS` (freshness window for reads; older rows are ignored by queries)
														
 
															+- `NEWS_PRUNING_ENABLED` (default true; if false, no rows are physically deleted)
														
 
															+- `NEWS_RETENTION_DAYS` (physical delete threshold for stored clusters)
														
 
															+- `NEWS_PRUNE_INTERVAL_HOURS` (how often in-server pruning may run)
														
 
															 - `GROQ_ENRICH_OTHER_ONLY` (default false; set true for cost control)
														
 
															 - `NEWS_EMBEDDINGS_ENABLED` (default false; enables Ollama embeddings for clustering when wired in)
														
 
															 - `OLLAMA_BASE_URL` / `OLLAMA_URL` (default `http://127.0.0.1:11434`)
														
@@ -79,6 +83,21 @@ Key variables:
 
															 When embeddings are enabled, news-mcp tries Ollama first and falls back to the existing heuristic clustering path if Ollama is unavailable.
														
 
															+## TTL vs pruning
														
 
															+
														
 
															+These are intentionally different:
														
 
															+
														
 
															+- `NEWS_DEFAULT_LOOKBACK_HOURS` controls **read freshness** only. Older rows remain in SQLite but do not appear in normal "latest" queries.
														
 
															+- `NEWS_PRUNING_ENABLED` controls whether the server is allowed to **physically delete** old rows.
														
 
															+- `NEWS_RETENTION_DAYS` controls how old rows may get before they are deleted.
														
 
															+- `NEWS_PRUNE_INTERVAL_HOURS` controls how often the server checks whether deletion is due.
														
 
															+
														
 
															+Pruning is self-contained inside the server:
														
 
															+- on startup
														
 
															+- after refresh cycles (prune-if-due)
														
 
															+
														
 
															+If `NEWS_PRUNING_ENABLED=false`, no pruning occurs and old rows are retained indefinitely.
														
 
															+
														
 
															 ## Live extraction smoke test
														
 
															 Run a standardized, fabricated extraction test against the currently configured provider/model:
														
--- a/news_mcp/config.py
+++ b/news_mcp/config.py
@@ -17,7 +17,7 @@ NEWS_FEED_URLS = os.getenv("NEWS_FEED_URLS", os.getenv("NEWS_RSS_FEED_URLS", "")
 
															 RSS_FEED_URL = NEWS_FEED_URL
														
 
															 RSS_FEED_URLS = NEWS_FEED_URLS
														
 
															-CLUSTERS_TTL_HOURS = float(os.getenv("NEWS_CLUSTERS_TTL_HOURS", "24"))
														
 
															+DEFAULT_LOOKBACK_HOURS = float(os.getenv("NEWS_DEFAULT_LOOKBACK_HOURS", os.getenv("NEWS_CLUSTERS_TTL_HOURS", "24")))
														
 
															 DEFAULT_TOPICS = ["crypto", "macro", "regulation", "ai", "other"]
														
 
															 # LLM extraction / summarization
														
@@ -42,3 +42,6 @@ NEWS_EMBEDDING_SIMILARITY_THRESHOLD = float(os.getenv("NEWS_EMBEDDING_SIMILARITY
 
															 NEWS_REFRESH_INTERVAL_SECONDS = int(os.getenv("NEWS_REFRESH_INTERVAL_SECONDS", "900"))
														
 
															 NEWS_BACKGROUND_REFRESH_ENABLED = os.getenv("NEWS_BACKGROUND_REFRESH_ENABLED", "true").lower() == "true"
														
 
															 NEWS_BACKGROUND_REFRESH_ON_START = os.getenv("NEWS_BACKGROUND_REFRESH_ON_START", "true").lower() == "true"
														
 
															+NEWS_PRUNING_ENABLED = os.getenv("NEWS_PRUNING_ENABLED", "true").lower() == "true"
														
 
															+NEWS_RETENTION_DAYS = float(os.getenv("NEWS_RETENTION_DAYS", "180"))
														
 
															+NEWS_PRUNE_INTERVAL_HOURS = float(os.getenv("NEWS_PRUNE_INTERVAL_HOURS", "24"))
														
--- a/news_mcp/enrichment/llm_enrich.py
+++ b/news_mcp/enrichment/llm_enrich.py
@@ -26,7 +26,7 @@ def _filter_entities(entities, blacklist=None):
 
															     return out
														
 
															-async def classify_cluster_groq(cluster: Dict[str, Any]) -> Dict[str, Any]:
														
 
															+async def classify_cluster_llm(cluster: Dict[str, Any]) -> Dict[str, Any]:
														
 
															     parsed = await call_extraction(cluster)
														
 
															     out = dict(cluster)
														
 
															     topic = parsed.get("topic", cluster.get("topic"))
														
@@ -46,6 +46,11 @@ async def classify_cluster_groq(cluster: Dict[str, Any]) -> Dict[str, Any]:
 
															     return out
														
 
															-async def summarize_cluster_groq(cluster: Dict[str, Any]) -> Dict[str, Any]:
														
 
															+async def summarize_cluster_llm(cluster: Dict[str, Any]) -> Dict[str, Any]:
														
 
															     parsed = await call_summary(cluster)
														
 
															     return parsed
														
 
															+
														
 
															+
														
 
															+# Backward-compatible aliases during the transition away from provider-specific naming.
														
 
															+classify_cluster_groq = classify_cluster_llm
														
 
															+summarize_cluster_groq = summarize_cluster_llm
														
--- a/news_mcp/jobs/poller.py
+++ b/news_mcp/jobs/poller.py
@@ -3,15 +3,21 @@ from __future__ import annotations
 
															 import logging
														
 
															 from typing import Any, Dict
														
 
															-from news_mcp.config import CLUSTERS_TTL_HOURS, DB_PATH, NEWS_FEED_URL, NEWS_FEED_URLS
														
 
															+from news_mcp.config import DEFAULT_LOOKBACK_HOURS, DB_PATH, NEWS_FEED_URL, NEWS_FEED_URLS
														
 
															 from news_mcp.dedup.cluster import dedup_and_cluster_articles
														
 
															 from news_mcp.enrichment.enrich import enrich_cluster
														
 
															-from news_mcp.enrichment.llm_enrich import classify_cluster_groq
														
 
															+from news_mcp.enrichment.llm_enrich import classify_cluster_llm
														
 
															 from news_mcp.trends_resolution import resolve_entity_via_trends
														
 
															 from news_mcp.sources.news_feeds import fetch_news_articles
														
 
															 from news_mcp.storage.sqlite_store import SQLiteClusterStore
														
 
															-from news_mcp.config import GROQ_ENRICH_OTHER_ONLY, GROQ_MAX_CLUSTERS_PER_REFRESH
														
 
															+from news_mcp.config import (
														
 
															+    GROQ_ENRICH_OTHER_ONLY,
														
 
															+    GROQ_MAX_CLUSTERS_PER_REFRESH,
														
 
															+    NEWS_PRUNE_INTERVAL_HOURS,
														
 
															+    NEWS_PRUNING_ENABLED,
														
 
															+    NEWS_RETENTION_DAYS,
														
 
															+)
														
 
															 async def refresh_clusters(topic: str | None = None, limit: int = 80) -> None:
														
@@ -36,6 +42,12 @@ async def refresh_clusters(topic: str | None = None, limit: int = 80) -> None:
 
															     prev_hash = store.get_feed_hash(feed_key)
														
 
															     if prev_hash == last_hash:
														
 
															         logger.info("refresh unchanged feed_key=%s topic=%s", feed_key, topic)
														
 
															+        prune_result = store.prune_if_due(
														
 
															+            pruning_enabled=NEWS_PRUNING_ENABLED,
														
 
															+            retention_days=NEWS_RETENTION_DAYS,
														
 
															+            interval_hours=NEWS_PRUNE_INTERVAL_HOURS,
														
 
															+        )
														
 
															+        logger.info("refresh prune_result=%s", prune_result)
														
 
															         return
														
 
															     logger.info("refresh changed feed_key=%s topic=%s", feed_key, topic)
														
 
															     store.set_feed_hash(feed_key, last_hash)
														
@@ -75,11 +87,16 @@ async def refresh_clusters(topic: str | None = None, limit: int = 80) -> None:
 
															                     if existing.get("keywords"):
														
 
															                         c2["keywords"] = existing.get("keywords")
														
 
															                 else:
														
 
															-                    c2 = await classify_cluster_groq(c2)
														
 
															+                    c2 = await classify_cluster_llm(c2)
														
 
															             enriched.append(c2)
														
 
															         store.upsert_clusters(enriched, topic=t)
														
 
															         logger.info("refresh stored topic=%s clusters=%s", t, len(enriched))
														
 
															-            
														
 
															+    prune_result = store.prune_if_due(
														
 
															+        pruning_enabled=NEWS_PRUNING_ENABLED,
														
 
															+        retention_days=NEWS_RETENTION_DAYS,
														
 
															+        interval_hours=NEWS_PRUNE_INTERVAL_HOURS,
														
 
															+    )
														
 
															+    logger.info("refresh prune_result=%s", prune_result)
														
--- a/news_mcp/mcp_server_fastmcp.py
+++ b/news_mcp/mcp_server_fastmcp.py
@@ -1,19 +1,30 @@
 
															 from __future__ import annotations
														
 
															+import asyncio
														
 
															+import logging
														
 
															+from collections import Counter
														
 
															+from datetime import datetime, timezone
														
 
															+from email.utils import parsedate_to_datetime
														
 
															+
														
 
															 from fastapi import FastAPI
														
 
															 from mcp.server.fastmcp import FastMCP
														
 
															 from mcp.server.transport_security import TransportSecuritySettings
														
 
															-from news_mcp.config import CLUSTERS_TTL_HOURS, DEFAULT_TOPICS, DB_PATH
														
 
															-from news_mcp.config import NEWS_REFRESH_INTERVAL_SECONDS, NEWS_BACKGROUND_REFRESH_ENABLED, NEWS_BACKGROUND_REFRESH_ON_START
														
 
															+from news_mcp.config import DEFAULT_LOOKBACK_HOURS, DEFAULT_TOPICS, DB_PATH
														
 
															+from news_mcp.config import (
														
 
															+    NEWS_PRUNE_INTERVAL_HOURS,
														
 
															+    NEWS_PRUNING_ENABLED,
														
 
															+    NEWS_REFRESH_INTERVAL_SECONDS,
														
 
															+    NEWS_BACKGROUND_REFRESH_ENABLED,
														
 
															+    NEWS_BACKGROUND_REFRESH_ON_START,
														
 
															+    NEWS_RETENTION_DAYS,
														
 
															+)
														
 
															 from news_mcp.jobs.poller import refresh_clusters
														
 
															 from news_mcp.storage.sqlite_store import SQLiteClusterStore
														
 
															-from news_mcp.enrichment.llm_enrich import summarize_cluster_groq
														
 
															+from news_mcp.enrichment.llm_enrich import summarize_cluster_llm
														
 
															 from news_mcp.trends_resolution import resolve_entity_via_trends
														
 
															 from news_mcp.llm import active_llm_config
														
 
															 from news_mcp.entity_normalize import normalize_query
														
 
															-from collections import Counter
														
 
															-import logging
														
 
															 mcp = FastMCP(
														
@@ -37,7 +48,40 @@ def _cluster_entity_haystack(cluster: dict) -> list[str]:
 
															     return [v for v in values if v]
														
 
															-@mcp.tool(description="What is happening right now? Return the latest deduplicated news clusters for a topic.")
														
 
															+def _parse_cluster_timestamp(value) -> datetime:
														
 
															+    if not value:
														
 
															+        return datetime.min.replace(tzinfo=timezone.utc)
														
 
															+    text = str(value).strip()
														
 
															+    if not text:
														
 
															+        return datetime.min.replace(tzinfo=timezone.utc)
														
 
															+    try:
														
 
															+        dt = datetime.fromisoformat(text.replace("Z", "+00:00"))
														
 
															+        if dt.tzinfo is None:
														
 
															+            dt = dt.replace(tzinfo=timezone.utc)
														
 
															+        return dt.astimezone(timezone.utc)
														
 
															+    except Exception:
														
 
															+        pass
														
 
															+    try:
														
 
															+        dt = parsedate_to_datetime(text)
														
 
															+        if dt.tzinfo is None:
														
 
															+            dt = dt.replace(tzinfo=timezone.utc)
														
 
															+        return dt.astimezone(timezone.utc)
														
 
															+    except Exception:
														
 
															+        return datetime.min.replace(tzinfo=timezone.utc)
														
 
															+
														
 
															+
														
 
															+def _sort_clusters_by_recency(clusters: list[dict]) -> list[dict]:
														
 
															+    return sorted(
														
 
															+        clusters,
														
 
															+        key=lambda c: (
														
 
															+            _parse_cluster_timestamp(c.get("timestamp")),
														
 
															+            float(c.get("importance", 0.0) or 0.0),
														
 
															+        ),
														
 
															+        reverse=True,
														
 
															+    )
														
 
															+
														
 
															+
														
 
															+@mcp.tool(description="Investigate a topic and return the newest deduplicated news clusters, sorted by recency.")
														
 
															 async def get_latest_events(topic: str = "crypto", limit: int = 5, include_articles: bool = False):
														
 
															     limit = max(1, min(int(limit), 20))
														
 
															     # If the caller passes an entity-like value, resolve it and use the canonical
														
@@ -58,14 +102,14 @@ async def get_latest_events(topic: str = "crypto", limit: int = 5, include_artic
 
															     if is_topic:
														
 
															         # Cache-first: only refresh if we currently have no fresh clusters for this topic.
														
 
															-        clusters = store.get_latest_clusters(topic=topic_norm, ttl_hours=CLUSTERS_TTL_HOURS, limit=limit)
														
 
															+        clusters = store.get_latest_clusters(topic=topic_norm, ttl_hours=DEFAULT_LOOKBACK_HOURS, limit=limit)
														
 
															         if not clusters:
														
 
															             await refresh_clusters(topic=topic_norm, limit=200)
														
 
															-            clusters = store.get_latest_clusters(topic=topic_norm, ttl_hours=CLUSTERS_TTL_HOURS, limit=limit)
														
 
															+            clusters = store.get_latest_clusters(topic=topic_norm, ttl_hours=DEFAULT_LOOKBACK_HOURS, limit=limit)
														
 
															     else:
														
 
															         # Entity-aware mode: search recent clusters across all topics and match by
														
 
															         # raw entity, canonical label, or MID.
														
 
															-        clusters = store.get_latest_clusters_all_topics(ttl_hours=CLUSTERS_TTL_HOURS, limit=limit * 8)
														
 
															+        clusters = store.get_latest_clusters_all_topics(ttl_hours=DEFAULT_LOOKBACK_HOURS, limit=limit * 8)
														
 
															         filtered = []
														
 
															         for c in clusters:
														
 
															             haystack = _cluster_entity_haystack(c)
														
@@ -75,11 +119,8 @@ async def get_latest_events(topic: str = "crypto", limit: int = 5, include_artic
 
															                 break
														
 
															         clusters = filtered
														
 
															-    # Ensure the response is compact and agent-friendly.
														
 
															-    clusters_sorted = sorted(clusters, key=lambda x: float(x.get("importance", 0.0)), reverse=True)
														
 
															-
														
 
															     out = []
														
 
															-    for c in clusters_sorted:
														
 
															+    for c in _sort_clusters_by_recency(clusters):
														
 
															         item = {
														
 
															             "cluster_id": c.get("cluster_id"),
														
 
															             "headline": c.get("headline"),
														
@@ -108,7 +149,7 @@ async def get_latest_events(topic: str = "crypto", limit: int = 5, include_artic
 
															     return out
														
 
															-@mcp.tool(description="What's happening with X? Filter clusters by extracted entity substring (case-insensitive) within a timeframe.")
														
 
															+@mcp.tool(description="Investigate a person, company, place, or theme by matching extracted entities within a time window.")
														
 
															 async def get_events_for_entity(entity: str, limit: int = 10, timeframe: str = "24h", include_articles: bool = False):
														
 
															     limit = max(1, min(int(limit), 30))
														
 
															     query = normalize_query(entity).strip().lower()
														
@@ -128,7 +169,7 @@ async def get_events_for_entity(entity: str, limit: int = 10, timeframe: str = "
 
															     def _match_clusters(clusters: list[dict]) -> list[dict]:
														
 
															         hits: list[dict] = []
														
 
															-        for c in clusters:
														
 
															+        for c in _sort_clusters_by_recency(clusters):
														
 
															             haystack = _cluster_entity_haystack(c)
														
 
															             if any(any(term in item for item in haystack) for term in query_terms):
														
 
															                 hits.append(c)
														
@@ -136,14 +177,10 @@ async def get_events_for_entity(entity: str, limit: int = 10, timeframe: str = "
 
															                 break
														
 
															         return hits
														
 
															-    clusters = store.get_latest_clusters_all_topics(ttl_hours=CLUSTERS_TTL_HOURS, limit=limit * 5)
														
 
															-    hits = _match_clusters(clusters)
														
 
															-
														
 
															     hours = _parse_timeframe_to_hours(timeframe)
														
 
															     clusters = store.get_latest_clusters_all_topics(ttl_hours=hours, limit=max(200, limit * 10))
														
 
															     hits = _match_clusters(clusters)
														
 
															-    # Compress to tool response shape.
														
 
															     out = []
														
 
															     for c in hits:
														
 
															         item = {
														
@@ -172,14 +209,14 @@ async def get_events_for_entity(entity: str, limit: int = 10, timeframe: str = "
 
															     return out
														
 
															-@mcp.tool(description="Explain an event clearly by cluster_id (Groq summary).")
														
 
															+@mcp.tool(description="Investigate one cluster in depth and return a concise LLM-written explanation plus key facts.")
														
 
															 async def get_event_summary(event_id: str, include_articles: bool = False):
														
 
															     store = SQLiteClusterStore(DB_PATH)
														
 
															     # Summary cache: reuse if present within TTL.
														
 
															     cached_summary = store.get_cluster_summary(
														
 
															         cluster_id=event_id,
														
 
															-        ttl_hours=CLUSTERS_TTL_HOURS,
														
 
															+        ttl_hours=DEFAULT_LOOKBACK_HOURS,
														
 
															     )
														
 
															     if cached_summary:
														
 
															         out = {
														
@@ -226,7 +263,7 @@ async def get_event_summary(event_id: str, include_articles: bool = False):
 
															             if isinstance(a, dict)
														
 
															         ]
														
 
															-    summary = await summarize_cluster_groq(cluster)
														
 
															+    summary = await summarize_cluster_llm(cluster)
														
 
															     store.upsert_cluster_summary(event_id, summary)
														
 
															     out = {
														
@@ -242,13 +279,12 @@ async def get_event_summary(event_id: str, include_articles: bool = False):
 
															     return out
														
 
															-@mcp.tool(description="Detect emerging topics/entities from recent cached news clusters.")
														
 
															+@mcp.tool(description="Explore what is starting to matter: surface emerging entities and phrases from recent clusters.")
														
 
															 async def detect_emerging_topics(limit: int = 10):
														
 
															     limit = max(1, min(int(limit), 20))
														
 
															     store = SQLiteClusterStore(DB_PATH)
														
 
															-    clusters = store.get_latest_clusters_all_topics(ttl_hours=CLUSTERS_TTL_HOURS, limit=200)
														
 
															+    clusters = store.get_latest_clusters_all_topics(ttl_hours=DEFAULT_LOOKBACK_HOURS, limit=200)
														
 
															-    from collections import Counter
														
 
															     import re
														
 
															     entity_counts = Counter()
														
@@ -280,9 +316,9 @@ async def detect_emerging_topics(limit: int = 10):
 
															                 continue
														
 
															             entity_counts[ent] += 1
														
 
															             try:
														
 
															-                    entity_importance_sum[ent] += float(c.get("importance", 0.0) or 0.0)
														
 
															+                entity_importance_sum[ent] += float(c.get("importance", 0.0) or 0.0)
														
 
															             except Exception:
														
 
															-                    pass
														
 
															+                pass
														
 
															         # update co-occurrence counts
														
 
															         for i in range(len(ents_in_cluster_norm)):
														
@@ -342,7 +378,7 @@ async def detect_emerging_topics(limit: int = 10):
 
															     return emerging[:limit]
														
 
															-@mcp.tool(description="What's the overall sentiment around an entity within a timeframe?")
														
 
															+@mcp.tool(description="Investigate whether sentiment around an entity is positive, negative, or neutral over a chosen lookback window.")
														
 
															 async def get_news_sentiment(entity: str, timeframe: str = "24h"):
														
 
															     store = SQLiteClusterStore(DB_PATH)
														
@@ -428,7 +464,7 @@ def _parse_timeframe_to_hours(timeframe: str) -> int:
 
															 @mcp.tool(
														
 
															-    description="Given a subject entity, find related entities via co-occurrence inside recent clusters (entity-only, no topic fallback)."
														
 
															+    description="Investigate which entities tend to appear alongside a subject entity in recent clusters, based on co-occurrence."
														
 
															 )
														
 
															 async def get_related_entities(subject: str, timeframe: str = "24h", limit: int = 10):
														
 
															     store = SQLiteClusterStore(DB_PATH)
														
@@ -529,6 +565,14 @@ async def _start_background_refresh():
 
															     _background_task_started = True
														
 
															     logger.info("news-mcp llm config: %s", active_llm_config())
														
 
															+    store = SQLiteClusterStore(DB_PATH)
														
 
															+    prune_result = store.prune_if_due(
														
 
															+        pruning_enabled=NEWS_PRUNING_ENABLED,
														
 
															+        retention_days=NEWS_RETENTION_DAYS,
														
 
															+        interval_hours=NEWS_PRUNE_INTERVAL_HOURS,
														
 
															+    )
														
 
															+    logger.info("startup prune_result=%s", prune_result)
														
 
															+
														
 
															     async def _loop():
														
 
															         if not NEWS_BACKGROUND_REFRESH_ON_START:
														
 
															             await asyncio.sleep(float(NEWS_REFRESH_INTERVAL_SECONDS))
														
@@ -541,8 +585,6 @@ async def _start_background_refresh():
 
															                 pass
														
 
															             await asyncio.sleep(float(NEWS_REFRESH_INTERVAL_SECONDS))
														
 
															-    import asyncio
														
 
															-
														
 
															     asyncio.create_task(_loop())
														
@@ -557,6 +599,14 @@ def root():
 
															             "enabled": NEWS_BACKGROUND_REFRESH_ENABLED,
														
 
															             "interval_seconds": NEWS_REFRESH_INTERVAL_SECONDS,
														
 
															         },
														
 
															+        "retention": {
														
 
															+            "ttl_hours": DEFAULT_LOOKBACK_HOURS,
														
 
															+            "retention_days": NEWS_RETENTION_DAYS,
														
 
															+        },
														
 
															+        "pruning": {
														
 
															+            "enabled": NEWS_PRUNING_ENABLED,
														
 
															+            "interval_hours": NEWS_PRUNE_INTERVAL_HOURS,
														
 
															+        },
														
 
															     }
														
@@ -565,7 +615,12 @@ def health():
 
															     store = SQLiteClusterStore(DB_PATH)
														
 
															     return {
														
 
															         "status": "ok",
														
 
															-        "ttl_hours": CLUSTERS_TTL_HOURS,
														
 
															+        "ttl_hours": DEFAULT_LOOKBACK_HOURS,
														
 
															         "db": str(DB_PATH),
														
 
															         "refresh": store.get_feed_state("breakingthenews"),
														
 
															+        "pruning": store.get_prune_state(
														
 
															+            pruning_enabled=NEWS_PRUNING_ENABLED,
														
 
															+            retention_days=NEWS_RETENTION_DAYS,
														
 
															+            interval_hours=NEWS_PRUNE_INTERVAL_HOURS,
														
 
															+        ),
														
 
															     }
														
--- a/news_mcp/storage/sqlite_store.py
+++ b/news_mcp/storage/sqlite_store.py
@@ -20,6 +20,9 @@ class ClusterRow:
 
															     updated_at: datetime
														
 
															+META_LAST_PRUNE_AT = "last_prune_at"
														
 
															+
														
 
															+
														
 
															 def _article_key(article: dict[str, Any]) -> str:
														
 
															     url = str(article.get("url") or "").strip()
														
 
															     if not url:
														
@@ -123,6 +126,9 @@ class SQLiteClusterStore:
 
															             conn.execute(
														
 
															                 "CREATE INDEX IF NOT EXISTS idx_clusters_topic ON clusters(topic)"
														
 
															             )
														
 
															+            conn.execute(
														
 
															+                "CREATE INDEX IF NOT EXISTS idx_clusters_updated_at ON clusters(updated_at)"
														
 
															+            )
														
 
															             conn.execute(
														
 
															                 """
														
@@ -134,6 +140,15 @@ class SQLiteClusterStore:
 
															                 """
														
 
															             )
														
 
															+            conn.execute(
														
 
															+                """
														
 
															+                CREATE TABLE IF NOT EXISTS meta (
														
 
															+                  key TEXT PRIMARY KEY,
														
 
															+                  value TEXT NOT NULL
														
 
															+                )
														
 
															+                """
														
 
															+            )
														
 
															+
														
 
															     def upsert_clusters(self, clusters: list[dict], topic: str) -> None:
														
 
															         now = datetime.now(timezone.utc)
														
 
															         with self._conn() as conn:
														
@@ -241,3 +256,86 @@ class SQLiteClusterStore:
 
															             if not row:
														
 
															                 return None
														
 
															             return {"last_hash": row[0], "updated_at": row[1]}
														
 
															+
														
 
															+    def get_meta(self, key: str) -> str | None:
														
 
															+        with self._conn() as conn:
														
 
															+            cur = conn.execute("SELECT value FROM meta WHERE key=?", (key,))
														
 
															+            row = cur.fetchone()
														
 
															+            return row[0] if row else None
														
 
															+
														
 
															+    def set_meta(self, key: str, value: str) -> None:
														
 
															+        with self._conn() as conn:
														
 
															+            conn.execute(
														
 
															+                "INSERT INTO meta(key, value) VALUES(?, ?) "
														
 
															+                "ON CONFLICT(key) DO UPDATE SET value=excluded.value",
														
 
															+                (key, value),
														
 
															+            )
														
 
															+
														
 
															+    def prune_clusters(self, retention_days: float) -> int:
														
 
															+        retention_days = float(retention_days)
														
 
															+        if retention_days <= 0:
														
 
															+            return 0
														
 
															+        cutoff = datetime.now(timezone.utc) - timedelta(days=retention_days)
														
 
															+        cutoff_iso = cutoff.isoformat()
														
 
															+        pruned_at = datetime.now(timezone.utc).isoformat()
														
 
															+        with self._conn() as conn:
														
 
															+            cur = conn.execute("DELETE FROM clusters WHERE updated_at < ?", (cutoff_iso,))
														
 
															+            deleted = int(cur.rowcount or 0)
														
 
															+            conn.execute(
														
 
															+                "INSERT INTO meta(key, value) VALUES(?, ?) "
														
 
															+                "ON CONFLICT(key) DO UPDATE SET value=excluded.value",
														
 
															+                (META_LAST_PRUNE_AT, pruned_at),
														
 
															+            )
														
 
															+            return deleted
														
 
															+
														
 
															+    def prune_if_due(self, pruning_enabled: bool, retention_days: float, interval_hours: float = 24.0) -> dict[str, Any]:
														
 
															+        retention_days = float(retention_days)
														
 
															+        interval_hours = float(interval_hours)
														
 
															+        if (not pruning_enabled) or retention_days <= 0:
														
 
															+            return {
														
 
															+                "enabled": bool(pruning_enabled),
														
 
															+                "deleted": 0,
														
 
															+                "due": False,
														
 
															+                "retention_days": retention_days,
														
 
															+                "interval_hours": interval_hours,
														
 
															+                "last_prune_at": self.get_meta(META_LAST_PRUNE_AT),
														
 
															+            }
														
 
															+
														
 
															+        last_prune_at = self.get_meta(META_LAST_PRUNE_AT)
														
 
															+        now = datetime.now(timezone.utc)
														
 
															+        due = True
														
 
															+        if last_prune_at:
														
 
															+            try:
														
 
															+                last_dt = datetime.fromisoformat(last_prune_at)
														
 
															+                due = now - last_dt >= timedelta(hours=max(1.0, interval_hours))
														
 
															+            except Exception:
														
 
															+                due = True
														
 
															+
														
 
															+        if not due:
														
 
															+            return {
														
 
															+                "enabled": True,
														
 
															+                "deleted": 0,
														
 
															+                "due": False,
														
 
															+                "retention_days": retention_days,
														
 
															+                "interval_hours": interval_hours,
														
 
															+                "last_prune_at": last_prune_at,
														
 
															+            }
														
 
															+
														
 
															+        deleted = self.prune_clusters(retention_days)
														
 
															+        last_prune_at = self.get_meta(META_LAST_PRUNE_AT)
														
 
															+        return {
														
 
															+            "enabled": True,
														
 
															+            "deleted": deleted,
														
 
															+            "due": True,
														
 
															+            "retention_days": retention_days,
														
 
															+            "interval_hours": interval_hours,
														
 
															+            "last_prune_at": last_prune_at,
														
 
															+        }
														
 
															+
														
 
															+    def get_prune_state(self, pruning_enabled: bool, retention_days: float, interval_hours: float = 24.0) -> dict[str, Any]:
														
 
															+        return {
														
 
															+            "enabled": bool(pruning_enabled),
														
 
															+            "retention_days": float(retention_days),
														
 
															+            "interval_hours": float(interval_hours),
														
 
															+            "last_prune_at": self.get_meta(META_LAST_PRUNE_AT),
														
 
															+        }
														
--- a/news_mcp/trends_resolution.py
+++ b/news_mcp/trends_resolution.py
@@ -1,20 +1,77 @@
 
															 from __future__ import annotations
														
 
															 import json
														
 
															-import os
														
 
															-import subprocess
														
 
															+from datetime import datetime, timezone
														
 
															 from functools import lru_cache
														
 
															 from typing import Any
														
 
															+from urllib.parse import quote
														
 
															+
														
 
															+import httpx
														
 
															 from news_mcp.entity_normalize import normalize_entity
														
 
															+class GoogleTrendsError(RuntimeError):
														
 
															+    pass
														
 
															+
														
 
															+
														
 
															+class GoogleTrendsProvider:
														
 
															+    """Minimal in-process Google Trends adapter used by news-mcp.
														
 
															+
														
 
															+    We only need entity suggestions for the resolver path, so keep this module
														
 
															+    intentionally narrow rather than importing the full trends-mcp server.
														
 
															+    """
														
 
															+
														
 
															+    _SUGGESTIONS_URL = "https://trends.google.com/trends/api/autocomplete/"
														
 
															+
														
 
															+    def __init__(self, *, hl: str = "en-US", tz: int = 120, timeout: float = 10.0):
														
 
															+        self.hl = hl
														
 
															+        self.tz = tz
														
 
															+        self.timeout = timeout
														
 
															+        self._headers = {
														
 
															+            "User-Agent": (
														
 
															+                "Mozilla/5.0 (X11; Linux x86_64) "
														
 
															+                "AppleWebKit/537.36 (KHTML, like Gecko) "
														
 
															+                "Chrome/135.0.0.0 Safari/537.36"
														
 
															+            ),
														
 
															+            "Accept": "application/json,text/javascript,*/*;q=0.1",
														
 
															+        }
														
 
															+
														
 
															+    def suggestions(self, keyword: str) -> list[dict[str, Any]]:
														
 
															+        url = self._SUGGESTIONS_URL + quote(keyword)
														
 
															+        params = {"hl": self.hl, "tz": str(self.tz)}
														
 
															+        try:
														
 
															+            response = httpx.get(url, params=params, headers=self._headers, timeout=self.timeout, follow_redirects=True)
														
 
															+            response.raise_for_status()
														
 
															+            text = response.text.strip()
														
 
															+            if text.startswith(")]}',"):
														
 
															+                text = text[5:]
														
 
															+            payload = json.loads(text)
														
 
															+            default = payload.get("default") if isinstance(payload, dict) else None
														
 
															+            topics = default.get("topics") if isinstance(default, dict) else None
														
 
															+            return topics if isinstance(topics, list) else []
														
 
															+        except Exception as exc:  # pragma: no cover - network/provider dependent
														
 
															+            raise GoogleTrendsError(f"suggestions failed for {keyword!r}: {exc}") from exc
														
 
															+
														
 
															+
														
 
															+@lru_cache(maxsize=1)
														
 
															+def _provider() -> GoogleTrendsProvider | None:
														
 
															+    try:
														
 
															+        return GoogleTrendsProvider()
														
 
															+    except Exception:
														
 
															+        return None
														
 
															+
														
 
															+
														
 
															+def _resolved_at() -> str:
														
 
															+    return datetime.now(timezone.utc).isoformat()
														
 
															+
														
 
															+
														
 
															 @lru_cache(maxsize=1024)
														
 
															 def resolve_entity_via_trends(entity: str) -> dict[str, Any]:
														
 
															-    """Resolve a normalized entity through trends-mcp, falling back cleanly.
														
 
															+    """Resolve an entity locally via Google Trends suggestions.
														
 
															-    The input is normalized first using the same local normalization rules used
														
 
															-    everywhere else in news-mcp, so query and storage paths stay aligned.
														
 
															+    The returned shape intentionally mirrors the former trends-mcp bridge so the
														
 
															+    rest of news-mcp can stay unchanged during the migration.
														
 
															     """
														
 
															     normalized = normalize_entity(entity)
														
 
															     if not normalized:
														
@@ -24,34 +81,28 @@ def resolve_entity_via_trends(entity: str) -> dict[str, Any]:
 
															             "canonical_label": "",
														
 
															             "mid": None,
														
 
															             "type": None,
														
 
															+            "candidates": [],
														
 
															             "source": "empty",
														
 
															+            "resolved_at": _resolved_at(),
														
 
															         }
														
 
															-    config = os.getenv("MCPORTER_CONFIG", os.path.expanduser("~/.openclaw/workspace/config/mcporter.json"))
														
 
															-    command = [
														
 
															-        "mcporter",
														
 
															-        "--config",
														
 
															-        config,
														
 
															-        "call",
														
 
															-        "trends.resolve_entity",
														
 
															-        f"keyword={normalized}",
														
 
															-    ]
														
 
															-
														
 
															-    try:
														
 
															-        proc = subprocess.run(command, capture_output=True, text=True, timeout=20, check=False)
														
 
															-        if proc.returncode == 0 and proc.stdout.strip():
														
 
															-            payload = json.loads(proc.stdout)
														
 
															+    provider = _provider()
														
 
															+    if provider is not None:
														
 
															+        try:
														
 
															+            suggestions = provider.suggestions(normalized)
														
 
															+            best = suggestions[0] if suggestions else None
														
 
															             return {
														
 
															                 "raw": entity,
														
 
															                 "normalized": normalized,
														
 
															-                "canonical_label": payload.get("canonical_label") or normalized,
														
 
															-                "mid": payload.get("mid"),
														
 
															-                "type": payload.get("type"),
														
 
															-                "candidates": payload.get("candidates", []),
														
 
															-                "source": "trends-mcp",
														
 
															+                "canonical_label": best.get("title") if best else normalized,
														
 
															+                "mid": best.get("mid") if best else None,
														
 
															+                "type": best.get("type") if best else None,
														
 
															+                "candidates": suggestions,
														
 
															+                "source": "google-trends",
														
 
															+                "resolved_at": _resolved_at(),
														
 
															             }
														
 
															-    except Exception:
														
 
															-        pass
														
 
															+        except Exception:
														
 
															+            pass
														
 
															     # Conservative fallback: keep the local normalized form and leave MID unset.
														
 
															     return {
														
@@ -60,5 +111,7 @@ def resolve_entity_via_trends(entity: str) -> dict[str, Any]:
 
															         "canonical_label": normalized,
														
 
															         "mid": None,
														
 
															         "type": None,
														
 
															+        "candidates": [],
														
 
															         "source": "fallback",
														
 
															+        "resolved_at": _resolved_at(),
														
 
															     }
														
--- a/test_news_mcp.py
+++ b/test_news_mcp.py
@@ -9,6 +9,8 @@ from news_mcp.enrichment.importance import compute_importance
 
															 from news_mcp.enrichment.llm_enrich import _filter_entities, _matches_blacklist
														
 
															 from news_mcp.entity_normalize import normalize_query, normalize_entities
														
 
															 from news_mcp.llm import build_extraction_prompt, call_llm, load_prompt
														
 
															+from news_mcp.trends_resolution import resolve_entity_via_trends
														
 
															+from news_mcp.mcp_server_fastmcp import _sort_clusters_by_recency
														
 
															 def _article(title: str, url: str = "https://example.com/x", source: str = "Src", ts: str = "Mon, 30 Mar 2026 12:00:00 GMT"):
														
@@ -77,6 +79,71 @@ def test_sqlite_summary_cache_roundtrip():
 
															         assert cached["keyFacts"] == ["Fact 1"]
														
 
															+def test_prune_clusters_deletes_rows_older_than_retention():
														
 
															+    with tempfile.TemporaryDirectory() as td:
														
 
															+        db = Path(td) / "news.sqlite"
														
 
															+        store = SQLiteClusterStore(db)
														
 
															+        store.upsert_clusters([
														
 
															+            {
														
 
															+                "cluster_id": "fresh",
														
 
															+                "headline": "Fresh",
														
 
															+                "summary": "Fresh summary",
														
 
															+                "entities": ["Bitcoin"],
														
 
															+                "timestamp": "Wed, 01 Apr 2026 12:00:00 GMT",
														
 
															+                "articles": [],
														
 
															+            },
														
 
															+            {
														
 
															+                "cluster_id": "stale",
														
 
															+                "headline": "Stale",
														
 
															+                "summary": "Stale summary",
														
 
															+                "entities": ["Iran"],
														
 
															+                "timestamp": "Wed, 01 Apr 2026 11:00:00 GMT",
														
 
															+                "articles": [],
														
 
															+            },
														
 
															+        ], topic="other")
														
 
															+
														
 
															+        with store._conn() as conn:
														
 
															+            conn.execute(
														
 
															+                "UPDATE clusters SET updated_at=? WHERE cluster_id=?",
														
 
															+                ("2025-01-01T00:00:00+00:00", "stale"),
														
 
															+            )
														
 
															+
														
 
															+        deleted = store.prune_clusters(retention_days=30)
														
 
															+
														
 
															+        assert deleted == 1
														
 
															+        assert store.get_cluster_by_id("stale") is None
														
 
															+        assert store.get_cluster_by_id("fresh") is not None
														
 
															+        assert store.get_prune_state(pruning_enabled=True, retention_days=30, interval_hours=24)["last_prune_at"] is not None
														
 
															+
														
 
															+
														
 
															+def test_prune_if_due_skips_deletes_when_pruning_disabled():
														
 
															+    with tempfile.TemporaryDirectory() as td:
														
 
															+        db = Path(td) / "news.sqlite"
														
 
															+        store = SQLiteClusterStore(db)
														
 
															+        store.upsert_clusters([
														
 
															+            {
														
 
															+                "cluster_id": "stale",
														
 
															+                "headline": "Stale",
														
 
															+                "summary": "Stale summary",
														
 
															+                "entities": ["Iran"],
														
 
															+                "timestamp": "Wed, 01 Apr 2026 11:00:00 GMT",
														
 
															+                "articles": [],
														
 
															+            }
														
 
															+        ], topic="other")
														
 
															+
														
 
															+        with store._conn() as conn:
														
 
															+            conn.execute(
														
 
															+                "UPDATE clusters SET updated_at=? WHERE cluster_id=?",
														
 
															+                ("2025-01-01T00:00:00+00:00", "stale"),
														
 
															+            )
														
 
															+
														
 
															+        result = store.prune_if_due(pruning_enabled=False, retention_days=30, interval_hours=24)
														
 
															+
														
 
															+        assert result["enabled"] is False
														
 
															+        assert result["deleted"] == 0
														
 
															+        assert store.get_cluster_by_id("stale") is not None
														
 
															+
														
 
															+
														
 
															 def test_blacklist_filters_entities_case_insensitively():
														
 
															     entities = ["Bloomberg", "Reuters", "bloomberg", "CoinDesk"]
														
 
															     filtered = _filter_entities(entities, blacklist=["bloomberg"])
														
@@ -104,6 +171,35 @@ def test_load_prompt_reads_prompt_files():
 
															     assert "Return STRICT JSON" in text
														
 
															+def test_resolve_entity_falls_back_cleanly_when_provider_unavailable(monkeypatch):
														
 
															+    import news_mcp.trends_resolution as trends_resolution
														
 
															+
														
 
															+    trends_resolution.resolve_entity_via_trends.cache_clear()
														
 
															+    trends_resolution._provider.cache_clear()
														
 
															+    monkeypatch.setattr(trends_resolution, "_provider", lambda: None)
														
 
															+
														
 
															+    resolved = resolve_entity_via_trends("btc")
														
 
															+
														
 
															+    assert resolved["normalized"] == "Bitcoin"
														
 
															+    assert resolved["canonical_label"] == "Bitcoin"
														
 
															+    assert resolved["mid"] is None
														
 
															+    assert resolved["candidates"] == []
														
 
															+    assert resolved["source"] == "fallback"
														
 
															+
														
 
															+    trends_resolution.resolve_entity_via_trends.cache_clear()
														
 
															+
														
 
															+
														
 
															+def test_sort_clusters_by_recency_prefers_newer_timestamp_over_importance():
														
 
															+    clusters = [
														
 
															+        {"headline": "older", "timestamp": "Wed, 01 Apr 2026 10:00:00 GMT", "importance": 0.9},
														
 
															+        {"headline": "newer", "timestamp": "Wed, 01 Apr 2026 11:00:00 GMT", "importance": 0.1},
														
 
															+    ]
														
 
															+
														
 
															+    sorted_clusters = _sort_clusters_by_recency(clusters)
														
 
															+
														
 
															+    assert [c["headline"] for c in sorted_clusters] == ["newer", "older"]
														
 
															+
														
 
															+
														
 
															 def test_build_extraction_prompt_is_stable_without_blacklist():
														
 
															     cluster = {
														
 
															         "headline": "Bloomberg reports Bitcoin rallies after US rate comments",