Bläddra i källkod

news-mcp: clarify lookback, pruning, and tool docs

Lukas Goldschmidt 1 månad sedan
förälder
incheckning
5ec094693f

+ 4 - 1
.env.example

@@ -29,7 +29,10 @@ NEWS_FEED_URLS=
 # Storage / refresh
 NEWS_MCP_DATA_DIR=
 NEWS_MCP_DB_PATH=
-NEWS_CLUSTERS_TTL_HOURS=24
+NEWS_DEFAULT_LOOKBACK_HOURS=24
+NEWS_PRUNING_ENABLED=true
+NEWS_RETENTION_DAYS=180
+NEWS_PRUNE_INTERVAL_HOURS=24
 NEWS_REFRESH_INTERVAL_SECONDS=900
 NEWS_BACKGROUND_REFRESH_ENABLED=true
 NEWS_BACKGROUND_REFRESH_ON_START=true

+ 21 - 2
README.md

@@ -23,6 +23,7 @@ Health:
 - Enriches clusters with configurable LLM providers/models (topic/entities/sentiment/keywords)
 - Applies a case-insensitive entity blacklist after extraction
 - Caches clusters + LLM fields in SQLite
+- Resolves entities in-process via Google Trends suggestions (no `trends-mcp` hop required for entity resolution)
 
 ## Tools (MCP)
 
@@ -36,7 +37,7 @@ Health:
 - when `include_articles=true`, includes `articles[].url` + minimal fields per returned cluster
 
 3) `get_event_summary(event_id, include_articles=false)`
-- Groq-written compressed narrative for a given `cluster_id`
+- LLM-written compressed narrative for a given `cluster_id`
 - when `include_articles=true`, includes the underlying `articles` list (with `url`) from the stored cluster
 
 4) `detect_emerging_topics(limit)`
@@ -70,7 +71,10 @@ Key variables:
 - `NEWS_REFRESH_INTERVAL_SECONDS` (default 900)
 - `NEWS_BACKGROUND_REFRESH_ON_START` (default true)
 - `NEWS_BACKGROUND_REFRESH_ENABLED` (default true)
-- `NEWS_CLUSTERS_TTL_HOURS`
+- `NEWS_DEFAULT_LOOKBACK_HOURS` (freshness window for reads; older rows are ignored by queries)
+- `NEWS_PRUNING_ENABLED` (default true; if false, no rows are physically deleted)
+- `NEWS_RETENTION_DAYS` (physical delete threshold for stored clusters)
+- `NEWS_PRUNE_INTERVAL_HOURS` (how often in-server pruning may run)
 - `GROQ_ENRICH_OTHER_ONLY` (default false; set true for cost control)
 - `NEWS_EMBEDDINGS_ENABLED` (default false; enables Ollama embeddings for clustering when wired in)
 - `OLLAMA_BASE_URL` / `OLLAMA_URL` (default `http://127.0.0.1:11434`)
@@ -79,6 +83,21 @@ Key variables:
 
 When embeddings are enabled, news-mcp tries Ollama first and falls back to the existing heuristic clustering path if Ollama is unavailable.
 
+## TTL vs pruning
+
+These are intentionally different:
+
+- `NEWS_DEFAULT_LOOKBACK_HOURS` controls **read freshness** only. Older rows remain in SQLite but do not appear in normal "latest" queries.
+- `NEWS_PRUNING_ENABLED` controls whether the server is allowed to **physically delete** old rows.
+- `NEWS_RETENTION_DAYS` controls how old rows may get before they are deleted.
+- `NEWS_PRUNE_INTERVAL_HOURS` controls how often the server checks whether deletion is due.
+
+Pruning is self-contained inside the server:
+- on startup
+- after refresh cycles (prune-if-due)
+
+If `NEWS_PRUNING_ENABLED=false`, no pruning occurs and old rows are retained indefinitely.
+
 ## Live extraction smoke test
 
 Run a standardized, fabricated extraction test against the currently configured provider/model:

+ 4 - 1
news_mcp/config.py

@@ -17,7 +17,7 @@ NEWS_FEED_URLS = os.getenv("NEWS_FEED_URLS", os.getenv("NEWS_RSS_FEED_URLS", "")
 RSS_FEED_URL = NEWS_FEED_URL
 RSS_FEED_URLS = NEWS_FEED_URLS
 
-CLUSTERS_TTL_HOURS = float(os.getenv("NEWS_CLUSTERS_TTL_HOURS", "24"))
+DEFAULT_LOOKBACK_HOURS = float(os.getenv("NEWS_DEFAULT_LOOKBACK_HOURS", os.getenv("NEWS_CLUSTERS_TTL_HOURS", "24")))
 DEFAULT_TOPICS = ["crypto", "macro", "regulation", "ai", "other"]
 
 # LLM extraction / summarization
@@ -42,3 +42,6 @@ NEWS_EMBEDDING_SIMILARITY_THRESHOLD = float(os.getenv("NEWS_EMBEDDING_SIMILARITY
 NEWS_REFRESH_INTERVAL_SECONDS = int(os.getenv("NEWS_REFRESH_INTERVAL_SECONDS", "900"))
 NEWS_BACKGROUND_REFRESH_ENABLED = os.getenv("NEWS_BACKGROUND_REFRESH_ENABLED", "true").lower() == "true"
 NEWS_BACKGROUND_REFRESH_ON_START = os.getenv("NEWS_BACKGROUND_REFRESH_ON_START", "true").lower() == "true"
+NEWS_PRUNING_ENABLED = os.getenv("NEWS_PRUNING_ENABLED", "true").lower() == "true"
+NEWS_RETENTION_DAYS = float(os.getenv("NEWS_RETENTION_DAYS", "180"))
+NEWS_PRUNE_INTERVAL_HOURS = float(os.getenv("NEWS_PRUNE_INTERVAL_HOURS", "24"))

+ 7 - 2
news_mcp/enrichment/llm_enrich.py

@@ -26,7 +26,7 @@ def _filter_entities(entities, blacklist=None):
     return out
 
 
-async def classify_cluster_groq(cluster: Dict[str, Any]) -> Dict[str, Any]:
+async def classify_cluster_llm(cluster: Dict[str, Any]) -> Dict[str, Any]:
     parsed = await call_extraction(cluster)
     out = dict(cluster)
     topic = parsed.get("topic", cluster.get("topic"))
@@ -46,6 +46,11 @@ async def classify_cluster_groq(cluster: Dict[str, Any]) -> Dict[str, Any]:
     return out
 
 
-async def summarize_cluster_groq(cluster: Dict[str, Any]) -> Dict[str, Any]:
+async def summarize_cluster_llm(cluster: Dict[str, Any]) -> Dict[str, Any]:
     parsed = await call_summary(cluster)
     return parsed
+
+
+# Backward-compatible aliases during the transition away from provider-specific naming.
+classify_cluster_groq = classify_cluster_llm
+summarize_cluster_groq = summarize_cluster_llm

+ 22 - 5
news_mcp/jobs/poller.py

@@ -3,15 +3,21 @@ from __future__ import annotations
 import logging
 from typing import Any, Dict
 
-from news_mcp.config import CLUSTERS_TTL_HOURS, DB_PATH, NEWS_FEED_URL, NEWS_FEED_URLS
+from news_mcp.config import DEFAULT_LOOKBACK_HOURS, DB_PATH, NEWS_FEED_URL, NEWS_FEED_URLS
 from news_mcp.dedup.cluster import dedup_and_cluster_articles
 from news_mcp.enrichment.enrich import enrich_cluster
-from news_mcp.enrichment.llm_enrich import classify_cluster_groq
+from news_mcp.enrichment.llm_enrich import classify_cluster_llm
 from news_mcp.trends_resolution import resolve_entity_via_trends
 from news_mcp.sources.news_feeds import fetch_news_articles
 from news_mcp.storage.sqlite_store import SQLiteClusterStore
 
-from news_mcp.config import GROQ_ENRICH_OTHER_ONLY, GROQ_MAX_CLUSTERS_PER_REFRESH
+from news_mcp.config import (
+    GROQ_ENRICH_OTHER_ONLY,
+    GROQ_MAX_CLUSTERS_PER_REFRESH,
+    NEWS_PRUNE_INTERVAL_HOURS,
+    NEWS_PRUNING_ENABLED,
+    NEWS_RETENTION_DAYS,
+)
 
 
 async def refresh_clusters(topic: str | None = None, limit: int = 80) -> None:
@@ -36,6 +42,12 @@ async def refresh_clusters(topic: str | None = None, limit: int = 80) -> None:
     prev_hash = store.get_feed_hash(feed_key)
     if prev_hash == last_hash:
         logger.info("refresh unchanged feed_key=%s topic=%s", feed_key, topic)
+        prune_result = store.prune_if_due(
+            pruning_enabled=NEWS_PRUNING_ENABLED,
+            retention_days=NEWS_RETENTION_DAYS,
+            interval_hours=NEWS_PRUNE_INTERVAL_HOURS,
+        )
+        logger.info("refresh prune_result=%s", prune_result)
         return
     logger.info("refresh changed feed_key=%s topic=%s", feed_key, topic)
     store.set_feed_hash(feed_key, last_hash)
@@ -75,11 +87,16 @@ async def refresh_clusters(topic: str | None = None, limit: int = 80) -> None:
                     if existing.get("keywords"):
                         c2["keywords"] = existing.get("keywords")
                 else:
-                    c2 = await classify_cluster_groq(c2)
+                    c2 = await classify_cluster_llm(c2)
 
             enriched.append(c2)
 
         store.upsert_clusters(enriched, topic=t)
         logger.info("refresh stored topic=%s clusters=%s", t, len(enriched))
 
-            
+    prune_result = store.prune_if_due(
+        pruning_enabled=NEWS_PRUNING_ENABLED,
+        retention_days=NEWS_RETENTION_DAYS,
+        interval_hours=NEWS_PRUNE_INTERVAL_HOURS,
+    )
+    logger.info("refresh prune_result=%s", prune_result)

+ 87 - 32
news_mcp/mcp_server_fastmcp.py

@@ -1,19 +1,30 @@
 from __future__ import annotations
 
+import asyncio
+import logging
+from collections import Counter
+from datetime import datetime, timezone
+from email.utils import parsedate_to_datetime
+
 from fastapi import FastAPI
 from mcp.server.fastmcp import FastMCP
 from mcp.server.transport_security import TransportSecuritySettings
 
-from news_mcp.config import CLUSTERS_TTL_HOURS, DEFAULT_TOPICS, DB_PATH
-from news_mcp.config import NEWS_REFRESH_INTERVAL_SECONDS, NEWS_BACKGROUND_REFRESH_ENABLED, NEWS_BACKGROUND_REFRESH_ON_START
+from news_mcp.config import DEFAULT_LOOKBACK_HOURS, DEFAULT_TOPICS, DB_PATH
+from news_mcp.config import (
+    NEWS_PRUNE_INTERVAL_HOURS,
+    NEWS_PRUNING_ENABLED,
+    NEWS_REFRESH_INTERVAL_SECONDS,
+    NEWS_BACKGROUND_REFRESH_ENABLED,
+    NEWS_BACKGROUND_REFRESH_ON_START,
+    NEWS_RETENTION_DAYS,
+)
 from news_mcp.jobs.poller import refresh_clusters
 from news_mcp.storage.sqlite_store import SQLiteClusterStore
-from news_mcp.enrichment.llm_enrich import summarize_cluster_groq
+from news_mcp.enrichment.llm_enrich import summarize_cluster_llm
 from news_mcp.trends_resolution import resolve_entity_via_trends
 from news_mcp.llm import active_llm_config
 from news_mcp.entity_normalize import normalize_query
-from collections import Counter
-import logging
 
 
 mcp = FastMCP(
@@ -37,7 +48,40 @@ def _cluster_entity_haystack(cluster: dict) -> list[str]:
     return [v for v in values if v]
 
 
-@mcp.tool(description="What is happening right now? Return the latest deduplicated news clusters for a topic.")
+def _parse_cluster_timestamp(value) -> datetime:
+    if not value:
+        return datetime.min.replace(tzinfo=timezone.utc)
+    text = str(value).strip()
+    if not text:
+        return datetime.min.replace(tzinfo=timezone.utc)
+    try:
+        dt = datetime.fromisoformat(text.replace("Z", "+00:00"))
+        if dt.tzinfo is None:
+            dt = dt.replace(tzinfo=timezone.utc)
+        return dt.astimezone(timezone.utc)
+    except Exception:
+        pass
+    try:
+        dt = parsedate_to_datetime(text)
+        if dt.tzinfo is None:
+            dt = dt.replace(tzinfo=timezone.utc)
+        return dt.astimezone(timezone.utc)
+    except Exception:
+        return datetime.min.replace(tzinfo=timezone.utc)
+
+
+def _sort_clusters_by_recency(clusters: list[dict]) -> list[dict]:
+    return sorted(
+        clusters,
+        key=lambda c: (
+            _parse_cluster_timestamp(c.get("timestamp")),
+            float(c.get("importance", 0.0) or 0.0),
+        ),
+        reverse=True,
+    )
+
+
+@mcp.tool(description="Investigate a topic and return the newest deduplicated news clusters, sorted by recency.")
 async def get_latest_events(topic: str = "crypto", limit: int = 5, include_articles: bool = False):
     limit = max(1, min(int(limit), 20))
     # If the caller passes an entity-like value, resolve it and use the canonical
@@ -58,14 +102,14 @@ async def get_latest_events(topic: str = "crypto", limit: int = 5, include_artic
 
     if is_topic:
         # Cache-first: only refresh if we currently have no fresh clusters for this topic.
-        clusters = store.get_latest_clusters(topic=topic_norm, ttl_hours=CLUSTERS_TTL_HOURS, limit=limit)
+        clusters = store.get_latest_clusters(topic=topic_norm, ttl_hours=DEFAULT_LOOKBACK_HOURS, limit=limit)
         if not clusters:
             await refresh_clusters(topic=topic_norm, limit=200)
-            clusters = store.get_latest_clusters(topic=topic_norm, ttl_hours=CLUSTERS_TTL_HOURS, limit=limit)
+            clusters = store.get_latest_clusters(topic=topic_norm, ttl_hours=DEFAULT_LOOKBACK_HOURS, limit=limit)
     else:
         # Entity-aware mode: search recent clusters across all topics and match by
         # raw entity, canonical label, or MID.
-        clusters = store.get_latest_clusters_all_topics(ttl_hours=CLUSTERS_TTL_HOURS, limit=limit * 8)
+        clusters = store.get_latest_clusters_all_topics(ttl_hours=DEFAULT_LOOKBACK_HOURS, limit=limit * 8)
         filtered = []
         for c in clusters:
             haystack = _cluster_entity_haystack(c)
@@ -75,11 +119,8 @@ async def get_latest_events(topic: str = "crypto", limit: int = 5, include_artic
                 break
         clusters = filtered
 
-    # Ensure the response is compact and agent-friendly.
-    clusters_sorted = sorted(clusters, key=lambda x: float(x.get("importance", 0.0)), reverse=True)
-
     out = []
-    for c in clusters_sorted:
+    for c in _sort_clusters_by_recency(clusters):
         item = {
             "cluster_id": c.get("cluster_id"),
             "headline": c.get("headline"),
@@ -108,7 +149,7 @@ async def get_latest_events(topic: str = "crypto", limit: int = 5, include_artic
     return out
 
 
-@mcp.tool(description="What's happening with X? Filter clusters by extracted entity substring (case-insensitive) within a timeframe.")
+@mcp.tool(description="Investigate a person, company, place, or theme by matching extracted entities within a time window.")
 async def get_events_for_entity(entity: str, limit: int = 10, timeframe: str = "24h", include_articles: bool = False):
     limit = max(1, min(int(limit), 30))
     query = normalize_query(entity).strip().lower()
@@ -128,7 +169,7 @@ async def get_events_for_entity(entity: str, limit: int = 10, timeframe: str = "
 
     def _match_clusters(clusters: list[dict]) -> list[dict]:
         hits: list[dict] = []
-        for c in clusters:
+        for c in _sort_clusters_by_recency(clusters):
             haystack = _cluster_entity_haystack(c)
             if any(any(term in item for item in haystack) for term in query_terms):
                 hits.append(c)
@@ -136,14 +177,10 @@ async def get_events_for_entity(entity: str, limit: int = 10, timeframe: str = "
                 break
         return hits
 
-    clusters = store.get_latest_clusters_all_topics(ttl_hours=CLUSTERS_TTL_HOURS, limit=limit * 5)
-    hits = _match_clusters(clusters)
-
     hours = _parse_timeframe_to_hours(timeframe)
     clusters = store.get_latest_clusters_all_topics(ttl_hours=hours, limit=max(200, limit * 10))
     hits = _match_clusters(clusters)
 
-    # Compress to tool response shape.
     out = []
     for c in hits:
         item = {
@@ -172,14 +209,14 @@ async def get_events_for_entity(entity: str, limit: int = 10, timeframe: str = "
     return out
 
 
-@mcp.tool(description="Explain an event clearly by cluster_id (Groq summary).")
+@mcp.tool(description="Investigate one cluster in depth and return a concise LLM-written explanation plus key facts.")
 async def get_event_summary(event_id: str, include_articles: bool = False):
     store = SQLiteClusterStore(DB_PATH)
 
     # Summary cache: reuse if present within TTL.
     cached_summary = store.get_cluster_summary(
         cluster_id=event_id,
-        ttl_hours=CLUSTERS_TTL_HOURS,
+        ttl_hours=DEFAULT_LOOKBACK_HOURS,
     )
     if cached_summary:
         out = {
@@ -226,7 +263,7 @@ async def get_event_summary(event_id: str, include_articles: bool = False):
             if isinstance(a, dict)
         ]
 
-    summary = await summarize_cluster_groq(cluster)
+    summary = await summarize_cluster_llm(cluster)
 
     store.upsert_cluster_summary(event_id, summary)
     out = {
@@ -242,13 +279,12 @@ async def get_event_summary(event_id: str, include_articles: bool = False):
     return out
 
 
-@mcp.tool(description="Detect emerging topics/entities from recent cached news clusters.")
+@mcp.tool(description="Explore what is starting to matter: surface emerging entities and phrases from recent clusters.")
 async def detect_emerging_topics(limit: int = 10):
     limit = max(1, min(int(limit), 20))
     store = SQLiteClusterStore(DB_PATH)
-    clusters = store.get_latest_clusters_all_topics(ttl_hours=CLUSTERS_TTL_HOURS, limit=200)
+    clusters = store.get_latest_clusters_all_topics(ttl_hours=DEFAULT_LOOKBACK_HOURS, limit=200)
 
-    from collections import Counter
     import re
 
     entity_counts = Counter()
@@ -280,9 +316,9 @@ async def detect_emerging_topics(limit: int = 10):
                 continue
             entity_counts[ent] += 1
             try:
-                    entity_importance_sum[ent] += float(c.get("importance", 0.0) or 0.0)
+                entity_importance_sum[ent] += float(c.get("importance", 0.0) or 0.0)
             except Exception:
-                    pass
+                pass
 
         # update co-occurrence counts
         for i in range(len(ents_in_cluster_norm)):
@@ -342,7 +378,7 @@ async def detect_emerging_topics(limit: int = 10):
     return emerging[:limit]
 
 
-@mcp.tool(description="What's the overall sentiment around an entity within a timeframe?")
+@mcp.tool(description="Investigate whether sentiment around an entity is positive, negative, or neutral over a chosen lookback window.")
 async def get_news_sentiment(entity: str, timeframe: str = "24h"):
     store = SQLiteClusterStore(DB_PATH)
 
@@ -428,7 +464,7 @@ def _parse_timeframe_to_hours(timeframe: str) -> int:
 
 
 @mcp.tool(
-    description="Given a subject entity, find related entities via co-occurrence inside recent clusters (entity-only, no topic fallback)."
+    description="Investigate which entities tend to appear alongside a subject entity in recent clusters, based on co-occurrence."
 )
 async def get_related_entities(subject: str, timeframe: str = "24h", limit: int = 10):
     store = SQLiteClusterStore(DB_PATH)
@@ -529,6 +565,14 @@ async def _start_background_refresh():
     _background_task_started = True
     logger.info("news-mcp llm config: %s", active_llm_config())
 
+    store = SQLiteClusterStore(DB_PATH)
+    prune_result = store.prune_if_due(
+        pruning_enabled=NEWS_PRUNING_ENABLED,
+        retention_days=NEWS_RETENTION_DAYS,
+        interval_hours=NEWS_PRUNE_INTERVAL_HOURS,
+    )
+    logger.info("startup prune_result=%s", prune_result)
+
     async def _loop():
         if not NEWS_BACKGROUND_REFRESH_ON_START:
             await asyncio.sleep(float(NEWS_REFRESH_INTERVAL_SECONDS))
@@ -541,8 +585,6 @@ async def _start_background_refresh():
                 pass
             await asyncio.sleep(float(NEWS_REFRESH_INTERVAL_SECONDS))
 
-    import asyncio
-
     asyncio.create_task(_loop())
 
 
@@ -557,6 +599,14 @@ def root():
             "enabled": NEWS_BACKGROUND_REFRESH_ENABLED,
             "interval_seconds": NEWS_REFRESH_INTERVAL_SECONDS,
         },
+        "retention": {
+            "ttl_hours": DEFAULT_LOOKBACK_HOURS,
+            "retention_days": NEWS_RETENTION_DAYS,
+        },
+        "pruning": {
+            "enabled": NEWS_PRUNING_ENABLED,
+            "interval_hours": NEWS_PRUNE_INTERVAL_HOURS,
+        },
     }
 
 
@@ -565,7 +615,12 @@ def health():
     store = SQLiteClusterStore(DB_PATH)
     return {
         "status": "ok",
-        "ttl_hours": CLUSTERS_TTL_HOURS,
+        "ttl_hours": DEFAULT_LOOKBACK_HOURS,
         "db": str(DB_PATH),
         "refresh": store.get_feed_state("breakingthenews"),
+        "pruning": store.get_prune_state(
+            pruning_enabled=NEWS_PRUNING_ENABLED,
+            retention_days=NEWS_RETENTION_DAYS,
+            interval_hours=NEWS_PRUNE_INTERVAL_HOURS,
+        ),
     }

+ 98 - 0
news_mcp/storage/sqlite_store.py

@@ -20,6 +20,9 @@ class ClusterRow:
     updated_at: datetime
 
 
+META_LAST_PRUNE_AT = "last_prune_at"
+
+
 def _article_key(article: dict[str, Any]) -> str:
     url = str(article.get("url") or "").strip()
     if not url:
@@ -123,6 +126,9 @@ class SQLiteClusterStore:
             conn.execute(
                 "CREATE INDEX IF NOT EXISTS idx_clusters_topic ON clusters(topic)"
             )
+            conn.execute(
+                "CREATE INDEX IF NOT EXISTS idx_clusters_updated_at ON clusters(updated_at)"
+            )
 
             conn.execute(
                 """
@@ -134,6 +140,15 @@ class SQLiteClusterStore:
                 """
             )
 
+            conn.execute(
+                """
+                CREATE TABLE IF NOT EXISTS meta (
+                  key TEXT PRIMARY KEY,
+                  value TEXT NOT NULL
+                )
+                """
+            )
+
     def upsert_clusters(self, clusters: list[dict], topic: str) -> None:
         now = datetime.now(timezone.utc)
         with self._conn() as conn:
@@ -241,3 +256,86 @@ class SQLiteClusterStore:
             if not row:
                 return None
             return {"last_hash": row[0], "updated_at": row[1]}
+
+    def get_meta(self, key: str) -> str | None:
+        with self._conn() as conn:
+            cur = conn.execute("SELECT value FROM meta WHERE key=?", (key,))
+            row = cur.fetchone()
+            return row[0] if row else None
+
+    def set_meta(self, key: str, value: str) -> None:
+        with self._conn() as conn:
+            conn.execute(
+                "INSERT INTO meta(key, value) VALUES(?, ?) "
+                "ON CONFLICT(key) DO UPDATE SET value=excluded.value",
+                (key, value),
+            )
+
+    def prune_clusters(self, retention_days: float) -> int:
+        retention_days = float(retention_days)
+        if retention_days <= 0:
+            return 0
+        cutoff = datetime.now(timezone.utc) - timedelta(days=retention_days)
+        cutoff_iso = cutoff.isoformat()
+        pruned_at = datetime.now(timezone.utc).isoformat()
+        with self._conn() as conn:
+            cur = conn.execute("DELETE FROM clusters WHERE updated_at < ?", (cutoff_iso,))
+            deleted = int(cur.rowcount or 0)
+            conn.execute(
+                "INSERT INTO meta(key, value) VALUES(?, ?) "
+                "ON CONFLICT(key) DO UPDATE SET value=excluded.value",
+                (META_LAST_PRUNE_AT, pruned_at),
+            )
+            return deleted
+
+    def prune_if_due(self, pruning_enabled: bool, retention_days: float, interval_hours: float = 24.0) -> dict[str, Any]:
+        retention_days = float(retention_days)
+        interval_hours = float(interval_hours)
+        if (not pruning_enabled) or retention_days <= 0:
+            return {
+                "enabled": bool(pruning_enabled),
+                "deleted": 0,
+                "due": False,
+                "retention_days": retention_days,
+                "interval_hours": interval_hours,
+                "last_prune_at": self.get_meta(META_LAST_PRUNE_AT),
+            }
+
+        last_prune_at = self.get_meta(META_LAST_PRUNE_AT)
+        now = datetime.now(timezone.utc)
+        due = True
+        if last_prune_at:
+            try:
+                last_dt = datetime.fromisoformat(last_prune_at)
+                due = now - last_dt >= timedelta(hours=max(1.0, interval_hours))
+            except Exception:
+                due = True
+
+        if not due:
+            return {
+                "enabled": True,
+                "deleted": 0,
+                "due": False,
+                "retention_days": retention_days,
+                "interval_hours": interval_hours,
+                "last_prune_at": last_prune_at,
+            }
+
+        deleted = self.prune_clusters(retention_days)
+        last_prune_at = self.get_meta(META_LAST_PRUNE_AT)
+        return {
+            "enabled": True,
+            "deleted": deleted,
+            "due": True,
+            "retention_days": retention_days,
+            "interval_hours": interval_hours,
+            "last_prune_at": last_prune_at,
+        }
+
+    def get_prune_state(self, pruning_enabled: bool, retention_days: float, interval_hours: float = 24.0) -> dict[str, Any]:
+        return {
+            "enabled": bool(pruning_enabled),
+            "retention_days": float(retention_days),
+            "interval_hours": float(interval_hours),
+            "last_prune_at": self.get_meta(META_LAST_PRUNE_AT),
+        }

+ 79 - 26
news_mcp/trends_resolution.py

@@ -1,20 +1,77 @@
 from __future__ import annotations
 
 import json
-import os
-import subprocess
+from datetime import datetime, timezone
 from functools import lru_cache
 from typing import Any
+from urllib.parse import quote
+
+import httpx
 
 from news_mcp.entity_normalize import normalize_entity
 
 
+class GoogleTrendsError(RuntimeError):
+    pass
+
+
+class GoogleTrendsProvider:
+    """Minimal in-process Google Trends adapter used by news-mcp.
+
+    We only need entity suggestions for the resolver path, so keep this module
+    intentionally narrow rather than importing the full trends-mcp server.
+    """
+
+    _SUGGESTIONS_URL = "https://trends.google.com/trends/api/autocomplete/"
+
+    def __init__(self, *, hl: str = "en-US", tz: int = 120, timeout: float = 10.0):
+        self.hl = hl
+        self.tz = tz
+        self.timeout = timeout
+        self._headers = {
+            "User-Agent": (
+                "Mozilla/5.0 (X11; Linux x86_64) "
+                "AppleWebKit/537.36 (KHTML, like Gecko) "
+                "Chrome/135.0.0.0 Safari/537.36"
+            ),
+            "Accept": "application/json,text/javascript,*/*;q=0.1",
+        }
+
+    def suggestions(self, keyword: str) -> list[dict[str, Any]]:
+        url = self._SUGGESTIONS_URL + quote(keyword)
+        params = {"hl": self.hl, "tz": str(self.tz)}
+        try:
+            response = httpx.get(url, params=params, headers=self._headers, timeout=self.timeout, follow_redirects=True)
+            response.raise_for_status()
+            text = response.text.strip()
+            if text.startswith(")]}',"):
+                text = text[5:]
+            payload = json.loads(text)
+            default = payload.get("default") if isinstance(payload, dict) else None
+            topics = default.get("topics") if isinstance(default, dict) else None
+            return topics if isinstance(topics, list) else []
+        except Exception as exc:  # pragma: no cover - network/provider dependent
+            raise GoogleTrendsError(f"suggestions failed for {keyword!r}: {exc}") from exc
+
+
+@lru_cache(maxsize=1)
+def _provider() -> GoogleTrendsProvider | None:
+    try:
+        return GoogleTrendsProvider()
+    except Exception:
+        return None
+
+
+def _resolved_at() -> str:
+    return datetime.now(timezone.utc).isoformat()
+
+
 @lru_cache(maxsize=1024)
 def resolve_entity_via_trends(entity: str) -> dict[str, Any]:
-    """Resolve a normalized entity through trends-mcp, falling back cleanly.
+    """Resolve an entity locally via Google Trends suggestions.
 
-    The input is normalized first using the same local normalization rules used
-    everywhere else in news-mcp, so query and storage paths stay aligned.
+    The returned shape intentionally mirrors the former trends-mcp bridge so the
+    rest of news-mcp can stay unchanged during the migration.
     """
     normalized = normalize_entity(entity)
     if not normalized:
@@ -24,34 +81,28 @@ def resolve_entity_via_trends(entity: str) -> dict[str, Any]:
             "canonical_label": "",
             "mid": None,
             "type": None,
+            "candidates": [],
             "source": "empty",
+            "resolved_at": _resolved_at(),
         }
 
-    config = os.getenv("MCPORTER_CONFIG", os.path.expanduser("~/.openclaw/workspace/config/mcporter.json"))
-    command = [
-        "mcporter",
-        "--config",
-        config,
-        "call",
-        "trends.resolve_entity",
-        f"keyword={normalized}",
-    ]
-
-    try:
-        proc = subprocess.run(command, capture_output=True, text=True, timeout=20, check=False)
-        if proc.returncode == 0 and proc.stdout.strip():
-            payload = json.loads(proc.stdout)
+    provider = _provider()
+    if provider is not None:
+        try:
+            suggestions = provider.suggestions(normalized)
+            best = suggestions[0] if suggestions else None
             return {
                 "raw": entity,
                 "normalized": normalized,
-                "canonical_label": payload.get("canonical_label") or normalized,
-                "mid": payload.get("mid"),
-                "type": payload.get("type"),
-                "candidates": payload.get("candidates", []),
-                "source": "trends-mcp",
+                "canonical_label": best.get("title") if best else normalized,
+                "mid": best.get("mid") if best else None,
+                "type": best.get("type") if best else None,
+                "candidates": suggestions,
+                "source": "google-trends",
+                "resolved_at": _resolved_at(),
             }
-    except Exception:
-        pass
+        except Exception:
+            pass
 
     # Conservative fallback: keep the local normalized form and leave MID unset.
     return {
@@ -60,5 +111,7 @@ def resolve_entity_via_trends(entity: str) -> dict[str, Any]:
         "canonical_label": normalized,
         "mid": None,
         "type": None,
+        "candidates": [],
         "source": "fallback",
+        "resolved_at": _resolved_at(),
     }

+ 96 - 0
test_news_mcp.py

@@ -9,6 +9,8 @@ from news_mcp.enrichment.importance import compute_importance
 from news_mcp.enrichment.llm_enrich import _filter_entities, _matches_blacklist
 from news_mcp.entity_normalize import normalize_query, normalize_entities
 from news_mcp.llm import build_extraction_prompt, call_llm, load_prompt
+from news_mcp.trends_resolution import resolve_entity_via_trends
+from news_mcp.mcp_server_fastmcp import _sort_clusters_by_recency
 
 
 def _article(title: str, url: str = "https://example.com/x", source: str = "Src", ts: str = "Mon, 30 Mar 2026 12:00:00 GMT"):
@@ -77,6 +79,71 @@ def test_sqlite_summary_cache_roundtrip():
         assert cached["keyFacts"] == ["Fact 1"]
 
 
+def test_prune_clusters_deletes_rows_older_than_retention():
+    with tempfile.TemporaryDirectory() as td:
+        db = Path(td) / "news.sqlite"
+        store = SQLiteClusterStore(db)
+        store.upsert_clusters([
+            {
+                "cluster_id": "fresh",
+                "headline": "Fresh",
+                "summary": "Fresh summary",
+                "entities": ["Bitcoin"],
+                "timestamp": "Wed, 01 Apr 2026 12:00:00 GMT",
+                "articles": [],
+            },
+            {
+                "cluster_id": "stale",
+                "headline": "Stale",
+                "summary": "Stale summary",
+                "entities": ["Iran"],
+                "timestamp": "Wed, 01 Apr 2026 11:00:00 GMT",
+                "articles": [],
+            },
+        ], topic="other")
+
+        with store._conn() as conn:
+            conn.execute(
+                "UPDATE clusters SET updated_at=? WHERE cluster_id=?",
+                ("2025-01-01T00:00:00+00:00", "stale"),
+            )
+
+        deleted = store.prune_clusters(retention_days=30)
+
+        assert deleted == 1
+        assert store.get_cluster_by_id("stale") is None
+        assert store.get_cluster_by_id("fresh") is not None
+        assert store.get_prune_state(pruning_enabled=True, retention_days=30, interval_hours=24)["last_prune_at"] is not None
+
+
+def test_prune_if_due_skips_deletes_when_pruning_disabled():
+    with tempfile.TemporaryDirectory() as td:
+        db = Path(td) / "news.sqlite"
+        store = SQLiteClusterStore(db)
+        store.upsert_clusters([
+            {
+                "cluster_id": "stale",
+                "headline": "Stale",
+                "summary": "Stale summary",
+                "entities": ["Iran"],
+                "timestamp": "Wed, 01 Apr 2026 11:00:00 GMT",
+                "articles": [],
+            }
+        ], topic="other")
+
+        with store._conn() as conn:
+            conn.execute(
+                "UPDATE clusters SET updated_at=? WHERE cluster_id=?",
+                ("2025-01-01T00:00:00+00:00", "stale"),
+            )
+
+        result = store.prune_if_due(pruning_enabled=False, retention_days=30, interval_hours=24)
+
+        assert result["enabled"] is False
+        assert result["deleted"] == 0
+        assert store.get_cluster_by_id("stale") is not None
+
+
 def test_blacklist_filters_entities_case_insensitively():
     entities = ["Bloomberg", "Reuters", "bloomberg", "CoinDesk"]
     filtered = _filter_entities(entities, blacklist=["bloomberg"])
@@ -104,6 +171,35 @@ def test_load_prompt_reads_prompt_files():
     assert "Return STRICT JSON" in text
 
 
+def test_resolve_entity_falls_back_cleanly_when_provider_unavailable(monkeypatch):
+    import news_mcp.trends_resolution as trends_resolution
+
+    trends_resolution.resolve_entity_via_trends.cache_clear()
+    trends_resolution._provider.cache_clear()
+    monkeypatch.setattr(trends_resolution, "_provider", lambda: None)
+
+    resolved = resolve_entity_via_trends("btc")
+
+    assert resolved["normalized"] == "Bitcoin"
+    assert resolved["canonical_label"] == "Bitcoin"
+    assert resolved["mid"] is None
+    assert resolved["candidates"] == []
+    assert resolved["source"] == "fallback"
+
+    trends_resolution.resolve_entity_via_trends.cache_clear()
+
+
+def test_sort_clusters_by_recency_prefers_newer_timestamp_over_importance():
+    clusters = [
+        {"headline": "older", "timestamp": "Wed, 01 Apr 2026 10:00:00 GMT", "importance": 0.9},
+        {"headline": "newer", "timestamp": "Wed, 01 Apr 2026 11:00:00 GMT", "importance": 0.1},
+    ]
+
+    sorted_clusters = _sort_clusters_by_recency(clusters)
+
+    assert [c["headline"] for c in sorted_clusters] == ["newer", "older"]
+
+
 def test_build_extraction_prompt_is_stable_without_blacklist():
     cluster = {
         "headline": "Bloomberg reports Bitcoin rallies after US rate comments",