|
|
@@ -5,7 +5,7 @@ import hashlib
|
|
|
import logging
|
|
|
import sys
|
|
|
from collections import defaultdict
|
|
|
-from datetime import datetime, timezone
|
|
|
+from datetime import datetime, timezone, timedelta
|
|
|
from typing import Any, Dict
|
|
|
|
|
|
from news_mcp.config import (
|
|
|
@@ -23,7 +23,7 @@ from news_mcp.config import (
|
|
|
NEWS_CLUSTER_MAX_AGE_HOURS,
|
|
|
llm_concurrency,
|
|
|
)
|
|
|
-from news_mcp.dedup.cluster import dedup_and_cluster_articles, _cluster_is_within_age_window
|
|
|
+from news_mcp.dedup.cluster import dedup_and_cluster_articles, _cluster_is_within_age_window, _parse_ts
|
|
|
from news_mcp.enrichment.enrich import enrich_cluster
|
|
|
from news_mcp.enrichment.llm_enrich import classify_cluster_llm
|
|
|
from news_mcp.sources.news_feeds import fetch_news_articles
|
|
|
@@ -232,6 +232,41 @@ async def refresh_clusters(topic: str | None = None, limit: int = 80) -> None:
|
|
|
return
|
|
|
|
|
|
articles = changed_articles
|
|
|
+
|
|
|
+ # Pre-filter: drop articles whose RSS timestamp is older than retention.
|
|
|
+ # This prevents stale feed items from being re-ingested after pruning.
|
|
|
+ if NEWS_RETENTION_DAYS > 0:
|
|
|
+ retention_cutoff = datetime.now(timezone.utc) - timedelta(days=NEWS_RETENTION_DAYS)
|
|
|
+ fresh_articles = []
|
|
|
+ for a in articles:
|
|
|
+ ts_str = a.get("timestamp", "")
|
|
|
+ if not ts_str:
|
|
|
+ fresh_articles.append(a)
|
|
|
+ continue
|
|
|
+ dt = _parse_ts(ts_str)
|
|
|
+ if dt is None:
|
|
|
+ fresh_articles.append(a)
|
|
|
+ continue
|
|
|
+ if dt >= retention_cutoff:
|
|
|
+ fresh_articles.append(a)
|
|
|
+ else:
|
|
|
+ logger.debug("drop stale article title=%s ts=%s", a.get("title", "")[:60], ts_str)
|
|
|
+ dropped = len(articles) - len(fresh_articles)
|
|
|
+ if dropped:
|
|
|
+ logger.info("refresh retention-filter dropped=%d remaining=%d retention_days=%.0f", dropped, len(fresh_articles), NEWS_RETENTION_DAYS)
|
|
|
+ articles = fresh_articles
|
|
|
+
|
|
|
+ if not articles:
|
|
|
+ logger.info("refresh no articles after retention filter topic=%s", topic)
|
|
|
+ store.set_meta("last_refresh_at", datetime.now(timezone.utc).isoformat())
|
|
|
+ prune_result = store.prune_if_due(
|
|
|
+ pruning_enabled=NEWS_PRUNING_ENABLED,
|
|
|
+ retention_days=NEWS_RETENTION_DAYS,
|
|
|
+ interval_hours=NEWS_PRUNE_INTERVAL_HOURS,
|
|
|
+ )
|
|
|
+ logger.info("refresh prune_result=%s", prune_result)
|
|
|
+ return
|
|
|
+
|
|
|
logger.info("refresh clustering start articles=%s topic=%s", len(articles), topic)
|
|
|
|
|
|
# Pre-seed with recent clusters from the DB so new articles can merge
|