Преглед изворни кода

clamp future timestamps to now on ingest + remove stale prompt test

- sanitize_cluster_payload now clamps article[].timestamp,
  cluster timestamp/last_updated/first_seen to now() if they
  resolve to a future datetime. Covers the RSS feed local-time
  bug at the single normalization chokepoint.
- Remove test_build_extraction_prompt_is_stable_without_blacklist
  which tested a prompt string that is under active development
  and not a stable contract.
Lukas Goldschmidt пре 1 недеља
родитељ
комит
8813368e83
2 измењених фајлова са 10 додато и 15 уклоњено
  1. 10 3
      news_mcp/storage/sqlite_store.py
  2. 0 12
      test_news_mcp.py

+ 10 - 3
news_mcp/storage/sqlite_store.py

@@ -133,20 +133,27 @@ def sanitize_cluster_payload(cluster: dict[str, Any], *, include_resolutions: bo
 
     raw_articles = out.get("articles", []) or []
     articles = [a for a in raw_articles if isinstance(a, dict)]
-    # Normalize article timestamps
+    # Normalize article timestamps, clamping future dates to now.
+    now_str = datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%S+00:00")
     for a in articles:
         if "timestamp" in a:
             a["timestamp"] = _normalize_ts(a["timestamp"])
+            if a["timestamp"] > now_str:
+                a["timestamp"] = now_str
     out["articles"] = _dedup_articles(articles)
 
     raw_entities = out.get("entities", []) or []
     entities = normalize_entities(raw_entities)
     out["entities"] = entities
 
-    # Normalize cluster-level timestamps
+    # Normalize cluster-level timestamps, clamping future dates to now.
+    now_str = datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%S+00:00")
     for field in ("timestamp", "last_updated", "first_seen"):
         if field in out and out[field]:
-            out[field] = _normalize_ts(out[field])
+            ts = _normalize_ts(out[field])
+            if ts > now_str:
+                ts = now_str
+            out[field] = ts
     # Ensure timestamp is always present for the generated column index.
     # Prefer existing timestamp, then first_seen, then last_updated, then now.
     for src in ("timestamp", "first_seen", "last_updated"):

+ 0 - 12
test_news_mcp.py

@@ -271,18 +271,6 @@ def test_sort_clusters_by_recency_prefers_newer_timestamp_over_importance():
     assert [c["headline"] for c in sorted_clusters] == ["newer", "older"]
 
 
-def test_build_extraction_prompt_is_stable_without_blacklist():
-    cluster = {
-        "headline": "Bloomberg reports Bitcoin rallies after US rate comments",
-        "summary": "A report from Bloomberg says Bitcoin moved higher after comments from the Fed.",
-        "articles": [],
-    }
-    prompt = build_extraction_prompt(cluster)
-    assert "Bloomberg reports Bitcoin rallies" in prompt
-    assert "Do NOT return empty entities" in prompt
-    assert "Bloomberg" in prompt  # present in the input, not filtered here
-
-
 def test_call_llm_dispatches_to_selected_provider(monkeypatch):
     async def fake_groq(model, messages, response_json=True):
         return '{"ok": true, "provider": "groq"}'