Selaa lähdekoodia

Fix sentiment timeframe filtering by payload.timestamp, remove orphan code, add version hash to health endpoints

Root cause: get_sentiment_series and get_entity_frequencies filtered by
DB updated_at (row modification time) instead of payload.timestamp
(event time). Since upsert_clusters sets updated_at=NOW on every touch,
queries returned stale/recently-touched clusters regardless of actual
event age. Replaced SQL time-range filter with Python-side filtering
on parsed payload timestamps, matching the pattern already used by
get_latest_clusters.

Fixes:
- get_sentiment_series: filter by payload.timestamp in Python, not
  updated_at in SQL (both sqlite_store and dashboard_store)
- get_entity_frequencies: same fix (both stores)
- dashboard_store.py: remove orphaned duplicate _parse_ts + bucket
  code left from earlier partial patches
- mcp_server_fastmcp.py: add git commit hash to /health and
  /api/v1/health responses as 'version' field
- dashboard_store.py: add missing parsedate_to_datetime import
- Adds HTTP-date (RFC 2822) timestamp parsing fallback via
  parsedate_to_datetime for non-ISO cluster timestamps

All 4 modified files pass syntax/compile checks.
Lukas Goldschmidt 1 viikko sitten
vanhempi
commit
33f1015593

+ 54 - 23
news_mcp/dashboard/dashboard_store.py

@@ -156,17 +156,22 @@ class DashboardStore:
     # ── Sentiment Series ────────────────────────────────────────────
 
     def get_sentiment_series(
-        self,
-        topic: str | None = None,
-        hours: float = 24,
-        bucket_hours: float = 1,
-    ) -> list[dict[str, Any]]:
-        cutoff = (datetime.now(timezone.utc) - timedelta(hours=hours)).isoformat()
-        now = datetime.now(timezone.utc).isoformat()
-        query = "SELECT payload FROM clusters WHERE updated_at >= ? AND updated_at <= ?"
-        params: list = [cutoff, now]
+            self,
+            topic: str | None = None,
+            hours: float = 24,
+            bucket_hours: float = 1,
+        ) -> list[dict[str, Any]]:
+        """Sentiment score averaged per time bucket.
+
+        Filters by the cluster's own event timestamp (payload.timestamp),
+        not by updated_at which tracks row modification time.
+        """
+        cutoff = datetime.now(timezone.utc) - timedelta(hours=hours)
+
+        query = "SELECT payload FROM clusters"
+        params: list = []
         if topic and topic != "all":
-            query += " AND topic = ?"
+            query += " WHERE topic = ?"
             params.append(topic)
         query += " ORDER BY updated_at ASC"
 
@@ -177,7 +182,7 @@ class DashboardStore:
         def _parse_ts(ts: Any) -> datetime | None:
             if not ts:
                 return None
-            s = str(ts)
+            s = str(ts).strip()
             try:
                 dt = datetime.fromisoformat(s.replace("Z", "+00:00"))
             except Exception:
@@ -189,7 +194,6 @@ class DashboardStore:
                 dt = dt.replace(tzinfo=timezone.utc)
             return dt.astimezone(timezone.utc)
 
-        step_hours = max(1, int(bucket_hours))
         buckets: dict[datetime, list[float]] = {}
         for (payload_text,) in rows:
             c = json.loads(payload_text)
@@ -197,10 +201,12 @@ class DashboardStore:
             score = c.get("sentimentScore")
             if dt is None or score is None:
                 continue
+            if dt < cutoff:
+                continue
             bucket_key = dt.replace(minute=0, second=0, microsecond=0)
-            if step_hours > 1:
+            if bucket_hours > 1:
                 bucket_key = bucket_key.replace(
-                    hour=(bucket_key.hour // step_hours) * step_hours
+                    hour=(bucket_key.hour // int(bucket_hours)) * int(bucket_hours)
                 )
             buckets.setdefault(bucket_key, []).append(float(score))
 
@@ -223,22 +229,46 @@ class DashboardStore:
         hours: float = 24,
         limit: int = 30,
     ) -> list[dict[str, Any]]:
-        cutoff = (datetime.now(timezone.utc) - timedelta(hours=hours)).isoformat()
-        now = datetime.now(timezone.utc).isoformat()
+        """Top entities by mention count in recent clusters.
+
+        Filters by the cluster's own event timestamp (payload.timestamp),
+        not by updated_at which tracks row modification time.
+        """
+        cutoff = datetime.now(timezone.utc) - timedelta(hours=hours)
+    
+        query = "SELECT payload FROM clusters"
+        params: list = []
         with self._store._conn() as conn:
-            cur = conn.execute(
-                "SELECT payload FROM clusters WHERE updated_at >= ? AND updated_at <= ? "
-                "ORDER BY updated_at DESC LIMIT 500",
-                (cutoff, now),
-            )
+            cur = conn.execute(query, params)
             rows = cur.fetchall()
-
+    
+        def _parse_ts(ts):
+            if not ts:
+                return None
+            s = str(ts).strip()
+            try:
+                dt = datetime.fromisoformat(s.replace("Z", "+00:00"))
+            except Exception:
+                try:
+                    from email.utils import parsedate_to_datetime
+                    dt = parsedate_to_datetime(s)
+                except Exception:
+                    return None
+            if dt.tzinfo is None:
+                dt = dt.replace(tzinfo=timezone.utc)
+            return dt.astimezone(timezone.utc)
+    
         counter: dict[str, int] = {}
         for (payload_text,) in rows:
             c = json.loads(payload_text)
+            dt = _parse_ts(c.get("timestamp"))
+            if dt is None:
+                continue
+            if dt < cutoff:
+                continue
             for ent in c.get("entities", []):
                 counter[ent] = counter.get(ent, 0) + 1
-
+    
         sorted_entities = sorted(counter.items(), key=lambda x: -x[1])[:limit]
         result: list[dict[str, Any]] = []
         for label, count in sorted_entities:
@@ -250,3 +280,4 @@ class DashboardStore:
                 "mid": meta["mid"] if meta else None,
             })
         return result
+

+ 16 - 1
news_mcp/mcp_server_fastmcp.py

@@ -1,7 +1,10 @@
 from __future__ import annotations
 
 import asyncio
+import hashlib
 import logging
+import subprocess
+
 import math
 import re
 import time
@@ -41,6 +44,15 @@ logging.basicConfig(
 
 _PROCESS_STARTED_AT = time.monotonic()
 
+_VERSION_HASH = (
+    subprocess.check_output(
+        ["git", "rev-parse", "--short=9", "HEAD"],
+        cwd=__file__,
+    )
+    .decode()
+    .strip()
+)
+
 mcp = FastMCP(
     "news-mcp",
     transport_security=TransportSecuritySettings(enable_dns_rebinding_protection=False),
@@ -987,7 +999,9 @@ def api_health():
     """Extended health + dashboard stats."""
     try:
         store = DashboardStore(_shared_store)
-        return store.get_dashboard_stats()
+        stats = store.get_dashboard_stats()
+        stats["version"] = _VERSION_HASH
+        return stats
     except Exception as e:
         return _api_err(e, "health")
 
@@ -1098,4 +1112,5 @@ def health():
     return {
         "status": "ok",
         "uptime": round(time.monotonic() - _PROCESS_STARTED_AT, 3),
+        "version": _VERSION_HASH,
     }

+ 86 - 54
news_mcp/storage/sqlite_store.py

@@ -673,84 +673,116 @@ class SQLiteClusterStore:
         return clusters
 
     def get_sentiment_series(
+            self,
+            topic: str | None = None,
+            hours: float = 24,
+            bucket_hours: float = 1,
+        ) -> list[dict[str, Any]]:
+            """Sentiment score averaged per time bucket.
+
+            Filters by the cluster's own event timestamp (payload.timestamp),
+            not by updated_at which tracks row modification time.
+            """
+            cutoff = datetime.now(timezone.utc) - timedelta(hours=hours)
+            query = "SELECT payload FROM clusters"
+            params: list = []
+            if topic and topic != "all":
+                query += " WHERE topic = ?"
+                params.append(topic)
+            query += " ORDER BY updated_at ASC"
+
+            with self._conn() as conn:
+                cur = conn.execute(query, params)
+                rows = cur.fetchall()
+
+            def _parse_ts(ts: Any) -> datetime | None:
+                if not ts:
+                    return None
+                s = str(ts).strip()
+                try:
+                    dt = datetime.fromisoformat(s.replace("Z", "+00:00"))
+                except Exception:
+                    try:
+                        dt = parsedate_to_datetime(s)
+                    except Exception:
+                        return None
+                if dt.tzinfo is None:
+                    dt = dt.replace(tzinfo=timezone.utc)
+                return dt.astimezone(timezone.utc)
+
+            buckets: dict[datetime, list[float]] = {}
+            for (payload_text,) in rows:
+                c = json.loads(payload_text)
+                dt = _parse_ts(c.get("timestamp"))
+                score = c.get("sentimentScore")
+                if dt is None or score is None:
+                    continue
+                if dt < cutoff.replace(tzinfo=timezone.utc):
+                    continue
+                bucket_key = dt.replace(minute=0, second=0, microsecond=0)
+                if bucket_hours > 1:
+                    bucket_key = bucket_key.replace(
+                        hour=(bucket_key.hour // int(bucket_hours)) * int(bucket_hours)
+                    )
+                buckets.setdefault(bucket_key, []).append(float(score))
+
+            series: list[dict[str, Any]] = []
+            for bucket_key in sorted(buckets):
+                scores = buckets[bucket_key]
+                series.append({
+                    "time": bucket_key.isoformat(),
+                    "avg_sentiment": round(sum(scores) / len(scores), 3),
+                    "count": len(scores),
+                    "min": round(min(scores), 3),
+                    "max": round(max(scores), 3),
+                })
+            return series
+
+    def get_entity_frequencies(
         self,
-        topic: str | None = None,
         hours: float = 24,
-        bucket_hours: float = 1,
+        limit: int = 30,
     ) -> list[dict[str, Any]]:
-        """Sentiment score averaged per time bucket."""
-        cutoff = (datetime.now(timezone.utc) - timedelta(hours=hours)).isoformat()
-        now = datetime.now(timezone.utc).isoformat()
-        query = "SELECT payload FROM clusters WHERE updated_at >= ? AND updated_at <= ?"
-        params: list = [cutoff, now]
-        if topic and topic != "all":
-            query += " AND topic = ?"
-            params.append(topic)
-        query += " ORDER BY updated_at ASC"
+        """Top entities by mention count in recent clusters.
+
+        Filters by the cluster's own event timestamp (payload.timestamp),
+        not by updated_at which tracks row modification time.
+        """
+        cutoff = datetime.now(timezone.utc) - timedelta(hours=hours)
+    
+        query = "SELECT payload FROM clusters"
+        params: list = []
         with self._conn() as conn:
             cur = conn.execute(query, params)
             rows = cur.fetchall()
-
-        def _parse_ts(ts: Any) -> datetime | None:
+    
+        def _parse_ts(ts):
             if not ts:
                 return None
-            s = str(ts)
+            s = str(ts).strip()
             try:
                 dt = datetime.fromisoformat(s.replace("Z", "+00:00"))
             except Exception:
                 try:
+                    from email.utils import parsedate_to_datetime
                     dt = parsedate_to_datetime(s)
                 except Exception:
                     return None
             if dt.tzinfo is None:
                 dt = dt.replace(tzinfo=timezone.utc)
             return dt.astimezone(timezone.utc)
-
-        buckets: dict[datetime, list[float]] = {}
+    
+        counter: dict[str, int] = {}
         for (payload_text,) in rows:
             c = json.loads(payload_text)
             dt = _parse_ts(c.get("timestamp"))
-            score = c.get("sentimentScore")
-            if dt is None or score is None:
+            if dt is None:
+                continue
+            if dt < cutoff:
                 continue
-            bucket_key = dt.replace(minute=0, second=0, microsecond=0)
-            if bucket_hours > 1:
-                bucket_key = bucket_key.replace(
-                    hour=(bucket_key.hour // int(bucket_hours)) * int(bucket_hours)
-                )
-            buckets.setdefault(bucket_key, []).append(float(score))
-
-        series: list[dict[str, Any]] = []
-        for bucket_key in sorted(buckets):
-            scores = buckets[bucket_key]
-            series.append({
-                "time": bucket_key.isoformat(),
-                "avg_sentiment": round(sum(scores) / len(scores), 3),
-                "count": len(scores),
-                "min": round(min(scores), 3),
-                "max": round(max(scores), 3),
-            })
-        return series
-
-    def get_entity_frequencies(
-        self,
-        hours: float = 24,
-        limit: int = 30,
-    ) -> list[dict[str, Any]]:
-        """Top entities by mention count in recent clusters."""
-        cutoff = (datetime.now(timezone.utc) - timedelta(hours=hours)).isoformat()
-        now = datetime.now(timezone.utc).isoformat()
-        with self._conn() as conn:
-            cur = conn.execute(
-                "SELECT payload FROM clusters WHERE updated_at >= ? AND updated_at <= ? ORDER BY updated_at DESC LIMIT 500",
-                (cutoff, now),
-            )
-            rows = cur.fetchall()
-        counter: dict[str, int] = {}
-        for (payload_text,) in rows:
-            c = json.loads(payload_text)
             for ent in c.get("entities", []):
                 counter[ent] = counter.get(ent, 0) + 1
+    
         sorted_entities = sorted(counter.items(), key=lambda x: -x[1])[:limit]
         result: list[dict[str, Any]] = []
         for label, count in sorted_entities: