|
|
@@ -146,6 +146,14 @@ def sanitize_cluster_payload(cluster: dict[str, Any], *, include_resolutions: bo
|
|
|
for field in ("timestamp", "last_updated", "first_seen"):
|
|
|
if field in out and out[field]:
|
|
|
out[field] = _normalize_ts(out[field])
|
|
|
+ # Ensure timestamp is always present for the generated column index.
|
|
|
+ # Prefer existing timestamp, then first_seen, then last_updated, then now.
|
|
|
+ for src in ("timestamp", "first_seen", "last_updated"):
|
|
|
+ if out.get(src):
|
|
|
+ out.setdefault("timestamp", out[src])
|
|
|
+ break
|
|
|
+ if not out.get("timestamp"):
|
|
|
+ out["timestamp"] = datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%S+00:00")
|
|
|
|
|
|
if not include_resolutions:
|
|
|
return out
|
|
|
@@ -206,6 +214,44 @@ class SQLiteClusterStore:
|
|
|
"CREATE INDEX IF NOT EXISTS idx_clusters_updated_at ON clusters(updated_at)"
|
|
|
)
|
|
|
|
|
|
+ # Generated column for indexed event-time filtering (VIRTUAL for compatibility)
|
|
|
+ try:
|
|
|
+ conn.execute(
|
|
|
+ "ALTER TABLE clusters ADD COLUMN payload_ts "
|
|
|
+ "GENERATED ALWAYS AS (json_extract(payload, '$.timestamp')) VIRTUAL"
|
|
|
+ )
|
|
|
+ except sqlite3.OperationalError:
|
|
|
+ pass # column already exists
|
|
|
+ conn.execute(
|
|
|
+ "CREATE INDEX IF NOT EXISTS idx_clusters_payload_ts ON clusters(payload_ts)"
|
|
|
+ )
|
|
|
+
|
|
|
+ # Junction tables for SQL-level entity/keyword search
|
|
|
+ conn.execute(
|
|
|
+ """
|
|
|
+ CREATE TABLE IF NOT EXISTS cluster_entities (
|
|
|
+ cluster_id TEXT NOT NULL REFERENCES clusters(cluster_id) ON DELETE CASCADE,
|
|
|
+ entity TEXT NOT NULL,
|
|
|
+ PRIMARY KEY (cluster_id, entity)
|
|
|
+ )
|
|
|
+ """
|
|
|
+ )
|
|
|
+ conn.execute(
|
|
|
+ "CREATE INDEX IF NOT EXISTS idx_cluster_entities_entity ON cluster_entities(entity)"
|
|
|
+ )
|
|
|
+ conn.execute(
|
|
|
+ """
|
|
|
+ CREATE TABLE IF NOT EXISTS cluster_keywords (
|
|
|
+ cluster_id TEXT NOT NULL REFERENCES clusters(cluster_id) ON DELETE CASCADE,
|
|
|
+ keyword TEXT NOT NULL,
|
|
|
+ PRIMARY KEY (cluster_id, keyword)
|
|
|
+ )
|
|
|
+ """
|
|
|
+ )
|
|
|
+ conn.execute(
|
|
|
+ "CREATE INDEX IF NOT EXISTS idx_cluster_keywords_keyword ON cluster_keywords(keyword)"
|
|
|
+ )
|
|
|
+
|
|
|
try:
|
|
|
cur = conn.execute("PRAGMA table_info(entity_metadata)")
|
|
|
cols = [row[1] for row in cur.fetchall()]
|
|
|
@@ -267,6 +313,24 @@ class SQLiteClusterStore:
|
|
|
"ON CONFLICT(cluster_id) DO UPDATE SET topic=excluded.topic, payload=excluded.payload, updated_at=excluded.updated_at",
|
|
|
(cluster_id, topic, payload, now.isoformat()),
|
|
|
)
|
|
|
+ # Populate junction tables for SQL-level entity/keyword search.
|
|
|
+ # DELETE first so re-enrichment replaces stale entries.
|
|
|
+ conn.execute("DELETE FROM cluster_entities WHERE cluster_id=?", (cluster_id,))
|
|
|
+ conn.execute("DELETE FROM cluster_keywords WHERE cluster_id=?", (cluster_id,))
|
|
|
+ for entity in c.get("entities", []):
|
|
|
+ ent_norm = str(entity).strip().lower()
|
|
|
+ if ent_norm:
|
|
|
+ conn.execute(
|
|
|
+ "INSERT OR IGNORE INTO cluster_entities(cluster_id, entity) VALUES(?, ?)",
|
|
|
+ (cluster_id, ent_norm),
|
|
|
+ )
|
|
|
+ for kw in c.get("keywords", []):
|
|
|
+ kw_norm = str(kw).strip().lower()
|
|
|
+ if kw_norm:
|
|
|
+ conn.execute(
|
|
|
+ "INSERT OR IGNORE INTO cluster_keywords(cluster_id, keyword) VALUES(?, ?)",
|
|
|
+ (cluster_id, kw_norm),
|
|
|
+ )
|
|
|
|
|
|
def upsert_cluster_summary(
|
|
|
self,
|
|
|
@@ -299,39 +363,24 @@ class SQLiteClusterStore:
|
|
|
return json.loads(row[0])
|
|
|
|
|
|
def get_latest_clusters(self, topic: str, ttl_hours: float, limit: int) -> list[dict]:
|
|
|
- """Return newest clusters by their own event timestamp (payload.timestamp).
|
|
|
-
|
|
|
- payload.timestamp is guaranteed ISO 8601 UTC — use _read_ts, not raw
|
|
|
- JSON parsing with RFC 2822 fallbacks.
|
|
|
- """
|
|
|
- cutoff_ts = (datetime.now(timezone.utc) - timedelta(hours=float(ttl_hours))).timestamp()
|
|
|
-
|
|
|
+ """Return newest clusters by event timestamp, filtered via SQL payload_ts index."""
|
|
|
+ cutoff = (datetime.now(timezone.utc) - timedelta(hours=float(ttl_hours))).isoformat()
|
|
|
with self._conn() as conn:
|
|
|
cur = conn.execute(
|
|
|
- "SELECT payload FROM clusters WHERE topic=? ORDER BY updated_at DESC",
|
|
|
- (topic,),
|
|
|
+ "SELECT payload FROM clusters WHERE topic=? AND payload_ts >= ? ORDER BY payload_ts DESC LIMIT ?",
|
|
|
+ (topic, cutoff, int(limit)),
|
|
|
)
|
|
|
- candidates = [json.loads(r[0]) for r in cur.fetchall()]
|
|
|
-
|
|
|
- filtered = [c for c in candidates if (_read_ts(c.get("timestamp")) or 0.0) >= cutoff_ts]
|
|
|
- filtered.sort(key=lambda c: _read_ts(c.get("timestamp")) or 0.0, reverse=True)
|
|
|
- return filtered[: int(limit)]
|
|
|
+ return [json.loads(r[0]) for r in cur.fetchall()]
|
|
|
|
|
|
def get_latest_clusters_all_topics(self, ttl_hours: float, limit: int) -> list[dict]:
|
|
|
- """Return newest clusters across all topics by event timestamp.
|
|
|
-
|
|
|
- payload.timestamp is guaranteed ISO 8601 UTC — use _read_ts, not raw
|
|
|
- JSON parsing with RFC 2822 fallbacks.
|
|
|
- """
|
|
|
- cutoff_ts = (datetime.now(timezone.utc) - timedelta(hours=float(ttl_hours))).timestamp()
|
|
|
-
|
|
|
+ """Return newest clusters across all topics, filtered via SQL payload_ts index."""
|
|
|
+ cutoff = (datetime.now(timezone.utc) - timedelta(hours=float(ttl_hours))).isoformat()
|
|
|
with self._conn() as conn:
|
|
|
- cur = conn.execute("SELECT payload FROM clusters ORDER BY updated_at DESC")
|
|
|
- candidates = [json.loads(r[0]) for r in cur.fetchall()]
|
|
|
-
|
|
|
- filtered = [c for c in candidates if (_read_ts(c.get("timestamp")) or 0.0) >= cutoff_ts]
|
|
|
- filtered.sort(key=lambda c: _read_ts(c.get("timestamp")) or 0.0, reverse=True)
|
|
|
- return filtered[: int(limit)]
|
|
|
+ cur = conn.execute(
|
|
|
+ "SELECT payload FROM clusters WHERE payload_ts >= ? ORDER BY payload_ts DESC LIMIT ?",
|
|
|
+ (cutoff, int(limit)),
|
|
|
+ )
|
|
|
+ return [json.loads(r[0]) for r in cur.fetchall()]
|
|
|
|
|
|
def get_cluster_by_id(self, cluster_id: str) -> dict | None:
|
|
|
with self._conn() as conn:
|
|
|
@@ -645,129 +694,6 @@ class SQLiteClusterStore:
|
|
|
"feeds": feeds,
|
|
|
}
|
|
|
|
|
|
- def get_clusters_page(
|
|
|
- self,
|
|
|
- topic: str | None = None,
|
|
|
- hours: float = 24,
|
|
|
- limit: int = 20,
|
|
|
- offset: int = 0,
|
|
|
- ) -> list[dict[str, Any]]:
|
|
|
- """Paginated cluster listing filtered by payload.timestamp (event time).
|
|
|
-
|
|
|
- payload.timestamp is guaranteed ISO 8601 UTC — filtered and sorted
|
|
|
- using _read_ts, not updated_at (row modification time).
|
|
|
- """
|
|
|
- cutoff_ts = (datetime.now(timezone.utc) - timedelta(hours=hours)).timestamp()
|
|
|
-
|
|
|
- query = "SELECT payload FROM clusters"
|
|
|
- params: list = []
|
|
|
- if topic and topic != "all":
|
|
|
- query += " WHERE topic = ?"
|
|
|
- params.append(topic)
|
|
|
-
|
|
|
- with self._conn() as conn:
|
|
|
- rows = conn.execute(query, params).fetchall()
|
|
|
-
|
|
|
- filtered = [json.loads(r[0]) for r in rows]
|
|
|
- filtered = [c for c in filtered if (_read_ts(c.get("timestamp")) or 0.0) >= cutoff_ts]
|
|
|
- filtered.sort(key=lambda c: _read_ts(c.get("timestamp")) or 0.0, reverse=True)
|
|
|
- page = filtered[offset:offset + limit]
|
|
|
-
|
|
|
- return [
|
|
|
- {
|
|
|
- "cluster_id": c.get("cluster_id", ""),
|
|
|
- "headline": c.get("headline", ""),
|
|
|
- "topic": c.get("topic", ""),
|
|
|
- "sentiment": c.get("sentiment", "neutral"),
|
|
|
- "sentimentScore": c.get("sentimentScore"),
|
|
|
- "importance": c.get("importance", 0),
|
|
|
- "entities": c.get("entities", []),
|
|
|
- "sources": c.get("sources", []),
|
|
|
- "timestamp": c.get("timestamp", ""),
|
|
|
- "keywords": c.get("keywords", []),
|
|
|
- "article_count": len(c.get("articles", [])),
|
|
|
- }
|
|
|
- for c in page
|
|
|
- ]
|
|
|
-
|
|
|
- def get_sentiment_series(
|
|
|
- self,
|
|
|
- topic: str | None = None,
|
|
|
- hours: float = 24,
|
|
|
- bucket_hours: float = 1,
|
|
|
- ) -> list[dict[str, Any]]:
|
|
|
- """Sentiment score averaged per time bucket.
|
|
|
-
|
|
|
- Filters by payload.timestamp (event time, ISO 8601 UTC guaranteed).
|
|
|
- """
|
|
|
- cutoff_ts = (datetime.now(timezone.utc) - timedelta(hours=hours)).timestamp()
|
|
|
- query = "SELECT payload FROM clusters"
|
|
|
- params: list = []
|
|
|
- if topic and topic != "all":
|
|
|
- query += " WHERE topic = ?"
|
|
|
- params.append(topic)
|
|
|
-
|
|
|
- with self._conn() as conn:
|
|
|
- rows = conn.execute(query, params).fetchall()
|
|
|
-
|
|
|
- buckets: dict[datetime, list[float]] = {}
|
|
|
- for (payload_text,) in rows:
|
|
|
- c = json.loads(payload_text)
|
|
|
- ts = _read_ts(c.get("timestamp"))
|
|
|
- score = c.get("sentimentScore")
|
|
|
- if ts is None or score is None:
|
|
|
- continue
|
|
|
- if ts < cutoff_ts:
|
|
|
- continue
|
|
|
- dt = datetime.fromtimestamp(ts, tz=timezone.utc)
|
|
|
- bucket_key = dt.replace(minute=0, second=0, microsecond=0)
|
|
|
- if bucket_hours > 1:
|
|
|
- bucket_key = bucket_key.replace(
|
|
|
- hour=(bucket_key.hour // int(bucket_hours)) * int(bucket_hours)
|
|
|
- )
|
|
|
- buckets.setdefault(bucket_key, []).append(float(score))
|
|
|
-
|
|
|
- return [
|
|
|
- {
|
|
|
- "time": bucket_key.isoformat(),
|
|
|
- "avg_sentiment": round(sum(scores) / len(scores), 3),
|
|
|
- "count": len(scores),
|
|
|
- "min": round(min(scores), 3),
|
|
|
- "max": round(max(scores), 3),
|
|
|
- }
|
|
|
- for bucket_key, scores in sorted(buckets.items())
|
|
|
- ]
|
|
|
-
|
|
|
- def get_entity_frequencies(
|
|
|
- self,
|
|
|
- hours: float = 24,
|
|
|
- limit: int = 30,
|
|
|
- ) -> list[dict[str, Any]]:
|
|
|
- """Top entities by mention count filtered by payload.timestamp (ISO 8601 UTC guaranteed)."""
|
|
|
- cutoff_ts = (datetime.now(timezone.utc) - timedelta(hours=hours)).timestamp()
|
|
|
-
|
|
|
- with self._conn() as conn:
|
|
|
- rows = conn.execute("SELECT payload FROM clusters").fetchall()
|
|
|
-
|
|
|
- counter: dict[str, int] = {}
|
|
|
- for (payload_text,) in rows:
|
|
|
- c = json.loads(payload_text)
|
|
|
- if (_read_ts(c.get("timestamp")) or 0.0) < cutoff_ts:
|
|
|
- continue
|
|
|
- for ent in c.get("entities", []):
|
|
|
- counter[ent] = counter.get(ent, 0) + 1
|
|
|
-
|
|
|
- result: list[dict[str, Any]] = []
|
|
|
- for label, count in sorted(counter.items(), key=lambda x: -x[1])[:limit]:
|
|
|
- meta = self.get_entity_metadata(label)
|
|
|
- result.append({
|
|
|
- "label": label,
|
|
|
- "count": count,
|
|
|
- "canonical_label": meta["canonical_label"] if meta else label,
|
|
|
- "mid": meta["mid"] if meta else None,
|
|
|
- })
|
|
|
- return result
|
|
|
-
|
|
|
def get_cluster_detail(self, cluster_id: str) -> dict[str, Any] | None:
|
|
|
"""Dashboard-optimized cluster detail fetch."""
|
|
|
with self._conn() as conn:
|