|
@@ -0,0 +1,110 @@
|
|
|
|
|
+#!/usr/bin/env python3
|
|
|
|
|
+"""Analyze FT-sourced clusters in the live DB.
|
|
|
|
|
+Run: docker exec -it news-mcp python3 /app/scripts/analyze_ft.py
|
|
|
|
|
+"""
|
|
|
|
|
+import sqlite3, json
|
|
|
|
|
+
|
|
|
|
|
+DB = "./data/news.sqlite"
|
|
|
|
|
+conn = sqlite3.connect(DB)
|
|
|
|
|
+
|
|
|
|
|
+print("=== FT clusters in DB ===")
|
|
|
|
|
+rows = conn.execute("""
|
|
|
|
|
+ SELECT cluster_id,
|
|
|
|
|
+ json_array_length(payload, '$.articles'),
|
|
|
|
|
+ json_extract(payload, '$.headline'),
|
|
|
|
|
+ json_extract(payload, '$.sources'),
|
|
|
|
|
+ json_extract(payload, '$.timestamp')
|
|
|
|
|
+ FROM clusters
|
|
|
|
|
+ WHERE payload LIKE '%ft.com%'
|
|
|
|
|
+ ORDER BY json_array_length(payload, '$.articles') DESC
|
|
|
|
|
+ LIMIT 30
|
|
|
|
|
+""").fetchall()
|
|
|
|
|
+print("FT clusters (top 30 by article count):")
|
|
|
|
|
+for r in rows:
|
|
|
|
|
+ src = (r[3] or "")[:30]
|
|
|
|
|
+ ts = (r[4] or "")[:16]
|
|
|
|
|
+ hl = (r[2] or "")[:55]
|
|
|
|
|
+ print(f" {r[0][:12]} arts={r[1]} {hl} src={src} ts={ts}")
|
|
|
|
|
+
|
|
|
|
|
+total = conn.execute("SELECT count(*) FROM clusters WHERE payload LIKE '%ft.com%'").fetchone()[0]
|
|
|
|
|
+singletons = conn.execute("SELECT count(*) FROM clusters WHERE payload LIKE '%ft.com%' AND json_array_length(payload, '$.articles') = 1").fetchone()[0]
|
|
|
|
|
+print(f"\nTotal FT clusters: {total}, singletons: {singletons}, multi-article: {total - singletons}")
|
|
|
|
|
+
|
|
|
|
|
+# Duplicate headlines
|
|
|
|
|
+print("\n=== Potential duplicates (same headline in multiple clusters) ===")
|
|
|
|
|
+dupe_rows = conn.execute("""
|
|
|
|
|
+ SELECT json_extract(payload, '$.headline'), count(*) as cnt, group_concat(substr(cluster_id,1,12))
|
|
|
|
|
+ FROM clusters
|
|
|
|
|
+ WHERE payload LIKE '%ft.com%'
|
|
|
|
|
+ GROUP BY json_extract(payload, '$.headline')
|
|
|
|
|
+ HAVING cnt > 1
|
|
|
|
|
+ ORDER BY cnt DESC
|
|
|
|
|
+ LIMIT 10
|
|
|
|
|
+""").fetchall()
|
|
|
|
|
+if dupe_rows:
|
|
|
|
|
+ for r in dupe_rows:
|
|
|
|
|
+ print(f' "{(r[0] or "")[:60]}" appears in {r[1]} clusters: {r[2]}')
|
|
|
|
|
+else:
|
|
|
|
|
+ print(" No duplicates found")
|
|
|
|
|
+
|
|
|
|
|
+# BreakingTheNews comparison
|
|
|
|
|
+print("\n=== BreakingTheNews clusters (for comparison) ===")
|
|
|
|
|
+bt_total = conn.execute("SELECT count(*) FROM clusters WHERE payload LIKE '%breakingthenews%'").fetchone()[0]
|
|
|
|
|
+bt_singletons = conn.execute("SELECT count(*) FROM clusters WHERE payload LIKE '%breakingthenews%' AND json_array_length(payload, '$.articles') = 1").fetchone()[0]
|
|
|
|
|
+print(f"Total BT clusters: {bt_total}, singletons: {bt_singletons}, multi-article: {bt_total - bt_singletons}")
|
|
|
|
|
+
|
|
|
|
|
+# Cross-feed clusters
|
|
|
|
|
+print("\n=== Cross-feed clusters (FT + BT merged) ===")
|
|
|
|
|
+cross = conn.execute("SELECT count(*) FROM clusters WHERE payload LIKE '%ft.com%' AND payload LIKE '%breakingthenews%'").fetchone()[0]
|
|
|
|
|
+print(f"Cross-feed clusters: {cross}")
|
|
|
|
|
+
|
|
|
|
|
+# FoxNews clusters
|
|
|
|
|
+print("\n=== FoxNews clusters ===")
|
|
|
|
|
+fox_total = conn.execute("SELECT count(*) FROM clusters WHERE payload LIKE '%foxnews%'").fetchone()[0]
|
|
|
|
|
+fox_singletons = conn.execute("SELECT count(*) FROM clusters WHERE payload LIKE '%foxnews%' AND json_array_length(payload, '$.articles') = 1").fetchone()[0]
|
|
|
|
|
+print(f"Total FoxNews clusters: {fox_total}, singletons: {fox_singletons}, multi-article: {fox_total - fox_singletons}")
|
|
|
|
|
+
|
|
|
|
|
+# All clusters summary
|
|
|
|
|
+print("\n=== All clusters summary ===")
|
|
|
|
|
+all_total = conn.execute("SELECT count(*) FROM clusters").fetchone()[0]
|
|
|
|
|
+all_singletons = conn.execute("SELECT count(*) FROM clusters WHERE json_array_length(payload, '$.articles') = 1").fetchone()[0]
|
|
|
|
|
+print(f"Total clusters: {all_total}, singletons: {all_singletons}, multi-article: {all_total - all_singletons}")
|
|
|
|
|
+
|
|
|
|
|
+# Feed state
|
|
|
|
|
+print("\n=== Feed state (active feeds) ===")
|
|
|
|
|
+feeds = conn.execute("SELECT feed_key, last_hash, last_item_count, enabled, updated_at FROM feed_state WHERE last_hash != ''").fetchall()
|
|
|
|
|
+for f in feeds:
|
|
|
|
|
+ print(f" {f[0][:55]} items={f[2]} enabled={f[3]} updated={f[4][:19]}")
|
|
|
|
|
+
|
|
|
|
|
+# Check for clusters covering same story from different feeds (by title similarity)
|
|
|
|
|
+print("\n=== Clusters with similar headlines across feeds (sample) ===")
|
|
|
|
|
+all_clusters = conn.execute("""
|
|
|
|
|
+ SELECT cluster_id, json_extract(payload, '$.headline'), json_extract(payload, '$.sources')
|
|
|
|
|
+ FROM clusters
|
|
|
|
|
+ ORDER BY cluster_id
|
|
|
|
|
+ LIMIT 200
|
|
|
|
|
+""").fetchall()
|
|
|
|
|
+# Quick check: any FT headline that contains words also in BT headlines
|
|
|
|
|
+ft_headlines = [(cid, hl, src) for cid, hl, src in all_clusters if "ft.com" in (src or "")]
|
|
|
|
|
+bt_headlines = [(cid, hl, src) for cid, hl, src in all_clusters if "breakingthenews" in (src or "")]
|
|
|
|
|
+if ft_headlines and bt_headlines:
|
|
|
|
|
+ from difflib import SequenceMatcher
|
|
|
|
|
+ def norm(t):
|
|
|
|
|
+ import re
|
|
|
|
|
+ t = t.lower().strip()
|
|
|
|
|
+ t = re.sub(r"[^a-z0-9\s]", " ", t)
|
|
|
|
|
+ return re.sub(r"\s+", " ", t).strip()
|
|
|
|
|
+ matches = []
|
|
|
|
|
+ for fc, fh, fs in ft_headlines[:20]:
|
|
|
|
|
+ for bc, bh, bs in bt_headlines[:20]:
|
|
|
|
|
+ sim = SequenceMatcher(None, norm(fh or ""), norm(bh or "")).ratio()
|
|
|
|
|
+ if sim >= 0.50:
|
|
|
|
|
+ matches.append((sim, fh[:50], bh[:50], fc[:8], bc[:8]))
|
|
|
|
|
+ matches.sort(reverse=True)
|
|
|
|
|
+ for sim, fh, bh, fc, bc in matches[:8]:
|
|
|
|
|
+ print(f" sim={sim:.2f} FT[{fc}]: {fh}")
|
|
|
|
|
+ print(f" BT[{bc}]: {bh}")
|
|
|
|
|
+ if not matches:
|
|
|
|
|
+ print(" No cross-feed headline matches >= 0.50 found")
|
|
|
|
|
+
|
|
|
|
|
+conn.close()
|