#!/usr/bin/env python3 """Analyze FT-sourced clusters in the live DB. Run: docker exec -it news-mcp python3 /app/scripts/analyze_ft.py """ import sqlite3, json DB = "./data/news.sqlite" conn = sqlite3.connect(DB) print("=== FT clusters in DB ===") rows = conn.execute(""" SELECT cluster_id, json_array_length(payload, '$.articles'), json_extract(payload, '$.headline'), json_extract(payload, '$.sources'), json_extract(payload, '$.timestamp') FROM clusters WHERE payload LIKE '%ft.com%' ORDER BY json_array_length(payload, '$.articles') DESC LIMIT 30 """).fetchall() print("FT clusters (top 30 by article count):") for r in rows: src = (r[3] or "")[:30] ts = (r[4] or "")[:16] hl = (r[2] or "")[:55] print(f" {r[0][:12]} arts={r[1]} {hl} src={src} ts={ts}") total = conn.execute("SELECT count(*) FROM clusters WHERE payload LIKE '%ft.com%'").fetchone()[0] singletons = conn.execute("SELECT count(*) FROM clusters WHERE payload LIKE '%ft.com%' AND json_array_length(payload, '$.articles') = 1").fetchone()[0] print(f"\nTotal FT clusters: {total}, singletons: {singletons}, multi-article: {total - singletons}") # Duplicate headlines print("\n=== Potential duplicates (same headline in multiple clusters) ===") dupe_rows = conn.execute(""" SELECT json_extract(payload, '$.headline'), count(*) as cnt, group_concat(substr(cluster_id,1,12)) FROM clusters WHERE payload LIKE '%ft.com%' GROUP BY json_extract(payload, '$.headline') HAVING cnt > 1 ORDER BY cnt DESC LIMIT 10 """).fetchall() if dupe_rows: for r in dupe_rows: print(f' "{(r[0] or "")[:60]}" appears in {r[1]} clusters: {r[2]}') else: print(" No duplicates found") # BreakingTheNews comparison print("\n=== BreakingTheNews clusters (for comparison) ===") bt_total = conn.execute("SELECT count(*) FROM clusters WHERE payload LIKE '%breakingthenews%'").fetchone()[0] bt_singletons = conn.execute("SELECT count(*) FROM clusters WHERE payload LIKE '%breakingthenews%' AND json_array_length(payload, '$.articles') = 1").fetchone()[0] print(f"Total BT clusters: {bt_total}, singletons: {bt_singletons}, multi-article: {bt_total - bt_singletons}") # Cross-feed clusters print("\n=== Cross-feed clusters (FT + BT merged) ===") cross = conn.execute("SELECT count(*) FROM clusters WHERE payload LIKE '%ft.com%' AND payload LIKE '%breakingthenews%'").fetchone()[0] print(f"Cross-feed clusters: {cross}") # FoxNews clusters print("\n=== FoxNews clusters ===") fox_total = conn.execute("SELECT count(*) FROM clusters WHERE payload LIKE '%foxnews%'").fetchone()[0] fox_singletons = conn.execute("SELECT count(*) FROM clusters WHERE payload LIKE '%foxnews%' AND json_array_length(payload, '$.articles') = 1").fetchone()[0] print(f"Total FoxNews clusters: {fox_total}, singletons: {fox_singletons}, multi-article: {fox_total - fox_singletons}") # All clusters summary print("\n=== All clusters summary ===") all_total = conn.execute("SELECT count(*) FROM clusters").fetchone()[0] all_singletons = conn.execute("SELECT count(*) FROM clusters WHERE json_array_length(payload, '$.articles') = 1").fetchone()[0] print(f"Total clusters: {all_total}, singletons: {all_singletons}, multi-article: {all_total - all_singletons}") # Feed state print("\n=== Feed state (active feeds) ===") feeds = conn.execute("SELECT feed_key, last_hash, last_item_count, enabled, updated_at FROM feed_state WHERE last_hash != ''").fetchall() for f in feeds: print(f" {f[0][:55]} items={f[2]} enabled={f[3]} updated={f[4][:19]}") # Check for clusters covering same story from different feeds (by title similarity) print("\n=== Clusters with similar headlines across feeds (sample) ===") all_clusters = conn.execute(""" SELECT cluster_id, json_extract(payload, '$.headline'), json_extract(payload, '$.sources') FROM clusters ORDER BY cluster_id LIMIT 200 """).fetchall() # Quick check: any FT headline that contains words also in BT headlines ft_headlines = [(cid, hl, src) for cid, hl, src in all_clusters if "ft.com" in (src or "")] bt_headlines = [(cid, hl, src) for cid, hl, src in all_clusters if "breakingthenews" in (src or "")] if ft_headlines and bt_headlines: from difflib import SequenceMatcher def norm(t): import re t = t.lower().strip() t = re.sub(r"[^a-z0-9\s]", " ", t) return re.sub(r"\s+", " ", t).strip() matches = [] for fc, fh, fs in ft_headlines[:20]: for bc, bh, bs in bt_headlines[:20]: sim = SequenceMatcher(None, norm(fh or ""), norm(bh or "")).ratio() if sim >= 0.50: matches.append((sim, fh[:50], bh[:50], fc[:8], bc[:8])) matches.sort(reverse=True) for sim, fh, bh, fc, bc in matches[:8]: print(f" sim={sim:.2f} FT[{fc}]: {fh}") print(f" BT[{bc}]: {bh}") if not matches: print(" No cross-feed headline matches >= 0.50 found") conn.close()