| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110 |
- #!/usr/bin/env python3
- """Analyze FT-sourced clusters in the live DB.
- Run: docker exec -it news-mcp python3 /app/scripts/analyze_ft.py
- """
- import sqlite3, json
- DB = "./data/news.sqlite"
- conn = sqlite3.connect(DB)
- print("=== FT clusters in DB ===")
- rows = conn.execute("""
- SELECT cluster_id,
- json_array_length(payload, '$.articles'),
- json_extract(payload, '$.headline'),
- json_extract(payload, '$.sources'),
- json_extract(payload, '$.timestamp')
- FROM clusters
- WHERE payload LIKE '%ft.com%'
- ORDER BY json_array_length(payload, '$.articles') DESC
- LIMIT 30
- """).fetchall()
- print("FT clusters (top 30 by article count):")
- for r in rows:
- src = (r[3] or "")[:30]
- ts = (r[4] or "")[:16]
- hl = (r[2] or "")[:55]
- print(f" {r[0][:12]} arts={r[1]} {hl} src={src} ts={ts}")
- total = conn.execute("SELECT count(*) FROM clusters WHERE payload LIKE '%ft.com%'").fetchone()[0]
- singletons = conn.execute("SELECT count(*) FROM clusters WHERE payload LIKE '%ft.com%' AND json_array_length(payload, '$.articles') = 1").fetchone()[0]
- print(f"\nTotal FT clusters: {total}, singletons: {singletons}, multi-article: {total - singletons}")
- # Duplicate headlines
- print("\n=== Potential duplicates (same headline in multiple clusters) ===")
- dupe_rows = conn.execute("""
- SELECT json_extract(payload, '$.headline'), count(*) as cnt, group_concat(substr(cluster_id,1,12))
- FROM clusters
- WHERE payload LIKE '%ft.com%'
- GROUP BY json_extract(payload, '$.headline')
- HAVING cnt > 1
- ORDER BY cnt DESC
- LIMIT 10
- """).fetchall()
- if dupe_rows:
- for r in dupe_rows:
- print(f' "{(r[0] or "")[:60]}" appears in {r[1]} clusters: {r[2]}')
- else:
- print(" No duplicates found")
- # BreakingTheNews comparison
- print("\n=== BreakingTheNews clusters (for comparison) ===")
- bt_total = conn.execute("SELECT count(*) FROM clusters WHERE payload LIKE '%breakingthenews%'").fetchone()[0]
- bt_singletons = conn.execute("SELECT count(*) FROM clusters WHERE payload LIKE '%breakingthenews%' AND json_array_length(payload, '$.articles') = 1").fetchone()[0]
- print(f"Total BT clusters: {bt_total}, singletons: {bt_singletons}, multi-article: {bt_total - bt_singletons}")
- # Cross-feed clusters
- print("\n=== Cross-feed clusters (FT + BT merged) ===")
- cross = conn.execute("SELECT count(*) FROM clusters WHERE payload LIKE '%ft.com%' AND payload LIKE '%breakingthenews%'").fetchone()[0]
- print(f"Cross-feed clusters: {cross}")
- # FoxNews clusters
- print("\n=== FoxNews clusters ===")
- fox_total = conn.execute("SELECT count(*) FROM clusters WHERE payload LIKE '%foxnews%'").fetchone()[0]
- fox_singletons = conn.execute("SELECT count(*) FROM clusters WHERE payload LIKE '%foxnews%' AND json_array_length(payload, '$.articles') = 1").fetchone()[0]
- print(f"Total FoxNews clusters: {fox_total}, singletons: {fox_singletons}, multi-article: {fox_total - fox_singletons}")
- # All clusters summary
- print("\n=== All clusters summary ===")
- all_total = conn.execute("SELECT count(*) FROM clusters").fetchone()[0]
- all_singletons = conn.execute("SELECT count(*) FROM clusters WHERE json_array_length(payload, '$.articles') = 1").fetchone()[0]
- print(f"Total clusters: {all_total}, singletons: {all_singletons}, multi-article: {all_total - all_singletons}")
- # Feed state
- print("\n=== Feed state (active feeds) ===")
- feeds = conn.execute("SELECT feed_key, last_hash, last_item_count, enabled, updated_at FROM feed_state WHERE last_hash != ''").fetchall()
- for f in feeds:
- print(f" {f[0][:55]} items={f[2]} enabled={f[3]} updated={f[4][:19]}")
- # Check for clusters covering same story from different feeds (by title similarity)
- print("\n=== Clusters with similar headlines across feeds (sample) ===")
- all_clusters = conn.execute("""
- SELECT cluster_id, json_extract(payload, '$.headline'), json_extract(payload, '$.sources')
- FROM clusters
- ORDER BY cluster_id
- LIMIT 200
- """).fetchall()
- # Quick check: any FT headline that contains words also in BT headlines
- ft_headlines = [(cid, hl, src) for cid, hl, src in all_clusters if "ft.com" in (src or "")]
- bt_headlines = [(cid, hl, src) for cid, hl, src in all_clusters if "breakingthenews" in (src or "")]
- if ft_headlines and bt_headlines:
- from difflib import SequenceMatcher
- def norm(t):
- import re
- t = t.lower().strip()
- t = re.sub(r"[^a-z0-9\s]", " ", t)
- return re.sub(r"\s+", " ", t).strip()
- matches = []
- for fc, fh, fs in ft_headlines[:20]:
- for bc, bh, bs in bt_headlines[:20]:
- sim = SequenceMatcher(None, norm(fh or ""), norm(bh or "")).ratio()
- if sim >= 0.50:
- matches.append((sim, fh[:50], bh[:50], fc[:8], bc[:8]))
- matches.sort(reverse=True)
- for sim, fh, bh, fc, bc in matches[:8]:
- print(f" sim={sim:.2f} FT[{fc}]: {fh}")
- print(f" BT[{bc}]: {bh}")
- if not matches:
- print(" No cross-feed headline matches >= 0.50 found")
- conn.close()
|