analyze_ft.py 4.9 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110
  1. #!/usr/bin/env python3
  2. """Analyze FT-sourced clusters in the live DB.
  3. Run: docker exec -it news-mcp python3 /app/scripts/analyze_ft.py
  4. """
  5. import sqlite3, json
  6. DB = "./data/news.sqlite"
  7. conn = sqlite3.connect(DB)
  8. print("=== FT clusters in DB ===")
  9. rows = conn.execute("""
  10. SELECT cluster_id,
  11. json_array_length(payload, '$.articles'),
  12. json_extract(payload, '$.headline'),
  13. json_extract(payload, '$.sources'),
  14. json_extract(payload, '$.timestamp')
  15. FROM clusters
  16. WHERE payload LIKE '%ft.com%'
  17. ORDER BY json_array_length(payload, '$.articles') DESC
  18. LIMIT 30
  19. """).fetchall()
  20. print("FT clusters (top 30 by article count):")
  21. for r in rows:
  22. src = (r[3] or "")[:30]
  23. ts = (r[4] or "")[:16]
  24. hl = (r[2] or "")[:55]
  25. print(f" {r[0][:12]} arts={r[1]} {hl} src={src} ts={ts}")
  26. total = conn.execute("SELECT count(*) FROM clusters WHERE payload LIKE '%ft.com%'").fetchone()[0]
  27. singletons = conn.execute("SELECT count(*) FROM clusters WHERE payload LIKE '%ft.com%' AND json_array_length(payload, '$.articles') = 1").fetchone()[0]
  28. print(f"\nTotal FT clusters: {total}, singletons: {singletons}, multi-article: {total - singletons}")
  29. # Duplicate headlines
  30. print("\n=== Potential duplicates (same headline in multiple clusters) ===")
  31. dupe_rows = conn.execute("""
  32. SELECT json_extract(payload, '$.headline'), count(*) as cnt, group_concat(substr(cluster_id,1,12))
  33. FROM clusters
  34. WHERE payload LIKE '%ft.com%'
  35. GROUP BY json_extract(payload, '$.headline')
  36. HAVING cnt > 1
  37. ORDER BY cnt DESC
  38. LIMIT 10
  39. """).fetchall()
  40. if dupe_rows:
  41. for r in dupe_rows:
  42. print(f' "{(r[0] or "")[:60]}" appears in {r[1]} clusters: {r[2]}')
  43. else:
  44. print(" No duplicates found")
  45. # BreakingTheNews comparison
  46. print("\n=== BreakingTheNews clusters (for comparison) ===")
  47. bt_total = conn.execute("SELECT count(*) FROM clusters WHERE payload LIKE '%breakingthenews%'").fetchone()[0]
  48. bt_singletons = conn.execute("SELECT count(*) FROM clusters WHERE payload LIKE '%breakingthenews%' AND json_array_length(payload, '$.articles') = 1").fetchone()[0]
  49. print(f"Total BT clusters: {bt_total}, singletons: {bt_singletons}, multi-article: {bt_total - bt_singletons}")
  50. # Cross-feed clusters
  51. print("\n=== Cross-feed clusters (FT + BT merged) ===")
  52. cross = conn.execute("SELECT count(*) FROM clusters WHERE payload LIKE '%ft.com%' AND payload LIKE '%breakingthenews%'").fetchone()[0]
  53. print(f"Cross-feed clusters: {cross}")
  54. # FoxNews clusters
  55. print("\n=== FoxNews clusters ===")
  56. fox_total = conn.execute("SELECT count(*) FROM clusters WHERE payload LIKE '%foxnews%'").fetchone()[0]
  57. fox_singletons = conn.execute("SELECT count(*) FROM clusters WHERE payload LIKE '%foxnews%' AND json_array_length(payload, '$.articles') = 1").fetchone()[0]
  58. print(f"Total FoxNews clusters: {fox_total}, singletons: {fox_singletons}, multi-article: {fox_total - fox_singletons}")
  59. # All clusters summary
  60. print("\n=== All clusters summary ===")
  61. all_total = conn.execute("SELECT count(*) FROM clusters").fetchone()[0]
  62. all_singletons = conn.execute("SELECT count(*) FROM clusters WHERE json_array_length(payload, '$.articles') = 1").fetchone()[0]
  63. print(f"Total clusters: {all_total}, singletons: {all_singletons}, multi-article: {all_total - all_singletons}")
  64. # Feed state
  65. print("\n=== Feed state (active feeds) ===")
  66. feeds = conn.execute("SELECT feed_key, last_hash, last_item_count, enabled, updated_at FROM feed_state WHERE last_hash != ''").fetchall()
  67. for f in feeds:
  68. print(f" {f[0][:55]} items={f[2]} enabled={f[3]} updated={f[4][:19]}")
  69. # Check for clusters covering same story from different feeds (by title similarity)
  70. print("\n=== Clusters with similar headlines across feeds (sample) ===")
  71. all_clusters = conn.execute("""
  72. SELECT cluster_id, json_extract(payload, '$.headline'), json_extract(payload, '$.sources')
  73. FROM clusters
  74. ORDER BY cluster_id
  75. LIMIT 200
  76. """).fetchall()
  77. # Quick check: any FT headline that contains words also in BT headlines
  78. ft_headlines = [(cid, hl, src) for cid, hl, src in all_clusters if "ft.com" in (src or "")]
  79. bt_headlines = [(cid, hl, src) for cid, hl, src in all_clusters if "breakingthenews" in (src or "")]
  80. if ft_headlines and bt_headlines:
  81. from difflib import SequenceMatcher
  82. def norm(t):
  83. import re
  84. t = t.lower().strip()
  85. t = re.sub(r"[^a-z0-9\s]", " ", t)
  86. return re.sub(r"\s+", " ", t).strip()
  87. matches = []
  88. for fc, fh, fs in ft_headlines[:20]:
  89. for bc, bh, bs in bt_headlines[:20]:
  90. sim = SequenceMatcher(None, norm(fh or ""), norm(bh or "")).ratio()
  91. if sim >= 0.50:
  92. matches.append((sim, fh[:50], bh[:50], fc[:8], bc[:8]))
  93. matches.sort(reverse=True)
  94. for sim, fh, bh, fc, bc in matches[:8]:
  95. print(f" sim={sim:.2f} FT[{fc}]: {fh}")
  96. print(f" BT[{bc}]: {bh}")
  97. if not matches:
  98. print(" No cross-feed headline matches >= 0.50 found")
  99. conn.close()