analyze_ft.sh 3.7 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687
  1. #!/bin/bash
  2. # Analyze FT-sourced clusters in the live DB
  3. # Run on thinkcenter-2: docker exec -it news-mcp bash /app/scripts/analyze_ft.sh
  4. # Or: docker exec -it news-mcp python3 /app/scripts/analyze_ft.py
  5. echo "=== FT clusters in DB ==="
  6. python3 -c "
  7. import sqlite3, json
  8. conn = sqlite3.connect('./data/news.sqlite')
  9. # FT clusters
  10. rows = conn.execute('''
  11. SELECT cluster_id,
  12. json_array_length(payload, '\$.articles'),
  13. payload->>'\$.headline',
  14. payload->>'\$.sources',
  15. payload->>'\$.timestamp'
  16. FROM clusters
  17. WHERE payload LIKE '%ft.com%'
  18. ORDER BY json_array_length(payload, '\$.articles') DESC
  19. LIMIT 30
  20. ''').fetchall()
  21. print(f'FT clusters (top 30 by article count):')
  22. for r in rows:
  23. print(f' {r[0][:12]} arts={r[1]} {r[2][:55]} src={r[3][:30]} ts={r[4][:16]}')
  24. total = conn.execute(\"SELECT count(*) FROM clusters WHERE payload LIKE '%ft.com%'\").fetchone()[0]
  25. singletons = conn.execute(\"SELECT count(*) FROM clusters WHERE payload LIKE '%ft.com%' AND json_array_length(payload, '$.articles') = 1\").fetchone()[0]
  26. multi = total - singletons
  27. print(f'\nTotal FT clusters: {total}, singletons: {singletons}, multi-article: {multi}')
  28. # Check for duplicate headlines (same story, different clusters)
  29. print('\n=== Potential duplicates (same headline in multiple clusters) ===')
  30. dupe_rows = conn.execute('''
  31. SELECT payload->>'\$.headline', count(*) as cnt, group_concat(substr(cluster_id,1,12))
  32. FROM clusters
  33. WHERE payload LIKE '%ft.com%'
  34. GROUP BY payload->>'\$.headline'
  35. HAVING cnt > 1
  36. ORDER BY cnt DESC
  37. LIMIT 10
  38. ''').fetchall()
  39. for r in dupe_rows:
  40. print(f' \"{r[0][:60]}\" appears in {r[1]} clusters: {r[2]}')
  41. # BreakingTheNews clusters for comparison
  42. print('\n=== BreakingTheNews clusters (for comparison) ===')
  43. bt_total = conn.execute(\"SELECT count(*) FROM clusters WHERE payload LIKE '%breakingthenews%'\").fetchone()[0]
  44. bt_singletons = conn.execute(\"SELECT count(*) FROM clusters WHERE payload LIKE '%breakingthenews%' AND json_array_length(payload, '$.articles') = 1\").fetchone()[0]
  45. print(f'Total BT clusters: {bt_total}, singletons: {bt_singletons}, multi-article: {bt_total - bt_singletons}')
  46. # Cross-feed clusters (have both FT and BT sources)
  47. print('\n=== Cross-feed clusters (FT + BT merged) ===')
  48. cross = conn.execute(\"SELECT count(*) FROM clusters WHERE payload LIKE '%ft.com%' AND payload LIKE '%breakingthenews%'\").fetchone()[0]
  49. print(f'Cross-feed clusters: {cross}')
  50. # All clusters summary
  51. print('\n=== All clusters summary ===')
  52. all_total = conn.execute('SELECT count(*) FROM clusters').fetchone()[0]
  53. all_singletons = conn.execute(\"SELECT count(*) FROM clusters WHERE json_array_length(payload, '$.articles') = 1\").fetchone()[0]
  54. print(f'Total clusters: {all_total}, singletons: {all_singletons}, multi-article: {all_total - all_singletons}')
  55. # Feed state
  56. print('\n=== Feed state ===')
  57. feeds = conn.execute('SELECT feed_key, last_hash, last_item_count, enabled, updated_at FROM feed_state WHERE last_hash != \"\"').fetchall()
  58. for f in feeds:
  59. print(f' {f[0][:50]} items={f[2]} enabled={f[3]} updated={f[4][:19]}')
  60. "
  61. echo ""
  62. echo "=== Recent poller clustering stats ==="
  63. python3 -c "
  64. import sqlite3, json
  65. conn = sqlite3.connect('./data/news.sqlite')
  66. # Check poll_stats table if it exists
  67. try:
  68. rows = conn.execute('SELECT * FROM poll_stats ORDER BY rowid DESC LIMIT 5').fetchall()
  69. if rows:
  70. cols = [d[0] for d in conn.execute('SELECT * FROM pragma_table_info(poll_stats)').fetchall()]
  71. print(f'poll_stats columns: {cols}')
  72. for r in rows:
  73. print(f' {r}')
  74. else:
  75. print('poll_stats table is empty')
  76. except Exception as e:
  77. print(f'poll_stats table: {e}')
  78. "