| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687 |
- #!/bin/bash
- # Analyze FT-sourced clusters in the live DB
- # Run on thinkcenter-2: docker exec -it news-mcp bash /app/scripts/analyze_ft.sh
- # Or: docker exec -it news-mcp python3 /app/scripts/analyze_ft.py
- echo "=== FT clusters in DB ==="
- python3 -c "
- import sqlite3, json
- conn = sqlite3.connect('./data/news.sqlite')
- # FT clusters
- rows = conn.execute('''
- SELECT cluster_id,
- json_array_length(payload, '\$.articles'),
- payload->>'\$.headline',
- payload->>'\$.sources',
- payload->>'\$.timestamp'
- FROM clusters
- WHERE payload LIKE '%ft.com%'
- ORDER BY json_array_length(payload, '\$.articles') DESC
- LIMIT 30
- ''').fetchall()
- print(f'FT clusters (top 30 by article count):')
- for r in rows:
- print(f' {r[0][:12]} arts={r[1]} {r[2][:55]} src={r[3][:30]} ts={r[4][:16]}')
- total = conn.execute(\"SELECT count(*) FROM clusters WHERE payload LIKE '%ft.com%'\").fetchone()[0]
- singletons = conn.execute(\"SELECT count(*) FROM clusters WHERE payload LIKE '%ft.com%' AND json_array_length(payload, '$.articles') = 1\").fetchone()[0]
- multi = total - singletons
- print(f'\nTotal FT clusters: {total}, singletons: {singletons}, multi-article: {multi}')
- # Check for duplicate headlines (same story, different clusters)
- print('\n=== Potential duplicates (same headline in multiple clusters) ===')
- dupe_rows = conn.execute('''
- SELECT payload->>'\$.headline', count(*) as cnt, group_concat(substr(cluster_id,1,12))
- FROM clusters
- WHERE payload LIKE '%ft.com%'
- GROUP BY payload->>'\$.headline'
- HAVING cnt > 1
- ORDER BY cnt DESC
- LIMIT 10
- ''').fetchall()
- for r in dupe_rows:
- print(f' \"{r[0][:60]}\" appears in {r[1]} clusters: {r[2]}')
- # BreakingTheNews clusters for comparison
- print('\n=== BreakingTheNews clusters (for comparison) ===')
- bt_total = conn.execute(\"SELECT count(*) FROM clusters WHERE payload LIKE '%breakingthenews%'\").fetchone()[0]
- bt_singletons = conn.execute(\"SELECT count(*) FROM clusters WHERE payload LIKE '%breakingthenews%' AND json_array_length(payload, '$.articles') = 1\").fetchone()[0]
- print(f'Total BT clusters: {bt_total}, singletons: {bt_singletons}, multi-article: {bt_total - bt_singletons}')
- # Cross-feed clusters (have both FT and BT sources)
- print('\n=== Cross-feed clusters (FT + BT merged) ===')
- cross = conn.execute(\"SELECT count(*) FROM clusters WHERE payload LIKE '%ft.com%' AND payload LIKE '%breakingthenews%'\").fetchone()[0]
- print(f'Cross-feed clusters: {cross}')
- # All clusters summary
- print('\n=== All clusters summary ===')
- all_total = conn.execute('SELECT count(*) FROM clusters').fetchone()[0]
- all_singletons = conn.execute(\"SELECT count(*) FROM clusters WHERE json_array_length(payload, '$.articles') = 1\").fetchone()[0]
- print(f'Total clusters: {all_total}, singletons: {all_singletons}, multi-article: {all_total - all_singletons}')
- # Feed state
- print('\n=== Feed state ===')
- feeds = conn.execute('SELECT feed_key, last_hash, last_item_count, enabled, updated_at FROM feed_state WHERE last_hash != \"\"').fetchall()
- for f in feeds:
- print(f' {f[0][:50]} items={f[2]} enabled={f[3]} updated={f[4][:19]}')
- "
- echo ""
- echo "=== Recent poller clustering stats ==="
- python3 -c "
- import sqlite3, json
- conn = sqlite3.connect('./data/news.sqlite')
- # Check poll_stats table if it exists
- try:
- rows = conn.execute('SELECT * FROM poll_stats ORDER BY rowid DESC LIMIT 5').fetchall()
- if rows:
- cols = [d[0] for d in conn.execute('SELECT * FROM pragma_table_info(poll_stats)').fetchall()]
- print(f'poll_stats columns: {cols}')
- for r in rows:
- print(f' {r}')
- else:
- print('poll_stats table is empty')
- except Exception as e:
- print(f'poll_stats table: {e}')
- "
|