소스 검색

debug scripts

Lukas Goldschmidt 6 일 전
부모
커밋
8fe316db24
2개의 변경된 파일197개의 추가작업 그리고 0개의 파일을 삭제
  1. 110 0
      scripts/analyze_ft.py
  2. 87 0
      scripts/analyze_ft.sh

+ 110 - 0
scripts/analyze_ft.py

@@ -0,0 +1,110 @@
+#!/usr/bin/env python3
+"""Analyze FT-sourced clusters in the live DB.
+Run: docker exec -it news-mcp python3 /app/scripts/analyze_ft.py
+"""
+import sqlite3, json
+
+DB = "./data/news.sqlite"
+conn = sqlite3.connect(DB)
+
+print("=== FT clusters in DB ===")
+rows = conn.execute("""
+    SELECT cluster_id, 
+           json_array_length(payload, '$.articles'), 
+           json_extract(payload, '$.headline'),
+           json_extract(payload, '$.sources'),
+           json_extract(payload, '$.timestamp')
+    FROM clusters 
+    WHERE payload LIKE '%ft.com%' 
+    ORDER BY json_array_length(payload, '$.articles') DESC 
+    LIMIT 30
+""").fetchall()
+print("FT clusters (top 30 by article count):")
+for r in rows:
+    src = (r[3] or "")[:30]
+    ts = (r[4] or "")[:16]
+    hl = (r[2] or "")[:55]
+    print(f"  {r[0][:12]}  arts={r[1]}  {hl}  src={src}  ts={ts}")
+
+total = conn.execute("SELECT count(*) FROM clusters WHERE payload LIKE '%ft.com%'").fetchone()[0]
+singletons = conn.execute("SELECT count(*) FROM clusters WHERE payload LIKE '%ft.com%' AND json_array_length(payload, '$.articles') = 1").fetchone()[0]
+print(f"\nTotal FT clusters: {total}, singletons: {singletons}, multi-article: {total - singletons}")
+
+# Duplicate headlines
+print("\n=== Potential duplicates (same headline in multiple clusters) ===")
+dupe_rows = conn.execute("""
+    SELECT json_extract(payload, '$.headline'), count(*) as cnt, group_concat(substr(cluster_id,1,12))
+    FROM clusters 
+    WHERE payload LIKE '%ft.com%'
+    GROUP BY json_extract(payload, '$.headline')
+    HAVING cnt > 1
+    ORDER BY cnt DESC
+    LIMIT 10
+""").fetchall()
+if dupe_rows:
+    for r in dupe_rows:
+        print(f'  "{(r[0] or "")[:60]}" appears in {r[1]} clusters: {r[2]}')
+else:
+    print("  No duplicates found")
+
+# BreakingTheNews comparison
+print("\n=== BreakingTheNews clusters (for comparison) ===")
+bt_total = conn.execute("SELECT count(*) FROM clusters WHERE payload LIKE '%breakingthenews%'").fetchone()[0]
+bt_singletons = conn.execute("SELECT count(*) FROM clusters WHERE payload LIKE '%breakingthenews%' AND json_array_length(payload, '$.articles') = 1").fetchone()[0]
+print(f"Total BT clusters: {bt_total}, singletons: {bt_singletons}, multi-article: {bt_total - bt_singletons}")
+
+# Cross-feed clusters
+print("\n=== Cross-feed clusters (FT + BT merged) ===")
+cross = conn.execute("SELECT count(*) FROM clusters WHERE payload LIKE '%ft.com%' AND payload LIKE '%breakingthenews%'").fetchone()[0]
+print(f"Cross-feed clusters: {cross}")
+
+# FoxNews clusters
+print("\n=== FoxNews clusters ===")
+fox_total = conn.execute("SELECT count(*) FROM clusters WHERE payload LIKE '%foxnews%'").fetchone()[0]
+fox_singletons = conn.execute("SELECT count(*) FROM clusters WHERE payload LIKE '%foxnews%' AND json_array_length(payload, '$.articles') = 1").fetchone()[0]
+print(f"Total FoxNews clusters: {fox_total}, singletons: {fox_singletons}, multi-article: {fox_total - fox_singletons}")
+
+# All clusters summary
+print("\n=== All clusters summary ===")
+all_total = conn.execute("SELECT count(*) FROM clusters").fetchone()[0]
+all_singletons = conn.execute("SELECT count(*) FROM clusters WHERE json_array_length(payload, '$.articles') = 1").fetchone()[0]
+print(f"Total clusters: {all_total}, singletons: {all_singletons}, multi-article: {all_total - all_singletons}")
+
+# Feed state
+print("\n=== Feed state (active feeds) ===")
+feeds = conn.execute("SELECT feed_key, last_hash, last_item_count, enabled, updated_at FROM feed_state WHERE last_hash != ''").fetchall()
+for f in feeds:
+    print(f"  {f[0][:55]}  items={f[2]}  enabled={f[3]}  updated={f[4][:19]}")
+
+# Check for clusters covering same story from different feeds (by title similarity)
+print("\n=== Clusters with similar headlines across feeds (sample) ===")
+all_clusters = conn.execute("""
+    SELECT cluster_id, json_extract(payload, '$.headline'), json_extract(payload, '$.sources')
+    FROM clusters
+    ORDER BY cluster_id
+    LIMIT 200
+""").fetchall()
+# Quick check: any FT headline that contains words also in BT headlines
+ft_headlines = [(cid, hl, src) for cid, hl, src in all_clusters if "ft.com" in (src or "")]
+bt_headlines = [(cid, hl, src) for cid, hl, src in all_clusters if "breakingthenews" in (src or "")]
+if ft_headlines and bt_headlines:
+    from difflib import SequenceMatcher
+    def norm(t):
+        import re
+        t = t.lower().strip()
+        t = re.sub(r"[^a-z0-9\s]", " ", t)
+        return re.sub(r"\s+", " ", t).strip()
+    matches = []
+    for fc, fh, fs in ft_headlines[:20]:
+        for bc, bh, bs in bt_headlines[:20]:
+            sim = SequenceMatcher(None, norm(fh or ""), norm(bh or "")).ratio()
+            if sim >= 0.50:
+                matches.append((sim, fh[:50], bh[:50], fc[:8], bc[:8]))
+    matches.sort(reverse=True)
+    for sim, fh, bh, fc, bc in matches[:8]:
+        print(f"  sim={sim:.2f}  FT[{fc}]: {fh}")
+        print(f"           BT[{bc}]: {bh}")
+    if not matches:
+        print("  No cross-feed headline matches >= 0.50 found")
+
+conn.close()

+ 87 - 0
scripts/analyze_ft.sh

@@ -0,0 +1,87 @@
+#!/bin/bash
+# Analyze FT-sourced clusters in the live DB
+# Run on thinkcenter-2: docker exec -it news-mcp bash /app/scripts/analyze_ft.sh
+# Or: docker exec -it news-mcp python3 /app/scripts/analyze_ft.py
+
+echo "=== FT clusters in DB ==="
+python3 -c "
+import sqlite3, json
+conn = sqlite3.connect('./data/news.sqlite')
+
+# FT clusters
+rows = conn.execute('''
+    SELECT cluster_id, 
+           json_array_length(payload, '\$.articles'), 
+           payload->>'\$.headline',
+           payload->>'\$.sources',
+           payload->>'\$.timestamp'
+    FROM clusters 
+    WHERE payload LIKE '%ft.com%' 
+    ORDER BY json_array_length(payload, '\$.articles') DESC 
+    LIMIT 30
+''').fetchall()
+print(f'FT clusters (top 30 by article count):')
+for r in rows:
+    print(f'  {r[0][:12]}  arts={r[1]}  {r[2][:55]}  src={r[3][:30]}  ts={r[4][:16]}')
+
+total = conn.execute(\"SELECT count(*) FROM clusters WHERE payload LIKE '%ft.com%'\").fetchone()[0]
+singletons = conn.execute(\"SELECT count(*) FROM clusters WHERE payload LIKE '%ft.com%' AND json_array_length(payload, '$.articles') = 1\").fetchone()[0]
+multi = total - singletons
+print(f'\nTotal FT clusters: {total}, singletons: {singletons}, multi-article: {multi}')
+
+# Check for duplicate headlines (same story, different clusters)
+print('\n=== Potential duplicates (same headline in multiple clusters) ===')
+dupe_rows = conn.execute('''
+    SELECT payload->>'\$.headline', count(*) as cnt, group_concat(substr(cluster_id,1,12))
+    FROM clusters 
+    WHERE payload LIKE '%ft.com%'
+    GROUP BY payload->>'\$.headline'
+    HAVING cnt > 1
+    ORDER BY cnt DESC
+    LIMIT 10
+''').fetchall()
+for r in dupe_rows:
+    print(f'  \"{r[0][:60]}\" appears in {r[1]} clusters: {r[2]}')
+
+# BreakingTheNews clusters for comparison
+print('\n=== BreakingTheNews clusters (for comparison) ===')
+bt_total = conn.execute(\"SELECT count(*) FROM clusters WHERE payload LIKE '%breakingthenews%'\").fetchone()[0]
+bt_singletons = conn.execute(\"SELECT count(*) FROM clusters WHERE payload LIKE '%breakingthenews%' AND json_array_length(payload, '$.articles') = 1\").fetchone()[0]
+print(f'Total BT clusters: {bt_total}, singletons: {bt_singletons}, multi-article: {bt_total - bt_singletons}')
+
+# Cross-feed clusters (have both FT and BT sources)
+print('\n=== Cross-feed clusters (FT + BT merged) ===')
+cross = conn.execute(\"SELECT count(*) FROM clusters WHERE payload LIKE '%ft.com%' AND payload LIKE '%breakingthenews%'\").fetchone()[0]
+print(f'Cross-feed clusters: {cross}')
+
+# All clusters summary
+print('\n=== All clusters summary ===')
+all_total = conn.execute('SELECT count(*) FROM clusters').fetchone()[0]
+all_singletons = conn.execute(\"SELECT count(*) FROM clusters WHERE json_array_length(payload, '$.articles') = 1\").fetchone()[0]
+print(f'Total clusters: {all_total}, singletons: {all_singletons}, multi-article: {all_total - all_singletons}')
+
+# Feed state
+print('\n=== Feed state ===')
+feeds = conn.execute('SELECT feed_key, last_hash, last_item_count, enabled, updated_at FROM feed_state WHERE last_hash != \"\"').fetchall()
+for f in feeds:
+    print(f'  {f[0][:50]}  items={f[2]}  enabled={f[3]}  updated={f[4][:19]}')
+"
+
+echo ""
+echo "=== Recent poller clustering stats ==="
+python3 -c "
+import sqlite3, json
+conn = sqlite3.connect('./data/news.sqlite')
+# Check poll_stats table if it exists
+try:
+    rows = conn.execute('SELECT * FROM poll_stats ORDER BY rowid DESC LIMIT 5').fetchall()
+    if rows:
+        cols = [d[0] for d in conn.execute('SELECT * FROM pragma_table_info(poll_stats)').fetchall()]
+        print(f'poll_stats columns: {cols}')
+        for r in rows:
+            print(f'  {r}')
+    else:
+        print('poll_stats table is empty')
+except Exception as e:
+    print(f'poll_stats table: {e}')
+"