Procházet zdrojové kódy

feat: article_identity module, site_config DB table, debug_dedup tool

1. Deduplicate _article_key: single source of truth in
   news_mcp/article_identity.py (article_key + article_content_hash).
   Both cluster.py and sqlite_store.py now import from there.
   Also adds article_content_hash() for future content-change detection.

2. DB-backed site_config table: all tunable parameters in one place.
   Seeded from .env overrides or defaults on first startup.
   REST API: GET /api/v1/config, POST /api/v1/config/update,
   POST /api/v1/config/reset. Dashboard-tuneable without restart.
   Categories: clustering, enrichment, retention.

3. debug_dedup MCP tool: given a URL (+ optional title), reports:
   - Whether the article is in seen_articles
   - Which cluster it belongs to
   - Similarity signals (title, jaccard, cosine) against top-10
     existing clusters, with match decisions and thresholds
Lukas Goldschmidt před 6 dny
rodič
revize
b22882c580

+ 48 - 0
news_mcp/article_identity.py

@@ -0,0 +1,48 @@
+"""Article identity and content hashing — single source of truth.
+
+Used by:
+  - news_mcp.dedup.cluster  (clustering identity, orphan merge, stable cluster IDs)
+  - news_mcp.storage.sqlite_store  (seen_articles, dedup, upsert)
+  - scripts/backfill_seen_articles.py  (backfill)
+"""
+
+from __future__ import annotations
+
+import hashlib
+from typing import Any
+from urllib.parse import urlparse
+
+
+def article_key(article: dict[str, Any]) -> str:
+    """Deterministic identity key derived from an article's URL.
+
+    If a URL exists, returns the last path segment (e.g. '/content/uuid' → 'uuid',
+    '/Article/Slug/66427393' → '66427393').  Falls back to the full URL if no
+    path segments, or to the title if no URL at all.
+
+    This is the primary dedup identity — two articles with the same key
+    are considered the same article regardless of source.
+    """
+    url = str(article.get("url") or "").strip()
+    if not url:
+        return str(article.get("title") or "")
+    try:
+        parsed = urlparse(url)
+        parts = [p for p in parsed.path.split("/") if p]
+        if parts:
+            return parts[-1]
+    except Exception:
+        pass
+    return url
+
+
+def article_content_hash(article: dict[str, Any]) -> str:
+    """SHA-1 hash of title + summary for detecting content changes.
+
+    Used to detect in-place article updates (e.g. a stub that gets fleshed
+    out) where the URL stays the same but the content changes.
+    """
+    title = str(article.get("title") or "").strip()
+    summary = str(article.get("summary") or "").strip()
+    material = f"{title}|{summary}"
+    return hashlib.sha1(material.encode("utf-8")).hexdigest()

+ 3 - 13
news_mcp/dedup/cluster.py

@@ -6,13 +6,13 @@ import re
 from datetime import datetime, timezone, timedelta
 from difflib import SequenceMatcher
 from typing import Any, Dict, List
-from urllib.parse import urlparse
 
 from news_mcp.config import (
     NEWS_EMBEDDINGS_ENABLED,
     NEWS_EMBEDDING_SIMILARITY_THRESHOLD,
     NEWS_CLUSTER_MAX_AGE_HOURS,
 )
+from news_mcp.article_identity import article_key, article_content_hash
 from news_mcp.dedup.embedding_support import cosine_similarity, ollama_embed
 from news_mcp.sources.news_feeds import normalize_topic_from_title
 
@@ -33,18 +33,8 @@ def _title_similarity(a: str, b: str) -> float:
     return SequenceMatcher(None, _normalize_title(a), _normalize_title(b)).ratio()
 
 
-def _article_key(article: Dict[str, Any]) -> str:
-    url = str(article.get("url") or "").strip()
-    if not url:
-        return str(article.get("title") or "")
-    try:
-        parsed = urlparse(url)
-        parts = [p for p in parsed.path.split("/") if p]
-        if parts:
-            return parts[-1]
-    except Exception:
-        pass
-    return url
+# For internal use — canonical name is article_key(article) from article_identity
+_article_key = article_key
 
 
 def _cluster_text(a: Dict[str, Any]) -> str:

+ 109 - 0
news_mcp/mcp_server_fastmcp.py

@@ -349,6 +349,70 @@ async def toggle_feed(feed_url: str, enabled: bool) -> dict:
     return {"ok": True, "feed_key": feed_url.strip(), "enabled": enabled, "details": updated}
 
 
+@mcp.tool(description="Debug dedup: inspect whether an article URL was already processed, which cluster it belongs to, and what similarity signals it would produce against existing clusters.")
+async def debug_dedup(url: str, title: str | None = None) -> dict:
+    """Given an article URL (and optional title), report dedup status.
+
+    Returns:
+      - seen: whether the article_key is in seen_articles
+      - article_key: the identity key derived from the URL
+      - cluster_id: which cluster it belongs to (if seen)
+      - similarity_signals: if title is provided, compute signals against
+        the top-N most similar existing clusters
+    """
+    from news_mcp.article_identity import article_key, article_content_hash
+    from news_mcp.dedup.cluster import _title_similarity, _normalize_title, _signals, _is_match
+    from news_mcp.config import NEWS_EMBEDDINGS_ENABLED
+
+    art = {"url": url, "title": title or ""}
+    akey = article_key(art)
+    result = {"url": url, "article_key": akey}
+
+    store = SQLiteClusterStore(DB_PATH)
+    with store._conn() as conn:
+        # Check seen_articles
+        row = conn.execute(
+            "SELECT cluster_id, first_seen, url FROM seen_articles WHERE article_key=?",
+            (akey,),
+        ).fetchone()
+        if row:
+            result["seen"] = True
+            result["cluster_id"] = row[0]
+            result["first_seen"] = row[1]
+            result["stored_url"] = row[2]
+        else:
+            result["seen"] = False
+
+    # If title provided, compute similarity against top clusters
+    if title:
+        # Get recent clusters for comparison
+        recent = store.get_latest_clusters_all_topics(ttl_hours=24, limit=20)
+        signals_list = []
+        for c in recent:
+            c_title = c.get("headline", "")
+            sigs = _signals(art, c)
+            matched, signal_name, signal_value = _is_match(
+                sigs, embeddings_enabled=NEWS_EMBEDDINGS_ENABLED,
+            )
+            signals_list.append({
+                "cluster_id": c.get("cluster_id", "")[:12],
+                "headline": c_title[:60],
+                "title_sim": round(sigs["title"], 3),
+                "jaccard": round(sigs["jaccard"], 3),
+                "cosine": round(sigs["cosine"], 3) if sigs["cosine"] else None,
+                "matched": matched,
+                "match_signal": signal_name,
+                "match_value": round(signal_value, 3) if signal_value else None,
+            })
+        # Sort by best title similarity
+        signals_list.sort(key=lambda x: x["title_sim"], reverse=True)
+        result["similarity_signals"] = signals_list[:10]
+        result["title_threshold"] = 0.75  # DEFAULT_TITLE_THRESHOLD
+        result["jaccard_threshold"] = 0.55  # DEFAULT_JACCARD_THRESHOLD
+
+    return result
+
+
 @mcp.tool(description="Investigate a topic and return the newest deduplicated news clusters with entities and thematic keywords, sorted by recency.")
 async def get_latest_events(topic: str | None = None, limit: int = 5, include_articles: bool = False):
     limit = max(1, min(int(limit), 20))
@@ -1380,6 +1444,51 @@ async def api_feed_toggle(feed_url: str = Form(), enabled: bool = Form()):
         return _api_err(e, f"toggle({feed_url})")
 
 
+# ------------------------------------------------------------------ #
+#  Site config (dashboard-tuneable parameters)
+# ------------------------------------------------------------------ #
+
+@app.get("/api/v1/config")
+def api_config():
+    """All site config parameters (seeded from .env/defaults)."""
+    try:
+        from news_mcp.site_config import get_site_config
+        with _shared_store._conn() as conn:
+            rows = get_site_config(conn)
+        return {"config": rows}
+    except Exception as e:
+        return _api_err(e, "config")
+
+
+@app.post("/api/v1/config/update")
+async def api_config_update(key: str = Form(), value: str = Form()):
+    """Update a single config parameter at runtime."""
+    try:
+        from news_mcp.site_config import set_config_value
+        with _shared_store._conn() as conn:
+            ok = set_config_value(conn, key.strip(), value.strip())
+            conn.commit()
+        if not ok:
+            return JSONResponse(status_code=404, content={"error": f"Config key not found: {key}"})
+        return {"ok": True, "key": key.strip(), "value": value.strip()}
+    except Exception as e:
+        return _api_err(e, f"config/update({key})")
+
+
+@app.post("/api/v1/config/reset")
+async def api_config_reset():
+    """Reset all config to .env/defaults (drops and re-seeds site_config)."""
+    try:
+        from news_mcp.site_config import seed_site_config
+        with _shared_store._conn() as conn:
+            conn.execute("DELETE FROM site_config")
+            seeded = seed_site_config(conn)
+            conn.commit()
+        return {"ok": True, "seeded": seeded}
+    except Exception as e:
+        return _api_err(e, "config/reset")
+
+
 @app.get("/health")
 def health():
     return {

+ 167 - 0
news_mcp/site_config.py

@@ -0,0 +1,167 @@
+"""DB-backed site configuration.
+
+All tunable parameters live in the `site_config` SQLite table.
+On startup, if the table is empty, it is seeded from .env overrides
+or Python defaults. After that, values are read from the DB — allowing
+runtime updates via the REST API without restart.
+
+Categories:
+  clustering  — similarity thresholds and merge criteria
+  enrichment  — LLM behavior, rate limits, embedding settings
+  retention   — pruning, age windows, refresh intervals
+"""
+
+from __future__ import annotations
+
+import os
+from typing import Any
+
+# ---------------------------------------------------------------------------
+#  Default registry — (key, default_value, type, category, description)
+#  The type is one of: float, int, bool, str
+# ---------------------------------------------------------------------------
+
+CONFIG_DEFAULTS: list[tuple[str, str, str, str, str]] = [
+    # Clustering thresholds
+    ("title_threshold",             "0.75",  "float", "clustering", "Min title similarity to merge (SequenceMatcher)"),
+    ("jaccard_threshold",           "0.55",  "float", "clustering", "Min Jaccard token overlap to merge"),
+    ("dual_title_floor",            "0.55",  "float", "clustering", "Dual-signal: min title for title+jaccard merge"),
+    ("dual_jaccard_floor",          "0.25",  "float", "clustering", "Dual-signal: min jaccard for title+jaccard merge"),
+    ("early_exit_title",            "0.95",  "float", "clustering", "Early-exit title signal (both must be met)"),
+    ("early_exit_jaccard",          "0.80",  "float", "clustering", "Early-exit jaccard signal"),
+    ("consensus_cosine_floor",      "0.80",  "float", "clustering", "Consensus path: min cosine"),
+    ("consensus_jaccard_floor",     "0.30",  "float", "clustering", "Consensus path: min jaccard (or title)"),
+    ("consensus_title_floor",       "0.55",  "float", "clustering", "Consensus path: min title (or jaccard)"),
+    ("crosstopic_title_threshold",  "0.90",  "float", "clustering", "Cross-topic merge: min title similarity"),
+    ("embedding_similarity_threshold", "0.885", "float", "clustering", "Cosine threshold for embedding-only merge"),
+    ("cluster_max_age_hours",       "6",     "float", "clustering", "Cross-cycle merge window (hours, 0=off)"),
+
+    # Enrichment/LLM
+    ("embeddings_enabled",          "true",  "bool",  "enrichment", "Enable Ollama embedding computation"),
+    ("ollama_base_url",             "http://192.168.0.200:11434", "str", "enrichment", "Ollama API base URL"),
+    ("ollama_embedding_model",      "nomic-embed-text", "str", "enrichment", "Ollama embedding model name"),
+    ("extract_provider",            "groq",  "str",   "enrichment", "LLM provider for extraction"),
+    ("extract_model",               "llama4-16e", "str", "enrichment", "LLM model for extraction"),
+    ("summary_provider",            "groq",  "str",   "enrichment", "LLM provider for summarisation"),
+    ("summary_model",               "llama4-16e", "str", "enrichment", "LLM model for summarisation"),
+    ("enrichment_max_per_refresh",  "0",     "int",   "enrichment", "Max enrichments per cycle (0=unlimited)"),
+    ("enrich_other_topics_only",    "false", "bool",  "enrichment", "Only enrich 'other' topic (legacy guard)"),
+
+    # Retention/pruning
+    ("pruning_enabled",             "true",  "bool",  "retention", "Enable periodic cluster pruning"),
+    ("retention_days",              "10",    "float", "retention", "Delete clusters older than N days"),
+    ("prune_interval_hours",        "12",    "float", "retention", "Run prune every N hours"),
+    ("refresh_interval_seconds",    "300",   "int",   "retention", "Polling cycle interval (seconds)"),
+    ("background_refresh_enabled",  "true",  "bool",  "retention", "Enable background polling"),
+    ("default_lookback_hours",      "24",    "float", "retention", "Default lookback for read queries"),
+]
+
+# ---------------------------------------------------------------------------
+#  .env override map: CONFIG_KEY → ENV_VAR_NAME
+#  When seeding, if the env var is set, its value overrides the default.
+# ---------------------------------------------------------------------------
+
+ENV_OVERRIDES: dict[str, str] = {
+    "title_threshold":             "NEWS_TITLE_THRESHOLD",
+    "jaccard_threshold":           "NEWS_JACCARD_THRESHOLD",
+    "embedding_similarity_threshold": "NEWS_EMBEDDING_SIMILARITY_THRESHOLD",
+    "cluster_max_age_hours":       "NEWS_CLUSTER_MAX_AGE_HOURS",
+    "embeddings_enabled":          "NEWS_EMBEDDINGS_ENABLED",
+    "ollama_base_url":             "OLLAMA_BASE_URL",
+    "ollama_embedding_model":      "OLLAMA_EMBEDDING_MODEL",
+    "extract_provider":            "NEWS_EXTRACT_PROVIDER",
+    "extract_model":               "NEWS_EXTRACT_MODEL",
+    "summary_provider":            "NEWS_SUMMARY_PROVIDER",
+    "summary_model":               "NEWS_SUMMARY_MODEL",
+    "enrichment_max_per_refresh":  "ENRICHMENT_MAX_PER_REFRESH",
+    "enrich_other_topics_only":    "ENRICH_OTHER_TOPICS_ONLY",
+    "pruning_enabled":             "NEWS_PRUNING_ENABLED",
+    "retention_days":              "NEWS_RETENTION_DAYS",
+    "prune_interval_hours":        "NEWS_PRUNE_INTERVAL_HOURS",
+    "refresh_interval_seconds":    "NEWS_REFRESH_INTERVAL_SECONDS",
+    "background_refresh_enabled":  "NEWS_BACKGROUND_REFRESH_ENABLED",
+    "default_lookback_hours":      "NEWS_DEFAULT_LOOKBACK_HOURS",
+}
+
+
+def _coerce(value: str, typ: str) -> Any:
+    """Convert a string value to its declared type."""
+    if typ == "float":
+        return float(value)
+    if typ == "int":
+        return int(value)
+    if typ == "bool":
+        return value.lower() in ("true", "1", "yes")
+    return value  # str
+
+
+def seed_site_config(conn) -> int:
+    """Create the site_config table and seed it if empty.
+
+    Returns the number of rows inserted (0 if already seeded).
+    """
+    conn.execute("""
+        CREATE TABLE IF NOT EXISTS site_config (
+            key         TEXT PRIMARY KEY,
+            value       TEXT NOT NULL,
+            type        TEXT NOT NULL DEFAULT 'str',
+            category    TEXT NOT NULL DEFAULT 'general',
+            description TEXT NOT NULL DEFAULT '',
+            source      TEXT NOT NULL DEFAULT 'default'
+        )
+    """)
+
+    count = conn.execute("SELECT count(*) FROM site_config").fetchone()[0]
+    if count > 0:
+        return 0
+
+    inserted = 0
+    for key, default, typ, category, description in CONFIG_DEFAULTS:
+        # Check .env override
+        env_var = ENV_OVERRIDES.get(key)
+        env_val = os.getenv(env_var) if env_var else None
+        if env_val is not None:
+            value = env_val
+            source = "env"
+        else:
+            value = default
+            source = "default"
+        conn.execute(
+            "INSERT INTO site_config(key, value, type, category, description, source) VALUES(?,?,?,?,?,?)",
+            (key, value, typ, category, description, source),
+        )
+        inserted += 1
+    return inserted
+
+
+def get_site_config(conn) -> list[dict]:
+    """Return all config rows as dicts."""
+    conn.execute("""
+        CREATE TABLE IF NOT EXISTS site_config (
+            key         TEXT PRIMARY KEY,
+            value       TEXT NOT NULL,
+            type        TEXT NOT NULL DEFAULT 'str',
+            category    TEXT NOT NULL DEFAULT 'general',
+            description TEXT NOT NULL DEFAULT '',
+            source      TEXT NOT NULL DEFAULT 'default'
+        )
+    """)
+    rows = conn.execute(
+        "SELECT key, value, type, category, description, source FROM site_config ORDER BY category, key"
+    ).fetchall()
+    return [
+        {"key": r[0], "value": r[1], "type": r[2], "category": r[3], "description": r[4], "source": r[5]}
+        for r in rows
+    ]
+
+
+def get_config_value(conn, key: str) -> str | None:
+    """Get a single config value by key. Returns None if not found."""
+    row = conn.execute("SELECT value FROM site_config WHERE key=?", (key,)).fetchone()
+    return row[0] if row else None
+
+
+def set_config_value(conn, key: str, value: str) -> bool:
+    """Update a single config value. Returns True if the key existed."""
+    cur = conn.execute("UPDATE site_config SET value=?, source='api' WHERE key=?", (value, key))
+    return cur.rowcount > 0

+ 10 - 14
news_mcp/storage/sqlite_store.py

@@ -6,9 +6,9 @@ from dataclasses import dataclass
 from datetime import datetime, timezone, timedelta
 from pathlib import Path
 from typing import Any
-from urllib.parse import urlparse
 from email.utils import parsedate_to_datetime
 
+from news_mcp.article_identity import article_key
 from news_mcp.config import (
     NEWS_PRUNE_INTERVAL_HOURS,
     NEWS_PRUNING_ENABLED,
@@ -87,19 +87,8 @@ class ClusterRow:
 
 META_LAST_PRUNE_AT = "last_prune_at"
 
-
-def _article_key(article: dict[str, Any]) -> str:
-    url = str(article.get("url") or "").strip()
-    if not url:
-        return str(article.get("title") or "")
-    try:
-        parsed = urlparse(url)
-        parts = [p for p in parsed.path.split("/") if p]
-        if parts:
-            return parts[-1]
-    except Exception:
-        pass
-    return url
+# For internal use — canonical name is article_key(article) from article_identity
+_article_key = article_key
 
 
 def _dedup_articles(articles: list[dict[str, Any]]) -> list[dict[str, Any]]:
@@ -329,6 +318,13 @@ class SQLiteClusterStore:
                 """
             )
 
+            # Seed site_config from .env / defaults (no-op if already populated)
+            from news_mcp.site_config import seed_site_config
+            seeded = seed_site_config(conn)
+            if seeded:
+                import logging
+                logging.getLogger(__name__).info("site_config: seeded %d rows from env/defaults", seeded)
+
     def upsert_clusters(self, clusters: list[dict], topic: str) -> None:
         now = datetime.now(timezone.utc)
         with self._conn() as conn:

+ 5 - 14
scripts/backfill_seen_articles.py

@@ -11,21 +11,12 @@ import json
 import sqlite3
 import sys
 from datetime import datetime, timezone
-from urllib.parse import urlparse
 
+# Add parent dir so we can import news_mcp when run as a standalone script
+from pathlib import Path
+sys.path.insert(0, str(Path(__file__).resolve().parent.parent))
 
-def _article_key(article: dict) -> str:
-    url = str(article.get("url") or "").strip()
-    if not url:
-        return str(article.get("title") or "")
-    try:
-        parsed = urlparse(url)
-        parts = [p for p in parsed.path.split("/") if p]
-        if parts:
-            return parts[-1]
-    except Exception:
-        pass
-    return url
+from news_mcp.article_identity import article_key
 
 
 def main(db_path: str = "./data/news.sqlite"):
@@ -53,7 +44,7 @@ def main(db_path: str = "./data/news.sqlite"):
             skipped += 1
             continue
         for art in payload.get("articles", []):
-            akey = _article_key(art)
+            akey = article_key(art)
             if not akey:
                 continue
             art_url = str(art.get("url") or "").strip()