"""DB-backed site configuration. All tunable parameters live in the `site_config` SQLite table. On startup, if the table is empty, it is seeded from .env overrides or Python defaults. After that, values are read from the DB — allowing runtime updates via the REST API without restart. Categories: clustering — similarity thresholds and merge criteria enrichment — LLM behavior, rate limits, embedding settings retention — pruning, age windows, refresh intervals """ from __future__ import annotations import os from typing import Any # --------------------------------------------------------------------------- # Default registry — (key, default_value, type, category, description) # The type is one of: float, int, bool, str # --------------------------------------------------------------------------- CONFIG_DEFAULTS: list[tuple[str, str, str, str, str]] = [ # Clustering thresholds ("title_threshold", "0.75", "float", "clustering", "Min title similarity to merge (SequenceMatcher)"), ("jaccard_threshold", "0.55", "float", "clustering", "Min Jaccard token overlap to merge"), ("dual_title_floor", "0.55", "float", "clustering", "Dual-signal: min title for title+jaccard merge"), ("dual_jaccard_floor", "0.25", "float", "clustering", "Dual-signal: min jaccard for title+jaccard merge"), ("early_exit_title", "0.95", "float", "clustering", "Early-exit title signal (both must be met)"), ("early_exit_jaccard", "0.80", "float", "clustering", "Early-exit jaccard signal"), ("consensus_cosine_floor", "0.80", "float", "clustering", "Consensus path: min cosine"), ("consensus_jaccard_floor", "0.30", "float", "clustering", "Consensus path: min jaccard (or title)"), ("consensus_title_floor", "0.55", "float", "clustering", "Consensus path: min title (or jaccard)"), ("crosstopic_title_threshold", "0.90", "float", "clustering", "Cross-topic merge: min title similarity"), ("embedding_similarity_threshold", "0.885", "float", "clustering", "Cosine threshold for embedding-only merge"), ("cluster_max_age_hours", "6", "float", "clustering", "Cross-cycle merge window (hours, 0=off)"), # Enrichment/LLM ("embeddings_enabled", "true", "bool", "enrichment", "Enable Ollama embedding computation"), ("ollama_base_url", "http://192.168.0.200:11434", "str", "enrichment", "Ollama API base URL"), ("ollama_embedding_model", "nomic-embed-text", "str", "enrichment", "Ollama embedding model name"), ("extract_provider", "groq", "str", "enrichment", "LLM provider for extraction"), ("extract_model", "llama4-16e", "str", "enrichment", "LLM model for extraction"), ("summary_provider", "groq", "str", "enrichment", "LLM provider for summarisation"), ("summary_model", "llama4-16e", "str", "enrichment", "LLM model for summarisation"), ("enrichment_max_per_refresh", "0", "int", "enrichment", "Max enrichments per cycle (0=unlimited)"), ("enrich_other_topics_only", "false", "bool", "enrichment", "Only enrich 'other' topic (legacy guard)"), # Retention/pruning ("pruning_enabled", "true", "bool", "retention", "Enable periodic cluster pruning"), ("retention_days", "10", "float", "retention", "Delete clusters older than N days"), ("prune_interval_hours", "12", "float", "retention", "Run prune every N hours"), ("refresh_interval_seconds", "300", "int", "retention", "Polling cycle interval (seconds)"), ("background_refresh_enabled", "true", "bool", "retention", "Enable background polling"), ("default_lookback_hours", "24", "float", "retention", "Default lookback for read queries"), ] # --------------------------------------------------------------------------- # .env override map: CONFIG_KEY → ENV_VAR_NAME # When seeding, if the env var is set, its value overrides the default. # --------------------------------------------------------------------------- ENV_OVERRIDES: dict[str, str] = { "title_threshold": "NEWS_TITLE_THRESHOLD", "jaccard_threshold": "NEWS_JACCARD_THRESHOLD", "embedding_similarity_threshold": "NEWS_EMBEDDING_SIMILARITY_THRESHOLD", "cluster_max_age_hours": "NEWS_CLUSTER_MAX_AGE_HOURS", "embeddings_enabled": "NEWS_EMBEDDINGS_ENABLED", "ollama_base_url": "OLLAMA_BASE_URL", "ollama_embedding_model": "OLLAMA_EMBEDDING_MODEL", "extract_provider": "NEWS_EXTRACT_PROVIDER", "extract_model": "NEWS_EXTRACT_MODEL", "summary_provider": "NEWS_SUMMARY_PROVIDER", "summary_model": "NEWS_SUMMARY_MODEL", "enrichment_max_per_refresh": "ENRICHMENT_MAX_PER_REFRESH", "enrich_other_topics_only": "ENRICH_OTHER_TOPICS_ONLY", "pruning_enabled": "NEWS_PRUNING_ENABLED", "retention_days": "NEWS_RETENTION_DAYS", "prune_interval_hours": "NEWS_PRUNE_INTERVAL_HOURS", "refresh_interval_seconds": "NEWS_REFRESH_INTERVAL_SECONDS", "background_refresh_enabled": "NEWS_BACKGROUND_REFRESH_ENABLED", "default_lookback_hours": "NEWS_DEFAULT_LOOKBACK_HOURS", } def _coerce(value: str, typ: str) -> Any: """Convert a string value to its declared type.""" if typ == "float": return float(value) if typ == "int": return int(value) if typ == "bool": return value.lower() in ("true", "1", "yes") return value # str def seed_site_config(conn) -> int: """Create the site_config table and seed it if empty. Returns the number of rows inserted (0 if already seeded). """ conn.execute(""" CREATE TABLE IF NOT EXISTS site_config ( key TEXT PRIMARY KEY, value TEXT NOT NULL, type TEXT NOT NULL DEFAULT 'str', category TEXT NOT NULL DEFAULT 'general', description TEXT NOT NULL DEFAULT '', source TEXT NOT NULL DEFAULT 'default' ) """) count = conn.execute("SELECT count(*) FROM site_config").fetchone()[0] if count > 0: return 0 inserted = 0 for key, default, typ, category, description in CONFIG_DEFAULTS: # Check .env override env_var = ENV_OVERRIDES.get(key) env_val = os.getenv(env_var) if env_var else None if env_val is not None: value = env_val source = "env" else: value = default source = "default" conn.execute( "INSERT INTO site_config(key, value, type, category, description, source) VALUES(?,?,?,?,?,?)", (key, value, typ, category, description, source), ) inserted += 1 return inserted def get_site_config(conn) -> list[dict]: """Return all config rows as dicts.""" conn.execute(""" CREATE TABLE IF NOT EXISTS site_config ( key TEXT PRIMARY KEY, value TEXT NOT NULL, type TEXT NOT NULL DEFAULT 'str', category TEXT NOT NULL DEFAULT 'general', description TEXT NOT NULL DEFAULT '', source TEXT NOT NULL DEFAULT 'default' ) """) rows = conn.execute( "SELECT key, value, type, category, description, source FROM site_config ORDER BY category, key" ).fetchall() return [ {"key": r[0], "value": r[1], "type": r[2], "category": r[3], "description": r[4], "source": r[5]} for r in rows ] def get_config_value(conn, key: str) -> str | None: """Get a single config value by key. Returns None if not found.""" row = conn.execute("SELECT value FROM site_config WHERE key=?", (key,)).fetchone() return row[0] if row else None def set_config_value(conn, key: str, value: str) -> bool: """Update a single config value. Returns True if the key existed.""" cur = conn.execute("UPDATE site_config SET value=?, source='api' WHERE key=?", (value, key)) return cur.rowcount > 0