|
@@ -0,0 +1,167 @@
|
|
|
|
|
+"""DB-backed site configuration.
|
|
|
|
|
+
|
|
|
|
|
+All tunable parameters live in the `site_config` SQLite table.
|
|
|
|
|
+On startup, if the table is empty, it is seeded from .env overrides
|
|
|
|
|
+or Python defaults. After that, values are read from the DB — allowing
|
|
|
|
|
+runtime updates via the REST API without restart.
|
|
|
|
|
+
|
|
|
|
|
+Categories:
|
|
|
|
|
+ clustering — similarity thresholds and merge criteria
|
|
|
|
|
+ enrichment — LLM behavior, rate limits, embedding settings
|
|
|
|
|
+ retention — pruning, age windows, refresh intervals
|
|
|
|
|
+"""
|
|
|
|
|
+
|
|
|
|
|
+from __future__ import annotations
|
|
|
|
|
+
|
|
|
|
|
+import os
|
|
|
|
|
+from typing import Any
|
|
|
|
|
+
|
|
|
|
|
+# ---------------------------------------------------------------------------
|
|
|
|
|
+# Default registry — (key, default_value, type, category, description)
|
|
|
|
|
+# The type is one of: float, int, bool, str
|
|
|
|
|
+# ---------------------------------------------------------------------------
|
|
|
|
|
+
|
|
|
|
|
+CONFIG_DEFAULTS: list[tuple[str, str, str, str, str]] = [
|
|
|
|
|
+ # Clustering thresholds
|
|
|
|
|
+ ("title_threshold", "0.75", "float", "clustering", "Min title similarity to merge (SequenceMatcher)"),
|
|
|
|
|
+ ("jaccard_threshold", "0.55", "float", "clustering", "Min Jaccard token overlap to merge"),
|
|
|
|
|
+ ("dual_title_floor", "0.55", "float", "clustering", "Dual-signal: min title for title+jaccard merge"),
|
|
|
|
|
+ ("dual_jaccard_floor", "0.25", "float", "clustering", "Dual-signal: min jaccard for title+jaccard merge"),
|
|
|
|
|
+ ("early_exit_title", "0.95", "float", "clustering", "Early-exit title signal (both must be met)"),
|
|
|
|
|
+ ("early_exit_jaccard", "0.80", "float", "clustering", "Early-exit jaccard signal"),
|
|
|
|
|
+ ("consensus_cosine_floor", "0.80", "float", "clustering", "Consensus path: min cosine"),
|
|
|
|
|
+ ("consensus_jaccard_floor", "0.30", "float", "clustering", "Consensus path: min jaccard (or title)"),
|
|
|
|
|
+ ("consensus_title_floor", "0.55", "float", "clustering", "Consensus path: min title (or jaccard)"),
|
|
|
|
|
+ ("crosstopic_title_threshold", "0.90", "float", "clustering", "Cross-topic merge: min title similarity"),
|
|
|
|
|
+ ("embedding_similarity_threshold", "0.885", "float", "clustering", "Cosine threshold for embedding-only merge"),
|
|
|
|
|
+ ("cluster_max_age_hours", "6", "float", "clustering", "Cross-cycle merge window (hours, 0=off)"),
|
|
|
|
|
+
|
|
|
|
|
+ # Enrichment/LLM
|
|
|
|
|
+ ("embeddings_enabled", "true", "bool", "enrichment", "Enable Ollama embedding computation"),
|
|
|
|
|
+ ("ollama_base_url", "http://192.168.0.200:11434", "str", "enrichment", "Ollama API base URL"),
|
|
|
|
|
+ ("ollama_embedding_model", "nomic-embed-text", "str", "enrichment", "Ollama embedding model name"),
|
|
|
|
|
+ ("extract_provider", "groq", "str", "enrichment", "LLM provider for extraction"),
|
|
|
|
|
+ ("extract_model", "llama4-16e", "str", "enrichment", "LLM model for extraction"),
|
|
|
|
|
+ ("summary_provider", "groq", "str", "enrichment", "LLM provider for summarisation"),
|
|
|
|
|
+ ("summary_model", "llama4-16e", "str", "enrichment", "LLM model for summarisation"),
|
|
|
|
|
+ ("enrichment_max_per_refresh", "0", "int", "enrichment", "Max enrichments per cycle (0=unlimited)"),
|
|
|
|
|
+ ("enrich_other_topics_only", "false", "bool", "enrichment", "Only enrich 'other' topic (legacy guard)"),
|
|
|
|
|
+
|
|
|
|
|
+ # Retention/pruning
|
|
|
|
|
+ ("pruning_enabled", "true", "bool", "retention", "Enable periodic cluster pruning"),
|
|
|
|
|
+ ("retention_days", "10", "float", "retention", "Delete clusters older than N days"),
|
|
|
|
|
+ ("prune_interval_hours", "12", "float", "retention", "Run prune every N hours"),
|
|
|
|
|
+ ("refresh_interval_seconds", "300", "int", "retention", "Polling cycle interval (seconds)"),
|
|
|
|
|
+ ("background_refresh_enabled", "true", "bool", "retention", "Enable background polling"),
|
|
|
|
|
+ ("default_lookback_hours", "24", "float", "retention", "Default lookback for read queries"),
|
|
|
|
|
+]
|
|
|
|
|
+
|
|
|
|
|
+# ---------------------------------------------------------------------------
|
|
|
|
|
+# .env override map: CONFIG_KEY → ENV_VAR_NAME
|
|
|
|
|
+# When seeding, if the env var is set, its value overrides the default.
|
|
|
|
|
+# ---------------------------------------------------------------------------
|
|
|
|
|
+
|
|
|
|
|
+ENV_OVERRIDES: dict[str, str] = {
|
|
|
|
|
+ "title_threshold": "NEWS_TITLE_THRESHOLD",
|
|
|
|
|
+ "jaccard_threshold": "NEWS_JACCARD_THRESHOLD",
|
|
|
|
|
+ "embedding_similarity_threshold": "NEWS_EMBEDDING_SIMILARITY_THRESHOLD",
|
|
|
|
|
+ "cluster_max_age_hours": "NEWS_CLUSTER_MAX_AGE_HOURS",
|
|
|
|
|
+ "embeddings_enabled": "NEWS_EMBEDDINGS_ENABLED",
|
|
|
|
|
+ "ollama_base_url": "OLLAMA_BASE_URL",
|
|
|
|
|
+ "ollama_embedding_model": "OLLAMA_EMBEDDING_MODEL",
|
|
|
|
|
+ "extract_provider": "NEWS_EXTRACT_PROVIDER",
|
|
|
|
|
+ "extract_model": "NEWS_EXTRACT_MODEL",
|
|
|
|
|
+ "summary_provider": "NEWS_SUMMARY_PROVIDER",
|
|
|
|
|
+ "summary_model": "NEWS_SUMMARY_MODEL",
|
|
|
|
|
+ "enrichment_max_per_refresh": "ENRICHMENT_MAX_PER_REFRESH",
|
|
|
|
|
+ "enrich_other_topics_only": "ENRICH_OTHER_TOPICS_ONLY",
|
|
|
|
|
+ "pruning_enabled": "NEWS_PRUNING_ENABLED",
|
|
|
|
|
+ "retention_days": "NEWS_RETENTION_DAYS",
|
|
|
|
|
+ "prune_interval_hours": "NEWS_PRUNE_INTERVAL_HOURS",
|
|
|
|
|
+ "refresh_interval_seconds": "NEWS_REFRESH_INTERVAL_SECONDS",
|
|
|
|
|
+ "background_refresh_enabled": "NEWS_BACKGROUND_REFRESH_ENABLED",
|
|
|
|
|
+ "default_lookback_hours": "NEWS_DEFAULT_LOOKBACK_HOURS",
|
|
|
|
|
+}
|
|
|
|
|
+
|
|
|
|
|
+
|
|
|
|
|
+def _coerce(value: str, typ: str) -> Any:
|
|
|
|
|
+ """Convert a string value to its declared type."""
|
|
|
|
|
+ if typ == "float":
|
|
|
|
|
+ return float(value)
|
|
|
|
|
+ if typ == "int":
|
|
|
|
|
+ return int(value)
|
|
|
|
|
+ if typ == "bool":
|
|
|
|
|
+ return value.lower() in ("true", "1", "yes")
|
|
|
|
|
+ return value # str
|
|
|
|
|
+
|
|
|
|
|
+
|
|
|
|
|
+def seed_site_config(conn) -> int:
|
|
|
|
|
+ """Create the site_config table and seed it if empty.
|
|
|
|
|
+
|
|
|
|
|
+ Returns the number of rows inserted (0 if already seeded).
|
|
|
|
|
+ """
|
|
|
|
|
+ conn.execute("""
|
|
|
|
|
+ CREATE TABLE IF NOT EXISTS site_config (
|
|
|
|
|
+ key TEXT PRIMARY KEY,
|
|
|
|
|
+ value TEXT NOT NULL,
|
|
|
|
|
+ type TEXT NOT NULL DEFAULT 'str',
|
|
|
|
|
+ category TEXT NOT NULL DEFAULT 'general',
|
|
|
|
|
+ description TEXT NOT NULL DEFAULT '',
|
|
|
|
|
+ source TEXT NOT NULL DEFAULT 'default'
|
|
|
|
|
+ )
|
|
|
|
|
+ """)
|
|
|
|
|
+
|
|
|
|
|
+ count = conn.execute("SELECT count(*) FROM site_config").fetchone()[0]
|
|
|
|
|
+ if count > 0:
|
|
|
|
|
+ return 0
|
|
|
|
|
+
|
|
|
|
|
+ inserted = 0
|
|
|
|
|
+ for key, default, typ, category, description in CONFIG_DEFAULTS:
|
|
|
|
|
+ # Check .env override
|
|
|
|
|
+ env_var = ENV_OVERRIDES.get(key)
|
|
|
|
|
+ env_val = os.getenv(env_var) if env_var else None
|
|
|
|
|
+ if env_val is not None:
|
|
|
|
|
+ value = env_val
|
|
|
|
|
+ source = "env"
|
|
|
|
|
+ else:
|
|
|
|
|
+ value = default
|
|
|
|
|
+ source = "default"
|
|
|
|
|
+ conn.execute(
|
|
|
|
|
+ "INSERT INTO site_config(key, value, type, category, description, source) VALUES(?,?,?,?,?,?)",
|
|
|
|
|
+ (key, value, typ, category, description, source),
|
|
|
|
|
+ )
|
|
|
|
|
+ inserted += 1
|
|
|
|
|
+ return inserted
|
|
|
|
|
+
|
|
|
|
|
+
|
|
|
|
|
+def get_site_config(conn) -> list[dict]:
|
|
|
|
|
+ """Return all config rows as dicts."""
|
|
|
|
|
+ conn.execute("""
|
|
|
|
|
+ CREATE TABLE IF NOT EXISTS site_config (
|
|
|
|
|
+ key TEXT PRIMARY KEY,
|
|
|
|
|
+ value TEXT NOT NULL,
|
|
|
|
|
+ type TEXT NOT NULL DEFAULT 'str',
|
|
|
|
|
+ category TEXT NOT NULL DEFAULT 'general',
|
|
|
|
|
+ description TEXT NOT NULL DEFAULT '',
|
|
|
|
|
+ source TEXT NOT NULL DEFAULT 'default'
|
|
|
|
|
+ )
|
|
|
|
|
+ """)
|
|
|
|
|
+ rows = conn.execute(
|
|
|
|
|
+ "SELECT key, value, type, category, description, source FROM site_config ORDER BY category, key"
|
|
|
|
|
+ ).fetchall()
|
|
|
|
|
+ return [
|
|
|
|
|
+ {"key": r[0], "value": r[1], "type": r[2], "category": r[3], "description": r[4], "source": r[5]}
|
|
|
|
|
+ for r in rows
|
|
|
|
|
+ ]
|
|
|
|
|
+
|
|
|
|
|
+
|
|
|
|
|
+def get_config_value(conn, key: str) -> str | None:
|
|
|
|
|
+ """Get a single config value by key. Returns None if not found."""
|
|
|
|
|
+ row = conn.execute("SELECT value FROM site_config WHERE key=?", (key,)).fetchone()
|
|
|
|
|
+ return row[0] if row else None
|
|
|
|
|
+
|
|
|
|
|
+
|
|
|
|
|
+def set_config_value(conn, key: str, value: str) -> bool:
|
|
|
|
|
+ """Update a single config value. Returns True if the key existed."""
|
|
|
|
|
+ cur = conn.execute("UPDATE site_config SET value=?, source='api' WHERE key=?", (value, key))
|
|
|
|
|
+ return cur.rowcount > 0
|