site_config.py 8.0 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167
  1. """DB-backed site configuration.
  2. All tunable parameters live in the `site_config` SQLite table.
  3. On startup, if the table is empty, it is seeded from .env overrides
  4. or Python defaults. After that, values are read from the DB — allowing
  5. runtime updates via the REST API without restart.
  6. Categories:
  7. clustering — similarity thresholds and merge criteria
  8. enrichment — LLM behavior, rate limits, embedding settings
  9. retention — pruning, age windows, refresh intervals
  10. """
  11. from __future__ import annotations
  12. import os
  13. from typing import Any
  14. # ---------------------------------------------------------------------------
  15. # Default registry — (key, default_value, type, category, description)
  16. # The type is one of: float, int, bool, str
  17. # ---------------------------------------------------------------------------
  18. CONFIG_DEFAULTS: list[tuple[str, str, str, str, str]] = [
  19. # Clustering thresholds
  20. ("title_threshold", "0.75", "float", "clustering", "Min title similarity to merge (SequenceMatcher)"),
  21. ("jaccard_threshold", "0.55", "float", "clustering", "Min Jaccard token overlap to merge"),
  22. ("dual_title_floor", "0.55", "float", "clustering", "Dual-signal: min title for title+jaccard merge"),
  23. ("dual_jaccard_floor", "0.25", "float", "clustering", "Dual-signal: min jaccard for title+jaccard merge"),
  24. ("early_exit_title", "0.95", "float", "clustering", "Early-exit title signal (both must be met)"),
  25. ("early_exit_jaccard", "0.80", "float", "clustering", "Early-exit jaccard signal"),
  26. ("consensus_cosine_floor", "0.80", "float", "clustering", "Consensus path: min cosine"),
  27. ("consensus_jaccard_floor", "0.30", "float", "clustering", "Consensus path: min jaccard (or title)"),
  28. ("consensus_title_floor", "0.55", "float", "clustering", "Consensus path: min title (or jaccard)"),
  29. ("crosstopic_title_threshold", "0.90", "float", "clustering", "Cross-topic merge: min title similarity"),
  30. ("embedding_similarity_threshold", "0.885", "float", "clustering", "Cosine threshold for embedding-only merge"),
  31. ("cluster_max_age_hours", "6", "float", "clustering", "Cross-cycle merge window (hours, 0=off)"),
  32. # Enrichment/LLM
  33. ("embeddings_enabled", "true", "bool", "enrichment", "Enable Ollama embedding computation"),
  34. ("ollama_base_url", "http://192.168.0.200:11434", "str", "enrichment", "Ollama API base URL"),
  35. ("ollama_embedding_model", "nomic-embed-text", "str", "enrichment", "Ollama embedding model name"),
  36. ("extract_provider", "groq", "str", "enrichment", "LLM provider for extraction"),
  37. ("extract_model", "llama4-16e", "str", "enrichment", "LLM model for extraction"),
  38. ("summary_provider", "groq", "str", "enrichment", "LLM provider for summarisation"),
  39. ("summary_model", "llama4-16e", "str", "enrichment", "LLM model for summarisation"),
  40. ("enrichment_max_per_refresh", "0", "int", "enrichment", "Max enrichments per cycle (0=unlimited)"),
  41. ("enrich_other_topics_only", "false", "bool", "enrichment", "Only enrich 'other' topic (legacy guard)"),
  42. # Retention/pruning
  43. ("pruning_enabled", "true", "bool", "retention", "Enable periodic cluster pruning"),
  44. ("retention_days", "10", "float", "retention", "Delete clusters older than N days"),
  45. ("prune_interval_hours", "12", "float", "retention", "Run prune every N hours"),
  46. ("refresh_interval_seconds", "300", "int", "retention", "Polling cycle interval (seconds)"),
  47. ("background_refresh_enabled", "true", "bool", "retention", "Enable background polling"),
  48. ("default_lookback_hours", "24", "float", "retention", "Default lookback for read queries"),
  49. ]
  50. # ---------------------------------------------------------------------------
  51. # .env override map: CONFIG_KEY → ENV_VAR_NAME
  52. # When seeding, if the env var is set, its value overrides the default.
  53. # ---------------------------------------------------------------------------
  54. ENV_OVERRIDES: dict[str, str] = {
  55. "title_threshold": "NEWS_TITLE_THRESHOLD",
  56. "jaccard_threshold": "NEWS_JACCARD_THRESHOLD",
  57. "embedding_similarity_threshold": "NEWS_EMBEDDING_SIMILARITY_THRESHOLD",
  58. "cluster_max_age_hours": "NEWS_CLUSTER_MAX_AGE_HOURS",
  59. "embeddings_enabled": "NEWS_EMBEDDINGS_ENABLED",
  60. "ollama_base_url": "OLLAMA_BASE_URL",
  61. "ollama_embedding_model": "OLLAMA_EMBEDDING_MODEL",
  62. "extract_provider": "NEWS_EXTRACT_PROVIDER",
  63. "extract_model": "NEWS_EXTRACT_MODEL",
  64. "summary_provider": "NEWS_SUMMARY_PROVIDER",
  65. "summary_model": "NEWS_SUMMARY_MODEL",
  66. "enrichment_max_per_refresh": "ENRICHMENT_MAX_PER_REFRESH",
  67. "enrich_other_topics_only": "ENRICH_OTHER_TOPICS_ONLY",
  68. "pruning_enabled": "NEWS_PRUNING_ENABLED",
  69. "retention_days": "NEWS_RETENTION_DAYS",
  70. "prune_interval_hours": "NEWS_PRUNE_INTERVAL_HOURS",
  71. "refresh_interval_seconds": "NEWS_REFRESH_INTERVAL_SECONDS",
  72. "background_refresh_enabled": "NEWS_BACKGROUND_REFRESH_ENABLED",
  73. "default_lookback_hours": "NEWS_DEFAULT_LOOKBACK_HOURS",
  74. }
  75. def _coerce(value: str, typ: str) -> Any:
  76. """Convert a string value to its declared type."""
  77. if typ == "float":
  78. return float(value)
  79. if typ == "int":
  80. return int(value)
  81. if typ == "bool":
  82. return value.lower() in ("true", "1", "yes")
  83. return value # str
  84. def seed_site_config(conn) -> int:
  85. """Create the site_config table and seed it if empty.
  86. Returns the number of rows inserted (0 if already seeded).
  87. """
  88. conn.execute("""
  89. CREATE TABLE IF NOT EXISTS site_config (
  90. key TEXT PRIMARY KEY,
  91. value TEXT NOT NULL,
  92. type TEXT NOT NULL DEFAULT 'str',
  93. category TEXT NOT NULL DEFAULT 'general',
  94. description TEXT NOT NULL DEFAULT '',
  95. source TEXT NOT NULL DEFAULT 'default'
  96. )
  97. """)
  98. count = conn.execute("SELECT count(*) FROM site_config").fetchone()[0]
  99. if count > 0:
  100. return 0
  101. inserted = 0
  102. for key, default, typ, category, description in CONFIG_DEFAULTS:
  103. # Check .env override
  104. env_var = ENV_OVERRIDES.get(key)
  105. env_val = os.getenv(env_var) if env_var else None
  106. if env_val is not None:
  107. value = env_val
  108. source = "env"
  109. else:
  110. value = default
  111. source = "default"
  112. conn.execute(
  113. "INSERT INTO site_config(key, value, type, category, description, source) VALUES(?,?,?,?,?,?)",
  114. (key, value, typ, category, description, source),
  115. )
  116. inserted += 1
  117. return inserted
  118. def get_site_config(conn) -> list[dict]:
  119. """Return all config rows as dicts."""
  120. conn.execute("""
  121. CREATE TABLE IF NOT EXISTS site_config (
  122. key TEXT PRIMARY KEY,
  123. value TEXT NOT NULL,
  124. type TEXT NOT NULL DEFAULT 'str',
  125. category TEXT NOT NULL DEFAULT 'general',
  126. description TEXT NOT NULL DEFAULT '',
  127. source TEXT NOT NULL DEFAULT 'default'
  128. )
  129. """)
  130. rows = conn.execute(
  131. "SELECT key, value, type, category, description, source FROM site_config ORDER BY category, key"
  132. ).fetchall()
  133. return [
  134. {"key": r[0], "value": r[1], "type": r[2], "category": r[3], "description": r[4], "source": r[5]}
  135. for r in rows
  136. ]
  137. def get_config_value(conn, key: str) -> str | None:
  138. """Get a single config value by key. Returns None if not found."""
  139. row = conn.execute("SELECT value FROM site_config WHERE key=?", (key,)).fetchone()
  140. return row[0] if row else None
  141. def set_config_value(conn, key: str, value: str) -> bool:
  142. """Update a single config value. Returns True if the key existed."""
  143. cur = conn.execute("UPDATE site_config SET value=?, source='api' WHERE key=?", (value, key))
  144. return cur.rowcount > 0