4 месяцев назад · 70df063766
--- a/README.md
+++ b/README.md
@@ -1,21 +1,26 @@
 
															 # Coqui TTS Docker Server
														
 
															-A Dockerized Coqui TTS server with multilingual XTTS, multi-speaker support, voice caching, and automatic audio conversion.
														
 
															+A Dockerized Coqui XTTS server with multilingual support, voice cloning, smart text preprocessing, and robust GPU memory management.
														
 
															 ---
														
 
															 ## Features
														
 
															-* Multilingual TTS via [Coqui XTTS](https://github.com/coqui-ai/TTS)
														
 
															-* Multi-speaker support and voice cloning ready
														
 
															-* Automatic `.mp3` → `.wav` conversion
														
 
															-* Persistent embeddings cache for fast synthesis
														
 
															-* Adjustable `speed`, `pitch`, and `language`
														
 
															-* **Automatic chunking of large input texts** to prevent memory overflow
														
 
															-* **Automatic CPU fallback if GPU VRAM is exhausted**
														
 
															-* **Streaming endpoint for progressive audio output**
														
 
															-* Optimized for **small VRAM GPUs (e.g. GTX 1650 / ~4GB VRAM)**
														
 
															-* `/tts` and `/api/tts` endpoints
														
 
															+* Multilingual TTS via [Coqui XTTS v2](https://github.com/coqui-ai/TTS)
														
 
															+* Multi-speaker support and voice cloning
														
 
															+* Automatic `.mp3` → `.wav` conversion with staleness detection
														
 
															+* Persistent speaker embeddings cache for fast repeated synthesis
														
 
															+* **Markdown-aware text preprocessing** — headings, bold, lists and paragraphs are converted to natural prosody cues (pauses) before synthesis
														
 
															+* **Language-aware acronym expansion** — `KI`, `EU`, `US`, `BRD` etc. are expanded to phonetic letter-spellings appropriate for German or English (`Ka-I`, `E-U`, `U-Es` …)
														
 
															+* **Automatic chunking** of large inputs with sentence- and word-boundary awareness
														
 
															+* **Proactive VRAM headroom check** — entire request pins to CPU if GPU is already low before synthesis starts
														
 
															+* **Automatic per-chunk CPU fallback** on CUDA OOM, with model restored to GPU afterward
														
 
															+* **Per-chunk VRAM flush** — `empty_cache()` after every chunk and after every completed request keeps VRAM usage flat across long documents
														
 
															+* **Full CPU multi-core utilisation** when running on CPU (`torch.set_num_threads`)
														
 
															+* `/tts`, `/api/tts` — full-file synthesis
														
 
															+* `/tts_stream`, `/api/tts_stream` — progressive streaming synthesis
														
 
															+* `/health` — liveness + VRAM status
														
 
															+* Optimised for small VRAM GPUs (GTX 1650 / ~4 GB) and larger
														
 
															 ---
														
@@ -27,165 +32,229 @@ coqui-docker/
 
															 ├── build.sh
														
 
															 ├── run.sh
														
 
															 ├── README.md
														
 
															-├── tts_server.py            # Production server with caching and fallback
														
 
															-├── tts_server_simple.py     # Simple version
														
 
															-├── tts_server_noncaching.py # Legacy stages
														
 
															-├── models/                  # XTTS models (host mount recommended)
														
 
															-├── voices/                  # User voices (.wav or .mp3)
														
 
															-└── cache/                   # Persistent embeddings
														
 
															+├── tts_server.py   # Production server
														
 
															+├── models/         # XTTS model weights (host-mount recommended)
														
 
															+├── voices/         # Speaker reference files (.wav or .mp3)
														
 
															+└── cache/          # Persistent speaker embeddings (.pkl)
														
 
															 ```
														
 
															 ---
														
 
															-# Setup
														
 
															+## Setup
														
 
															-## 1. Build the Docker image
														
 
															+### 1. Build the Docker image
														
 
															 ```bash
														
 
															 ./build.sh
														
 
															 ```
														
 
															-## 2. Run the server
														
 
															+### 2. Run the server
														
 
															 ```bash
														
 
															 ./run.sh
														
 
															 ```
														
 
															-The scripts handle:
														
 
															-
														
 
															-* GPU detection
														
 
															-* volume mounts for `/models`, `/voices`, `/cache`
														
 
															-* accepting Coqui TTS license terms
														
 
															+The scripts handle GPU detection, volume mounts for `/models`, `/voices`, `/cache`, and acceptance of the Coqui TTS license terms.
														
 
															 ---
														
 
															-# API
														
 
															+## API
														
 
															+
														
 
															+### `GET /health`
														
 
															+
														
 
															+Returns server liveness and current VRAM status.
														
 
															-## `/tts` or `/api/tts`
														
 
															+**Example response (GPU):**
														
 
															+```json
														
 
															+{
														
 
															+  "status": "ok",
														
 
															+  "device": "cuda",
														
 
															+  "vram_free_mb": 2814,
														
 
															+  "vram_total_mb": 4096,
														
 
															+  "vram_used_pct": 31.3
														
 
															+}
														
 
															+```
														
 
															-Synthesize speech and return the full audio file.
														
 
															+**Example response (CPU):**
														
 
															+```json
														
 
															+{ "status": "ok", "device": "cpu" }
														
 
															+```
														
 
															-**Method:** `GET`
														
 
															+---
														
 
															-### Parameters
														
 
															+### `GET /tts` · `GET /api/tts`
														
 
															-| Parameter | Default   | Description             |
														
 
															-| --------- | --------- | ----------------------- |
														
 
															-| `text`    | required  | Text to synthesize      |
														
 
															-| `voice`   | `default` | Voice name in `/voices` |
														
 
															-| `lang`    | `en`      | Language code           |
														
 
															+Synthesise speech and return the complete audio file.
														
 
															 **Returns:** `audio/wav`
														
 
															-### Example
														
 
															+| Parameter | Default   | Description                        |
														
 
															+|-----------|-----------|------------------------------------|
														
 
															+| `text`    | required  | Text to synthesise (plain or markdown) |
														
 
															+| `voice`   | `default` | Voice name (stem of file in `/voices`) |
														
 
															+| `lang`    | `en`      | BCP-47 language code (`en`, `de`, `fr` …) |
														
 
															+**Example:**
														
 
															 ```bash
														
 
															-curl "http://localhost:5002/tts?text=Hello%20world&voice=trump" --output hello.wav
														
 
															+curl "http://localhost:5002/tts?text=Hello+world&voice=alice&lang=en" --output out.wav
														
 
															 ```
														
 
															 ---
														
 
															-# Streaming Endpoint
														
 
															+### `GET /tts_stream` · `GET /api/tts_stream`
														
 
															-## `/tts_stream` or `/api/tts_stream`
														
 
															+Streams synthesised audio progressively as each chunk completes.  
														
 
															+Useful for conversational agents, low-latency playback, or very long documents.
														
 
															-Streams generated audio while synthesis is happening.
														
 
															+Parameters are identical to `/tts`.
														
 
															-This is useful for:
														
 
															-
														
 
															-* conversational agents
														
 
															-* low-latency playback
														
 
															-* long text generation
														
 
															+**Example:**
														
 
															+```bash
														
 
															+curl "http://localhost:5002/tts_stream?text=Hello+world&voice=alice&lang=en" --output out.wav
														
 
															+```
														
 
															-**Method:** `GET`
														
 
															+> Streaming works best with audio players that support progressive WAV input (e.g. VLC, ffplay, most browser `<audio>` elements).
														
 
															-### Parameters
														
 
															+---
														
 
															-Same as `/tts`.
														
 
															+### `GET /voices`
														
 
															-### Example
														
 
															+Returns available voices (deduplicated, sorted).
														
 
															-```bash
														
 
															-curl "http://localhost:5002/tts_stream?text=Hello%20world&voice=trump" --output hello.wav
														
 
															+**Example response:**
														
 
															+```json
														
 
															+{ "voices": ["alice", "default", "narrator"] }
														
 
															 ```
														
 
															-Streaming works best with audio players capable of handling progressive WAV streams.
														
 
															-
														
 
															 ---
														
 
															-# Voices
														
 
															+## Text Preprocessing
														
 
															-## `/voices`
														
 
															+All text passes through a two-stage pipeline before synthesis:
														
 
															-Returns available voices.
														
 
															+### 1. Markdown → prosody
														
 
															-**Method:** `GET`
														
 
															+Markdown structure is translated into punctuation cues that XTTS responds to prosodically — no spoken labels, just natural pauses:
														
 
															-### Example response
														
 
															+| Markdown element | Spoken rendering |
														
 
															+|-----------------|-----------------|
														
 
															+| `# H1` | Long pause (`...`) before and after |
														
 
															+| `## H2` / `### H3` | Medium pause (`.`) before and after |
														
 
															+| `#### H4–H6` | Short pause after |
														
 
															+| `**bold**` / `*italic*` | Comma-breath either side `, text,` |
														
 
															+| `- bullet` / `1. item` | Comma-breath before, period after |
														
 
															+| Blank line | Full stop (paragraph break) |
														
 
															+| `---` horizontal rule | Long section-break pause (`...`) |
														
 
															+| `` `code` `` / ` ```block``` ` | Plain text, fences stripped |
														
 
															+| `[link](url)` | Label text only |
														
 
															-```json
														
 
															-{
														
 
															-  "voices": ["trump","narrator","alice"]
														
 
															-}
														
 
															-```
														
 
															+### 2. Acronym expansion
														
 
															----
														
 
															+Acronyms and symbols are expanded to phonetic spellings before tokenisation, preventing the CUDA device-side assert errors caused by out-of-range token IDs.
														
 
															+
														
 
															+Expansion is **language-aware** — pass `lang=de` for German rules, any other value for English rules.
														
 
															+
														
 
															+**German examples (`lang=de`):**
														
 
															-# Voice Handling
														
 
															+| Input | Expanded |
														
 
															+|-------|----------|
														
 
															+| `KI`  | `Ka-I` |
														
 
															+| `EU`  | `E-U` |
														
 
															+| `US`  | `U-Es` |
														
 
															+| `ARD` | `A-Er-De` |
														
 
															+| `z.B.` | `zum Beispiel` |
														
 
															+| `€`   | `Euro` |
														
 
															-* `.wav` is the canonical internal format
														
 
															-* `.mp3` is converted automatically when needed
														
 
															-* If a `.mp3` is newer than the `.wav`, reconversion is triggered
														
 
															-* Voice embeddings are cached in `/cache` for faster synthesis
														
 
															-* Cached embeddings persist across container restarts
														
 
															+**English examples (`lang=en`):**
														
 
															+
														
 
															+| Input | Expanded |
														
 
															+|-------|----------|
														
 
															+| `KI`  | `Kay Eye` |
														
 
															+| `EU`  | `E-U` |
														
 
															+| `€`   | `euros` |
														
 
															+
														
 
															+To add a new term, edit `ACRONYMS_DE` or `ACRONYMS_EN` at the top of `tts_server.py` — no other changes needed.
														
 
															 ---
														
 
															-# Large Text Handling
														
 
															+## GPU Memory Management
														
 
															+
														
 
															+The server is designed to run stably on small GPUs (~4 GB VRAM) across long documents and back-to-back requests.
														
 
															-Long inputs are automatically **split into smaller chunks** before synthesis.
														
 
															+**Proactive headroom check (before synthesis starts)**  
														
 
															+If free VRAM is below 20% when a request arrives, the entire request — model, embeddings, and inference — is pinned to CPU from the start. This avoids the more expensive mid-document fallback.
														
 
															-This provides several advantages:
														
 
															+**Per-chunk cache flush (during synthesis)**  
														
 
															+`torch.cuda.empty_cache()` is called after every successfully synthesised chunk. XTTS leaves GPT decoder KV-cache and attention buffers behind; without explicit flushing these accumulate across a 20-chunk document and starve subsequent requests.
														
 
															-* prevents **CUDA out-of-memory errors**
														
 
															-* improves reliability on **low VRAM GPUs**
														
 
															-* allows long paragraphs or documents to be synthesized safely
														
 
															+**End-of-request flush (after synthesis)**  
														
 
															+A final `empty_cache()` after the full request completes catches anything the per-chunk flushes missed, including output tensors that lived in the accumulation buffer.
														
 
															-Chunked outputs are automatically concatenated into a single audio stream.
														
 
															+**OOM recovery (safety net)**  
														
 
															+If a chunk triggers a CUDA OOM despite the above, the model moves to CPU for that chunk and returns to GPU immediately after, inside a `finally` block so a CPU-side failure cannot strand the model on CPU permanently.
														
 
															+
														
 
															+**CPU multi-core utilisation**  
														
 
															+When running on CPU (fallback or CPU-only host), `torch.set_num_threads(os.cpu_count())` is set at startup so all available cores are used.
														
 
															 ---
														
 
															-# GPU Memory Handling
														
 
															+## Adding a New Voice (Voice Cloning)
														
 
															-The server is designed to work even on **small GPUs (~4GB VRAM)** such as:
														
 
															+XTTS performs zero-shot voice cloning — no training required. Any speaker reference
														
 
															+audio dropped into the `/voices` folder is immediately available as a voice.
														
 
															-* GTX 1650
														
 
															-* GTX 1050 Ti
														
 
															-* low-end cloud GPUs
														
 
															+### Quick start
														
 
															-If the GPU runs out of memory:
														
 
															+1. Take any recording of the speaker — an interview, a voice memo, a podcast clip.
														
 
															+2. Aim for **10–30 seconds** of clean speech. Longer is fine; shorter degrades quality.
														
 
															+3. Drop the file into `/voices` as either:
														
 
															+   * `<voice-name>.wav` — used directly
														
 
															+   * `<voice-name>.mp3` — converted to `.wav` automatically on first use
														
 
															+4. Call the API with `voice=<voice-name>`:
														
 
															-1. The system automatically catches the CUDA OOM error
														
 
															-2. The synthesis request **falls back to CPU mode**
														
 
															-3. Audio generation continues without crashing the server
														
 
															+```bash
														
 
															+curl "http://localhost:5002/tts?text=Hello&voice=alice&lang=en" --output out.wav
														
 
															+```
														
 
															-This allows stable operation even with long text inputs.
														
 
															+No server restart required.
														
 
															----
														
 
															+### What happens automatically
														
 
															+
														
 
															+| Step | Detail |
														
 
															+|------|--------|
														
 
															+| Format conversion | `.mp3` → `.wav` (22050 Hz mono) via `ffmpeg` on first request |
														
 
															+| Reconversion | If the `.mp3` is **newer** than the `.wav` (i.e. you replaced it), the `.wav` is regenerated |
														
 
															+| Embedding computation | Speaker embedding extracted from the `.wav` and saved to `/cache/<voice-name>.pkl` |
														
 
															+| Cache invalidation | The `.pkl` is keyed by SHA-256 hash of the `.wav` — replacing the audio automatically triggers recomputation |
														
 
															+| In-memory caching | After first use the embedding lives in memory; subsequent requests for the same voice pay no disk cost |
														
 
															+
														
 
															+### Tips for best quality
														
 
															+
														
 
															+* **Mono, minimal background noise** — music, reverb, and crosstalk confuse the encoder
														
 
															+* **Consistent tone** — avoid clips that mix whispering and shouting
														
 
															+* **Trim silence** — remove long silent gaps at the start and end before dropping in
														
 
															+* **Sample rate** — any rate works; `ffmpeg` resamples to 22050 Hz automatically
														
 
															+* **Language match** — the reference audio does not need to be in the same language as `text`, but accent and vocal character transfer better when it is
														
 
															-# Notes
														
 
															+### Replacing a voice
														
 
															-* GPU recommended for real-time XTTS synthesis
														
 
															-* CPU fallback ensures stability even on limited hardware
														
 
															-* `/models`, `/voices`, and `/cache` should be mounted as Docker volumes
														
 
															-* `/tts` endpoint is backward-compatible with `/api/tts`
														
 
															-* Set `DEFAULT_VOICE = "default"` in `tts_server.py` for missing voice parameters
														
 
															+Drop a new file with the same name into `/voices`. The cache key is a content hash,
														
 
															+so the stale embedding is detected automatically on the next request and recomputed —
														
 
															+no manual cache clearing needed.
														
 
															 ---
														
 
															-# License
														
 
															+## Notes
														
 
															-* Non-commercial use: [Coqui CPML](https://coqui.ai/cpml)
														
 
															-* Commercial license available: `licensing@coqui.ai`
														
 
															+* `lang` should match the language of the `text` parameter — XTTS uses it for tokenisation, not just accent
														
 
															+* `/tts` and `/api/tts` are fully interchangeable (same handler)
														
 
															+* `/models`, `/voices`, and `/cache` should be Docker volume mounts so data persists across image rebuilds
														
 
															+* The VRAM threshold (default 20%) can be tuned via `VRAM_HEADROOM` at the top of `tts_server.py`
														
 
															+* The chunk size (default 200 chars) can be tuned via `MAX_CHUNK_LEN`
														
 
															 ---
														
 
															+
														
 
															+## License
														
 
															+
														
 
															+* Non-commercial use: [Coqui CPML](https://coqui.ai/cpml)
														
 
															+* Commercial licensing: `licensing@coqui.ai`
														
--- a/tts_server.py
+++ b/tts_server.py
@@ -30,12 +30,12 @@ from TTS.api import TTS
 
															 # ─── Paths & constants ────────────────────────────────────────────────────────
														
 
															-VOICE_DIR  = Path("/voices")
														
 
															-CACHE_DIR  = Path("/cache")
														
 
															-MODEL_NAME = "tts_models/multilingual/multi-dataset/xtts_v2"
														
 
															+VOICE_DIR   = Path("/voices")
														
 
															+CACHE_DIR   = Path("/cache")
														
 
															+MODEL_NAME  = "tts_models/multilingual/multi-dataset/xtts_v2"
														
 
															 SAMPLE_RATE = 24000
														
 
															-VRAM_HEADROOM = 0.20          # fall back to CPU when VRAM < 20% free
														
 
															-MAX_CHUNK_LEN = 200           # chars; XTTS hard-limit is ~400 tokens ≈ 250 chars
														
 
															+VRAM_HEADROOM = 0.20   # fall back to CPU when VRAM < 20% free
														
 
															+MAX_CHUNK_LEN = 200    # chars; XTTS hard-limit ~400 tokens ≈ 250 chars
														
 
															 VOICE_DIR.mkdir(exist_ok=True)
														
 
															 CACHE_DIR.mkdir(exist_ok=True)
														
@@ -47,39 +47,263 @@ _device = "cuda" if torch.cuda.is_available() else "cpu"
 
															 tts = TTS(MODEL_NAME).to(_device)
														
 
															 print(f"Model loaded on {_device}.")
														
 
															-# Single lock so concurrent requests don't fight over GPU / model.to() calls
														
 
															+# Serialise all model access so concurrent requests don't race on .to() calls
														
 
															 _model_lock = threading.Lock()
														
 
															 app = FastAPI()
														
 
															 embedding_cache: dict = {}
														
 
															-# ─── Text helpers ─────────────────────────────────────────────────────────────
														
 
															+# ─── Acronym / symbol tables ──────────────────────────────────────────────────
														
 
															+#
														
 
															+# Keys are matched as whole words (word-boundary regex).
														
 
															+# Values are phonetic spellings XTTS pronounces letter-by-letter.
														
 
															+# Hyphens between letters reliably force individual-letter pronunciation.
														
 
															+#
														
 
															+# German rule: spell every letter using German letter names.
														
 
															+# English rule: most common EN acronyms are already correct; only fix known
														
 
															+#               bad ones (mainly German acronyms appearing in mixed text).
														
 
															+
														
 
															+ACRONYMS_DE: dict[str, str] = {
														
 
															+    # ── Technology / computing ───────────────────────────────────────────────
														
 
															+    "KI":    "Ka-I",
														
 
															+    "IT":    "I-Te",
														
 
															+    "PC":    "Pe-Tse",
														
 
															+    "API":   "A-Pe-I",
														
 
															+    "URL":   "U-Er-El",
														
 
															+    "HTTP":  "Ha-Te-Te-Pe",
														
 
															+    "AI":    "Ei-Ei",           # English loanword in German text
														
 
															+    "ML":    "Em-El",
														
 
															+    "UI":    "U-I",
														
 
															+    "GPU":   "Ge-Pe-U",
														
 
															+    "CPU":   "Tse-Pe-U",
														
 
															+    # ── Geography / politics ─────────────────────────────────────────────────
														
 
															+    "EU":    "E-U",
														
 
															+    "US":    "U-Es",
														
 
															+    "USA":   "U-Es-A",
														
 
															+    "UK":    "U-Ka",
														
 
															+    "UN":    "U-En",
														
 
															+    "NATO":  "NATO",            # spoken as a word in German too
														
 
															+    "BRD":   "Be-Er-De",
														
 
															+    "DDR":   "De-De-Er",
														
 
															+    "SPD":   "Es-Pe-De",
														
 
															+    "CDU":   "Tse-De-U",
														
 
															+    "CSU":   "Tse-Es-U",
														
 
															+    "FDP":   "Ef-De-Pe",
														
 
															+    "AfD":   "A-Ef-De",
														
 
															+    "ÖVP":   "Ö-Fau-Pe",
														
 
															+    "FPÖ":   "Ef-Pe-Ö",
														
 
															+    # ── Business / finance ───────────────────────────────────────────────────
														
 
															+    "AG":    "A-Ge",
														
 
															+    "GmbH":  "Ge-Em-Be-Ha",
														
 
															+    "CEO":   "Tse-E-O",
														
 
															+    "CFO":   "Tse-Ef-O",
														
 
															+    "CTO":   "Tse-Te-O",
														
 
															+    "HR":    "Ha-Er",
														
 
															+    "PR":    "Pe-Er",
														
 
															+    "BIP":   "Be-I-Pe",
														
 
															+    "EZB":   "E-Tse-Be",
														
 
															+    "IWF":   "I-Ve-Ef",
														
 
															+    "WTO":   "Ve-Te-O",
														
 
															+    # ── Media / broadcasting ─────────────────────────────────────────────────
														
 
															+    "ARD":   "A-Er-De",
														
 
															+    "ZDF":   "Tse-De-Ef",
														
 
															+    "ORF":   "O-Er-Ef",
														
 
															+    "SRF":   "Es-Er-Ef",
														
 
															+    "WDR":   "Ve-De-Er",
														
 
															+    "NDR":   "En-De-Er",
														
 
															+    "MDR":   "Em-De-Er",
														
 
															+    # ── Units / symbols (text substitution) ──────────────────────────────────
														
 
															+    "€":     "Euro",
														
 
															+    "$":     "Dollar",
														
 
															+    "£":     "Pfund",
														
 
															+    "%":     "Prozent",
														
 
															+    "°C":    "Grad Celsius",
														
 
															+    "°F":    "Grad Fahrenheit",
														
 
															+    "km":    "Kilometer",
														
 
															+    "kg":    "Kilogramm",
														
 
															+    # ── Common German abbreviations ───────────────────────────────────────────
														
 
															+    "bzw.":  "beziehungsweise",
														
 
															+    "ca.":   "circa",
														
 
															+    "usw.":  "und so weiter",
														
 
															+    "z.B.":  "zum Beispiel",
														
 
															+    "d.h.":  "das heißt",
														
 
															+    "u.a.":  "unter anderem",
														
 
															+    "etc.":  "etcetera",
														
 
															+    "Nr.":   "Nummer",
														
 
															+    "vs.":   "versus",
														
 
															+    "Dr.":   "Doktor",
														
 
															+    "Prof.": "Professor",
														
 
															+    "Hrsg.": "Herausgeber",
														
 
															+    "Jh.":   "Jahrhundert",
														
 
															+    "Mrd.":  "Milliarden",
														
 
															+    "Mio.":  "Millionen",
														
 
															+}
														
 
															+
														
 
															+ACRONYMS_EN: dict[str, str] = {
														
 
															+    # Only list acronyms that XTTS mispronounces in English context.
														
 
															+    # German acronyms that appear in English/mixed text:
														
 
															+    "KI":    "Kay Eye",
														
 
															+    "EU":    "E-U",
														
 
															+    "BRD":   "B-R-D",
														
 
															+    "DDR":   "D-D-R",
														
 
															+    "GmbH":  "G-m-b-H",
														
 
															+    "EZB":   "E-Z-B",
														
 
															+    "ARD":   "A-R-D",
														
 
															+    "ZDF":   "Z-D-F",
														
 
															+    "ORF":   "O-R-F",
														
 
															+    "SRF":   "S-R-F",
														
 
															+    "WDR":   "W-D-R",
														
 
															+    "NDR":   "N-D-R",
														
 
															+    "MDR":   "M-D-R",
														
 
															+    # Units / symbols
														
 
															+    "€":     "euros",
														
 
															+    "$":     "dollars",
														
 
															+    "£":     "pounds",
														
 
															+    "%":     "percent",
														
 
															+    "°C":    "degrees Celsius",
														
 
															+    "°F":    "degrees Fahrenheit",
														
 
															+    "km":    "kilometers",
														
 
															+    "kg":    "kilograms",
														
 
															+    # Abbreviations
														
 
															+    "vs.":   "versus",
														
 
															+    "etc.":  "et cetera",
														
 
															+    "Dr.":   "Doctor",
														
 
															+    "Prof.": "Professor",
														
 
															+    "Nr.":   "Number",
														
 
															+    "Mrd.":  "billion",
														
 
															+    "Mio.":  "million",
														
 
															+}
														
 
															+
														
 
															+
														
 
															+def _build_acronym_pattern(table: dict[str, str]) -> re.Pattern:
														
 
															+    """
														
 
															+    Compile a single regex matching all keys as whole tokens.
														
 
															+    Longer keys take priority (sorted descending by length).
														
 
															+    Pure-symbol keys (€, $, °C) are matched without word boundaries.
														
 
															+    """
														
 
															+    word_keys    = sorted([k for k in table if re.match(r'\w', k)],    key=len, reverse=True)
														
 
															+    special_keys = sorted([k for k in table if not re.match(r'\w', k)], key=len, reverse=True)
														
 
															+
														
 
															+    parts = [r'\b' + re.escape(k) + r'\b' for k in word_keys]
														
 
															+    parts += [re.escape(k) for k in special_keys]
														
 
															+
														
 
															+    return re.compile('|'.join(parts)) if parts else re.compile(r'(?!)')
														
 
															+
														
 
															+
														
 
															+_PATTERN_DE = _build_acronym_pattern(ACRONYMS_DE)
														
 
															+_PATTERN_EN = _build_acronym_pattern(ACRONYMS_EN)
														
 
															+
														
 
															+
														
 
															+def expand_acronyms(text: str, lang: str) -> str:
														
 
															+    """Replace acronyms/symbols with phonetic expansions for the given language."""
														
 
															+    if lang.startswith("de"):
														
 
															+        table, pattern = ACRONYMS_DE, _PATTERN_DE
														
 
															+    else:
														
 
															+        table, pattern = ACRONYMS_EN, _PATTERN_EN
														
 
															+    return pattern.sub(lambda m: table[m.group(0)], text)
														
 
															+
														
 
															+
														
 
															+# ─── Markdown → natural speech ────────────────────────────────────────────────
														
 
															+#
														
 
															+# XTTS has no SSML support, but punctuation shapes prosody directly:
														
 
															+#   Period  →  short stop / breath
														
 
															+#   Ellipsis "..."  →  longer, contemplative pause
														
 
															+#   Comma  →  brief breath
														
 
															+#
														
 
															+# Mapping:
														
 
															+#   H1         →  "..." before + text + "." + "..." after  (longest pause)
														
 
															+#   H2 / H3    →  "." before + text + "."                  (medium pause)
														
 
															+#   H4–H6      →  text + "."                               (small pause)
														
 
															+#   **bold**   →  ", " + text + ","                        (emphasis breath)
														
 
															+#   *italic*   →  ", " + text + ","
														
 
															+#   Bullets    →  ", " + text + "."                        (list breath)
														
 
															+#   Blank line →  "."                                       (paragraph stop)
														
 
															+#   Code block →  plain text, fences stripped
														
 
															+#   Link       →  label text only
														
 
															+#   HR  ---    →  "..."                                     (section break)
														
 
															+
														
 
															+_RE_HR          = re.compile(r'^\s*[-*_]{3,}\s*$', re.MULTILINE)
														
 
															+_RE_CODE_BLOCK  = re.compile(r'```[\s\S]*?```')
														
 
															+_RE_INLINE_CODE = re.compile(r'`[^`]+`')
														
 
															+_RE_H1          = re.compile(r'^#\s+(.+)$', re.MULTILINE)
														
 
															+_RE_H2          = re.compile(r'^#{2,3}\s+(.+)$', re.MULTILINE)
														
 
															+_RE_H_DEEP      = re.compile(r'^#{4,6}\s+(.+)$', re.MULTILINE)
														
 
															+_RE_BOLD_ITALIC = re.compile(r'\*{3}(.+?)\*{3}|_{3}(.+?)_{3}')
														
 
															+_RE_BOLD        = re.compile(r'\*{2}(.+?)\*{2}|_{2}(.+?)_{2}')
														
 
															+_RE_ITALIC      = re.compile(r'\*(.+?)\*|_(.+?)_')
														
 
															+_RE_LINK        = re.compile(r'\[([^\]]+)\]\([^)]*\)')
														
 
															+_RE_BULLET      = re.compile(r'^\s*[-*+]\s+(.+)$', re.MULTILINE)
														
 
															+_RE_NUMBERED    = re.compile(r'^\s*\d+\.\s+(.+)$', re.MULTILINE)
														
 
															+_RE_BLOCKQUOTE  = re.compile(r'^\s*>\s+(.+)$', re.MULTILINE)
														
 
															+_RE_MULTI_SPACE = re.compile(r'  +')
														
 
															+_RE_MULTI_DOTS  = re.compile(r'\.{4,}')
														
 
															+_RE_CONTROL     = re.compile(r'[\x00-\x08\x0b\x0c\x0e-\x1f\x7f]')
														
 
															+
														
 
															+
														
 
															+def markdown_to_speech_text(text: str) -> str:
														
 
															+    """
														
 
															+    Convert markdown to plain text shaped for natural TTS prosody.
														
 
															+    Uses only punctuation cues — no spoken labels.
														
 
															+    """
														
 
															+    # 1. Normalise line endings + strip control chars
														
 
															+    text = text.replace('\r\n', '\n').replace('\r', '\n')
														
 
															+    text = _RE_CONTROL.sub('', text)
														
 
															-# Characters XTTS tokeniser chokes on → strip or replace before inference
														
 
															-_MARKDOWN_RE   = re.compile(r'\*{1,2}|_{1,2}|`+|#{1,6}\s?|~~|\[([^\]]*)\]\([^)]*\)')
														
 
															-_MULTI_SPACE   = re.compile(r'  +')
														
 
															-_CONTROL_CHARS = re.compile(r'[\x00-\x08\x0b\x0c\x0e-\x1f\x7f]')
														
 
															+    # 2. Code blocks → plain text (strip fences, keep content)
														
 
															+    text = _RE_CODE_BLOCK.sub(
														
 
															+        lambda m: m.group(0).split('\n', 1)[-1].rsplit('\n', 1)[0], text
														
 
															+    )
														
 
															+    text = _RE_INLINE_CODE.sub(lambda m: m.group(0).strip('`'), text)
														
 
															+
														
 
															+    # 3. Horizontal rules → long section-break pause
														
 
															+    text = _RE_HR.sub('\n...\n', text)
														
 
															+
														
 
															+    # 4. Headings — longest pause for H1, medium for H2/H3, small for H4+
														
 
															+    text = _RE_H1.sub(r'\n...\n\1.\n...\n', text)
														
 
															+    text = _RE_H2.sub(r'\n.\n\1.\n', text)
														
 
															+    text = _RE_H_DEEP.sub(r'\n\1.\n', text)
														
 
															+
														
 
															+    # 5. Blockquotes → comma-padded inline
														
 
															+    text = _RE_BLOCKQUOTE.sub(r', \1,', text)
														
 
															+
														
 
															+    # 6. Inline emphasis — extract text, add comma-pauses
														
 
															+    text = _RE_BOLD_ITALIC.sub(lambda m: ', ' + (m.group(1) or m.group(2)) + ',', text)
														
 
															+    text = _RE_BOLD.sub(       lambda m: ', ' + (m.group(1) or m.group(2)) + ',', text)
														
 
															+    text = _RE_ITALIC.sub(     lambda m: ', ' + (m.group(1) or m.group(2)) + ',', text)
														
 
															+
														
 
															+    # 7. Links → label text only
														
 
															+    text = _RE_LINK.sub(r'\1', text)
														
 
															+
														
 
															+    # 8. List items → comma breath before, period after
														
 
															+    text = _RE_BULLET.sub(  r', \1.', text)
														
 
															+    text = _RE_NUMBERED.sub(r', \1.', text)
														
 
															+
														
 
															+    # 9. Paragraph breaks → full stop + implicit pause
														
 
															+    text = re.sub(r'\n{2,}', '.\n', text)
														
 
															+
														
 
															+    # 10. Remaining single newlines → space
														
 
															+    text = text.replace('\n', ' ')
														
 
															+
														
 
															+    # 11. Clean up punctuation artifacts left by the above substitutions
														
 
															+    text = re.sub(r',\s*,',          ',',   text)   # double commas
														
 
															+    text = re.sub(r'\.\s*\.(?!\.)',  '.',   text)   # double periods (not ellipsis)
														
 
															+    text = _RE_MULTI_DOTS.sub('...', text)           # normalise over-long ellipses
														
 
															+    text = re.sub(r'\s*\.\s*,',      '.',   text)   # ., → .
														
 
															+    text = re.sub(r',\s*\.',         '.',   text)   # ,. → .
														
 
															+    text = re.sub(r'\.\s*\.\.\.',    '...', text)   # .... → ...
														
 
															+    text = _RE_MULTI_SPACE.sub(' ',  text)
														
 
															-def clean_text(text: str) -> str:
														
 
															-    """Remove markdown and control characters that corrupt XTTS tokenisation."""
														
 
															-    text = _MARKDOWN_RE.sub(r'\1', text)   # strip md, keep link label
														
 
															-    text = _CONTROL_CHARS.sub('', text)
														
 
															-    text = text.replace('\r\n', '\n').replace('\r', '\n')
														
 
															-    # Collapse multiple blank lines / spaces
														
 
															-    text = re.sub(r'\n{3,}', '\n\n', text)
														
 
															-    text = _MULTI_SPACE.sub(' ', text)
														
 
															     return text.strip()
														
 
															+# ─── Text chunking ────────────────────────────────────────────────────────────
														
 
															+
														
 
															 def chunk_text(text: str, max_len: int = MAX_CHUNK_LEN) -> list[str]:
														
 
															     """
														
 
															-    Split on sentence boundaries.  Falls back to word-boundary splitting
														
 
															-    for sentences that are still too long (e.g. no punctuation at all).
														
 
															+    Split on sentence boundaries; falls back to word-boundary splits for
														
 
															+    sentences that exceed max_len (e.g. no punctuation, very long clauses).
														
 
															     """
														
 
															-    text = clean_text(text)
														
 
															-    # Split on sentence-ending punctuation followed by whitespace or end
														
 
															     sentences = re.split(r'(?<=[.!?…])\s+', text)
														
 
															-
														
 
															     chunks: list[str] = []
														
 
															     current = ""
														
@@ -88,7 +312,6 @@ def chunk_text(text: str, max_len: int = MAX_CHUNK_LEN) -> list[str]:
 
															         if not s:
														
 
															             continue
														
 
															-        # Single sentence longer than max_len → split on word boundary
														
 
															         if len(s) > max_len:
														
 
															             if current:
														
 
															                 chunks.append(current)
														
@@ -116,7 +339,14 @@ def chunk_text(text: str, max_len: int = MAX_CHUNK_LEN) -> list[str]:
 
															     if current:
														
 
															         chunks.append(current)
														
 
															-    return [c for c in chunks if c]
														
 
															+    return [c for c in chunks if c.strip()]
														
 
															+
														
 
															+
														
 
															+def prepare_text(text: str, lang: str) -> list[str]:
														
 
															+    """Full pipeline: markdown → prosody text → acronym expansion → chunks."""
														
 
															+    text = markdown_to_speech_text(text)
														
 
															+    text = expand_acronyms(text, lang)
														
 
															+    return chunk_text(text)
														
 
															 # ─── Voice / embedding helpers ────────────────────────────────────────────────
														
@@ -155,8 +385,8 @@ def get_embedding(voice_name: str):
 
															     if voice_name in embedding_cache:
														
 
															         return embedding_cache[voice_name]
														
 
															-    wav_file  = ensure_wav(voice_name)
														
 
															-    file_hash = sha256_file(wav_file)
														
 
															+    wav_file   = ensure_wav(voice_name)
														
 
															+    file_hash  = sha256_file(wav_file)
														
 
															     cache_file = CACHE_DIR / f"{voice_name}.pkl"
														
 
															     if cache_file.exists():
														
@@ -182,7 +412,7 @@ def get_embedding(voice_name: str):
 
															     return data
														
 
															-# ─── Core inference helper ────────────────────────────────────────────────────
														
 
															+# ─── Core inference ───────────────────────────────────────────────────────────
														
 
															 def _vram_low() -> bool:
														
 
															     if not torch.cuda.is_available():
														
@@ -191,8 +421,10 @@ def _vram_low() -> bool:
 
															     return (free / total) < VRAM_HEADROOM
														
 
															-def _infer_chunk(chunk: str, lang: str, gpt_cond_latent, speaker_embedding) -> np.ndarray:
														
 
															-    """Run inference for one chunk; falls back to CPU on OOM."""
														
 
															+def _infer_chunk(
														
 
															+    chunk: str, lang: str, gpt_cond_latent, speaker_embedding
														
 
															+) -> np.ndarray:
														
 
															+    """Synthesise one text chunk; auto-falls back to CPU on CUDA OOM."""
														
 
															     model = tts.synthesizer.tts_model
														
 
															     def _run(m, lat, emb):
														
@@ -207,9 +439,14 @@ def _infer_chunk(chunk: str, lang: str, gpt_cond_latent, speaker_embedding) -> n
 
															     with _model_lock:
														
 
															         try:
														
 
															-            return _run(model, gpt_cond_latent, speaker_embedding)
														
 
															+            result = _run(model, gpt_cond_latent, speaker_embedding)
														
 
															+            # Release XTTS activation memory after every chunk so it doesn't
														
 
															+            # accumulate across a long document and starve the next request.
														
 
															+            if torch.cuda.is_available():
														
 
															+                torch.cuda.empty_cache()
														
 
															+            return result
														
 
															         except torch.cuda.OutOfMemoryError:
														
 
															-            print(f"⚠ CUDA OOM on chunk – falling back to CPU ({os.cpu_count()} cores)")
														
 
															+            print(f"⚠ CUDA OOM – falling back to CPU ({os.cpu_count()} cores)")
														
 
															             torch.cuda.empty_cache()
														
 
															             model.to("cpu")
														
 
															             try:
														
@@ -219,7 +456,6 @@ def _infer_chunk(chunk: str, lang: str, gpt_cond_latent, speaker_embedding) -> n
 
															                     speaker_embedding.to("cpu"),
														
 
															                 )
														
 
															             finally:
														
 
															-                # Always move back, even if CPU inference also fails
														
 
															                 model.to("cuda")
														
 
															                 torch.cuda.empty_cache()
														
 
															             return result
														
@@ -232,10 +468,21 @@ def root():
 
															     return {"status": "XTTS server running", "device": _device}
														
 
															+@app.get("/health")
														
 
															+def health():
														
 
															+    info = {"status": "ok", "device": _device}
														
 
															+    if torch.cuda.is_available():
														
 
															+        free, total = torch.cuda.mem_get_info()
														
 
															+        info["vram_free_mb"]  = round(free  / 1024 ** 2)
														
 
															+        info["vram_total_mb"] = round(total / 1024 ** 2)
														
 
															+        info["vram_used_pct"] = round((1 - free / total) * 100, 1)
														
 
															+    return info
														
 
															+
														
 
															+
														
 
															 @app.get("/voices")
														
 
															 def list_voices():
														
 
															-    seen = set()
														
 
															-    voices = []
														
 
															+    seen: set = set()
														
 
															+    voices: list = []
														
 
															     for f in VOICE_DIR.iterdir():
														
 
															         if f.suffix in {".wav", ".mp3"} and f.stem not in seen:
														
 
															             voices.append(f.stem)
														
@@ -251,31 +498,34 @@ def synthesize(text: str, voice: str = "default", lang: str = "en"):
 
															     gpt_cond_latent, speaker_embedding = get_embedding(voice)
														
 
															-    # If VRAM is already scarce, pin embeddings on CPU for this whole request
														
 
															+    # Pin everything to CPU for this request if VRAM is already low
														
 
															     use_cpu = _vram_low()
														
 
															     if use_cpu and torch.cuda.is_available():
														
 
															         print("⚠ Low VRAM – pinning entire request to CPU")
														
 
															-        gpt_cond_latent  = gpt_cond_latent.to("cpu")
														
 
															+        gpt_cond_latent   = gpt_cond_latent.to("cpu")
														
 
															         speaker_embedding = speaker_embedding.to("cpu")
														
 
															         with _model_lock:
														
 
															             tts.synthesizer.tts_model.to("cpu")
														
 
															-    chunks  = chunk_text(text)
														
 
															+    chunks  = prepare_text(text, lang)
														
 
															     wav_all = []
														
 
															     for i, chunk in enumerate(chunks):
														
 
															-        print(f"  chunk {i+1}/{len(chunks)}: {chunk[:60]!r}")
														
 
															+        print(f"  chunk {i+1}/{len(chunks)}: {chunk[:80]!r}")
														
 
															         try:
														
 
															             wav_chunk = _infer_chunk(chunk, lang, gpt_cond_latent, speaker_embedding)
														
 
															         except Exception as e:
														
 
															             raise HTTPException(500, f"Inference failed on chunk {i+1}: {e}")
														
 
															         wav_all.append(wav_chunk)
														
 
															-    # Restore model to GPU if we moved it
														
 
															     if use_cpu and torch.cuda.is_available():
														
 
															         with _model_lock:
														
 
															             tts.synthesizer.tts_model.to("cuda")
														
 
															+    # Final sweep — catches anything the per-chunk clears missed
														
 
															+    if torch.cuda.is_available():
														
 
															+        torch.cuda.empty_cache()
														
 
															+
														
 
															     wav = np.concatenate(wav_all, axis=0)
														
 
															     buf = io.BytesIO()
														
 
															     sf.write(buf, wav, SAMPLE_RATE, format="WAV")
														
@@ -286,16 +536,16 @@ def synthesize(text: str, voice: str = "default", lang: str = "en"):
 
															 @app.get("/tts_stream")
														
 
															 @app.get("/api/tts_stream")
														
 
															 def synthesize_stream(text: str, voice: str = "default", lang: str = "en"):
														
 
															-    """Stream WAV chunks as they are synthesised — lower latency for long texts."""
														
 
															+    """Stream WAV chunks as synthesised — lower latency for long texts."""
														
 
															     if not text.strip():
														
 
															         raise HTTPException(400, "text parameter is empty")
														
 
															     gpt_cond_latent, speaker_embedding = get_embedding(voice)
														
 
															-    chunks = chunk_text(text)
														
 
															+    chunks = prepare_text(text, lang)
														
 
															     def audio_generator():
														
 
															         for i, chunk in enumerate(chunks):
														
 
															-            print(f"  [stream] chunk {i+1}/{len(chunks)}: {chunk[:60]!r}")
														
 
															+            print(f"  [stream] chunk {i+1}/{len(chunks)}: {chunk[:80]!r}")
														
 
															             try:
														
 
															                 wav = _infer_chunk(chunk, lang, gpt_cond_latent, speaker_embedding)
														
 
															             except Exception as e:
														
@@ -305,5 +555,9 @@ def synthesize_stream(text: str, voice: str = "default", lang: str = "en"):
 
															             sf.write(buf, wav, SAMPLE_RATE, format="WAV")
														
 
															             buf.seek(0)
														
 
															             yield buf.read()
														
 
															+            # Clear after each streamed chunk — long documents would otherwise
														
 
															+            # accumulate VRAM and cause the next request to fall back to CPU.
														
 
															+            if torch.cuda.is_available():
														
 
															+                torch.cuda.empty_cache()
														
 
															-    return StreamingResponse(audio_generator(), media_type="audio/wav")
														
 
															+    return StreamingResponse(audio_generator(), media_type="audio/wav")