4 月之前 · b534ee03ff
--- a/.gitignore
+++ b/.gitignore
@@ -0,0 +1,7 @@
 
				+.aider*
			
 
				+cache/*
			
 
				+!cache/.gitkeep
			
 
				+voices/*
			
 
				+!voices/.gitkeep
			
 
				+models/*
			
 
				+!models/.gitkeep
			
--- a/Dockerfile
+++ b/Dockerfile
@@ -7,12 +7,10 @@ ENV TORCH_HOME=/models
 
				 
			
 
				 WORKDIR /app
			
 
				 
			
 
				-# Ubuntu repositories
			
 
				 RUN printf 'deb http://archive.ubuntu.com/ubuntu jammy main universe\n\
			
 
				 deb http://archive.ubuntu.com/ubuntu jammy-updates main universe\n\
			
 
				 deb http://archive.ubuntu.com/ubuntu jammy-security main universe\n' > /etc/apt/sources.list
			
 
				 
			
 
				-# System dependencies
			
 
				 RUN apt-get update && apt-get install -y \
			
 
				     python3 \
			
 
				     python3-pip \
			
@@ -22,29 +20,28 @@ RUN apt-get update && apt-get install -y \
 
				     libsndfile1 \
			
 
				     && rm -rf /var/lib/apt/lists/*
			
 
				 
			
 
				-# Upgrade pip
			
 
				 RUN pip3 install --no-cache-dir --upgrade pip
			
 
				 
			
 
				-# CUDA-enabled PyTorch
			
 
				+# CUDA Torch (PINNED)
			
 
				 RUN pip3 install --no-cache-dir \
			
 
				-    torch \
			
 
				-    torchvision \
			
 
				-    torchaudio \
			
 
				+    torch==2.5.1+cu118 \
			
 
				+    torchvision==0.20.1+cu118 \
			
 
				+    torchaudio==2.5.1+cu118 \
			
 
				     --index-url https://download.pytorch.org/whl/cu118
			
 
				 
			
 
				-# Install Coqui TTS + server
			
 
				+# Coqui TTS
			
 
				 RUN pip3 install --no-cache-dir \
			
 
				-    TTS[all] \
			
 
				+    TTS[all]==0.22.0 \
			
 
				     fastapi \
			
 
				     uvicorn
			
 
				 
			
 
				-# Create directories
			
 
				+# Transformers version compatible with XTTS
			
 
				+RUN pip3 install --no-cache-dir "transformers>=4.30.0,<4.32.0"
			
 
				+
			
 
				 RUN mkdir -p /models /voices
			
 
				 
			
 
				-# Ports
			
 
				 EXPOSE 5002
			
 
				 
			
 
				-# Optional volume mounts
			
 
				 VOLUME ["/models", "/voices"]
			
 
				 
			
 
				 ENTRYPOINT ["python3", "-m", "uvicorn", "tts_server:app", "--host", "0.0.0.0", "--port", "5002"]
			
--- a/README.md
+++ b/README.md
@@ -0,0 +1,191 @@
 
				+# Coqui TTS Docker Server
			
 
				+
			
 
				+A Dockerized Coqui TTS server with multilingual XTTS, multi-speaker support, voice caching, and automatic audio conversion.
			
 
				+
			
 
				+---
			
 
				+
			
 
				+## Features
			
 
				+
			
 
				+* Multilingual TTS via [Coqui XTTS](https://github.com/coqui-ai/TTS)
			
 
				+* Multi-speaker support and voice cloning ready
			
 
				+* Automatic `.mp3` → `.wav` conversion
			
 
				+* Persistent embeddings cache for fast synthesis
			
 
				+* Adjustable `speed`, `pitch`, and `language`
			
 
				+* **Automatic chunking of large input texts** to prevent memory overflow
			
 
				+* **Automatic CPU fallback if GPU VRAM is exhausted**
			
 
				+* **Streaming endpoint for progressive audio output**
			
 
				+* Optimized for **small VRAM GPUs (e.g. GTX 1650 / ~4GB VRAM)**
			
 
				+* `/tts` and `/api/tts` endpoints
			
 
				+
			
 
				+---
			
 
				+
			
 
				+## Repository Structure
			
 
				+
			
 
				+```text
			
 
				+coqui-docker/
			
 
				+├── Dockerfile
			
 
				+├── build.sh
			
 
				+├── run.sh
			
 
				+├── README.md
			
 
				+├── tts_server.py            # Production server with caching and fallback
			
 
				+├── tts_server_simple.py     # Simple version
			
 
				+├── tts_server_noncaching.py # Legacy stages
			
 
				+├── models/                  # XTTS models (host mount recommended)
			
 
				+├── voices/                  # User voices (.wav or .mp3)
			
 
				+└── cache/                   # Persistent embeddings
			
 
				+```
			
 
				+
			
 
				+---
			
 
				+
			
 
				+# Setup
			
 
				+
			
 
				+## 1. Build the Docker image
			
 
				+
			
 
				+```bash
			
 
				+./build.sh
			
 
				+```
			
 
				+
			
 
				+## 2. Run the server
			
 
				+
			
 
				+```bash
			
 
				+./run.sh
			
 
				+```
			
 
				+
			
 
				+The scripts handle:
			
 
				+
			
 
				+* GPU detection
			
 
				+* volume mounts for `/models`, `/voices`, `/cache`
			
 
				+* accepting Coqui TTS license terms
			
 
				+
			
 
				+---
			
 
				+
			
 
				+# API
			
 
				+
			
 
				+## `/tts` or `/api/tts`
			
 
				+
			
 
				+Synthesize speech and return the full audio file.
			
 
				+
			
 
				+**Method:** `GET`
			
 
				+
			
 
				+### Parameters
			
 
				+
			
 
				+| Parameter | Default   | Description             |
			
 
				+| --------- | --------- | ----------------------- |
			
 
				+| `text`    | required  | Text to synthesize      |
			
 
				+| `voice`   | `default` | Voice name in `/voices` |
			
 
				+| `lang`    | `en`      | Language code           |
			
 
				+
			
 
				+**Returns:** `audio/wav`
			
 
				+
			
 
				+### Example
			
 
				+
			
 
				+```bash
			
 
				+curl "http://localhost:5002/tts?text=Hello%20world&voice=trump" --output hello.wav
			
 
				+```
			
 
				+
			
 
				+---
			
 
				+
			
 
				+# Streaming Endpoint
			
 
				+
			
 
				+## `/tts_stream` or `/api/tts_stream`
			
 
				+
			
 
				+Streams generated audio while synthesis is happening.
			
 
				+
			
 
				+This is useful for:
			
 
				+
			
 
				+* conversational agents
			
 
				+* low-latency playback
			
 
				+* long text generation
			
 
				+
			
 
				+**Method:** `GET`
			
 
				+
			
 
				+### Parameters
			
 
				+
			
 
				+Same as `/tts`.
			
 
				+
			
 
				+### Example
			
 
				+
			
 
				+```bash
			
 
				+curl "http://localhost:5002/tts_stream?text=Hello%20world&voice=trump" --output hello.wav
			
 
				+```
			
 
				+
			
 
				+Streaming works best with audio players capable of handling progressive WAV streams.
			
 
				+
			
 
				+---
			
 
				+
			
 
				+# Voices
			
 
				+
			
 
				+## `/voices`
			
 
				+
			
 
				+Returns available voices.
			
 
				+
			
 
				+**Method:** `GET`
			
 
				+
			
 
				+### Example response
			
 
				+
			
 
				+```json
			
 
				+{
			
 
				+  "voices": ["trump","narrator","alice"]
			
 
				+}
			
 
				+```
			
 
				+
			
 
				+---
			
 
				+
			
 
				+# Voice Handling
			
 
				+
			
 
				+* `.wav` is the canonical internal format
			
 
				+* `.mp3` is converted automatically when needed
			
 
				+* If a `.mp3` is newer than the `.wav`, reconversion is triggered
			
 
				+* Voice embeddings are cached in `/cache` for faster synthesis
			
 
				+* Cached embeddings persist across container restarts
			
 
				+
			
 
				+---
			
 
				+
			
 
				+# Large Text Handling
			
 
				+
			
 
				+Long inputs are automatically **split into smaller chunks** before synthesis.
			
 
				+
			
 
				+This provides several advantages:
			
 
				+
			
 
				+* prevents **CUDA out-of-memory errors**
			
 
				+* improves reliability on **low VRAM GPUs**
			
 
				+* allows long paragraphs or documents to be synthesized safely
			
 
				+
			
 
				+Chunked outputs are automatically concatenated into a single audio stream.
			
 
				+
			
 
				+---
			
 
				+
			
 
				+# GPU Memory Handling
			
 
				+
			
 
				+The server is designed to work even on **small GPUs (~4GB VRAM)** such as:
			
 
				+
			
 
				+* GTX 1650
			
 
				+* GTX 1050 Ti
			
 
				+* low-end cloud GPUs
			
 
				+
			
 
				+If the GPU runs out of memory:
			
 
				+
			
 
				+1. The system automatically catches the CUDA OOM error
			
 
				+2. The synthesis request **falls back to CPU mode**
			
 
				+3. Audio generation continues without crashing the server
			
 
				+
			
 
				+This allows stable operation even with long text inputs.
			
 
				+
			
 
				+---
			
 
				+
			
 
				+# Notes
			
 
				+
			
 
				+* GPU recommended for real-time XTTS synthesis
			
 
				+* CPU fallback ensures stability even on limited hardware
			
 
				+* `/models`, `/voices`, and `/cache` should be mounted as Docker volumes
			
 
				+* `/tts` endpoint is backward-compatible with `/api/tts`
			
 
				+* Set `DEFAULT_VOICE = "default"` in `tts_server.py` for missing voice parameters
			
 
				+
			
 
				+---
			
 
				+
			
 
				+# License
			
 
				+
			
 
				+* Non-commercial use: [Coqui CPML](https://coqui.ai/cpml)
			
 
				+* Commercial license available: `licensing@coqui.ai`
			
 
				+
			
 
				+---
			
--- a/build.sh
+++ b/build.sh
@@ -1 +1,2 @@
 
				-docker build -t coqui-tts-server . 
			
 
				+docker rmi coqui-tts-server
			
 
				+docker build --no-cache -t coqui-tts-server . 
			
--- a/cache/.gitkeep
+++ b/cache/.gitkeep
--- a/models/.gitkeep
+++ b/models/.gitkeep
--- a/run.sh
+++ b/run.sh
@@ -1,2 +1,10 @@
 
				-docker run -d --gpus all -p 5002:5002  $(pwd)/tts_server.py:/app/tts_server.py  -v $(pwd)/models:/models  -v $(pwd)/voices:/voices  --name coqui-tts-server coqui-tts-server
			
 
				+docker stop coqui-tts-server
			
 
				+docker rm coqui-tts-server
			
 
				+docker run -d --gpus all -p 5002:5002  \
			
 
				+-v ./tts_server.py:/app/tts_server.py  \
			
 
				+-v ./models:/models  -v ./voices:/voices -v ./cache:/cache \
			
 
				+-v $HOME/.local/share/tts:/root/.local/share/tts \
			
 
				+-e COQUI_TOS_AGREED=1 \
			
 
				+-e PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True \
			
 
				+--name coqui-tts-server coqui-tts-server
			
 
				 
			
--- a/tts_server.py
+++ b/tts_server.py
@@ -1,13 +1,330 @@
 
				-from fastapi import FastAPI, Query
			
 
				-from fastapi.responses import FileResponse
			
 
				+import os
			
 
				+import io
			
 
				+import hashlib
			
 
				+import pickle
			
 
				+import subprocess
			
 
				+from pathlib import Path
			
 
				+
			
 
				+import torch
			
 
				+
			
 
				+# FIX for PyTorch >=2.6 security change
			
 
				+from torch.serialization import add_safe_globals
			
 
				+from TTS.tts.configs.xtts_config import XttsConfig
			
 
				+
			
 
				+import TTS.tts.configs.xtts_config
			
 
				+import TTS.tts.models.xtts
			
 
				+
			
 
				+add_safe_globals([
			
 
				+    TTS.tts.configs.xtts_config.XttsConfig,
			
 
				+    TTS.tts.models.xtts.XttsAudioConfig
			
 
				+])
			
 
				+
			
 
				+from fastapi import FastAPI, HTTPException
			
 
				+from fastapi.responses import StreamingResponse
			
 
				 from TTS.api import TTS
			
 
				-import tempfile
			
 
				+
			
 
				+VOICE_DIR = Path("/voices")
			
 
				+CACHE_DIR = Path("/cache")
			
 
				+
			
 
				+VOICE_DIR.mkdir(exist_ok=True)
			
 
				+CACHE_DIR.mkdir(exist_ok=True)
			
 
				+
			
 
				+MODEL_NAME = "tts_models/multilingual/multi-dataset/xtts_v2"
			
 
				+
			
 
				+print("Loading XTTS model...")
			
 
				+tts = TTS(MODEL_NAME).to("cuda" if torch.cuda.is_available() else "cpu")
			
 
				+print("Model loaded.")
			
 
				 
			
 
				 app = FastAPI()
			
 
				-tts = TTS(model_name="tts_models/en/ljspeech/vits", gpu=True)
			
 
				 
			
 
				+embedding_cache = {}
			
 
				+
			
 
				+
			
 
				+def sha256(path):
			
 
				+    h = hashlib.sha256()
			
 
				+    with open(path, "rb") as f:
			
 
				+        while True:
			
 
				+            chunk = f.read(8192)
			
 
				+            if not chunk:
			
 
				+                break
			
 
				+            h.update(chunk)
			
 
				+    return h.hexdigest()
			
 
				+
			
 
				+
			
 
				+def ensure_wav(voice_name):
			
 
				+
			
 
				+    wav = VOICE_DIR / f"{voice_name}.wav"
			
 
				+    mp3 = VOICE_DIR / f"{voice_name}.mp3"
			
 
				+
			
 
				+    if wav.exists():
			
 
				+
			
 
				+        if mp3.exists() and mp3.stat().st_mtime > wav.stat().st_mtime:
			
 
				+            print(f"MP3 newer than WAV → reconverting {voice_name}")
			
 
				+            convert_to_wav(mp3, wav)
			
 
				+
			
 
				+        return wav
			
 
				+
			
 
				+    if mp3.exists():
			
 
				+        print(f"Converting MP3 → WAV for {voice_name}")
			
 
				+        convert_to_wav(mp3, wav)
			
 
				+        return wav
			
 
				+
			
 
				+    raise HTTPException(404, f"Voice '{voice_name}' not found")
			
 
				+
			
 
				+
			
 
				+def convert_to_wav(src, dst):
			
 
				+
			
 
				+    subprocess.run(
			
 
				+        [
			
 
				+            "ffmpeg",
			
 
				+            "-y",
			
 
				+            "-i",
			
 
				+            str(src),
			
 
				+            "-ar",
			
 
				+            "22050",
			
 
				+            "-ac",
			
 
				+            "1",
			
 
				+            str(dst),
			
 
				+        ],
			
 
				+        check=True,
			
 
				+        stdout=subprocess.DEVNULL,
			
 
				+        stderr=subprocess.DEVNULL,
			
 
				+    )
			
 
				+
			
 
				+def load_cached_embedding(cache_file):
			
 
				+    with open(cache_file, "rb") as f:
			
 
				+        return pickle.load(f)
			
 
				+
			
 
				+
			
 
				+def save_cached_embedding(cache_file, data):
			
 
				+    with open(cache_file, "wb") as f:
			
 
				+        pickle.dump(data, f)
			
 
				+
			
 
				+def get_embedding(voice_name):
			
 
				+
			
 
				+    if voice_name in embedding_cache:
			
 
				+        return embedding_cache[voice_name]
			
 
				+
			
 
				+    src = None
			
 
				+
			
 
				+    for ext in ["wav", "mp3"]:
			
 
				+        p = VOICE_DIR / f"{voice_name}.{ext}"
			
 
				+        if p.exists():
			
 
				+            src = p
			
 
				+            break
			
 
				+
			
 
				+    if not src:
			
 
				+        raise HTTPException(404, f"Voice '{voice_name}' not found")
			
 
				+
			
 
				+    wav_file = ensure_wav(voice_name)
			
 
				+    # wav_file = src if src.suffix == ".wav" else convert_to_wav(src)
			
 
				+
			
 
				+    file_hash = sha256(wav_file)
			
 
				+    cache_file = CACHE_DIR / f"{voice_name}.pkl"
			
 
				+
			
 
				+    if cache_file.exists():
			
 
				+
			
 
				+        cached = load_cached_embedding(cache_file)
			
 
				+
			
 
				+        if cached["hash"] == file_hash:
			
 
				+            print(f"Using cached embedding for {voice_name}")
			
 
				+            embedding_cache[voice_name] = cached["data"]
			
 
				+            return cached["data"]
			
 
				+
			
 
				+    print(f"Computing embedding for {voice_name}")
			
 
				+
			
 
				+    model = tts.synthesizer.tts_model
			
 
				+
			
 
				+    gpt_cond_latent, speaker_embedding = model.get_conditioning_latents(
			
 
				+        audio_path=str(wav_file)
			
 
				+    )
			
 
				+
			
 
				+    data = (gpt_cond_latent, speaker_embedding)
			
 
				+
			
 
				+    save_cached_embedding(
			
 
				+        cache_file,
			
 
				+        {"hash": file_hash, "data": data},
			
 
				+    )
			
 
				+
			
 
				+    embedding_cache[voice_name] = data
			
 
				+
			
 
				+    return data
			
 
				+
			
 
				+@app.get("/")
			
 
				+def root():
			
 
				+    return {"status": "XTTS server running"}
			
 
				+
			
 
				+@app.get("/voices")
			
 
				+def list_voices():
			
 
				+    voices = []
			
 
				+    for f in VOICE_DIR.iterdir():
			
 
				+        if f.suffix in [".wav", ".mp3"]:
			
 
				+            voices.append(f.stem)
			
 
				+    return {"voices": voices}
			
 
				+
			
 
				+@app.get("/tts")
			
 
				 @app.get("/api/tts")
			
 
				-def synth(text: str = Query(...)):
			
 
				-    tmp = tempfile.NamedTemporaryFile(delete=False, suffix=".wav")
			
 
				-    tts.tts_to_file(text=text, file_path=tmp.name)
			
 
				-    return FileResponse(tmp.name, media_type="audio/wav", filename="speech.wav")
			
 
				+def synthesize(
			
 
				+    text: str,
			
 
				+    voice: str = "default",
			
 
				+    lang: str = "en",
			
 
				+):
			
 
				+
			
 
				+    import numpy as np
			
 
				+    import torch
			
 
				+    import io
			
 
				+    import soundfile as sf
			
 
				+    import re
			
 
				+
			
 
				+    def chunk_text(text, max_len=150):
			
 
				+        sentences = re.split(r'(?<=[.!?])\s+', text)
			
 
				+        chunks = []
			
 
				+        current = ""
			
 
				+
			
 
				+        for s in sentences:
			
 
				+            if len(current) + len(s) > max_len:
			
 
				+                if current:
			
 
				+                    chunks.append(current.strip())
			
 
				+                current = s
			
 
				+            else:
			
 
				+                current += " " + s
			
 
				+
			
 
				+        if current:
			
 
				+            chunks.append(current.strip())
			
 
				+
			
 
				+        return chunks
			
 
				+
			
 
				+    gpt_cond_latent, speaker_embedding = get_embedding(voice)
			
 
				+
			
 
				+    text_chunks = chunk_text(text, max_len=150)
			
 
				+
			
 
				+    wav_all = []
			
 
				+
			
 
				+    for chunk in text_chunks:
			
 
				+
			
 
				+        try:
			
 
				+            out = tts.synthesizer.tts_model.inference(
			
 
				+                chunk,
			
 
				+                lang,
			
 
				+                gpt_cond_latent,
			
 
				+                speaker_embedding,
			
 
				+            )
			
 
				+
			
 
				+        except torch.cuda.OutOfMemoryError:
			
 
				+
			
 
				+            print("⚠ CUDA OOM – retrying chunk on CPU")
			
 
				+
			
 
				+            torch.cuda.empty_cache()
			
 
				+
			
 
				+            cpu_model = tts.synthesizer.tts_model.to("cpu")
			
 
				+
			
 
				+            out = cpu_model.inference(
			
 
				+                chunk,
			
 
				+                lang,
			
 
				+                gpt_cond_latent.to("cpu"),
			
 
				+                speaker_embedding.to("cpu"),
			
 
				+            )
			
 
				+
			
 
				+            tts.synthesizer.tts_model.to("cuda")
			
 
				+
			
 
				+        wav_chunk = out["wav"]
			
 
				+
			
 
				+        if len(wav_chunk.shape) == 1:
			
 
				+            wav_chunk = np.expand_dims(wav_chunk, 1)
			
 
				+
			
 
				+        wav_all.append(wav_chunk)
			
 
				+
			
 
				+        if torch.cuda.is_available():
			
 
				+            torch.cuda.empty_cache()
			
 
				+
			
 
				+    wav = np.concatenate(wav_all, axis=0)
			
 
				+
			
 
				+    buf = io.BytesIO()
			
 
				+
			
 
				+    sf.write(buf, wav, 24000, format="WAV")
			
 
				+
			
 
				+    buf.seek(0)
			
 
				+
			
 
				+    return StreamingResponse(buf, media_type="audio/wav")
			
 
				+
			
 
				+
			
 
				+
			
 
				+@app.get("/tts_stream")
			
 
				+@app.get("/api/tts_stream")
			
 
				+def synthesize_stream(
			
 
				+    text: str,
			
 
				+    voice: str = "default",
			
 
				+    lang: str = "en",
			
 
				+):
			
 
				+
			
 
				+    import numpy as np
			
 
				+    import torch
			
 
				+    import soundfile as sf
			
 
				+    import re
			
 
				+    import io
			
 
				+
			
 
				+    def chunk_text(text, max_len=150):
			
 
				+        sentences = re.split(r'(?<=[.!?])\s+', text)
			
 
				+        chunks = []
			
 
				+        current = ""
			
 
				+
			
 
				+        for s in sentences:
			
 
				+            if len(current) + len(s) > max_len:
			
 
				+                if current:
			
 
				+                    chunks.append(current.strip())
			
 
				+                current = s
			
 
				+            else:
			
 
				+                current += " " + s
			
 
				+
			
 
				+        if current:
			
 
				+            chunks.append(current.strip())
			
 
				+
			
 
				+        return chunks
			
 
				+
			
 
				+    gpt_cond_latent, speaker_embedding = get_embedding(voice)
			
 
				+
			
 
				+    text_chunks = chunk_text(text)
			
 
				+
			
 
				+    def audio_generator():
			
 
				+
			
 
				+        for chunk in text_chunks:
			
 
				+
			
 
				+            try:
			
 
				+                out = tts.synthesizer.tts_model.inference(
			
 
				+                    chunk,
			
 
				+                    lang,
			
 
				+                    gpt_cond_latent,
			
 
				+                    speaker_embedding,
			
 
				+                )
			
 
				+
			
 
				+            except torch.cuda.OutOfMemoryError:
			
 
				+
			
 
				+                print("CUDA OOM – retrying on CPU")
			
 
				+
			
 
				+                torch.cuda.empty_cache()
			
 
				+
			
 
				+                cpu_model = tts.synthesizer.tts_model.to("cpu")
			
 
				+
			
 
				+                out = cpu_model.inference(
			
 
				+                    chunk,
			
 
				+                    lang,
			
 
				+                    gpt_cond_latent.to("cpu"),
			
 
				+                    speaker_embedding.to("cpu"),
			
 
				+                )
			
 
				+
			
 
				+                tts.synthesizer.tts_model.to("cuda")
			
 
				+
			
 
				+            wav = out["wav"]
			
 
				+
			
 
				+            buf = io.BytesIO()
			
 
				+
			
 
				+            sf.write(buf, wav, 24000, format="WAV")
			
 
				+
			
 
				+            buf.seek(0)
			
 
				+
			
 
				+            yield buf.read()
			
 
				+
			
 
				+            if torch.cuda.is_available():
			
 
				+                torch.cuda.empty_cache()
			
 
				+
			
 
				+    return StreamingResponse(audio_generator(), media_type="audio/wav")
			
--- a/tts_server_nochunks.py
+++ b/tts_server_nochunks.py
@@ -0,0 +1,212 @@
 
				+import os
			
 
				+import io
			
 
				+import hashlib
			
 
				+import pickle
			
 
				+import subprocess
			
 
				+from pathlib import Path
			
 
				+
			
 
				+import torch
			
 
				+
			
 
				+# FIX for PyTorch >=2.6 security change
			
 
				+from torch.serialization import add_safe_globals
			
 
				+from TTS.tts.configs.xtts_config import XttsConfig
			
 
				+
			
 
				+import TTS.tts.configs.xtts_config
			
 
				+import TTS.tts.models.xtts
			
 
				+
			
 
				+add_safe_globals([
			
 
				+    TTS.tts.configs.xtts_config.XttsConfig,
			
 
				+    TTS.tts.models.xtts.XttsAudioConfig
			
 
				+])
			
 
				+
			
 
				+from fastapi import FastAPI, HTTPException
			
 
				+from fastapi.responses import StreamingResponse
			
 
				+from TTS.api import TTS
			
 
				+
			
 
				+VOICE_DIR = Path("/voices")
			
 
				+CACHE_DIR = Path("/cache")
			
 
				+
			
 
				+VOICE_DIR.mkdir(exist_ok=True)
			
 
				+CACHE_DIR.mkdir(exist_ok=True)
			
 
				+
			
 
				+MODEL_NAME = "tts_models/multilingual/multi-dataset/xtts_v2"
			
 
				+
			
 
				+print("Loading XTTS model...")
			
 
				+tts = TTS(MODEL_NAME).to("cuda" if torch.cuda.is_available() else "cpu")
			
 
				+print("Model loaded.")
			
 
				+
			
 
				+app = FastAPI()
			
 
				+
			
 
				+embedding_cache = {}
			
 
				+
			
 
				+
			
 
				+def sha256(path):
			
 
				+    h = hashlib.sha256()
			
 
				+    with open(path, "rb") as f:
			
 
				+        while True:
			
 
				+            chunk = f.read(8192)
			
 
				+            if not chunk:
			
 
				+                break
			
 
				+            h.update(chunk)
			
 
				+    return h.hexdigest()
			
 
				+
			
 
				+
			
 
				+def ensure_wav(voice_name):
			
 
				+
			
 
				+    wav = VOICE_DIR / f"{voice_name}.wav"
			
 
				+    mp3 = VOICE_DIR / f"{voice_name}.mp3"
			
 
				+
			
 
				+    if wav.exists():
			
 
				+
			
 
				+        if mp3.exists() and mp3.stat().st_mtime > wav.stat().st_mtime:
			
 
				+            print(f"MP3 newer than WAV → reconverting {voice_name}")
			
 
				+            convert_to_wav(mp3, wav)
			
 
				+
			
 
				+        return wav
			
 
				+
			
 
				+    if mp3.exists():
			
 
				+        print(f"Converting MP3 → WAV for {voice_name}")
			
 
				+        convert_to_wav(mp3, wav)
			
 
				+        return wav
			
 
				+
			
 
				+    raise HTTPException(404, f"Voice '{voice_name}' not found")
			
 
				+
			
 
				+
			
 
				+def convert_to_wav(src, dst):
			
 
				+
			
 
				+    subprocess.run(
			
 
				+        [
			
 
				+            "ffmpeg",
			
 
				+            "-y",
			
 
				+            "-i",
			
 
				+            str(src),
			
 
				+            "-ar",
			
 
				+            "22050",
			
 
				+            "-ac",
			
 
				+            "1",
			
 
				+            str(dst),
			
 
				+        ],
			
 
				+        check=True,
			
 
				+        stdout=subprocess.DEVNULL,
			
 
				+        stderr=subprocess.DEVNULL,
			
 
				+    )
			
 
				+
			
 
				+def load_cached_embedding(cache_file):
			
 
				+    with open(cache_file, "rb") as f:
			
 
				+        return pickle.load(f)
			
 
				+
			
 
				+
			
 
				+def save_cached_embedding(cache_file, data):
			
 
				+    with open(cache_file, "wb") as f:
			
 
				+        pickle.dump(data, f)
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+def get_embedding(voice_name):
			
 
				+
			
 
				+    if voice_name in embedding_cache:
			
 
				+        return embedding_cache[voice_name]
			
 
				+
			
 
				+    src = None
			
 
				+
			
 
				+    for ext in ["wav", "mp3"]:
			
 
				+        p = VOICE_DIR / f"{voice_name}.{ext}"
			
 
				+        if p.exists():
			
 
				+            src = p
			
 
				+            break
			
 
				+
			
 
				+    if not src:
			
 
				+        raise HTTPException(404, f"Voice '{voice_name}' not found")
			
 
				+
			
 
				+    wav_file = ensure_wav(voice_name)
			
 
				+    # wav_file = src if src.suffix == ".wav" else convert_to_wav(src)
			
 
				+
			
 
				+    file_hash = sha256(wav_file)
			
 
				+    cache_file = CACHE_DIR / f"{voice_name}.pkl"
			
 
				+
			
 
				+    if cache_file.exists():
			
 
				+
			
 
				+        cached = load_cached_embedding(cache_file)
			
 
				+
			
 
				+        if cached["hash"] == file_hash:
			
 
				+            print(f"Using cached embedding for {voice_name}")
			
 
				+            embedding_cache[voice_name] = cached["data"]
			
 
				+            return cached["data"]
			
 
				+
			
 
				+    print(f"Computing embedding for {voice_name}")
			
 
				+
			
 
				+    model = tts.synthesizer.tts_model
			
 
				+
			
 
				+    gpt_cond_latent, speaker_embedding = model.get_conditioning_latents(
			
 
				+        audio_path=str(wav_file)
			
 
				+    )
			
 
				+
			
 
				+    data = (gpt_cond_latent, speaker_embedding)
			
 
				+
			
 
				+    save_cached_embedding(
			
 
				+        cache_file,
			
 
				+        {"hash": file_hash, "data": data},
			
 
				+    )
			
 
				+
			
 
				+    embedding_cache[voice_name] = data
			
 
				+
			
 
				+    return data
			
 
				+
			
 
				+@app.get("/")
			
 
				+def root():
			
 
				+    return {"status": "XTTS server running"}
			
 
				+
			
 
				+@app.get("/voices")
			
 
				+def list_voices():
			
 
				+    voices = []
			
 
				+    for f in VOICE_DIR.iterdir():
			
 
				+        if f.suffix in [".wav", ".mp3"]:
			
 
				+            voices.append(f.stem)
			
 
				+    return {"voices": voices}
			
 
				+
			
 
				+@app.get("/tts")
			
 
				+@app.get("/api/tts")
			
 
				+def synthesize(
			
 
				+    text: str,
			
 
				+    voice: str = "default",
			
 
				+    lang: str = "en",
			
 
				+):
			
 
				+
			
 
				+    gpt_cond_latent, speaker_embedding = get_embedding(voice)
			
 
				+
			
 
				+    out = tts.synthesizer.tts_model.inference(
			
 
				+        text,
			
 
				+        lang,
			
 
				+        gpt_cond_latent,
			
 
				+        speaker_embedding,
			
 
				+    )
			
 
				+
			
 
				+    buf = io.BytesIO()
			
 
				+
			
 
				+    import numpy as np
			
 
				+    import soundfile as sf
			
 
				+
			
 
				+    # XTTS returns a dict, extract numpy array
			
 
				+    wav = out["wav"]
			
 
				+
			
 
				+    # Ensure 2D for soundfile
			
 
				+    if len(wav.shape) == 1:
			
 
				+        wav = np.expand_dims(wav, 1)
			
 
				+
			
 
				+    sf.write(buf, wav, 24000, format="WAV")
			
 
				+
			
 
				+    # Ensure the audio is 2D: (samples, channels)
			
 
				+    #if len(wav.shape) == 1:
			
 
				+    #    wav = np.expand_dims(wav, 1)
			
 
				+
			
 
				+    #sf.write(buf, wav, 24000, format="WAV")
			
 
				+
			
 
				+    #import soundfile as sf
			
 
				+    #sf.write(buf, wav, 24000, format="WAV", subtype="PCM_16")
			
 
				+    #sf.write(buf, wav, 24000, format="WAV")
			
 
				+
			
 
				+    buf.seek(0)
			
 
				+
			
 
				+    return StreamingResponse(buf, media_type="audio/wav")
			
--- a/tts_server_noncaching.py
+++ b/tts_server_noncaching.py
@@ -0,0 +1,89 @@
 
				+import torch
			
 
				+from fastapi import FastAPI, Query
			
 
				+from fastapi.responses import FileResponse, JSONResponse
			
 
				+from TTS.api import TTS
			
 
				+import tempfile
			
 
				+import os
			
 
				+
			
 
				+# Allow XTTS classes for torch.load
			
 
				+#torch.serialization.add_safe_globals(["TTS.tts.configs.xtts_config.XttsConfig"])
			
 
				+
			
 
				+app = FastAPI(title="Multilingual TTS Server")
			
 
				+
			
 
				+# Use full unpickling (PyTorch ≥2.6)
			
 
				+# Only safe because we trust Coqui models
			
 
				+torch_load_original = torch.load
			
 
				+def torch_load_patch(*args, **kwargs):
			
 
				+    kwargs["weights_only"] = False
			
 
				+    return torch_load_original(*args, **kwargs)
			
 
				+
			
 
				+torch.load = torch_load_patch
			
 
				+
			
 
				+
			
 
				+# Load model
			
 
				+tts = TTS("tts_models/multilingual/multi-dataset/xtts_v2")
			
 
				+tts.to("cuda")
			
 
				+
			
 
				+#tts = TTS("tts_models/multilingual/multi-dataset/xtts_v2", gpu=True)
			
 
				+
			
 
				+# Directory where voice samples are stored
			
 
				+VOICE_DIR = "/voices"
			
 
				+
			
 
				+# Helper: list available voices (WAV files in /voices)
			
 
				+def list_voice_files():
			
 
				+    if not os.path.exists(VOICE_DIR):
			
 
				+        return []
			
 
				+    return [f for f in os.listdir(VOICE_DIR) if f.lower().endswith(".wav")]
			
 
				+
			
 
				+# Endpoint: list available voices
			
 
				+@app.get("/voices")
			
 
				+def get_voices():
			
 
				+    return JSONResponse(list_voice_files())
			
 
				+
			
 
				+# Endpoint: list supported languages
			
 
				+@app.get("/languages")
			
 
				+def get_languages():
			
 
				+    return JSONResponse(tts.languages)
			
 
				+
			
 
				+# Endpoint: list available speakers (from the model)
			
 
				+#@app.get("/speakers")
			
 
				+#def get_speakers():
			
 
				+#    return JSONResponse(tts.speakers)
			
 
				+
			
 
				+#@app.get("/api/speakers")
			
 
				+#def speakers():
			
 
				+#    return {"speakers": tts.speakers}
			
 
				+
			
 
				+# Endpoint: TTS synthesis
			
 
				+@app.get("/api/tts")
			
 
				+def synthesize(
			
 
				+    text: str = Query(..., description="Text to speak"),
			
 
				+    lang: str = Query("en", description="Language code (e.g., en, de, es)"),
			
 
				+    speaker: str = Query(None, description="Speaker ID from model speakers"),
			
 
				+    voice_file: str = Query(None, description="Filename of WAV voice sample in /voices for cloning"),
			
 
				+    speed: float = Query(1.0, ge=0.5, le=2.0, description="Speech speed multiplier"),
			
 
				+    pitch: float = Query(1.0, ge=0.5, le=2.0, description="Pitch multiplier (approximate)")
			
 
				+):
			
 
				+    if voice_file:
			
 
				+        path = os.path.join(VOICE_DIR, voice_file)
			
 
				+        if not os.path.isfile(path):
			
 
				+            raise HTTPException(status_code=404, detail=f"Voice file '{voice_file}' not found")
			
 
				+        speaker_wav = path
			
 
				+    else:
			
 
				+        speaker_wav = None
			
 
				+
			
 
				+    # Temporary output file
			
 
				+    tmp = tempfile.NamedTemporaryFile(delete=False, suffix=".wav")
			
 
				+
			
 
				+#    voice = request.args.get("voice", "default")
			
 
				+#    speaker_wav = f"voices/{voice}.wav"
			
 
				+
			
 
				+    # Generate TTS
			
 
				+    tts.tts_to_file(
			
 
				+        text=text,
			
 
				+        file_path=tmp.name,
			
 
				+        language=lang,
			
 
				+        speaker_wav="/voices/trump.wav",
			
 
				+    )
			
 
				+
			
 
				+    return FileResponse(tmp.name, media_type="audio/wav", filename="speech.wav")
			
--- a/tts_server_simple.py
+++ b/tts_server_simple.py
@@ -0,0 +1,13 @@
 
				+from fastapi import FastAPI, Query
			
 
				+from fastapi.responses import FileResponse
			
 
				+from TTS.api import TTS
			
 
				+import tempfile
			
 
				+
			
 
				+app = FastAPI()
			
 
				+tts = TTS(model_name="tts_models/en/ljspeech/vits", gpu=True)
			
 
				+
			
 
				+@app.get("/api/tts")
			
 
				+def synth(text: str = Query(...)):
			
 
				+    tmp = tempfile.NamedTemporaryFile(delete=False, suffix=".wav")
			
 
				+    tts.tts_to_file(text=text, file_path=tmp.name)
			
 
				+    return FileResponse(tmp.name, media_type="audio/wav", filename="speech.wav")
			
--- a/voices/.gitkeep
+++ b/voices/.gitkeep
--- a/voices/default.wav
+++ b/voices/default.wav
--- a/voices/lauren.wav
+++ b/voices/lauren.wav
--- a/voices/susie.wav
+++ b/voices/susie.wav
--- a/voices/trump.wav
+++ b/voices/trump.wav