|
@@ -30,12 +30,12 @@ from TTS.api import TTS
|
|
|
|
|
|
|
|
# ─── Paths & constants ────────────────────────────────────────────────────────
|
|
# ─── Paths & constants ────────────────────────────────────────────────────────
|
|
|
|
|
|
|
|
-VOICE_DIR = Path("/voices")
|
|
|
|
|
-CACHE_DIR = Path("/cache")
|
|
|
|
|
-MODEL_NAME = "tts_models/multilingual/multi-dataset/xtts_v2"
|
|
|
|
|
|
|
+VOICE_DIR = Path("/voices")
|
|
|
|
|
+CACHE_DIR = Path("/cache")
|
|
|
|
|
+MODEL_NAME = "tts_models/multilingual/multi-dataset/xtts_v2"
|
|
|
SAMPLE_RATE = 24000
|
|
SAMPLE_RATE = 24000
|
|
|
-VRAM_HEADROOM = 0.20 # fall back to CPU when VRAM < 20% free
|
|
|
|
|
-MAX_CHUNK_LEN = 200 # chars; XTTS hard-limit is ~400 tokens ≈ 250 chars
|
|
|
|
|
|
|
+VRAM_HEADROOM = 0.20 # fall back to CPU when VRAM < 20% free
|
|
|
|
|
+MAX_CHUNK_LEN = 200 # chars; XTTS hard-limit ~400 tokens ≈ 250 chars
|
|
|
|
|
|
|
|
VOICE_DIR.mkdir(exist_ok=True)
|
|
VOICE_DIR.mkdir(exist_ok=True)
|
|
|
CACHE_DIR.mkdir(exist_ok=True)
|
|
CACHE_DIR.mkdir(exist_ok=True)
|
|
@@ -47,39 +47,263 @@ _device = "cuda" if torch.cuda.is_available() else "cpu"
|
|
|
tts = TTS(MODEL_NAME).to(_device)
|
|
tts = TTS(MODEL_NAME).to(_device)
|
|
|
print(f"Model loaded on {_device}.")
|
|
print(f"Model loaded on {_device}.")
|
|
|
|
|
|
|
|
-# Single lock so concurrent requests don't fight over GPU / model.to() calls
|
|
|
|
|
|
|
+# Serialise all model access so concurrent requests don't race on .to() calls
|
|
|
_model_lock = threading.Lock()
|
|
_model_lock = threading.Lock()
|
|
|
|
|
|
|
|
app = FastAPI()
|
|
app = FastAPI()
|
|
|
embedding_cache: dict = {}
|
|
embedding_cache: dict = {}
|
|
|
|
|
|
|
|
-# ─── Text helpers ─────────────────────────────────────────────────────────────
|
|
|
|
|
|
|
+# ─── Acronym / symbol tables ──────────────────────────────────────────────────
|
|
|
|
|
+#
|
|
|
|
|
+# Keys are matched as whole words (word-boundary regex).
|
|
|
|
|
+# Values are phonetic spellings XTTS pronounces letter-by-letter.
|
|
|
|
|
+# Hyphens between letters reliably force individual-letter pronunciation.
|
|
|
|
|
+#
|
|
|
|
|
+# German rule: spell every letter using German letter names.
|
|
|
|
|
+# English rule: most common EN acronyms are already correct; only fix known
|
|
|
|
|
+# bad ones (mainly German acronyms appearing in mixed text).
|
|
|
|
|
+
|
|
|
|
|
+ACRONYMS_DE: dict[str, str] = {
|
|
|
|
|
+ # ── Technology / computing ───────────────────────────────────────────────
|
|
|
|
|
+ "KI": "Ka-I",
|
|
|
|
|
+ "IT": "I-Te",
|
|
|
|
|
+ "PC": "Pe-Tse",
|
|
|
|
|
+ "API": "A-Pe-I",
|
|
|
|
|
+ "URL": "U-Er-El",
|
|
|
|
|
+ "HTTP": "Ha-Te-Te-Pe",
|
|
|
|
|
+ "AI": "Ei-Ei", # English loanword in German text
|
|
|
|
|
+ "ML": "Em-El",
|
|
|
|
|
+ "UI": "U-I",
|
|
|
|
|
+ "GPU": "Ge-Pe-U",
|
|
|
|
|
+ "CPU": "Tse-Pe-U",
|
|
|
|
|
+ # ── Geography / politics ─────────────────────────────────────────────────
|
|
|
|
|
+ "EU": "E-U",
|
|
|
|
|
+ "US": "U-Es",
|
|
|
|
|
+ "USA": "U-Es-A",
|
|
|
|
|
+ "UK": "U-Ka",
|
|
|
|
|
+ "UN": "U-En",
|
|
|
|
|
+ "NATO": "NATO", # spoken as a word in German too
|
|
|
|
|
+ "BRD": "Be-Er-De",
|
|
|
|
|
+ "DDR": "De-De-Er",
|
|
|
|
|
+ "SPD": "Es-Pe-De",
|
|
|
|
|
+ "CDU": "Tse-De-U",
|
|
|
|
|
+ "CSU": "Tse-Es-U",
|
|
|
|
|
+ "FDP": "Ef-De-Pe",
|
|
|
|
|
+ "AfD": "A-Ef-De",
|
|
|
|
|
+ "ÖVP": "Ö-Fau-Pe",
|
|
|
|
|
+ "FPÖ": "Ef-Pe-Ö",
|
|
|
|
|
+ # ── Business / finance ───────────────────────────────────────────────────
|
|
|
|
|
+ "AG": "A-Ge",
|
|
|
|
|
+ "GmbH": "Ge-Em-Be-Ha",
|
|
|
|
|
+ "CEO": "Tse-E-O",
|
|
|
|
|
+ "CFO": "Tse-Ef-O",
|
|
|
|
|
+ "CTO": "Tse-Te-O",
|
|
|
|
|
+ "HR": "Ha-Er",
|
|
|
|
|
+ "PR": "Pe-Er",
|
|
|
|
|
+ "BIP": "Be-I-Pe",
|
|
|
|
|
+ "EZB": "E-Tse-Be",
|
|
|
|
|
+ "IWF": "I-Ve-Ef",
|
|
|
|
|
+ "WTO": "Ve-Te-O",
|
|
|
|
|
+ # ── Media / broadcasting ─────────────────────────────────────────────────
|
|
|
|
|
+ "ARD": "A-Er-De",
|
|
|
|
|
+ "ZDF": "Tse-De-Ef",
|
|
|
|
|
+ "ORF": "O-Er-Ef",
|
|
|
|
|
+ "SRF": "Es-Er-Ef",
|
|
|
|
|
+ "WDR": "Ve-De-Er",
|
|
|
|
|
+ "NDR": "En-De-Er",
|
|
|
|
|
+ "MDR": "Em-De-Er",
|
|
|
|
|
+ # ── Units / symbols (text substitution) ──────────────────────────────────
|
|
|
|
|
+ "€": "Euro",
|
|
|
|
|
+ "$": "Dollar",
|
|
|
|
|
+ "£": "Pfund",
|
|
|
|
|
+ "%": "Prozent",
|
|
|
|
|
+ "°C": "Grad Celsius",
|
|
|
|
|
+ "°F": "Grad Fahrenheit",
|
|
|
|
|
+ "km": "Kilometer",
|
|
|
|
|
+ "kg": "Kilogramm",
|
|
|
|
|
+ # ── Common German abbreviations ───────────────────────────────────────────
|
|
|
|
|
+ "bzw.": "beziehungsweise",
|
|
|
|
|
+ "ca.": "circa",
|
|
|
|
|
+ "usw.": "und so weiter",
|
|
|
|
|
+ "z.B.": "zum Beispiel",
|
|
|
|
|
+ "d.h.": "das heißt",
|
|
|
|
|
+ "u.a.": "unter anderem",
|
|
|
|
|
+ "etc.": "etcetera",
|
|
|
|
|
+ "Nr.": "Nummer",
|
|
|
|
|
+ "vs.": "versus",
|
|
|
|
|
+ "Dr.": "Doktor",
|
|
|
|
|
+ "Prof.": "Professor",
|
|
|
|
|
+ "Hrsg.": "Herausgeber",
|
|
|
|
|
+ "Jh.": "Jahrhundert",
|
|
|
|
|
+ "Mrd.": "Milliarden",
|
|
|
|
|
+ "Mio.": "Millionen",
|
|
|
|
|
+}
|
|
|
|
|
+
|
|
|
|
|
+ACRONYMS_EN: dict[str, str] = {
|
|
|
|
|
+ # Only list acronyms that XTTS mispronounces in English context.
|
|
|
|
|
+ # German acronyms that appear in English/mixed text:
|
|
|
|
|
+ "KI": "Kay Eye",
|
|
|
|
|
+ "EU": "E-U",
|
|
|
|
|
+ "BRD": "B-R-D",
|
|
|
|
|
+ "DDR": "D-D-R",
|
|
|
|
|
+ "GmbH": "G-m-b-H",
|
|
|
|
|
+ "EZB": "E-Z-B",
|
|
|
|
|
+ "ARD": "A-R-D",
|
|
|
|
|
+ "ZDF": "Z-D-F",
|
|
|
|
|
+ "ORF": "O-R-F",
|
|
|
|
|
+ "SRF": "S-R-F",
|
|
|
|
|
+ "WDR": "W-D-R",
|
|
|
|
|
+ "NDR": "N-D-R",
|
|
|
|
|
+ "MDR": "M-D-R",
|
|
|
|
|
+ # Units / symbols
|
|
|
|
|
+ "€": "euros",
|
|
|
|
|
+ "$": "dollars",
|
|
|
|
|
+ "£": "pounds",
|
|
|
|
|
+ "%": "percent",
|
|
|
|
|
+ "°C": "degrees Celsius",
|
|
|
|
|
+ "°F": "degrees Fahrenheit",
|
|
|
|
|
+ "km": "kilometers",
|
|
|
|
|
+ "kg": "kilograms",
|
|
|
|
|
+ # Abbreviations
|
|
|
|
|
+ "vs.": "versus",
|
|
|
|
|
+ "etc.": "et cetera",
|
|
|
|
|
+ "Dr.": "Doctor",
|
|
|
|
|
+ "Prof.": "Professor",
|
|
|
|
|
+ "Nr.": "Number",
|
|
|
|
|
+ "Mrd.": "billion",
|
|
|
|
|
+ "Mio.": "million",
|
|
|
|
|
+}
|
|
|
|
|
+
|
|
|
|
|
+
|
|
|
|
|
+def _build_acronym_pattern(table: dict[str, str]) -> re.Pattern:
|
|
|
|
|
+ """
|
|
|
|
|
+ Compile a single regex matching all keys as whole tokens.
|
|
|
|
|
+ Longer keys take priority (sorted descending by length).
|
|
|
|
|
+ Pure-symbol keys (€, $, °C) are matched without word boundaries.
|
|
|
|
|
+ """
|
|
|
|
|
+ word_keys = sorted([k for k in table if re.match(r'\w', k)], key=len, reverse=True)
|
|
|
|
|
+ special_keys = sorted([k for k in table if not re.match(r'\w', k)], key=len, reverse=True)
|
|
|
|
|
+
|
|
|
|
|
+ parts = [r'\b' + re.escape(k) + r'\b' for k in word_keys]
|
|
|
|
|
+ parts += [re.escape(k) for k in special_keys]
|
|
|
|
|
+
|
|
|
|
|
+ return re.compile('|'.join(parts)) if parts else re.compile(r'(?!)')
|
|
|
|
|
+
|
|
|
|
|
+
|
|
|
|
|
+_PATTERN_DE = _build_acronym_pattern(ACRONYMS_DE)
|
|
|
|
|
+_PATTERN_EN = _build_acronym_pattern(ACRONYMS_EN)
|
|
|
|
|
+
|
|
|
|
|
+
|
|
|
|
|
+def expand_acronyms(text: str, lang: str) -> str:
|
|
|
|
|
+ """Replace acronyms/symbols with phonetic expansions for the given language."""
|
|
|
|
|
+ if lang.startswith("de"):
|
|
|
|
|
+ table, pattern = ACRONYMS_DE, _PATTERN_DE
|
|
|
|
|
+ else:
|
|
|
|
|
+ table, pattern = ACRONYMS_EN, _PATTERN_EN
|
|
|
|
|
+ return pattern.sub(lambda m: table[m.group(0)], text)
|
|
|
|
|
+
|
|
|
|
|
+
|
|
|
|
|
+# ─── Markdown → natural speech ────────────────────────────────────────────────
|
|
|
|
|
+#
|
|
|
|
|
+# XTTS has no SSML support, but punctuation shapes prosody directly:
|
|
|
|
|
+# Period → short stop / breath
|
|
|
|
|
+# Ellipsis "..." → longer, contemplative pause
|
|
|
|
|
+# Comma → brief breath
|
|
|
|
|
+#
|
|
|
|
|
+# Mapping:
|
|
|
|
|
+# H1 → "..." before + text + "." + "..." after (longest pause)
|
|
|
|
|
+# H2 / H3 → "." before + text + "." (medium pause)
|
|
|
|
|
+# H4–H6 → text + "." (small pause)
|
|
|
|
|
+# **bold** → ", " + text + "," (emphasis breath)
|
|
|
|
|
+# *italic* → ", " + text + ","
|
|
|
|
|
+# Bullets → ", " + text + "." (list breath)
|
|
|
|
|
+# Blank line → "." (paragraph stop)
|
|
|
|
|
+# Code block → plain text, fences stripped
|
|
|
|
|
+# Link → label text only
|
|
|
|
|
+# HR --- → "..." (section break)
|
|
|
|
|
+
|
|
|
|
|
+_RE_HR = re.compile(r'^\s*[-*_]{3,}\s*$', re.MULTILINE)
|
|
|
|
|
+_RE_CODE_BLOCK = re.compile(r'```[\s\S]*?```')
|
|
|
|
|
+_RE_INLINE_CODE = re.compile(r'`[^`]+`')
|
|
|
|
|
+_RE_H1 = re.compile(r'^#\s+(.+)$', re.MULTILINE)
|
|
|
|
|
+_RE_H2 = re.compile(r'^#{2,3}\s+(.+)$', re.MULTILINE)
|
|
|
|
|
+_RE_H_DEEP = re.compile(r'^#{4,6}\s+(.+)$', re.MULTILINE)
|
|
|
|
|
+_RE_BOLD_ITALIC = re.compile(r'\*{3}(.+?)\*{3}|_{3}(.+?)_{3}')
|
|
|
|
|
+_RE_BOLD = re.compile(r'\*{2}(.+?)\*{2}|_{2}(.+?)_{2}')
|
|
|
|
|
+_RE_ITALIC = re.compile(r'\*(.+?)\*|_(.+?)_')
|
|
|
|
|
+_RE_LINK = re.compile(r'\[([^\]]+)\]\([^)]*\)')
|
|
|
|
|
+_RE_BULLET = re.compile(r'^\s*[-*+]\s+(.+)$', re.MULTILINE)
|
|
|
|
|
+_RE_NUMBERED = re.compile(r'^\s*\d+\.\s+(.+)$', re.MULTILINE)
|
|
|
|
|
+_RE_BLOCKQUOTE = re.compile(r'^\s*>\s+(.+)$', re.MULTILINE)
|
|
|
|
|
+_RE_MULTI_SPACE = re.compile(r' +')
|
|
|
|
|
+_RE_MULTI_DOTS = re.compile(r'\.{4,}')
|
|
|
|
|
+_RE_CONTROL = re.compile(r'[\x00-\x08\x0b\x0c\x0e-\x1f\x7f]')
|
|
|
|
|
+
|
|
|
|
|
+
|
|
|
|
|
+def markdown_to_speech_text(text: str) -> str:
|
|
|
|
|
+ """
|
|
|
|
|
+ Convert markdown to plain text shaped for natural TTS prosody.
|
|
|
|
|
+ Uses only punctuation cues — no spoken labels.
|
|
|
|
|
+ """
|
|
|
|
|
+ # 1. Normalise line endings + strip control chars
|
|
|
|
|
+ text = text.replace('\r\n', '\n').replace('\r', '\n')
|
|
|
|
|
+ text = _RE_CONTROL.sub('', text)
|
|
|
|
|
|
|
|
-# Characters XTTS tokeniser chokes on → strip or replace before inference
|
|
|
|
|
-_MARKDOWN_RE = re.compile(r'\*{1,2}|_{1,2}|`+|#{1,6}\s?|~~|\[([^\]]*)\]\([^)]*\)')
|
|
|
|
|
-_MULTI_SPACE = re.compile(r' +')
|
|
|
|
|
-_CONTROL_CHARS = re.compile(r'[\x00-\x08\x0b\x0c\x0e-\x1f\x7f]')
|
|
|
|
|
|
|
+ # 2. Code blocks → plain text (strip fences, keep content)
|
|
|
|
|
+ text = _RE_CODE_BLOCK.sub(
|
|
|
|
|
+ lambda m: m.group(0).split('\n', 1)[-1].rsplit('\n', 1)[0], text
|
|
|
|
|
+ )
|
|
|
|
|
+ text = _RE_INLINE_CODE.sub(lambda m: m.group(0).strip('`'), text)
|
|
|
|
|
+
|
|
|
|
|
+ # 3. Horizontal rules → long section-break pause
|
|
|
|
|
+ text = _RE_HR.sub('\n...\n', text)
|
|
|
|
|
+
|
|
|
|
|
+ # 4. Headings — longest pause for H1, medium for H2/H3, small for H4+
|
|
|
|
|
+ text = _RE_H1.sub(r'\n...\n\1.\n...\n', text)
|
|
|
|
|
+ text = _RE_H2.sub(r'\n.\n\1.\n', text)
|
|
|
|
|
+ text = _RE_H_DEEP.sub(r'\n\1.\n', text)
|
|
|
|
|
+
|
|
|
|
|
+ # 5. Blockquotes → comma-padded inline
|
|
|
|
|
+ text = _RE_BLOCKQUOTE.sub(r', \1,', text)
|
|
|
|
|
+
|
|
|
|
|
+ # 6. Inline emphasis — extract text, add comma-pauses
|
|
|
|
|
+ text = _RE_BOLD_ITALIC.sub(lambda m: ', ' + (m.group(1) or m.group(2)) + ',', text)
|
|
|
|
|
+ text = _RE_BOLD.sub( lambda m: ', ' + (m.group(1) or m.group(2)) + ',', text)
|
|
|
|
|
+ text = _RE_ITALIC.sub( lambda m: ', ' + (m.group(1) or m.group(2)) + ',', text)
|
|
|
|
|
+
|
|
|
|
|
+ # 7. Links → label text only
|
|
|
|
|
+ text = _RE_LINK.sub(r'\1', text)
|
|
|
|
|
+
|
|
|
|
|
+ # 8. List items → comma breath before, period after
|
|
|
|
|
+ text = _RE_BULLET.sub( r', \1.', text)
|
|
|
|
|
+ text = _RE_NUMBERED.sub(r', \1.', text)
|
|
|
|
|
+
|
|
|
|
|
+ # 9. Paragraph breaks → full stop + implicit pause
|
|
|
|
|
+ text = re.sub(r'\n{2,}', '.\n', text)
|
|
|
|
|
+
|
|
|
|
|
+ # 10. Remaining single newlines → space
|
|
|
|
|
+ text = text.replace('\n', ' ')
|
|
|
|
|
+
|
|
|
|
|
+ # 11. Clean up punctuation artifacts left by the above substitutions
|
|
|
|
|
+ text = re.sub(r',\s*,', ',', text) # double commas
|
|
|
|
|
+ text = re.sub(r'\.\s*\.(?!\.)', '.', text) # double periods (not ellipsis)
|
|
|
|
|
+ text = _RE_MULTI_DOTS.sub('...', text) # normalise over-long ellipses
|
|
|
|
|
+ text = re.sub(r'\s*\.\s*,', '.', text) # ., → .
|
|
|
|
|
+ text = re.sub(r',\s*\.', '.', text) # ,. → .
|
|
|
|
|
+ text = re.sub(r'\.\s*\.\.\.', '...', text) # .... → ...
|
|
|
|
|
+ text = _RE_MULTI_SPACE.sub(' ', text)
|
|
|
|
|
|
|
|
-def clean_text(text: str) -> str:
|
|
|
|
|
- """Remove markdown and control characters that corrupt XTTS tokenisation."""
|
|
|
|
|
- text = _MARKDOWN_RE.sub(r'\1', text) # strip md, keep link label
|
|
|
|
|
- text = _CONTROL_CHARS.sub('', text)
|
|
|
|
|
- text = text.replace('\r\n', '\n').replace('\r', '\n')
|
|
|
|
|
- # Collapse multiple blank lines / spaces
|
|
|
|
|
- text = re.sub(r'\n{3,}', '\n\n', text)
|
|
|
|
|
- text = _MULTI_SPACE.sub(' ', text)
|
|
|
|
|
return text.strip()
|
|
return text.strip()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
+# ─── Text chunking ────────────────────────────────────────────────────────────
|
|
|
|
|
+
|
|
|
def chunk_text(text: str, max_len: int = MAX_CHUNK_LEN) -> list[str]:
|
|
def chunk_text(text: str, max_len: int = MAX_CHUNK_LEN) -> list[str]:
|
|
|
"""
|
|
"""
|
|
|
- Split on sentence boundaries. Falls back to word-boundary splitting
|
|
|
|
|
- for sentences that are still too long (e.g. no punctuation at all).
|
|
|
|
|
|
|
+ Split on sentence boundaries; falls back to word-boundary splits for
|
|
|
|
|
+ sentences that exceed max_len (e.g. no punctuation, very long clauses).
|
|
|
"""
|
|
"""
|
|
|
- text = clean_text(text)
|
|
|
|
|
- # Split on sentence-ending punctuation followed by whitespace or end
|
|
|
|
|
sentences = re.split(r'(?<=[.!?…])\s+', text)
|
|
sentences = re.split(r'(?<=[.!?…])\s+', text)
|
|
|
-
|
|
|
|
|
chunks: list[str] = []
|
|
chunks: list[str] = []
|
|
|
current = ""
|
|
current = ""
|
|
|
|
|
|
|
@@ -88,7 +312,6 @@ def chunk_text(text: str, max_len: int = MAX_CHUNK_LEN) -> list[str]:
|
|
|
if not s:
|
|
if not s:
|
|
|
continue
|
|
continue
|
|
|
|
|
|
|
|
- # Single sentence longer than max_len → split on word boundary
|
|
|
|
|
if len(s) > max_len:
|
|
if len(s) > max_len:
|
|
|
if current:
|
|
if current:
|
|
|
chunks.append(current)
|
|
chunks.append(current)
|
|
@@ -116,7 +339,14 @@ def chunk_text(text: str, max_len: int = MAX_CHUNK_LEN) -> list[str]:
|
|
|
if current:
|
|
if current:
|
|
|
chunks.append(current)
|
|
chunks.append(current)
|
|
|
|
|
|
|
|
- return [c for c in chunks if c]
|
|
|
|
|
|
|
+ return [c for c in chunks if c.strip()]
|
|
|
|
|
+
|
|
|
|
|
+
|
|
|
|
|
+def prepare_text(text: str, lang: str) -> list[str]:
|
|
|
|
|
+ """Full pipeline: markdown → prosody text → acronym expansion → chunks."""
|
|
|
|
|
+ text = markdown_to_speech_text(text)
|
|
|
|
|
+ text = expand_acronyms(text, lang)
|
|
|
|
|
+ return chunk_text(text)
|
|
|
|
|
|
|
|
|
|
|
|
|
# ─── Voice / embedding helpers ────────────────────────────────────────────────
|
|
# ─── Voice / embedding helpers ────────────────────────────────────────────────
|
|
@@ -155,8 +385,8 @@ def get_embedding(voice_name: str):
|
|
|
if voice_name in embedding_cache:
|
|
if voice_name in embedding_cache:
|
|
|
return embedding_cache[voice_name]
|
|
return embedding_cache[voice_name]
|
|
|
|
|
|
|
|
- wav_file = ensure_wav(voice_name)
|
|
|
|
|
- file_hash = sha256_file(wav_file)
|
|
|
|
|
|
|
+ wav_file = ensure_wav(voice_name)
|
|
|
|
|
+ file_hash = sha256_file(wav_file)
|
|
|
cache_file = CACHE_DIR / f"{voice_name}.pkl"
|
|
cache_file = CACHE_DIR / f"{voice_name}.pkl"
|
|
|
|
|
|
|
|
if cache_file.exists():
|
|
if cache_file.exists():
|
|
@@ -182,7 +412,7 @@ def get_embedding(voice_name: str):
|
|
|
return data
|
|
return data
|
|
|
|
|
|
|
|
|
|
|
|
|
-# ─── Core inference helper ────────────────────────────────────────────────────
|
|
|
|
|
|
|
+# ─── Core inference ───────────────────────────────────────────────────────────
|
|
|
|
|
|
|
|
def _vram_low() -> bool:
|
|
def _vram_low() -> bool:
|
|
|
if not torch.cuda.is_available():
|
|
if not torch.cuda.is_available():
|
|
@@ -191,8 +421,10 @@ def _vram_low() -> bool:
|
|
|
return (free / total) < VRAM_HEADROOM
|
|
return (free / total) < VRAM_HEADROOM
|
|
|
|
|
|
|
|
|
|
|
|
|
-def _infer_chunk(chunk: str, lang: str, gpt_cond_latent, speaker_embedding) -> np.ndarray:
|
|
|
|
|
- """Run inference for one chunk; falls back to CPU on OOM."""
|
|
|
|
|
|
|
+def _infer_chunk(
|
|
|
|
|
+ chunk: str, lang: str, gpt_cond_latent, speaker_embedding
|
|
|
|
|
+) -> np.ndarray:
|
|
|
|
|
+ """Synthesise one text chunk; auto-falls back to CPU on CUDA OOM."""
|
|
|
model = tts.synthesizer.tts_model
|
|
model = tts.synthesizer.tts_model
|
|
|
|
|
|
|
|
def _run(m, lat, emb):
|
|
def _run(m, lat, emb):
|
|
@@ -207,9 +439,14 @@ def _infer_chunk(chunk: str, lang: str, gpt_cond_latent, speaker_embedding) -> n
|
|
|
|
|
|
|
|
with _model_lock:
|
|
with _model_lock:
|
|
|
try:
|
|
try:
|
|
|
- return _run(model, gpt_cond_latent, speaker_embedding)
|
|
|
|
|
|
|
+ result = _run(model, gpt_cond_latent, speaker_embedding)
|
|
|
|
|
+ # Release XTTS activation memory after every chunk so it doesn't
|
|
|
|
|
+ # accumulate across a long document and starve the next request.
|
|
|
|
|
+ if torch.cuda.is_available():
|
|
|
|
|
+ torch.cuda.empty_cache()
|
|
|
|
|
+ return result
|
|
|
except torch.cuda.OutOfMemoryError:
|
|
except torch.cuda.OutOfMemoryError:
|
|
|
- print(f"⚠ CUDA OOM on chunk – falling back to CPU ({os.cpu_count()} cores)")
|
|
|
|
|
|
|
+ print(f"⚠ CUDA OOM – falling back to CPU ({os.cpu_count()} cores)")
|
|
|
torch.cuda.empty_cache()
|
|
torch.cuda.empty_cache()
|
|
|
model.to("cpu")
|
|
model.to("cpu")
|
|
|
try:
|
|
try:
|
|
@@ -219,7 +456,6 @@ def _infer_chunk(chunk: str, lang: str, gpt_cond_latent, speaker_embedding) -> n
|
|
|
speaker_embedding.to("cpu"),
|
|
speaker_embedding.to("cpu"),
|
|
|
)
|
|
)
|
|
|
finally:
|
|
finally:
|
|
|
- # Always move back, even if CPU inference also fails
|
|
|
|
|
model.to("cuda")
|
|
model.to("cuda")
|
|
|
torch.cuda.empty_cache()
|
|
torch.cuda.empty_cache()
|
|
|
return result
|
|
return result
|
|
@@ -232,10 +468,21 @@ def root():
|
|
|
return {"status": "XTTS server running", "device": _device}
|
|
return {"status": "XTTS server running", "device": _device}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
+@app.get("/health")
|
|
|
|
|
+def health():
|
|
|
|
|
+ info = {"status": "ok", "device": _device}
|
|
|
|
|
+ if torch.cuda.is_available():
|
|
|
|
|
+ free, total = torch.cuda.mem_get_info()
|
|
|
|
|
+ info["vram_free_mb"] = round(free / 1024 ** 2)
|
|
|
|
|
+ info["vram_total_mb"] = round(total / 1024 ** 2)
|
|
|
|
|
+ info["vram_used_pct"] = round((1 - free / total) * 100, 1)
|
|
|
|
|
+ return info
|
|
|
|
|
+
|
|
|
|
|
+
|
|
|
@app.get("/voices")
|
|
@app.get("/voices")
|
|
|
def list_voices():
|
|
def list_voices():
|
|
|
- seen = set()
|
|
|
|
|
- voices = []
|
|
|
|
|
|
|
+ seen: set = set()
|
|
|
|
|
+ voices: list = []
|
|
|
for f in VOICE_DIR.iterdir():
|
|
for f in VOICE_DIR.iterdir():
|
|
|
if f.suffix in {".wav", ".mp3"} and f.stem not in seen:
|
|
if f.suffix in {".wav", ".mp3"} and f.stem not in seen:
|
|
|
voices.append(f.stem)
|
|
voices.append(f.stem)
|
|
@@ -251,31 +498,34 @@ def synthesize(text: str, voice: str = "default", lang: str = "en"):
|
|
|
|
|
|
|
|
gpt_cond_latent, speaker_embedding = get_embedding(voice)
|
|
gpt_cond_latent, speaker_embedding = get_embedding(voice)
|
|
|
|
|
|
|
|
- # If VRAM is already scarce, pin embeddings on CPU for this whole request
|
|
|
|
|
|
|
+ # Pin everything to CPU for this request if VRAM is already low
|
|
|
use_cpu = _vram_low()
|
|
use_cpu = _vram_low()
|
|
|
if use_cpu and torch.cuda.is_available():
|
|
if use_cpu and torch.cuda.is_available():
|
|
|
print("⚠ Low VRAM – pinning entire request to CPU")
|
|
print("⚠ Low VRAM – pinning entire request to CPU")
|
|
|
- gpt_cond_latent = gpt_cond_latent.to("cpu")
|
|
|
|
|
|
|
+ gpt_cond_latent = gpt_cond_latent.to("cpu")
|
|
|
speaker_embedding = speaker_embedding.to("cpu")
|
|
speaker_embedding = speaker_embedding.to("cpu")
|
|
|
with _model_lock:
|
|
with _model_lock:
|
|
|
tts.synthesizer.tts_model.to("cpu")
|
|
tts.synthesizer.tts_model.to("cpu")
|
|
|
|
|
|
|
|
- chunks = chunk_text(text)
|
|
|
|
|
|
|
+ chunks = prepare_text(text, lang)
|
|
|
wav_all = []
|
|
wav_all = []
|
|
|
|
|
|
|
|
for i, chunk in enumerate(chunks):
|
|
for i, chunk in enumerate(chunks):
|
|
|
- print(f" chunk {i+1}/{len(chunks)}: {chunk[:60]!r}")
|
|
|
|
|
|
|
+ print(f" chunk {i+1}/{len(chunks)}: {chunk[:80]!r}")
|
|
|
try:
|
|
try:
|
|
|
wav_chunk = _infer_chunk(chunk, lang, gpt_cond_latent, speaker_embedding)
|
|
wav_chunk = _infer_chunk(chunk, lang, gpt_cond_latent, speaker_embedding)
|
|
|
except Exception as e:
|
|
except Exception as e:
|
|
|
raise HTTPException(500, f"Inference failed on chunk {i+1}: {e}")
|
|
raise HTTPException(500, f"Inference failed on chunk {i+1}: {e}")
|
|
|
wav_all.append(wav_chunk)
|
|
wav_all.append(wav_chunk)
|
|
|
|
|
|
|
|
- # Restore model to GPU if we moved it
|
|
|
|
|
if use_cpu and torch.cuda.is_available():
|
|
if use_cpu and torch.cuda.is_available():
|
|
|
with _model_lock:
|
|
with _model_lock:
|
|
|
tts.synthesizer.tts_model.to("cuda")
|
|
tts.synthesizer.tts_model.to("cuda")
|
|
|
|
|
|
|
|
|
|
+ # Final sweep — catches anything the per-chunk clears missed
|
|
|
|
|
+ if torch.cuda.is_available():
|
|
|
|
|
+ torch.cuda.empty_cache()
|
|
|
|
|
+
|
|
|
wav = np.concatenate(wav_all, axis=0)
|
|
wav = np.concatenate(wav_all, axis=0)
|
|
|
buf = io.BytesIO()
|
|
buf = io.BytesIO()
|
|
|
sf.write(buf, wav, SAMPLE_RATE, format="WAV")
|
|
sf.write(buf, wav, SAMPLE_RATE, format="WAV")
|
|
@@ -286,16 +536,16 @@ def synthesize(text: str, voice: str = "default", lang: str = "en"):
|
|
|
@app.get("/tts_stream")
|
|
@app.get("/tts_stream")
|
|
|
@app.get("/api/tts_stream")
|
|
@app.get("/api/tts_stream")
|
|
|
def synthesize_stream(text: str, voice: str = "default", lang: str = "en"):
|
|
def synthesize_stream(text: str, voice: str = "default", lang: str = "en"):
|
|
|
- """Stream WAV chunks as they are synthesised — lower latency for long texts."""
|
|
|
|
|
|
|
+ """Stream WAV chunks as synthesised — lower latency for long texts."""
|
|
|
if not text.strip():
|
|
if not text.strip():
|
|
|
raise HTTPException(400, "text parameter is empty")
|
|
raise HTTPException(400, "text parameter is empty")
|
|
|
|
|
|
|
|
gpt_cond_latent, speaker_embedding = get_embedding(voice)
|
|
gpt_cond_latent, speaker_embedding = get_embedding(voice)
|
|
|
- chunks = chunk_text(text)
|
|
|
|
|
|
|
+ chunks = prepare_text(text, lang)
|
|
|
|
|
|
|
|
def audio_generator():
|
|
def audio_generator():
|
|
|
for i, chunk in enumerate(chunks):
|
|
for i, chunk in enumerate(chunks):
|
|
|
- print(f" [stream] chunk {i+1}/{len(chunks)}: {chunk[:60]!r}")
|
|
|
|
|
|
|
+ print(f" [stream] chunk {i+1}/{len(chunks)}: {chunk[:80]!r}")
|
|
|
try:
|
|
try:
|
|
|
wav = _infer_chunk(chunk, lang, gpt_cond_latent, speaker_embedding)
|
|
wav = _infer_chunk(chunk, lang, gpt_cond_latent, speaker_embedding)
|
|
|
except Exception as e:
|
|
except Exception as e:
|
|
@@ -305,5 +555,9 @@ def synthesize_stream(text: str, voice: str = "default", lang: str = "en"):
|
|
|
sf.write(buf, wav, SAMPLE_RATE, format="WAV")
|
|
sf.write(buf, wav, SAMPLE_RATE, format="WAV")
|
|
|
buf.seek(0)
|
|
buf.seek(0)
|
|
|
yield buf.read()
|
|
yield buf.read()
|
|
|
|
|
+ # Clear after each streamed chunk — long documents would otherwise
|
|
|
|
|
+ # accumulate VRAM and cause the next request to fall back to CPU.
|
|
|
|
|
+ if torch.cuda.is_available():
|
|
|
|
|
+ torch.cuda.empty_cache()
|
|
|
|
|
|
|
|
- return StreamingResponse(audio_generator(), media_type="audio/wav")
|
|
|
|
|
|
|
+ return StreamingResponse(audio_generator(), media_type="audio/wav")
|