Browse Source

Initial commit

Lukas Goldschmidt 6 days ago
commit
e8f88eb57c
3 changed files with 188 additions and 0 deletions
  1. 14 0
      .gitignore
  2. 3 0
      README.md
  3. 171 0
      server.py

+ 14 - 0
.gitignore

@@ -0,0 +1,14 @@
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[codz]
+*$py.class
+
+# Environments
+.env
+.envrc
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/

+ 3 - 0
README.md

@@ -0,0 +1,3 @@
+# Whisper STT Server
+
+this first flask version works for me.

+ 171 - 0
server.py

@@ -0,0 +1,171 @@
+#!/usr/bin/env python3
+"""
+Minimal whisper.cpp HTTP server (one-shot transcription)
+
+POST /transcribe
+  multipart/form-data with field: file=@audio.(ogg|mp3|wav|m4a|...)
+Returns:
+  { "text": "..." }
+
+If transcription is empty, it returns diagnostics (stdout/stderr/cmd) to help debug.
+
+Notes:
+- This version forces English (-l en). Change to "de" if needed.
+- It converts any input to 16kHz mono WAV via ffmpeg for robustness.
+"""
+
+from flask import Flask, request, jsonify
+import subprocess
+import os
+import tempfile
+
+app = Flask(__name__)
+
+# Adjust these paths for your machine:
+WHISPER_BIN = os.path.expanduser("~/whisper.cpp/build/bin/whisper-cli")  # e.g. ~/whisper.cpp/main or ~/whisper.cpp/build/bin/whisper-cli
+MODEL_PATH = os.path.expanduser("~/whisper.cpp/models/ggml-small.bin")
+
+# Language to force; set to None to let whisper auto-detect (I recommend forcing for reliability)
+FORCE_LANG = "en"   # "en" or "de" etc.
+
+# ffmpeg settings for stable whisper input
+WAV_AR = "16000"
+WAV_AC = "1"
+
+
+def _run(cmd, timeout=300):
+    """Run subprocess and return (returncode, stdout_str, stderr_str)."""
+    r = subprocess.run(
+        cmd,
+        stdout=subprocess.PIPE,
+        stderr=subprocess.PIPE,
+        timeout=timeout,
+    )
+    return (
+        r.returncode,
+        r.stdout.decode("utf-8", errors="ignore"),
+        r.stderr.decode("utf-8", errors="ignore"),
+    )
+
+
+@app.get("/health")
+def health():
+    ok = True
+    problems = []
+
+    if not os.path.exists(WHISPER_BIN):
+        ok = False
+        problems.append(f"WHISPER_BIN not found: {WHISPER_BIN}")
+    if not os.path.exists(MODEL_PATH):
+        ok = False
+        problems.append(f"MODEL_PATH not found: {MODEL_PATH}")
+
+    # Check ffmpeg availability
+    try:
+        code, out, err = _run(["ffmpeg", "-version"], timeout=10)
+        if code != 0:
+            ok = False
+            problems.append("ffmpeg not working")
+    except Exception as e:
+        ok = False
+        problems.append(f"ffmpeg check failed: {e}")
+
+    return jsonify({"ok": ok, "problems": problems})
+
+
+@app.post("/transcribe")
+def transcribe():
+    if "file" not in request.files:
+        return jsonify({"error": "no file field; expected multipart/form-data with field 'file'"}), 400
+
+    up = request.files["file"]
+    if not up.filename:
+        return jsonify({"error": "empty filename"}), 400
+
+    # Store uploaded audio to temp
+    in_tmp = tempfile.NamedTemporaryFile(delete=False, suffix=os.path.splitext(up.filename)[1] or ".bin")
+    up.save(in_tmp.name)
+    in_tmp.close()
+
+    # Convert to WAV 16kHz mono (whisper-friendly)
+    wav_tmp = tempfile.NamedTemporaryFile(delete=False, suffix=".wav")
+    wav_tmp.close()
+
+    # Where whisper.cpp will write output .txt (we set -of explicitly)
+    out_base = tempfile.NamedTemporaryFile(delete=False)
+    out_base.close()
+    os.unlink(out_base.name)  # we only want the path, whisper will create files
+
+    try:
+        # 1) Convert input -> wav
+        ffmpeg_cmd = [
+            "ffmpeg",
+            "-hide_banner",
+            "-loglevel", "error",
+            "-y",
+            "-i", in_tmp.name,
+            "-ar", WAV_AR,
+            "-ac", WAV_AC,
+            "-c:a", "pcm_s16le",
+            wav_tmp.name,
+        ]
+        ff_code, ff_out, ff_err = _run(ffmpeg_cmd, timeout=120)
+        if ff_code != 0:
+            return jsonify({
+                "error": "ffmpeg conversion failed",
+                "returncode": ff_code,
+                "stderr": ff_err[-4000:],
+                "cmd": ffmpeg_cmd,
+            }), 500
+
+        # 2) Run whisper.cpp
+        # whisper.cpp flags vary slightly by build; these are commonly supported in whisper.cpp "main"
+        whisper_cmd = [
+            WHISPER_BIN,
+            "-m", MODEL_PATH,
+            "-f", wav_tmp.name,
+            "-otxt",
+            "-of", out_base.name,
+	    "-nt"
+        ]
+        if FORCE_LANG:
+            whisper_cmd += ["-l", FORCE_LANG]
+
+        w_code, w_out, w_err = _run(whisper_cmd, timeout=300)
+
+        txt_path = out_base.name + ".txt"
+        text = ""
+        if os.path.exists(txt_path):
+            with open(txt_path, "r", encoding="utf-8") as fh:
+                text = fh.read().strip()
+
+        # If empty, return diagnostics (don’t hide it behind a 500)
+        if not text:
+            return jsonify({
+                "text": "",
+                "note": "empty transcript; returning diagnostics",
+                "returncode": w_code,
+                "stdout": w_out[-4000:],
+                "stderr": w_err[-4000:],
+                "cmd": whisper_cmd,
+            }), 200
+
+        return jsonify({"text": text}), 200
+
+    except subprocess.TimeoutExpired:
+        return jsonify({"error": "timeout"}), 504
+    except Exception as e:
+        return jsonify({"error": f"server exception: {e}"}), 500
+    finally:
+        # cleanup
+        for p in [in_tmp.name, wav_tmp.name, out_base.name + ".txt", out_base.name + ".json", out_base.name + ".srt", out_base.name + ".vtt"]:
+            try:
+                if os.path.exists(p):
+                    os.unlink(p)
+            except Exception:
+                pass
+
+
+if __name__ == "__main__":
+    # Listen on LAN
+    app.run(host="0.0.0.0", port=5005)