4 months ago · e8f88eb57c
--- a/.gitignore
+++ b/.gitignore
@@ -0,0 +1,14 @@
 
				+# Byte-compiled / optimized / DLL files
			
 
				+__pycache__/
			
 
				+*.py[codz]
			
 
				+*$py.class
			
 
				+
			
 
				+# Environments
			
 
				+.env
			
 
				+.envrc
			
 
				+.venv
			
 
				+env/
			
 
				+venv/
			
 
				+ENV/
			
 
				+env.bak/
			
 
				+venv.bak/
			
--- a/README.md
+++ b/README.md
@@ -0,0 +1,3 @@
 
				+# Whisper STT Server
			
 
				+
			
 
				+this first flask version works for me.
			
--- a/server.py
+++ b/server.py
@@ -0,0 +1,171 @@
 
				+#!/usr/bin/env python3
			
 
				+"""
			
 
				+Minimal whisper.cpp HTTP server (one-shot transcription)
			
 
				+
			
 
				+POST /transcribe
			
 
				+  multipart/form-data with field: file=@audio.(ogg|mp3|wav|m4a|...)
			
 
				+Returns:
			
 
				+  { "text": "..." }
			
 
				+
			
 
				+If transcription is empty, it returns diagnostics (stdout/stderr/cmd) to help debug.
			
 
				+
			
 
				+Notes:
			
 
				+- This version forces English (-l en). Change to "de" if needed.
			
 
				+- It converts any input to 16kHz mono WAV via ffmpeg for robustness.
			
 
				+"""
			
 
				+
			
 
				+from flask import Flask, request, jsonify
			
 
				+import subprocess
			
 
				+import os
			
 
				+import tempfile
			
 
				+
			
 
				+app = Flask(__name__)
			
 
				+
			
 
				+# Adjust these paths for your machine:
			
 
				+WHISPER_BIN = os.path.expanduser("~/whisper.cpp/build/bin/whisper-cli")  # e.g. ~/whisper.cpp/main or ~/whisper.cpp/build/bin/whisper-cli
			
 
				+MODEL_PATH = os.path.expanduser("~/whisper.cpp/models/ggml-small.bin")
			
 
				+
			
 
				+# Language to force; set to None to let whisper auto-detect (I recommend forcing for reliability)
			
 
				+FORCE_LANG = "en"   # "en" or "de" etc.
			
 
				+
			
 
				+# ffmpeg settings for stable whisper input
			
 
				+WAV_AR = "16000"
			
 
				+WAV_AC = "1"
			
 
				+
			
 
				+
			
 
				+def _run(cmd, timeout=300):
			
 
				+    """Run subprocess and return (returncode, stdout_str, stderr_str)."""
			
 
				+    r = subprocess.run(
			
 
				+        cmd,
			
 
				+        stdout=subprocess.PIPE,
			
 
				+        stderr=subprocess.PIPE,
			
 
				+        timeout=timeout,
			
 
				+    )
			
 
				+    return (
			
 
				+        r.returncode,
			
 
				+        r.stdout.decode("utf-8", errors="ignore"),
			
 
				+        r.stderr.decode("utf-8", errors="ignore"),
			
 
				+    )
			
 
				+
			
 
				+
			
 
				+@app.get("/health")
			
 
				+def health():
			
 
				+    ok = True
			
 
				+    problems = []
			
 
				+
			
 
				+    if not os.path.exists(WHISPER_BIN):
			
 
				+        ok = False
			
 
				+        problems.append(f"WHISPER_BIN not found: {WHISPER_BIN}")
			
 
				+    if not os.path.exists(MODEL_PATH):
			
 
				+        ok = False
			
 
				+        problems.append(f"MODEL_PATH not found: {MODEL_PATH}")
			
 
				+
			
 
				+    # Check ffmpeg availability
			
 
				+    try:
			
 
				+        code, out, err = _run(["ffmpeg", "-version"], timeout=10)
			
 
				+        if code != 0:
			
 
				+            ok = False
			
 
				+            problems.append("ffmpeg not working")
			
 
				+    except Exception as e:
			
 
				+        ok = False
			
 
				+        problems.append(f"ffmpeg check failed: {e}")
			
 
				+
			
 
				+    return jsonify({"ok": ok, "problems": problems})
			
 
				+
			
 
				+
			
 
				+@app.post("/transcribe")
			
 
				+def transcribe():
			
 
				+    if "file" not in request.files:
			
 
				+        return jsonify({"error": "no file field; expected multipart/form-data with field 'file'"}), 400
			
 
				+
			
 
				+    up = request.files["file"]
			
 
				+    if not up.filename:
			
 
				+        return jsonify({"error": "empty filename"}), 400
			
 
				+
			
 
				+    # Store uploaded audio to temp
			
 
				+    in_tmp = tempfile.NamedTemporaryFile(delete=False, suffix=os.path.splitext(up.filename)[1] or ".bin")
			
 
				+    up.save(in_tmp.name)
			
 
				+    in_tmp.close()
			
 
				+
			
 
				+    # Convert to WAV 16kHz mono (whisper-friendly)
			
 
				+    wav_tmp = tempfile.NamedTemporaryFile(delete=False, suffix=".wav")
			
 
				+    wav_tmp.close()
			
 
				+
			
 
				+    # Where whisper.cpp will write output .txt (we set -of explicitly)
			
 
				+    out_base = tempfile.NamedTemporaryFile(delete=False)
			
 
				+    out_base.close()
			
 
				+    os.unlink(out_base.name)  # we only want the path, whisper will create files
			
 
				+
			
 
				+    try:
			
 
				+        # 1) Convert input -> wav
			
 
				+        ffmpeg_cmd = [
			
 
				+            "ffmpeg",
			
 
				+            "-hide_banner",
			
 
				+            "-loglevel", "error",
			
 
				+            "-y",
			
 
				+            "-i", in_tmp.name,
			
 
				+            "-ar", WAV_AR,
			
 
				+            "-ac", WAV_AC,
			
 
				+            "-c:a", "pcm_s16le",
			
 
				+            wav_tmp.name,
			
 
				+        ]
			
 
				+        ff_code, ff_out, ff_err = _run(ffmpeg_cmd, timeout=120)
			
 
				+        if ff_code != 0:
			
 
				+            return jsonify({
			
 
				+                "error": "ffmpeg conversion failed",
			
 
				+                "returncode": ff_code,
			
 
				+                "stderr": ff_err[-4000:],
			
 
				+                "cmd": ffmpeg_cmd,
			
 
				+            }), 500
			
 
				+
			
 
				+        # 2) Run whisper.cpp
			
 
				+        # whisper.cpp flags vary slightly by build; these are commonly supported in whisper.cpp "main"
			
 
				+        whisper_cmd = [
			
 
				+            WHISPER_BIN,
			
 
				+            "-m", MODEL_PATH,
			
 
				+            "-f", wav_tmp.name,
			
 
				+            "-otxt",
			
 
				+            "-of", out_base.name,
			
 
				+	    "-nt"
			
 
				+        ]
			
 
				+        if FORCE_LANG:
			
 
				+            whisper_cmd += ["-l", FORCE_LANG]
			
 
				+
			
 
				+        w_code, w_out, w_err = _run(whisper_cmd, timeout=300)
			
 
				+
			
 
				+        txt_path = out_base.name + ".txt"
			
 
				+        text = ""
			
 
				+        if os.path.exists(txt_path):
			
 
				+            with open(txt_path, "r", encoding="utf-8") as fh:
			
 
				+                text = fh.read().strip()
			
 
				+
			
 
				+        # If empty, return diagnostics (don’t hide it behind a 500)
			
 
				+        if not text:
			
 
				+            return jsonify({
			
 
				+                "text": "",
			
 
				+                "note": "empty transcript; returning diagnostics",
			
 
				+                "returncode": w_code,
			
 
				+                "stdout": w_out[-4000:],
			
 
				+                "stderr": w_err[-4000:],
			
 
				+                "cmd": whisper_cmd,
			
 
				+            }), 200
			
 
				+
			
 
				+        return jsonify({"text": text}), 200
			
 
				+
			
 
				+    except subprocess.TimeoutExpired:
			
 
				+        return jsonify({"error": "timeout"}), 504
			
 
				+    except Exception as e:
			
 
				+        return jsonify({"error": f"server exception: {e}"}), 500
			
 
				+    finally:
			
 
				+        # cleanup
			
 
				+        for p in [in_tmp.name, wav_tmp.name, out_base.name + ".txt", out_base.name + ".json", out_base.name + ".srt", out_base.name + ".vtt"]:
			
 
				+            try:
			
 
				+                if os.path.exists(p):
			
 
				+                    os.unlink(p)
			
 
				+            except Exception:
			
 
				+                pass
			
 
				+
			
 
				+
			
 
				+if __name__ == "__main__":
			
 
				+    # Listen on LAN
			
 
				+    app.run(host="0.0.0.0", port=5005)