#!/usr/bin/env python3 """ Minimal whisper.cpp HTTP server (one-shot transcription) POST /transcribe multipart/form-data with field: file=@audio.(ogg|mp3|wav|m4a|...) Returns: { "text": "..." } If transcription is empty, it returns diagnostics (stdout/stderr/cmd) to help debug. Notes: - This version forces English (-l en). Change to "de" if needed. - It converts any input to 16kHz mono WAV via ffmpeg for robustness. """ from flask import Flask, request, jsonify import subprocess import os import tempfile app = Flask(__name__) # Adjust these paths for your machine: WHISPER_BIN = os.path.expanduser("~/whisper.cpp/build/bin/whisper-cli") # e.g. ~/whisper.cpp/main or ~/whisper.cpp/build/bin/whisper-cli MODEL_PATH = os.path.expanduser("~/whisper.cpp/models/ggml-small.bin") # Language to force; set to None to let whisper auto-detect (I recommend forcing for reliability) FORCE_LANG = "en" # "en" or "de" etc. # ffmpeg settings for stable whisper input WAV_AR = "16000" WAV_AC = "1" def _run(cmd, timeout=300): """Run subprocess and return (returncode, stdout_str, stderr_str).""" r = subprocess.run( cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, timeout=timeout, ) return ( r.returncode, r.stdout.decode("utf-8", errors="ignore"), r.stderr.decode("utf-8", errors="ignore"), ) @app.get("/health") def health(): ok = True problems = [] if not os.path.exists(WHISPER_BIN): ok = False problems.append(f"WHISPER_BIN not found: {WHISPER_BIN}") if not os.path.exists(MODEL_PATH): ok = False problems.append(f"MODEL_PATH not found: {MODEL_PATH}") # Check ffmpeg availability try: code, out, err = _run(["ffmpeg", "-version"], timeout=10) if code != 0: ok = False problems.append("ffmpeg not working") except Exception as e: ok = False problems.append(f"ffmpeg check failed: {e}") return jsonify({"ok": ok, "problems": problems}) @app.post("/transcribe") def transcribe(): if "file" not in request.files: return jsonify({"error": "no file field; expected multipart/form-data with field 'file'"}), 400 up = request.files["file"] if not up.filename: return jsonify({"error": "empty filename"}), 400 # Store uploaded audio to temp in_tmp = tempfile.NamedTemporaryFile(delete=False, suffix=os.path.splitext(up.filename)[1] or ".bin") up.save(in_tmp.name) in_tmp.close() # Convert to WAV 16kHz mono (whisper-friendly) wav_tmp = tempfile.NamedTemporaryFile(delete=False, suffix=".wav") wav_tmp.close() # Where whisper.cpp will write output .txt (we set -of explicitly) out_base = tempfile.NamedTemporaryFile(delete=False) out_base.close() os.unlink(out_base.name) # we only want the path, whisper will create files try: # 1) Convert input -> wav ffmpeg_cmd = [ "ffmpeg", "-hide_banner", "-loglevel", "error", "-y", "-i", in_tmp.name, "-ar", WAV_AR, "-ac", WAV_AC, "-c:a", "pcm_s16le", wav_tmp.name, ] ff_code, ff_out, ff_err = _run(ffmpeg_cmd, timeout=120) if ff_code != 0: return jsonify({ "error": "ffmpeg conversion failed", "returncode": ff_code, "stderr": ff_err[-4000:], "cmd": ffmpeg_cmd, }), 500 # 2) Run whisper.cpp # whisper.cpp flags vary slightly by build; these are commonly supported in whisper.cpp "main" whisper_cmd = [ WHISPER_BIN, "-m", MODEL_PATH, "-f", wav_tmp.name, "-otxt", "-of", out_base.name, "-nt" ] if FORCE_LANG: whisper_cmd += ["-l", FORCE_LANG] w_code, w_out, w_err = _run(whisper_cmd, timeout=300) txt_path = out_base.name + ".txt" text = "" if os.path.exists(txt_path): with open(txt_path, "r", encoding="utf-8") as fh: text = fh.read().strip() # If empty, return diagnostics (don’t hide it behind a 500) if not text: return jsonify({ "text": "", "note": "empty transcript; returning diagnostics", "returncode": w_code, "stdout": w_out[-4000:], "stderr": w_err[-4000:], "cmd": whisper_cmd, }), 200 return jsonify({"text": text}), 200 except subprocess.TimeoutExpired: return jsonify({"error": "timeout"}), 504 except Exception as e: return jsonify({"error": f"server exception: {e}"}), 500 finally: # cleanup for p in [in_tmp.name, wav_tmp.name, out_base.name + ".txt", out_base.name + ".json", out_base.name + ".srt", out_base.name + ".vtt"]: try: if os.path.exists(p): os.unlink(p) except Exception: pass if __name__ == "__main__": # Listen on LAN app.run(host="0.0.0.0", port=5005)