| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171 |
- #!/usr/bin/env python3
- """
- Minimal whisper.cpp HTTP server (one-shot transcription)
- POST /transcribe
- multipart/form-data with field: file=@audio.(ogg|mp3|wav|m4a|...)
- Returns:
- { "text": "..." }
- If transcription is empty, it returns diagnostics (stdout/stderr/cmd) to help debug.
- Notes:
- - This version forces English (-l en). Change to "de" if needed.
- - It converts any input to 16kHz mono WAV via ffmpeg for robustness.
- """
- from flask import Flask, request, jsonify
- import subprocess
- import os
- import tempfile
- app = Flask(__name__)
- # Adjust these paths for your machine:
- WHISPER_BIN = os.path.expanduser("~/whisper.cpp/build/bin/whisper-cli") # e.g. ~/whisper.cpp/main or ~/whisper.cpp/build/bin/whisper-cli
- MODEL_PATH = os.path.expanduser("~/whisper.cpp/models/ggml-small.bin")
- # Language to force; set to None to let whisper auto-detect (I recommend forcing for reliability)
- FORCE_LANG = "en" # "en" or "de" etc.
- # ffmpeg settings for stable whisper input
- WAV_AR = "16000"
- WAV_AC = "1"
- def _run(cmd, timeout=300):
- """Run subprocess and return (returncode, stdout_str, stderr_str)."""
- r = subprocess.run(
- cmd,
- stdout=subprocess.PIPE,
- stderr=subprocess.PIPE,
- timeout=timeout,
- )
- return (
- r.returncode,
- r.stdout.decode("utf-8", errors="ignore"),
- r.stderr.decode("utf-8", errors="ignore"),
- )
- @app.get("/health")
- def health():
- ok = True
- problems = []
- if not os.path.exists(WHISPER_BIN):
- ok = False
- problems.append(f"WHISPER_BIN not found: {WHISPER_BIN}")
- if not os.path.exists(MODEL_PATH):
- ok = False
- problems.append(f"MODEL_PATH not found: {MODEL_PATH}")
- # Check ffmpeg availability
- try:
- code, out, err = _run(["ffmpeg", "-version"], timeout=10)
- if code != 0:
- ok = False
- problems.append("ffmpeg not working")
- except Exception as e:
- ok = False
- problems.append(f"ffmpeg check failed: {e}")
- return jsonify({"ok": ok, "problems": problems})
- @app.post("/transcribe")
- def transcribe():
- if "file" not in request.files:
- return jsonify({"error": "no file field; expected multipart/form-data with field 'file'"}), 400
- up = request.files["file"]
- if not up.filename:
- return jsonify({"error": "empty filename"}), 400
- # Store uploaded audio to temp
- in_tmp = tempfile.NamedTemporaryFile(delete=False, suffix=os.path.splitext(up.filename)[1] or ".bin")
- up.save(in_tmp.name)
- in_tmp.close()
- # Convert to WAV 16kHz mono (whisper-friendly)
- wav_tmp = tempfile.NamedTemporaryFile(delete=False, suffix=".wav")
- wav_tmp.close()
- # Where whisper.cpp will write output .txt (we set -of explicitly)
- out_base = tempfile.NamedTemporaryFile(delete=False)
- out_base.close()
- os.unlink(out_base.name) # we only want the path, whisper will create files
- try:
- # 1) Convert input -> wav
- ffmpeg_cmd = [
- "ffmpeg",
- "-hide_banner",
- "-loglevel", "error",
- "-y",
- "-i", in_tmp.name,
- "-ar", WAV_AR,
- "-ac", WAV_AC,
- "-c:a", "pcm_s16le",
- wav_tmp.name,
- ]
- ff_code, ff_out, ff_err = _run(ffmpeg_cmd, timeout=120)
- if ff_code != 0:
- return jsonify({
- "error": "ffmpeg conversion failed",
- "returncode": ff_code,
- "stderr": ff_err[-4000:],
- "cmd": ffmpeg_cmd,
- }), 500
- # 2) Run whisper.cpp
- # whisper.cpp flags vary slightly by build; these are commonly supported in whisper.cpp "main"
- whisper_cmd = [
- WHISPER_BIN,
- "-m", MODEL_PATH,
- "-f", wav_tmp.name,
- "-otxt",
- "-of", out_base.name,
- "-nt"
- ]
- if FORCE_LANG:
- whisper_cmd += ["-l", FORCE_LANG]
- w_code, w_out, w_err = _run(whisper_cmd, timeout=300)
- txt_path = out_base.name + ".txt"
- text = ""
- if os.path.exists(txt_path):
- with open(txt_path, "r", encoding="utf-8") as fh:
- text = fh.read().strip()
- # If empty, return diagnostics (don’t hide it behind a 500)
- if not text:
- return jsonify({
- "text": "",
- "note": "empty transcript; returning diagnostics",
- "returncode": w_code,
- "stdout": w_out[-4000:],
- "stderr": w_err[-4000:],
- "cmd": whisper_cmd,
- }), 200
- return jsonify({"text": text}), 200
- except subprocess.TimeoutExpired:
- return jsonify({"error": "timeout"}), 504
- except Exception as e:
- return jsonify({"error": f"server exception: {e}"}), 500
- finally:
- # cleanup
- for p in [in_tmp.name, wav_tmp.name, out_base.name + ".txt", out_base.name + ".json", out_base.name + ".srt", out_base.name + ".vtt"]:
- try:
- if os.path.exists(p):
- os.unlink(p)
- except Exception:
- pass
- if __name__ == "__main__":
- # Listen on LAN
- app.run(host="0.0.0.0", port=5005)
|