lucky
/
whisper-server


			
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181
							#!/usr/bin/env python3
"""
Minimal whisper.cpp HTTP server (one-shot transcription)

POST /transcribe
  multipart/form-data with field: file=@audio.(ogg|mp3|wav|m4a|...)
Returns:
  { "text": "..." }

If transcription is empty, it returns diagnostics (stdout/stderr/cmd) to help debug.

Notes:
- This version forces English (-l en). Change to "de" if needed.
- It converts any input to 16kHz mono WAV via ffmpeg for robustness.
"""

from flask import Flask, request, jsonify
import subprocess
import os
import tempfile

app = Flask(__name__)

# Adjust these paths for your machine:
#WHISPER_BIN = os.path.expanduser("~/whisper.cpp/build/bin/whisper-cli")  # e.g. ~/whisper.cpp/main or ~/whisper.cpp/build/bin/whisper-cli
#MODEL_PATH = os.path.expanduser("~/whisper.cpp/models/ggml-small.bin")

WHISPER_BIN = os.environ.get(
    "WHISPER_BIN",
    os.path.expanduser("~/whisper.cpp/build/bin/whisper-cli")
)

MODEL_PATH = os.environ.get(
    "MODEL_PATH",
    os.path.expanduser("~/whisper.cpp/models/ggml-small.bin")
)

# Language to force; set to None to let whisper auto-detect (I recommend forcing for reliability)
FORCE_LANG = "en"   # "en" or "de" etc.

# ffmpeg settings for stable whisper input
WAV_AR = "16000"
WAV_AC = "1"


def _run(cmd, timeout=300):
    """Run subprocess and return (returncode, stdout_str, stderr_str)."""
    r = subprocess.run(
        cmd,
        stdout=subprocess.PIPE,
        stderr=subprocess.PIPE,
        timeout=timeout,
    )
    return (
        r.returncode,
        r.stdout.decode("utf-8", errors="ignore"),
        r.stderr.decode("utf-8", errors="ignore"),
    )


@app.get("/health")
def health():
    ok = True
    problems = []

    if not os.path.exists(WHISPER_BIN):
        ok = False
        problems.append(f"WHISPER_BIN not found: {WHISPER_BIN}")
    if not os.path.exists(MODEL_PATH):
        ok = False
        problems.append(f"MODEL_PATH not found: {MODEL_PATH}")

    # Check ffmpeg availability
    try:
        code, out, err = _run(["ffmpeg", "-version"], timeout=10)
        if code != 0:
            ok = False
            problems.append("ffmpeg not working")
    except Exception as e:
        ok = False
        problems.append(f"ffmpeg check failed: {e}")

    return jsonify({"ok": ok, "problems": problems})


@app.post("/transcribe")
def transcribe():
    if "file" not in request.files:
        return jsonify({"error": "no file field; expected multipart/form-data with field 'file'"}), 400

    up = request.files["file"]
    if not up.filename:
        return jsonify({"error": "empty filename"}), 400

    # Store uploaded audio to temp
    in_tmp = tempfile.NamedTemporaryFile(delete=False, suffix=os.path.splitext(up.filename)[1] or ".bin")
    up.save(in_tmp.name)
    in_tmp.close()

    # Convert to WAV 16kHz mono (whisper-friendly)
    wav_tmp = tempfile.NamedTemporaryFile(delete=False, suffix=".wav")
    wav_tmp.close()

    # Where whisper.cpp will write output .txt (we set -of explicitly)
    out_base = tempfile.NamedTemporaryFile(delete=False)
    out_base.close()
    os.unlink(out_base.name)  # we only want the path, whisper will create files

    try:
        # 1) Convert input -> wav
        ffmpeg_cmd = [
            "ffmpeg",
            "-hide_banner",
            "-loglevel", "error",
            "-y",
            "-i", in_tmp.name,
            "-ar", WAV_AR,
            "-ac", WAV_AC,
            "-c:a", "pcm_s16le",
            wav_tmp.name,
        ]
        ff_code, ff_out, ff_err = _run(ffmpeg_cmd, timeout=120)
        if ff_code != 0:
            return jsonify({
                "error": "ffmpeg conversion failed",
                "returncode": ff_code,
                "stderr": ff_err[-4000:],
                "cmd": ffmpeg_cmd,
            }), 500

        # 2) Run whisper.cpp
        # whisper.cpp flags vary slightly by build; these are commonly supported in whisper.cpp "main"
        whisper_cmd = [
            WHISPER_BIN,
            "-m", MODEL_PATH,
            "-f", wav_tmp.name,
            "-otxt",
            "-of", out_base.name,
	    "-nt"
        ]
        if FORCE_LANG:
            whisper_cmd += ["-l", FORCE_LANG]

        w_code, w_out, w_err = _run(whisper_cmd, timeout=300)

        txt_path = out_base.name + ".txt"
        text = ""
        if os.path.exists(txt_path):
            with open(txt_path, "r", encoding="utf-8") as fh:
                text = fh.read().strip()

        # If empty, return diagnostics (don’t hide it behind a 500)
        if not text:
            return jsonify({
                "text": "",
                "note": "empty transcript; returning diagnostics",
                "returncode": w_code,
                "stdout": w_out[-4000:],
                "stderr": w_err[-4000:],
                "cmd": whisper_cmd,
            }), 200

        return jsonify({"text": text}), 200

    except subprocess.TimeoutExpired:
        return jsonify({"error": "timeout"}), 504
    except Exception as e:
        return jsonify({"error": f"server exception: {e}"}), 500
    finally:
        # cleanup
        for p in [in_tmp.name, wav_tmp.name, out_base.name + ".txt", out_base.name + ".json", out_base.name + ".srt", out_base.name + ".vtt"]:
            try:
                if os.path.exists(p):
                    os.unlink(p)
            except Exception:
                pass


if __name__ == "__main__":
    # Listen on LAN
    app.run(host="0.0.0.0", port=5005)