server.py 5.1 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171
  1. #!/usr/bin/env python3
  2. """
  3. Minimal whisper.cpp HTTP server (one-shot transcription)
  4. POST /transcribe
  5. multipart/form-data with field: file=@audio.(ogg|mp3|wav|m4a|...)
  6. Returns:
  7. { "text": "..." }
  8. If transcription is empty, it returns diagnostics (stdout/stderr/cmd) to help debug.
  9. Notes:
  10. - This version forces English (-l en). Change to "de" if needed.
  11. - It converts any input to 16kHz mono WAV via ffmpeg for robustness.
  12. """
  13. from flask import Flask, request, jsonify
  14. import subprocess
  15. import os
  16. import tempfile
  17. app = Flask(__name__)
  18. # Adjust these paths for your machine:
  19. WHISPER_BIN = os.path.expanduser("~/whisper.cpp/build/bin/whisper-cli") # e.g. ~/whisper.cpp/main or ~/whisper.cpp/build/bin/whisper-cli
  20. MODEL_PATH = os.path.expanduser("~/whisper.cpp/models/ggml-small.bin")
  21. # Language to force; set to None to let whisper auto-detect (I recommend forcing for reliability)
  22. FORCE_LANG = "en" # "en" or "de" etc.
  23. # ffmpeg settings for stable whisper input
  24. WAV_AR = "16000"
  25. WAV_AC = "1"
  26. def _run(cmd, timeout=300):
  27. """Run subprocess and return (returncode, stdout_str, stderr_str)."""
  28. r = subprocess.run(
  29. cmd,
  30. stdout=subprocess.PIPE,
  31. stderr=subprocess.PIPE,
  32. timeout=timeout,
  33. )
  34. return (
  35. r.returncode,
  36. r.stdout.decode("utf-8", errors="ignore"),
  37. r.stderr.decode("utf-8", errors="ignore"),
  38. )
  39. @app.get("/health")
  40. def health():
  41. ok = True
  42. problems = []
  43. if not os.path.exists(WHISPER_BIN):
  44. ok = False
  45. problems.append(f"WHISPER_BIN not found: {WHISPER_BIN}")
  46. if not os.path.exists(MODEL_PATH):
  47. ok = False
  48. problems.append(f"MODEL_PATH not found: {MODEL_PATH}")
  49. # Check ffmpeg availability
  50. try:
  51. code, out, err = _run(["ffmpeg", "-version"], timeout=10)
  52. if code != 0:
  53. ok = False
  54. problems.append("ffmpeg not working")
  55. except Exception as e:
  56. ok = False
  57. problems.append(f"ffmpeg check failed: {e}")
  58. return jsonify({"ok": ok, "problems": problems})
  59. @app.post("/transcribe")
  60. def transcribe():
  61. if "file" not in request.files:
  62. return jsonify({"error": "no file field; expected multipart/form-data with field 'file'"}), 400
  63. up = request.files["file"]
  64. if not up.filename:
  65. return jsonify({"error": "empty filename"}), 400
  66. # Store uploaded audio to temp
  67. in_tmp = tempfile.NamedTemporaryFile(delete=False, suffix=os.path.splitext(up.filename)[1] or ".bin")
  68. up.save(in_tmp.name)
  69. in_tmp.close()
  70. # Convert to WAV 16kHz mono (whisper-friendly)
  71. wav_tmp = tempfile.NamedTemporaryFile(delete=False, suffix=".wav")
  72. wav_tmp.close()
  73. # Where whisper.cpp will write output .txt (we set -of explicitly)
  74. out_base = tempfile.NamedTemporaryFile(delete=False)
  75. out_base.close()
  76. os.unlink(out_base.name) # we only want the path, whisper will create files
  77. try:
  78. # 1) Convert input -> wav
  79. ffmpeg_cmd = [
  80. "ffmpeg",
  81. "-hide_banner",
  82. "-loglevel", "error",
  83. "-y",
  84. "-i", in_tmp.name,
  85. "-ar", WAV_AR,
  86. "-ac", WAV_AC,
  87. "-c:a", "pcm_s16le",
  88. wav_tmp.name,
  89. ]
  90. ff_code, ff_out, ff_err = _run(ffmpeg_cmd, timeout=120)
  91. if ff_code != 0:
  92. return jsonify({
  93. "error": "ffmpeg conversion failed",
  94. "returncode": ff_code,
  95. "stderr": ff_err[-4000:],
  96. "cmd": ffmpeg_cmd,
  97. }), 500
  98. # 2) Run whisper.cpp
  99. # whisper.cpp flags vary slightly by build; these are commonly supported in whisper.cpp "main"
  100. whisper_cmd = [
  101. WHISPER_BIN,
  102. "-m", MODEL_PATH,
  103. "-f", wav_tmp.name,
  104. "-otxt",
  105. "-of", out_base.name,
  106. "-nt"
  107. ]
  108. if FORCE_LANG:
  109. whisper_cmd += ["-l", FORCE_LANG]
  110. w_code, w_out, w_err = _run(whisper_cmd, timeout=300)
  111. txt_path = out_base.name + ".txt"
  112. text = ""
  113. if os.path.exists(txt_path):
  114. with open(txt_path, "r", encoding="utf-8") as fh:
  115. text = fh.read().strip()
  116. # If empty, return diagnostics (don’t hide it behind a 500)
  117. if not text:
  118. return jsonify({
  119. "text": "",
  120. "note": "empty transcript; returning diagnostics",
  121. "returncode": w_code,
  122. "stdout": w_out[-4000:],
  123. "stderr": w_err[-4000:],
  124. "cmd": whisper_cmd,
  125. }), 200
  126. return jsonify({"text": text}), 200
  127. except subprocess.TimeoutExpired:
  128. return jsonify({"error": "timeout"}), 504
  129. except Exception as e:
  130. return jsonify({"error": f"server exception: {e}"}), 500
  131. finally:
  132. # cleanup
  133. for p in [in_tmp.name, wav_tmp.name, out_base.name + ".txt", out_base.name + ".json", out_base.name + ".srt", out_base.name + ".vtt"]:
  134. try:
  135. if os.path.exists(p):
  136. os.unlink(p)
  137. except Exception:
  138. pass
  139. if __name__ == "__main__":
  140. # Listen on LAN
  141. app.run(host="0.0.0.0", port=5005)