server.py 5.3 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181
  1. #!/usr/bin/env python3
  2. """
  3. Minimal whisper.cpp HTTP server (one-shot transcription)
  4. POST /transcribe
  5. multipart/form-data with field: file=@audio.(ogg|mp3|wav|m4a|...)
  6. Returns:
  7. { "text": "..." }
  8. If transcription is empty, it returns diagnostics (stdout/stderr/cmd) to help debug.
  9. Notes:
  10. - This version forces English (-l en). Change to "de" if needed.
  11. - It converts any input to 16kHz mono WAV via ffmpeg for robustness.
  12. """
  13. from flask import Flask, request, jsonify
  14. import subprocess
  15. import os
  16. import tempfile
  17. app = Flask(__name__)
  18. # Adjust these paths for your machine:
  19. #WHISPER_BIN = os.path.expanduser("~/whisper.cpp/build/bin/whisper-cli") # e.g. ~/whisper.cpp/main or ~/whisper.cpp/build/bin/whisper-cli
  20. #MODEL_PATH = os.path.expanduser("~/whisper.cpp/models/ggml-small.bin")
  21. WHISPER_BIN = os.environ.get(
  22. "WHISPER_BIN",
  23. os.path.expanduser("~/whisper.cpp/build/bin/whisper-cli")
  24. )
  25. MODEL_PATH = os.environ.get(
  26. "MODEL_PATH",
  27. os.path.expanduser("~/whisper.cpp/models/ggml-small.bin")
  28. )
  29. # Language to force; set to None to let whisper auto-detect (I recommend forcing for reliability)
  30. FORCE_LANG = "en" # "en" or "de" etc.
  31. # ffmpeg settings for stable whisper input
  32. WAV_AR = "16000"
  33. WAV_AC = "1"
  34. def _run(cmd, timeout=300):
  35. """Run subprocess and return (returncode, stdout_str, stderr_str)."""
  36. r = subprocess.run(
  37. cmd,
  38. stdout=subprocess.PIPE,
  39. stderr=subprocess.PIPE,
  40. timeout=timeout,
  41. )
  42. return (
  43. r.returncode,
  44. r.stdout.decode("utf-8", errors="ignore"),
  45. r.stderr.decode("utf-8", errors="ignore"),
  46. )
  47. @app.get("/health")
  48. def health():
  49. ok = True
  50. problems = []
  51. if not os.path.exists(WHISPER_BIN):
  52. ok = False
  53. problems.append(f"WHISPER_BIN not found: {WHISPER_BIN}")
  54. if not os.path.exists(MODEL_PATH):
  55. ok = False
  56. problems.append(f"MODEL_PATH not found: {MODEL_PATH}")
  57. # Check ffmpeg availability
  58. try:
  59. code, out, err = _run(["ffmpeg", "-version"], timeout=10)
  60. if code != 0:
  61. ok = False
  62. problems.append("ffmpeg not working")
  63. except Exception as e:
  64. ok = False
  65. problems.append(f"ffmpeg check failed: {e}")
  66. return jsonify({"ok": ok, "problems": problems})
  67. @app.post("/transcribe")
  68. def transcribe():
  69. if "file" not in request.files:
  70. return jsonify({"error": "no file field; expected multipart/form-data with field 'file'"}), 400
  71. up = request.files["file"]
  72. if not up.filename:
  73. return jsonify({"error": "empty filename"}), 400
  74. # Store uploaded audio to temp
  75. in_tmp = tempfile.NamedTemporaryFile(delete=False, suffix=os.path.splitext(up.filename)[1] or ".bin")
  76. up.save(in_tmp.name)
  77. in_tmp.close()
  78. # Convert to WAV 16kHz mono (whisper-friendly)
  79. wav_tmp = tempfile.NamedTemporaryFile(delete=False, suffix=".wav")
  80. wav_tmp.close()
  81. # Where whisper.cpp will write output .txt (we set -of explicitly)
  82. out_base = tempfile.NamedTemporaryFile(delete=False)
  83. out_base.close()
  84. os.unlink(out_base.name) # we only want the path, whisper will create files
  85. try:
  86. # 1) Convert input -> wav
  87. ffmpeg_cmd = [
  88. "ffmpeg",
  89. "-hide_banner",
  90. "-loglevel", "error",
  91. "-y",
  92. "-i", in_tmp.name,
  93. "-ar", WAV_AR,
  94. "-ac", WAV_AC,
  95. "-c:a", "pcm_s16le",
  96. wav_tmp.name,
  97. ]
  98. ff_code, ff_out, ff_err = _run(ffmpeg_cmd, timeout=120)
  99. if ff_code != 0:
  100. return jsonify({
  101. "error": "ffmpeg conversion failed",
  102. "returncode": ff_code,
  103. "stderr": ff_err[-4000:],
  104. "cmd": ffmpeg_cmd,
  105. }), 500
  106. # 2) Run whisper.cpp
  107. # whisper.cpp flags vary slightly by build; these are commonly supported in whisper.cpp "main"
  108. whisper_cmd = [
  109. WHISPER_BIN,
  110. "-m", MODEL_PATH,
  111. "-f", wav_tmp.name,
  112. "-otxt",
  113. "-of", out_base.name,
  114. "-nt"
  115. ]
  116. if FORCE_LANG:
  117. whisper_cmd += ["-l", FORCE_LANG]
  118. w_code, w_out, w_err = _run(whisper_cmd, timeout=300)
  119. txt_path = out_base.name + ".txt"
  120. text = ""
  121. if os.path.exists(txt_path):
  122. with open(txt_path, "r", encoding="utf-8") as fh:
  123. text = fh.read().strip()
  124. # If empty, return diagnostics (don’t hide it behind a 500)
  125. if not text:
  126. return jsonify({
  127. "text": "",
  128. "note": "empty transcript; returning diagnostics",
  129. "returncode": w_code,
  130. "stdout": w_out[-4000:],
  131. "stderr": w_err[-4000:],
  132. "cmd": whisper_cmd,
  133. }), 200
  134. return jsonify({"text": text}), 200
  135. except subprocess.TimeoutExpired:
  136. return jsonify({"error": "timeout"}), 504
  137. except Exception as e:
  138. return jsonify({"error": f"server exception: {e}"}), 500
  139. finally:
  140. # cleanup
  141. for p in [in_tmp.name, wav_tmp.name, out_base.name + ".txt", out_base.name + ".json", out_base.name + ".srt", out_base.name + ".vtt"]:
  142. try:
  143. if os.path.exists(p):
  144. os.unlink(p)
  145. except Exception:
  146. pass
  147. if __name__ == "__main__":
  148. # Listen on LAN
  149. app.run(host="0.0.0.0", port=5005)