hace 2 meses · 9a4c7191ca
--- a/Dockerfile
+++ b/Dockerfile
@@ -0,0 +1,42 @@
 
				+FROM python:3.11-slim
			
 
				+
			
 
				+# Install system dependencies
			
 
				+RUN apt-get update && apt-get install -y \
			
 
				+    ffmpeg \
			
 
				+    git \
			
 
				+    build-essential \
			
 
				+    cmake \
			
 
				+    curl \
			
 
				+    && rm -rf /var/lib/apt/lists/*
			
 
				+
			
 
				+# Create working directory
			
 
				+WORKDIR /app
			
 
				+
			
 
				+# Copy Python dependencies
			
 
				+COPY requirements.txt .
			
 
				+
			
 
				+# Install Python dependencies
			
 
				+RUN pip install --no-cache-dir -r requirements.txt
			
 
				+
			
 
				+# Copy server
			
 
				+COPY server.py .
			
 
				+
			
 
				+# Clone whisper.cpp
			
 
				+RUN git clone https://github.com/ggerganov/whisper.cpp.git
			
 
				+
			
 
				+# Build whisper.cpp
			
 
				+RUN cmake -S whisper.cpp -B whisper.cpp/build \
			
 
				+    && cmake --build whisper.cpp/build --config Release
			
 
				+
			
 
				+# Download model
			
 
				+RUN bash whisper.cpp/models/download-ggml-model.sh small
			
 
				+
			
 
				+# Environment variables for server.py
			
 
				+ENV WHISPER_BIN=/app/whisper.cpp/build/bin/whisper-cli
			
 
				+ENV MODEL_PATH=/app/whisper.cpp/models/ggml-small.bin
			
 
				+
			
 
				+# Expose API port
			
 
				+EXPOSE 5005
			
 
				+
			
 
				+# Run with gunicorn
			
 
				+CMD ["gunicorn", "-w", "2", "-b", "0.0.0.0:5005", "server:app"]
			
--- a/README.md
+++ b/README.md
@@ -1,3 +1,319 @@
 
				-# Whisper STT Server
			
 
				+# Whisper Transcription HTTP Server
			
 
				 
			
 
				-this first flask version works for me.
			
 
				+A minimal HTTP API for **speech-to-text transcription** built with:
			
 
				+
			
 
				+* **Flask**
			
 
				+* **whisper.cpp**
			
 
				+* **ffmpeg**
			
 
				+
			
 
				+The service accepts audio uploads and returns a **single transcription result** using the Whisper model.
			
 
				+
			
 
				+It is designed to be:
			
 
				+
			
 
				+* simple
			
 
				+* reliable
			
 
				+* easy to containerize
			
 
				+* suitable for internal automation pipelines
			
 
				+
			
 
				+Typical use cases include:
			
 
				+
			
 
				+* voice message transcription
			
 
				+* voice assistant pipelines
			
 
				+* transcription preprocessing
			
 
				+* automation workflows (Node-RED, etc.)
			
 
				+
			
 
				+---
			
 
				+
			
 
				+# Architecture
			
 
				+
			
 
				+The server works in three stages:
			
 
				+
			
 
				+1. **Upload audio**
			
 
				+
			
 
				+   A client sends a `multipart/form-data` POST request containing an audio file.
			
 
				+
			
 
				+2. **Convert audio**
			
 
				+
			
 
				+   The server converts the audio into **16 kHz mono WAV** using `ffmpeg`.
			
 
				+   This ensures compatibility and stable input for Whisper.
			
 
				+
			
 
				+3. **Transcribe**
			
 
				+
			
 
				+   The server calls the **whisper.cpp CLI** (`whisper-cli`) with a specified model.
			
 
				+
			
 
				+4. **Return text**
			
 
				+
			
 
				+   The transcription result is returned as JSON.
			
 
				+
			
 
				+If the transcription result is empty, the server returns **diagnostic information** to help debugging.
			
 
				+
			
 
				+---
			
 
				+
			
 
				+# API
			
 
				+
			
 
				+## Health Check
			
 
				+
			
 
				+```
			
 
				+GET /health
			
 
				+```
			
 
				+
			
 
				+Returns server status and verifies:
			
 
				+
			
 
				+* whisper binary exists
			
 
				+* model file exists
			
 
				+* ffmpeg is available
			
 
				+
			
 
				+Example response:
			
 
				+
			
 
				+```json
			
 
				+{
			
 
				+  "ok": true,
			
 
				+  "problems": []
			
 
				+}
			
 
				+```
			
 
				+
			
 
				+---
			
 
				+
			
 
				+## Transcribe Audio
			
 
				+
			
 
				+```
			
 
				+POST /transcribe
			
 
				+```
			
 
				+
			
 
				+### Request
			
 
				+
			
 
				+`multipart/form-data`
			
 
				+
			
 
				+Field name:
			
 
				+
			
 
				+```
			
 
				+file
			
 
				+```
			
 
				+
			
 
				+Example:
			
 
				+
			
 
				+```
			
 
				+file=@audio.wav
			
 
				+```
			
 
				+
			
 
				+Supported formats (handled by ffmpeg):
			
 
				+
			
 
				+* wav
			
 
				+* mp3
			
 
				+* ogg
			
 
				+* m4a
			
 
				+* most other common audio formats
			
 
				+
			
 
				+---
			
 
				+
			
 
				+### Response
			
 
				+
			
 
				+Success:
			
 
				+
			
 
				+```json
			
 
				+{
			
 
				+  "text": "Hello this is a transcription."
			
 
				+}
			
 
				+```
			
 
				+
			
 
				+If the transcription is empty, the server returns diagnostics:
			
 
				+
			
 
				+```json
			
 
				+{
			
 
				+  "text": "",
			
 
				+  "note": "empty transcript; returning diagnostics",
			
 
				+  "stdout": "...",
			
 
				+  "stderr": "...",
			
 
				+  "cmd": [...]
			
 
				+}
			
 
				+```
			
 
				+
			
 
				+---
			
 
				+
			
 
				+# Project Structure
			
 
				+
			
 
				+```
			
 
				+.
			
 
				+├── server.py
			
 
				+├── requirements.txt
			
 
				+├── Dockerfile
			
 
				+└── README.md
			
 
				+```
			
 
				+
			
 
				+At runtime the container will also contain:
			
 
				+
			
 
				+```
			
 
				+/app/whisper.cpp
			
 
				+/app/whisper.cpp/build/bin/whisper-cli
			
 
				+/app/whisper.cpp/models/ggml-small.bin
			
 
				+```
			
 
				+
			
 
				+---
			
 
				+
			
 
				+# Dependencies
			
 
				+
			
 
				+Runtime components:
			
 
				+
			
 
				+* Python 3.11
			
 
				+* Flask
			
 
				+* Gunicorn
			
 
				+* ffmpeg
			
 
				+* whisper.cpp
			
 
				+* Whisper model (`ggml-small.bin`)
			
 
				+
			
 
				+The Docker image builds whisper.cpp automatically.
			
 
				+
			
 
				+---
			
 
				+
			
 
				+# Configuration
			
 
				+
			
 
				+The server reads these environment variables:
			
 
				+
			
 
				+```
			
 
				+WHISPER_BIN
			
 
				+MODEL_PATH
			
 
				+```
			
 
				+
			
 
				+Defaults inside the container:
			
 
				+
			
 
				+```
			
 
				+WHISPER_BIN=/app/whisper.cpp/build/bin/whisper-cli
			
 
				+MODEL_PATH=/app/whisper.cpp/models/ggml-small.bin
			
 
				+```
			
 
				+
			
 
				+---
			
 
				+
			
 
				+# Build Docker Image
			
 
				+
			
 
				+From the project directory:
			
 
				+
			
 
				+```bash
			
 
				+docker build -t whisper-api .
			
 
				+```
			
 
				+
			
 
				+---
			
 
				+
			
 
				+# Run Container
			
 
				+
			
 
				+```
			
 
				+docker run -d -it -p 5005:5005 --name whisper-server whisper-api
			
 
				+```
			
 
				+
			
 
				+The API will be available at:
			
 
				+
			
 
				+```
			
 
				+http://localhost:5005
			
 
				+```
			
 
				+
			
 
				+---
			
 
				+
			
 
				+# Test the Server
			
 
				+
			
 
				+### Health Check
			
 
				+
			
 
				+```
			
 
				+curl http://localhost:5005/health
			
 
				+```
			
 
				+
			
 
				+---
			
 
				+
			
 
				+### Transcribe Audio
			
 
				+
			
 
				+```
			
 
				+curl -X POST \
			
 
				+  -F "file=@test.wav" \
			
 
				+  http://localhost:5005/transcribe
			
 
				+```
			
 
				+
			
 
				+Example response:
			
 
				+
			
 
				+```
			
 
				+{"text":"hello this is a test"}
			
 
				+```
			
 
				+
			
 
				+---
			
 
				+
			
 
				+# Development (Without Docker)
			
 
				+
			
 
				+Create a Python virtual environment:
			
 
				+
			
 
				+```
			
 
				+python -m venv venv
			
 
				+source venv/bin/activate
			
 
				+```
			
 
				+
			
 
				+Install dependencies:
			
 
				+
			
 
				+```
			
 
				+pip install -r requirements.txt
			
 
				+```
			
 
				+
			
 
				+Run server:
			
 
				+
			
 
				+```
			
 
				+python server.py
			
 
				+```
			
 
				+
			
 
				+---
			
 
				+
			
 
				+# Model Choice
			
 
				+
			
 
				+The default Docker build downloads:
			
 
				+
			
 
				+```
			
 
				+ggml-small.bin
			
 
				+```
			
 
				+
			
 
				+You can switch models by modifying the Dockerfile:
			
 
				+
			
 
				+| Model  | Speed     | Accuracy |
			
 
				+| ------ | --------- | -------- |
			
 
				+| tiny   | very fast | low      |
			
 
				+| base   | fast      | moderate |
			
 
				+| small  | balanced  | good     |
			
 
				+| medium | slow      | high     |
			
 
				+| large  | very slow | best     |
			
 
				+
			
 
				+---
			
 
				+
			
 
				+# Performance Notes
			
 
				+
			
 
				+* whisper.cpp runs **fully on CPU**
			
 
				+* transcription speed depends on:
			
 
				+
			
 
				+  * CPU cores
			
 
				+  * CPU vector extensions (AVX/AVX2)
			
 
				+  * model size
			
 
				+
			
 
				+Typical small-model performance on modern CPUs:
			
 
				+
			
 
				+```
			
 
				+~0.5x – 2x realtime
			
 
				+```
			
 
				+
			
 
				+---
			
 
				+
			
 
				+# Security Notes
			
 
				+
			
 
				+This server:
			
 
				+
			
 
				+* accepts arbitrary audio uploads
			
 
				+* runs ffmpeg on them
			
 
				+
			
 
				+For production deployments consider:
			
 
				+
			
 
				+* reverse proxy (nginx / traefik)
			
 
				+* request size limits
			
 
				+* authentication
			
 
				+* rate limiting
			
 
				+
			
 
				+---
			
 
				+
			
 
				+# License
			
 
				+
			
 
				+This project uses:
			
 
				+
			
 
				+* **whisper.cpp** — MIT License
			
 
				+* **Flask** — BSD License
			
 
				+
			
 
				+See their respective repositories for details.
			
--- a/requirements.txt
+++ b/requirements.txt
@@ -0,0 +1,2 @@
 
				+flask
			
 
				+gunicorn
			
--- a/server.py
+++ b/server.py
@@ -22,8 +22,18 @@ import tempfile
 
				 app = Flask(__name__)
			
 
				 
			
 
				 # Adjust these paths for your machine:
			
 
				-WHISPER_BIN = os.path.expanduser("~/whisper.cpp/build/bin/whisper-cli")  # e.g. ~/whisper.cpp/main or ~/whisper.cpp/build/bin/whisper-cli
			
 
				-MODEL_PATH = os.path.expanduser("~/whisper.cpp/models/ggml-small.bin")
			
 
				+#WHISPER_BIN = os.path.expanduser("~/whisper.cpp/build/bin/whisper-cli")  # e.g. ~/whisper.cpp/main or ~/whisper.cpp/build/bin/whisper-cli
			
 
				+#MODEL_PATH = os.path.expanduser("~/whisper.cpp/models/ggml-small.bin")
			
 
				+
			
 
				+WHISPER_BIN = os.environ.get(
			
 
				+    "WHISPER_BIN",
			
 
				+    os.path.expanduser("~/whisper.cpp/build/bin/whisper-cli")
			
 
				+)
			
 
				+
			
 
				+MODEL_PATH = os.environ.get(
			
 
				+    "MODEL_PATH",
			
 
				+    os.path.expanduser("~/whisper.cpp/models/ggml-small.bin")
			
 
				+)
			
 
				 
			
 
				 # Language to force; set to None to let whisper auto-detect (I recommend forcing for reliability)
			
 
				 FORCE_LANG = "en"   # "en" or "de" etc.