4 달 전 · 8954fd7bed
--- a/Dockerfile
+++ b/Dockerfile
@@ -0,0 +1,38 @@
 
				+# ── Base image ────────────────────────────────────────────────────────────────
			
 
				+# Use CUDA-enabled image so the container can use GPU if available.
			
 
				+# Falls back to CPU automatically (same as bare-metal behaviour).
			
 
				+FROM pytorch/pytorch:2.2.2-cuda12.1-cudnn8-runtime
			
 
				+
			
 
				+# ── System deps ───────────────────────────────────────────────────────────────
			
 
				+RUN apt-get update && apt-get install -y --no-install-recommends \
			
 
				+        git \
			
 
				+        curl \
			
 
				+        build-essential \
			
 
				+        cmake \
			
 
				+    && rm -rf /var/lib/apt/lists/*
			
 
				+
			
 
				+# ── Working directory ─────────────────────────────────────────────────────────
			
 
				+WORKDIR /app
			
 
				+
			
 
				+# ── Python dependencies ───────────────────────────────────────────────────────
			
 
				+COPY requirements.txt .
			
 
				+RUN pip install --upgrade pip && \
			
 
				+    pip install --no-cache-dir -r requirements.txt
			
 
				+
			
 
				+# ── Application code ──────────────────────────────────────────────────────────
			
 
				+COPY reranker_server.py .
			
 
				+
			
 
				+# ── HuggingFace cache ─────────────────────────────────────────────────────────
			
 
				+# Point HF libraries at a path inside /app/hf_cache so it can be bind-mounted
			
 
				+# from the host directory, making model downloads persist across container runs.
			
 
				+ENV HF_HOME=/app/hf_cache
			
 
				+ENV SENTENCE_TRANSFORMERS_HOME=/app/hf_cache
			
 
				+
			
 
				+# Create the directory in the image as a fallback (overridden by the mount).
			
 
				+RUN mkdir -p /app/hf_cache
			
 
				+
			
 
				+# ── Expose port ───────────────────────────────────────────────────────────────
			
 
				+EXPOSE 5200
			
 
				+
			
 
				+# ── Entrypoint ────────────────────────────────────────────────────────────────
			
 
				+CMD ["uvicorn", "reranker_server:app", "--host", "0.0.0.0", "--port", "5200"]
			
--- a/README.md
+++ b/README.md
@@ -1,14 +1,9 @@
 
				 # Local Two-Stage Reranker Server
			
 
				-
			
 
				 A lightweight **self-hosted reranking service** optimized for small GPUs and CPU fallback.
			
 
				-
			
 
				 The server exposes a simple REST API that reranks documents for a given query.
			
 
				 It is designed to integrate easily with **RAG pipelines**, **mem0**, **LangChain**, or custom retrieval systems.
			
 
				-
			
 
				 ---
			
 
				-
			
 
				 # Features
			
 
				-
			
 
				 * Two-stage reranking pipeline
			
 
				 * Fast **CPU FlashRank filtering**
			
 
				 * Accurate **MiniLM GPU cross-encoder**
			
@@ -19,13 +14,10 @@ It is designed to integrate easily with **RAG pipelines**, **mem0**, **LangChain
 
				 * Works fully **CPU-only**
			
 
				 * Simple **FastAPI REST interface**
			
 
				 * Built-in **health endpoint**
			
 
				-
			
 
				+* **Docker support** with persistent model cache
			
 
				 ---
			
 
				-
			
 
				 # Architecture
			
 
				-
			
 
				 The server uses a **two-stage reranking pipeline** to reduce GPU load while maintaining high ranking quality.
			
 
				-
			
 
				 ```
			
 
				 incoming documents
			
 
				         │
			
@@ -41,108 +33,117 @@ MiniLM cross-encoder (GPU if available)
 
				         ▼
			
 
				 final ranking
			
 
				 ```
			
 
				-
			
 
				 Advantages:
			
 
				-
			
 
				 * ~70-90% less GPU usage
			
 
				 * very low latency
			
 
				 * stable operation on small GPUs
			
 
				 * safe fallback when GPU memory is unavailable
			
 
				-
			
 
				 ---
			
 
				-
			
 
				 # Project Structure
			
 
				-
			
 
				 ```
			
 
				 reranker-server/
			
 
				 │
			
 
				 ├── reranker_server.py
			
 
				+├── Dockerfile
			
 
				 ├── requirements.txt
			
 
				 └── README.md
			
 
				 ```
			
 
				-
			
 
				 ---
			
 
				-
			
 
				 # Installation
			
 
				-
			
 
				 ## 1. Clone the repository
			
 
				-
			
 
				 ```
			
 
				 git clone <repo-url>
			
 
				 cd reranker-server
			
 
				 ```
			
 
				-
			
 
				 ---
			
 
				-
			
 
				 ## 2. Create a Python environment
			
 
				-
			
 
				 ```
			
 
				 python3 -m venv venv
			
 
				 ```
			
 
				-
			
 
				 Activate the environment.
			
 
				-
			
 
				 Linux / macOS:
			
 
				-
			
 
				 ```
			
 
				 source venv/bin/activate
			
 
				 ```
			
 
				-
			
 
				 Windows:
			
 
				-
			
 
				 ```
			
 
				 venv\Scripts\activate
			
 
				 ```
			
 
				-
			
 
				 ---
			
 
				-
			
 
				 ## 3. Install dependencies
			
 
				-
			
 
				 ```
			
 
				 pip install --upgrade pip
			
 
				 pip install -r requirements.txt
			
 
				 ```
			
 
				-
			
 
				 Required packages:
			
 
				-
			
 
				 * fastapi
			
 
				 * uvicorn
			
 
				 * sentence-transformers
			
 
				 * flashrank
			
 
				 * torch
			
 
				-
			
 
				 ---
			
 
				-
			
 
				 # Running the Server
			
 
				-
			
 
				 Start the server with:
			
 
				-
			
 
				 ```
			
 
				 uvicorn reranker_server:app --host 0.0.0.0 --port 5200
			
 
				 ```
			
 
				-
			
 
				 Server will run at:
			
 
				-
			
 
				 ```
			
 
				 http://localhost:5200
			
 
				 ```
			
 
				-
			
 
				 Interactive API docs:
			
 
				-
			
 
				 ```
			
 
				 http://localhost:5200/docs
			
 
				 ```
			
 
				+---
			
 
				+# Docker
			
 
				+## Build the image
			
 
				+From the repository root:
			
 
				+```
			
 
				+docker build -t reranker-server .
			
 
				+```
			
 
				+---
			
 
				+## Run the container (CPU only)
			
 
				+```
			
 
				+docker run -d \
			
 
				+  --name reranker-server \
			
 
				+  -p 5200:5200 \
			
 
				+  -v "$(pwd)/hf_cache:/app/hf_cache" \
			
 
				+  reranker-server
			
 
				+```
			
 
				+
			
 
				+The `hf_cache` bind-mount maps the local `./hf_cache` directory into the container.
			
 
				+Downloaded models are written there and reused on every subsequent start — no re-downloading required.
			
 
				 
			
 
				 ---
			
 
				+## Run the container (GPU)
			
 
				+Requires the [NVIDIA Container Toolkit](https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/install-guide.html) to be installed on the host.
			
 
				+```
			
 
				+docker run -d \
			
 
				+  --name reranker-server \
			
 
				+  --gpus all \
			
 
				+  -p 5200:5200 \
			
 
				+  -v "$(pwd)/hf_cache:/app/hf_cache" \
			
 
				+  reranker-server
			
 
				+```
			
 
				+Pass `--gpus '"device=0"'` instead of `--gpus all` to pin a specific GPU.
			
 
				 
			
 
				+---
			
 
				+## Stop / remove the container
			
 
				+```
			
 
				+docker stop reranker
			
 
				+docker rm reranker
			
 
				+```
			
 
				+---
			
 
				+## Rebuild after code changes
			
 
				+```
			
 
				+docker build --no-cache -t reranker-server .
			
 
				+```
			
 
				+---
			
 
				 # API
			
 
				-
			
 
				 ## POST `/rerank`
			
 
				-
			
 
				 Rerank a list of documents for a query.
			
 
				-
			
 
				 ### Request
			
 
				-
			
 
				 ```
			
 
				 {
			
 
				   "query": "What is a reranker?",
			
@@ -154,9 +155,7 @@ Rerank a list of documents for a query.
 
				   "top_k": 2
			
 
				 }
			
 
				 ```
			
 
				-
			
 
				 ### Response
			
 
				-
			
 
				 ```
			
 
				 {
			
 
				   "results": [
			
@@ -172,22 +171,15 @@ Rerank a list of documents for a query.
 
				   "backend": "gpu"
			
 
				 }
			
 
				 ```
			
 
				-
			
 
				 Field `backend` indicates which system produced the final ranking:
			
 
				-
			
 
				 * `gpu` → cross-encoder used
			
 
				 * `cpu` → FlashRank only
			
 
				-
			
 
				 ---
			
 
				-
			
 
				 # Health Check
			
 
				-
			
 
				 ```
			
 
				 GET /health
			
 
				 ```
			
 
				-
			
 
				 Example response:
			
 
				-
			
 
				 ```
			
 
				 {
			
 
				   "status": "ok",
			
@@ -195,41 +187,26 @@ Example response:
 
				   "gpu_model_loaded": true
			
 
				 }
			
 
				 ```
			
 
				-
			
 
				 ---
			
 
				-
			
 
				 # GPU Usage
			
 
				-
			
 
				 The server automatically detects CUDA.
			
 
				-
			
 
				 GPU reranking is used only when:
			
 
				-
			
 
				 * CUDA is available
			
 
				 * sufficient VRAM is free
			
 
				 * no CUDA errors occur
			
 
				-
			
 
				 If GPU fails, the system **falls back to CPU automatically**.
			
 
				-
			
 
				 ---
			
 
				-
			
 
				 # Recommended RAG Settings
			
 
				-
			
 
				 Typical pipeline:
			
 
				-
			
 
				 ```
			
 
				 vector search → top 20 documents
			
 
				 reranker → top 5 documents
			
 
				 LLM context
			
 
				 ```
			
 
				-
			
 
				 This keeps latency low while improving retrieval quality.
			
 
				-
			
 
				 ---
			
 
				-
			
 
				 # Testing the Server
			
 
				-
			
 
				 Example curl request:
			
 
				-
			
 
				 ```
			
 
				 curl http://localhost:5200/rerank \
			
 
				   -X POST \
			
@@ -243,46 +220,29 @@ curl http://localhost:5200/rerank \
 
				     "top_k": 1
			
 
				   }'
			
 
				 ```
			
 
				-
			
 
				 ---
			
 
				-
			
 
				 # Model Information
			
 
				-
			
 
				 ### CPU Stage
			
 
				-
			
 
				 FlashRank model:
			
 
				-
			
 
				 ```
			
 
				 ms-marco-MiniLM-L-12-v2
			
 
				 ```
			
 
				-
			
 
				 Fast ONNX reranker optimized for CPU.
			
 
				-
			
 
				 ### GPU Stage
			
 
				-
			
 
				 Cross-encoder model:
			
 
				-
			
 
				 ```
			
 
				 cross-encoder/ms-marco-MiniLM-L-6-v2
			
 
				 ```
			
 
				-
			
 
				 Widely used semantic reranker with good performance.
			
 
				-
			
 
				 ---
			
 
				-
			
 
				 # Future Improvements
			
 
				-
			
 
				 Possible enhancements:
			
 
				-
			
 
				 * result caching
			
 
				-* Docker container
			
 
				+* Docker Compose support
			
 
				 * batch reranking API
			
 
				 * integration with mem0
			
 
				 * Prometheus metrics
			
 
				 * request logging
			
 
				-
			
 
				 ---
			
 
				-
			
 
				 # License
			
 
				-
			
 
				 MIT License