4 mesi fa · 38a118086b
--- a/.gitignore
+++ b/.gitignore
@@ -0,0 +1,83 @@
 
															+# Python bytecode
														
 
															+
														
 
															+**pycache**/
														
 
															+*.py[cod]
														
 
															+*$py.class
														
 
															+
														
 
															+# Virtual environments
														
 
															+
														
 
															+venv/
														
 
															+.env/
														
 
															+.envrc
														
 
															+.venv/
														
 
															+
														
 
															+# Python build artifacts
														
 
															+
														
 
															+build/
														
 
															+dist/
														
 
															+*.egg-info/
														
 
															+.eggs/
														
 
															+
														
 
															+# Installer logs
														
 
															+
														
 
															+pip-log.txt
														
 
															+pip-delete-this-directory.txt
														
 
															+
														
 
															+# Test / coverage
														
 
															+
														
 
															+.coverage
														
 
															+.coverage.*
														
 
															+.cache
														
 
															+pytest_cache/
														
 
															+
														
 
															+# Logs
														
 
															+
														
 
															+*.log
														
 
															+logs/
														
 
															+
														
 
															+# Jupyter
														
 
															+
														
 
															+.ipynb_checkpoints
														
 
															+
														
 
															+# IDE files
														
 
															+
														
 
															+.vscode/
														
 
															+.idea/
														
 
															+
														
 
															+# OS files
														
 
															+
														
 
															+.DS_Store
														
 
															+Thumbs.db
														
 
															+
														
 
															+# HuggingFace / ML model caches
														
 
															+
														
 
															+hf_cache/
														
 
															+.hf_cache/
														
 
															+.cache/huggingface/
														
 
															+.cache/torch/
														
 
															+.cache/
														
 
															+
														
 
															+# Torch model checkpoints
														
 
															+
														
 
															+*.pt
														
 
															+*.pth
														
 
															+*.bin
														
 
															+
														
 
															+# ONNX models (FlashRank)
														
 
															+
														
 
															+*.onnx
														
 
															+
														
 
															+# Local configuration
														
 
															+
														
 
															+.env.local
														
 
															+.env.production
														
 
															+.env.development
														
 
															+
														
 
															+# Docker
														
 
															+
														
 
															+docker-compose.override.yml
														
 
															+
														
 
															+# Runtime files
														
 
															+
														
 
															+*.pid
														
 
															+*.sock
														
--- a/README.md
+++ b/README.md
@@ -0,0 +1,288 @@
 
															+# Local Two-Stage Reranker Server
														
 
															+
														
 
															+A lightweight **self-hosted reranking service** optimized for small GPUs and CPU fallback.
														
 
															+
														
 
															+The server exposes a simple REST API that reranks documents for a given query.
														
 
															+It is designed to integrate easily with **RAG pipelines**, **mem0**, **LangChain**, or custom retrieval systems.
														
 
															+
														
 
															+---
														
 
															+
														
 
															+# Features
														
 
															+
														
 
															+* Two-stage reranking pipeline
														
 
															+* Fast **CPU FlashRank filtering**
														
 
															+* Accurate **MiniLM GPU cross-encoder**
														
 
															+* **Automatic GPU detection**
														
 
															+* **VRAM safety check**
														
 
															+* **CUDA OOM fallback**
														
 
															+* Optimized for **small GPUs (e.g. GTX 1650 4GB)**
														
 
															+* Works fully **CPU-only**
														
 
															+* Simple **FastAPI REST interface**
														
 
															+* Built-in **health endpoint**
														
 
															+
														
 
															+---
														
 
															+
														
 
															+# Architecture
														
 
															+
														
 
															+The server uses a **two-stage reranking pipeline** to reduce GPU load while maintaining high ranking quality.
														
 
															+
														
 
															+```
														
 
															+incoming documents
														
 
															+        │
														
 
															+        ▼
														
 
															+FlashRank CPU reranker
														
 
															+        │
														
 
															+        ▼
														
 
															+top 10 candidates
														
 
															+        │
														
 
															+        ▼
														
 
															+MiniLM cross-encoder (GPU if available)
														
 
															+        │
														
 
															+        ▼
														
 
															+final ranking
														
 
															+```
														
 
															+
														
 
															+Advantages:
														
 
															+
														
 
															+* ~70-90% less GPU usage
														
 
															+* very low latency
														
 
															+* stable operation on small GPUs
														
 
															+* safe fallback when GPU memory is unavailable
														
 
															+
														
 
															+---
														
 
															+
														
 
															+# Project Structure
														
 
															+
														
 
															+```
														
 
															+reranker-server/
														
 
															+│
														
 
															+├── reranker_server.py
														
 
															+├── requirements.txt
														
 
															+└── README.md
														
 
															+```
														
 
															+
														
 
															+---
														
 
															+
														
 
															+# Installation
														
 
															+
														
 
															+## 1. Clone the repository
														
 
															+
														
 
															+```
														
 
															+git clone <repo-url>
														
 
															+cd reranker-server
														
 
															+```
														
 
															+
														
 
															+---
														
 
															+
														
 
															+## 2. Create a Python environment
														
 
															+
														
 
															+```
														
 
															+python3 -m venv venv
														
 
															+```
														
 
															+
														
 
															+Activate the environment.
														
 
															+
														
 
															+Linux / macOS:
														
 
															+
														
 
															+```
														
 
															+source venv/bin/activate
														
 
															+```
														
 
															+
														
 
															+Windows:
														
 
															+
														
 
															+```
														
 
															+venv\Scripts\activate
														
 
															+```
														
 
															+
														
 
															+---
														
 
															+
														
 
															+## 3. Install dependencies
														
 
															+
														
 
															+```
														
 
															+pip install --upgrade pip
														
 
															+pip install -r requirements.txt
														
 
															+```
														
 
															+
														
 
															+Required packages:
														
 
															+
														
 
															+* fastapi
														
 
															+* uvicorn
														
 
															+* sentence-transformers
														
 
															+* flashrank
														
 
															+* torch
														
 
															+
														
 
															+---
														
 
															+
														
 
															+# Running the Server
														
 
															+
														
 
															+Start the server with:
														
 
															+
														
 
															+```
														
 
															+uvicorn reranker_server:app --host 0.0.0.0 --port 5200
														
 
															+```
														
 
															+
														
 
															+Server will run at:
														
 
															+
														
 
															+```
														
 
															+http://localhost:5200
														
 
															+```
														
 
															+
														
 
															+Interactive API docs:
														
 
															+
														
 
															+```
														
 
															+http://localhost:5200/docs
														
 
															+```
														
 
															+
														
 
															+---
														
 
															+
														
 
															+# API
														
 
															+
														
 
															+## POST `/rerank`
														
 
															+
														
 
															+Rerank a list of documents for a query.
														
 
															+
														
 
															+### Request
														
 
															+
														
 
															+```
														
 
															+{
														
 
															+  "query": "What is a reranker?",
														
 
															+  "documents": [
														
 
															+    "A reranker sorts retrieved documents by relevance.",
														
 
															+    "Cats are mammals.",
														
 
															+    "Rerankers improve search pipelines."
														
 
															+  ],
														
 
															+  "top_k": 2
														
 
															+}
														
 
															+```
														
 
															+
														
 
															+### Response
														
 
															+
														
 
															+```
														
 
															+{
														
 
															+  "results": [
														
 
															+    {
														
 
															+      "text": "A reranker sorts retrieved documents by relevance.",
														
 
															+      "score": 0.84
														
 
															+    },
														
 
															+    {
														
 
															+      "text": "Rerankers improve search pipelines.",
														
 
															+      "score": 0.79
														
 
															+    }
														
 
															+  ],
														
 
															+  "backend": "gpu"
														
 
															+}
														
 
															+```
														
 
															+
														
 
															+Field `backend` indicates which system produced the final ranking:
														
 
															+
														
 
															+* `gpu` → cross-encoder used
														
 
															+* `cpu` → FlashRank only
														
 
															+
														
 
															+---
														
 
															+
														
 
															+# Health Check
														
 
															+
														
 
															+```
														
 
															+GET /health
														
 
															+```
														
 
															+
														
 
															+Example response:
														
 
															+
														
 
															+```
														
 
															+{
														
 
															+  "status": "ok",
														
 
															+  "cuda_available": true,
														
 
															+  "gpu_model_loaded": true
														
 
															+}
														
 
															+```
														
 
															+
														
 
															+---
														
 
															+
														
 
															+# GPU Usage
														
 
															+
														
 
															+The server automatically detects CUDA.
														
 
															+
														
 
															+GPU reranking is used only when:
														
 
															+
														
 
															+* CUDA is available
														
 
															+* sufficient VRAM is free
														
 
															+* no CUDA errors occur
														
 
															+
														
 
															+If GPU fails, the system **falls back to CPU automatically**.
														
 
															+
														
 
															+---
														
 
															+
														
 
															+# Recommended RAG Settings
														
 
															+
														
 
															+Typical pipeline:
														
 
															+
														
 
															+```
														
 
															+vector search → top 20 documents
														
 
															+reranker → top 5 documents
														
 
															+LLM context
														
 
															+```
														
 
															+
														
 
															+This keeps latency low while improving retrieval quality.
														
 
															+
														
 
															+---
														
 
															+
														
 
															+# Testing the Server
														
 
															+
														
 
															+Example curl request:
														
 
															+
														
 
															+```
														
 
															+curl http://localhost:5200/rerank \
														
 
															+  -X POST \
														
 
															+  -H "Content-Type: application/json" \
														
 
															+  -d '{
														
 
															+    "query": "reranker",
														
 
															+    "documents": [
														
 
															+      "cats",
														
 
															+      "reranking documents improves retrieval"
														
 
															+    ],
														
 
															+    "top_k": 1
														
 
															+  }'
														
 
															+```
														
 
															+
														
 
															+---
														
 
															+
														
 
															+# Model Information
														
 
															+
														
 
															+### CPU Stage
														
 
															+
														
 
															+FlashRank model:
														
 
															+
														
 
															+```
														
 
															+ms-marco-MiniLM-L-12-v2
														
 
															+```
														
 
															+
														
 
															+Fast ONNX reranker optimized for CPU.
														
 
															+
														
 
															+### GPU Stage
														
 
															+
														
 
															+Cross-encoder model:
														
 
															+
														
 
															+```
														
 
															+cross-encoder/ms-marco-MiniLM-L-6-v2
														
 
															+```
														
 
															+
														
 
															+Widely used semantic reranker with good performance.
														
 
															+
														
 
															+---
														
 
															+
														
 
															+# Future Improvements
														
 
															+
														
 
															+Possible enhancements:
														
 
															+
														
 
															+* result caching
														
 
															+* Docker container
														
 
															+* batch reranking API
														
 
															+* integration with mem0
														
 
															+* Prometheus metrics
														
 
															+* request logging
														
 
															+
														
 
															+---
														
 
															+
														
 
															+# License
														
 
															+
														
 
															+MIT License
														
--- a/requirements.txt
+++ b/requirements.txt
@@ -0,0 +1,5 @@
 
															+fastapi==0.110.0
														
 
															+uvicorn[standard]==0.29.0
														
 
															+sentence-transformers==2.7.0
														
 
															+flashrank==0.2.5
														
 
															+torch
														
--- a/reranker_server.py
+++ b/reranker_server.py
@@ -0,0 +1,283 @@
 
															+"""
														
 
															+Local Reranker Server
														
 
															+=====================
														
 
															+
														
 
															+Two-stage reranking architecture optimized for small GPUs.
														
 
															+
														
 
															+Pipeline:
														
 
															+
														
 
															+1) FlashRank CPU reranker (fast filtering)
														
 
															+2) GPU cross-encoder reranker (accurate final ranking)
														
 
															+
														
 
															+Features
														
 
															+--------
														
 
															+
														
 
															+✓ CPU-first architecture (safe for weak GPUs)
														
 
															+✓ GPU reranking when available
														
 
															+✓ automatic CUDA fallback
														
 
															+✓ VRAM-aware routing
														
 
															+✓ batching for low VRAM GPUs
														
 
															+✓ safe FlashRank integration
														
 
															+✓ FastAPI REST interface
														
 
															+✓ health endpoint
														
 
															+
														
 
															+Default API
														
 
															+-----------
														
 
															+
														
 
															+POST /rerank
														
 
															+
														
 
															+{
														
 
															+  "query": "...",
														
 
															+  "documents": ["doc1", "doc2"],
														
 
															+  "top_k": 5
														
 
															+}
														
 
															+
														
 
															+Server
														
 
															+------
														
 
															+
														
 
															+http://localhost:5200
														
 
															+"""
														
 
															+
														
 
															+from fastapi import FastAPI
														
 
															+from pydantic import BaseModel
														
 
															+from typing import List
														
 
															+
														
 
															+import torch
														
 
															+
														
 
															+from sentence_transformers import CrossEncoder
														
 
															+from flashrank import Ranker, RerankRequest
														
 
															+
														
 
															+
														
 
															+# --------------------------------------------------
														
 
															+# Configuration
														
 
															+# --------------------------------------------------
														
 
															+
														
 
															+PORT = 5200
														
 
															+
														
 
															+# FlashRank stage candidate count
														
 
															+FIRST_STAGE_TOP_K = 10
														
 
															+
														
 
															+# GPU batch size
														
 
															+BATCH_SIZE = 8
														
 
															+
														
 
															+# max token length
														
 
															+MAX_LENGTH = 256
														
 
															+
														
 
															+# minimal VRAM required to attempt GPU
														
 
															+MIN_GPU_MEMORY_GB = 1.5
														
 
															+
														
 
															+
														
 
															+# --------------------------------------------------
														
 
															+# Model initialization
														
 
															+# --------------------------------------------------
														
 
															+
														
 
															+print("Loading FlashRank CPU model...")
														
 
															+cpu_ranker = Ranker(model_name="ms-marco-MiniLM-L-12-v2")
														
 
															+
														
 
															+print("Loading GPU cross-encoder model...")
														
 
															+
														
 
															+gpu_available = torch.cuda.is_available()
														
 
															+
														
 
															+gpu_model = None
														
 
															+
														
 
															+if gpu_available:
														
 
															+    try:
														
 
															+        gpu_model = CrossEncoder(
														
 
															+            "cross-encoder/ms-marco-MiniLM-L-6-v2",
														
 
															+            device="cuda",
														
 
															+            max_length=MAX_LENGTH
														
 
															+        )
														
 
															+        print("GPU reranker loaded successfully")
														
 
															+
														
 
															+    except Exception as e:
														
 
															+        print("GPU initialization failed, running CPU-only:", e)
														
 
															+        gpu_model = None
														
 
															+        gpu_available = False
														
 
															+
														
 
															+else:
														
 
															+    print("CUDA not available, running CPU-only")
														
 
															+
														
 
															+
														
 
															+# --------------------------------------------------
														
 
															+# API schema
														
 
															+# --------------------------------------------------
														
 
															+
														
 
															+class RerankRequestModel(BaseModel):
														
 
															+    query: str
														
 
															+    documents: List[str]
														
 
															+    top_k: int = 5
														
 
															+
														
 
															+
														
 
															+class RerankResult(BaseModel):
														
 
															+    text: str
														
 
															+    score: float
														
 
															+
														
 
															+
														
 
															+class RerankResponse(BaseModel):
														
 
															+    results: List[RerankResult]
														
 
															+    backend: str
														
 
															+
														
 
															+
														
 
															+# --------------------------------------------------
														
 
															+# Utility functions
														
 
															+# --------------------------------------------------
														
 
															+
														
 
															+def gpu_memory_available():
														
 
															+    """
														
 
															+    Check available VRAM before attempting GPU inference.
														
 
															+    Prevents CUDA OOM on small GPUs.
														
 
															+    """
														
 
															+
														
 
															+    if not gpu_available:
														
 
															+        return False
														
 
															+
														
 
															+    free, total = torch.cuda.mem_get_info()
														
 
															+
														
 
															+    free_gb = free / (1024 ** 3)
														
 
															+
														
 
															+    return free_gb > MIN_GPU_MEMORY_GB
														
 
															+
														
 
															+
														
 
															+# --------------------------------------------------
														
 
															+# Stage 1: FlashRank CPU
														
 
															+# --------------------------------------------------
														
 
															+
														
 
															+def rerank_cpu_stage(query: str, docs: List[str]):
														
 
															+    """
														
 
															+    First-stage reranking using FlashRank.
														
 
															+
														
 
															+    FlashRank is extremely fast and filters
														
 
															+    the candidate documents before GPU reranking.
														
 
															+    """
														
 
															+
														
 
															+    passages = [{"text": d} for d in docs]
														
 
															+
														
 
															+    request = RerankRequest(
														
 
															+        query=query,
														
 
															+        passages=passages
														
 
															+    )
														
 
															+
														
 
															+    result = cpu_ranker.rerank(request)
														
 
															+
														
 
															+    ranked = [
														
 
															+        (r.get("text", ""), float(r.get("score", 0)))
														
 
															+        for r in result
														
 
															+    ]
														
 
															+
														
 
															+    return ranked
														
 
															+
														
 
															+
														
 
															+# --------------------------------------------------
														
 
															+# Stage 2: GPU cross-encoder
														
 
															+# --------------------------------------------------
														
 
															+
														
 
															+def rerank_gpu_stage(query: str, docs: List[str]):
														
 
															+    """
														
 
															+    Second-stage reranking using GPU cross-encoder.
														
 
															+    """
														
 
															+
														
 
															+    pairs = [(query, d) for d in docs]
														
 
															+
														
 
															+    scores = []
														
 
															+
														
 
															+    for i in range(0, len(pairs), BATCH_SIZE):
														
 
															+
														
 
															+        batch = pairs[i:i + BATCH_SIZE]
														
 
															+
														
 
															+        batch_scores = gpu_model.predict(batch)
														
 
															+
														
 
															+        scores.extend(batch_scores.tolist())
														
 
															+
														
 
															+    ranked = list(zip(docs, scores))
														
 
															+
														
 
															+    ranked.sort(key=lambda x: x[1], reverse=True)
														
 
															+
														
 
															+    return ranked
														
 
															+
														
 
															+
														
 
															+# --------------------------------------------------
														
 
															+# FastAPI app
														
 
															+# --------------------------------------------------
														
 
															+
														
 
															+app = FastAPI(
														
 
															+    title="Local Two-Stage Reranker",
														
 
															+    description="FlashRank CPU + MiniLM GPU reranking",
														
 
															+    version="2.0"
														
 
															+)
														
 
															+
														
 
															+
														
 
															+@app.post("/rerank", response_model=RerankResponse)
														
 
															+def rerank(request: RerankRequestModel):
														
 
															+
														
 
															+    query = request.query
														
 
															+    docs = request.documents
														
 
															+    top_k = min(request.top_k, len(docs))
														
 
															+
														
 
															+    # --------------------------------------------------
														
 
															+    # Stage 1: FlashRank CPU filtering
														
 
															+    # --------------------------------------------------
														
 
															+
														
 
															+    first_stage = rerank_cpu_stage(query, docs)
														
 
															+
														
 
															+    # select best candidates
														
 
															+    candidates = first_stage[:FIRST_STAGE_TOP_K]
														
 
															+
														
 
															+    candidate_docs = [d for d, s in candidates]
														
 
															+
														
 
															+    backend = "cpu"
														
 
															+
														
 
															+    # --------------------------------------------------
														
 
															+    # Stage 2: GPU reranking (optional)
														
 
															+    # --------------------------------------------------
														
 
															+
														
 
															+    if gpu_model and gpu_memory_available():
														
 
															+
														
 
															+        try:
														
 
															+
														
 
															+            second_stage = rerank_gpu_stage(query, candidate_docs)
														
 
															+
														
 
															+            backend = "gpu"
														
 
															+
														
 
															+        except (torch.cuda.OutOfMemoryError, RuntimeError):
														
 
															+
														
 
															+            print("CUDA failure -> using CPU stage results")
														
 
															+
														
 
															+            torch.cuda.empty_cache()
														
 
															+
														
 
															+            second_stage = candidates
														
 
															+
														
 
															+            backend = "cpu"
														
 
															+
														
 
															+    else:
														
 
															+
														
 
															+        second_stage = candidates
														
 
															+
														
 
															+    # --------------------------------------------------
														
 
															+    # Final result selection
														
 
															+    # --------------------------------------------------
														
 
															+
														
 
															+    final = second_stage[:top_k]
														
 
															+
														
 
															+    results = [
														
 
															+        RerankResult(text=d, score=float(s))
														
 
															+        for d, s in final
														
 
															+    ]
														
 
															+
														
 
															+    return RerankResponse(
														
 
															+        results=results,
														
 
															+        backend=backend
														
 
															+    )
														
 
															+
														
 
															+
														
 
															+# --------------------------------------------------
														
 
															+# Health endpoint
														
 
															+# --------------------------------------------------
														
 
															+
														
 
															+@app.get("/health")
														
 
															+def health():
														
 
															+
														
 
															+    return {
														
 
															+        "status": "ok",
														
 
															+        "cuda_available": torch.cuda.is_available(),
														
 
															+        "gpu_model_loaded": gpu_model is not None
														
 
															+    }