4 luni în urmă · 38a118086b
--- a/.gitignore
+++ b/.gitignore
@@ -0,0 +1,83 @@
 
				+# Python bytecode
			
 
				+
			
 
				+**pycache**/
			
 
				+*.py[cod]
			
 
				+*$py.class
			
 
				+
			
 
				+# Virtual environments
			
 
				+
			
 
				+venv/
			
 
				+.env/
			
 
				+.envrc
			
 
				+.venv/
			
 
				+
			
 
				+# Python build artifacts
			
 
				+
			
 
				+build/
			
 
				+dist/
			
 
				+*.egg-info/
			
 
				+.eggs/
			
 
				+
			
 
				+# Installer logs
			
 
				+
			
 
				+pip-log.txt
			
 
				+pip-delete-this-directory.txt
			
 
				+
			
 
				+# Test / coverage
			
 
				+
			
 
				+.coverage
			
 
				+.coverage.*
			
 
				+.cache
			
 
				+pytest_cache/
			
 
				+
			
 
				+# Logs
			
 
				+
			
 
				+*.log
			
 
				+logs/
			
 
				+
			
 
				+# Jupyter
			
 
				+
			
 
				+.ipynb_checkpoints
			
 
				+
			
 
				+# IDE files
			
 
				+
			
 
				+.vscode/
			
 
				+.idea/
			
 
				+
			
 
				+# OS files
			
 
				+
			
 
				+.DS_Store
			
 
				+Thumbs.db
			
 
				+
			
 
				+# HuggingFace / ML model caches
			
 
				+
			
 
				+hf_cache/
			
 
				+.hf_cache/
			
 
				+.cache/huggingface/
			
 
				+.cache/torch/
			
 
				+.cache/
			
 
				+
			
 
				+# Torch model checkpoints
			
 
				+
			
 
				+*.pt
			
 
				+*.pth
			
 
				+*.bin
			
 
				+
			
 
				+# ONNX models (FlashRank)
			
 
				+
			
 
				+*.onnx
			
 
				+
			
 
				+# Local configuration
			
 
				+
			
 
				+.env.local
			
 
				+.env.production
			
 
				+.env.development
			
 
				+
			
 
				+# Docker
			
 
				+
			
 
				+docker-compose.override.yml
			
 
				+
			
 
				+# Runtime files
			
 
				+
			
 
				+*.pid
			
 
				+*.sock
			
--- a/README.md
+++ b/README.md
@@ -0,0 +1,288 @@
 
				+# Local Two-Stage Reranker Server
			
 
				+
			
 
				+A lightweight **self-hosted reranking service** optimized for small GPUs and CPU fallback.
			
 
				+
			
 
				+The server exposes a simple REST API that reranks documents for a given query.
			
 
				+It is designed to integrate easily with **RAG pipelines**, **mem0**, **LangChain**, or custom retrieval systems.
			
 
				+
			
 
				+---
			
 
				+
			
 
				+# Features
			
 
				+
			
 
				+* Two-stage reranking pipeline
			
 
				+* Fast **CPU FlashRank filtering**
			
 
				+* Accurate **MiniLM GPU cross-encoder**
			
 
				+* **Automatic GPU detection**
			
 
				+* **VRAM safety check**
			
 
				+* **CUDA OOM fallback**
			
 
				+* Optimized for **small GPUs (e.g. GTX 1650 4GB)**
			
 
				+* Works fully **CPU-only**
			
 
				+* Simple **FastAPI REST interface**
			
 
				+* Built-in **health endpoint**
			
 
				+
			
 
				+---
			
 
				+
			
 
				+# Architecture
			
 
				+
			
 
				+The server uses a **two-stage reranking pipeline** to reduce GPU load while maintaining high ranking quality.
			
 
				+
			
 
				+```
			
 
				+incoming documents
			
 
				+        │
			
 
				+        ▼
			
 
				+FlashRank CPU reranker
			
 
				+        │
			
 
				+        ▼
			
 
				+top 10 candidates
			
 
				+        │
			
 
				+        ▼
			
 
				+MiniLM cross-encoder (GPU if available)
			
 
				+        │
			
 
				+        ▼
			
 
				+final ranking
			
 
				+```
			
 
				+
			
 
				+Advantages:
			
 
				+
			
 
				+* ~70-90% less GPU usage
			
 
				+* very low latency
			
 
				+* stable operation on small GPUs
			
 
				+* safe fallback when GPU memory is unavailable
			
 
				+
			
 
				+---
			
 
				+
			
 
				+# Project Structure
			
 
				+
			
 
				+```
			
 
				+reranker-server/
			
 
				+│
			
 
				+├── reranker_server.py
			
 
				+├── requirements.txt
			
 
				+└── README.md
			
 
				+```
			
 
				+
			
 
				+---
			
 
				+
			
 
				+# Installation
			
 
				+
			
 
				+## 1. Clone the repository
			
 
				+
			
 
				+```
			
 
				+git clone <repo-url>
			
 
				+cd reranker-server
			
 
				+```
			
 
				+
			
 
				+---
			
 
				+
			
 
				+## 2. Create a Python environment
			
 
				+
			
 
				+```
			
 
				+python3 -m venv venv
			
 
				+```
			
 
				+
			
 
				+Activate the environment.
			
 
				+
			
 
				+Linux / macOS:
			
 
				+
			
 
				+```
			
 
				+source venv/bin/activate
			
 
				+```
			
 
				+
			
 
				+Windows:
			
 
				+
			
 
				+```
			
 
				+venv\Scripts\activate
			
 
				+```
			
 
				+
			
 
				+---
			
 
				+
			
 
				+## 3. Install dependencies
			
 
				+
			
 
				+```
			
 
				+pip install --upgrade pip
			
 
				+pip install -r requirements.txt
			
 
				+```
			
 
				+
			
 
				+Required packages:
			
 
				+
			
 
				+* fastapi
			
 
				+* uvicorn
			
 
				+* sentence-transformers
			
 
				+* flashrank
			
 
				+* torch
			
 
				+
			
 
				+---
			
 
				+
			
 
				+# Running the Server
			
 
				+
			
 
				+Start the server with:
			
 
				+
			
 
				+```
			
 
				+uvicorn reranker_server:app --host 0.0.0.0 --port 5200
			
 
				+```
			
 
				+
			
 
				+Server will run at:
			
 
				+
			
 
				+```
			
 
				+http://localhost:5200
			
 
				+```
			
 
				+
			
 
				+Interactive API docs:
			
 
				+
			
 
				+```
			
 
				+http://localhost:5200/docs
			
 
				+```
			
 
				+
			
 
				+---
			
 
				+
			
 
				+# API
			
 
				+
			
 
				+## POST `/rerank`
			
 
				+
			
 
				+Rerank a list of documents for a query.
			
 
				+
			
 
				+### Request
			
 
				+
			
 
				+```
			
 
				+{
			
 
				+  "query": "What is a reranker?",
			
 
				+  "documents": [
			
 
				+    "A reranker sorts retrieved documents by relevance.",
			
 
				+    "Cats are mammals.",
			
 
				+    "Rerankers improve search pipelines."
			
 
				+  ],
			
 
				+  "top_k": 2
			
 
				+}
			
 
				+```
			
 
				+
			
 
				+### Response
			
 
				+
			
 
				+```
			
 
				+{
			
 
				+  "results": [
			
 
				+    {
			
 
				+      "text": "A reranker sorts retrieved documents by relevance.",
			
 
				+      "score": 0.84
			
 
				+    },
			
 
				+    {
			
 
				+      "text": "Rerankers improve search pipelines.",
			
 
				+      "score": 0.79
			
 
				+    }
			
 
				+  ],
			
 
				+  "backend": "gpu"
			
 
				+}
			
 
				+```
			
 
				+
			
 
				+Field `backend` indicates which system produced the final ranking:
			
 
				+
			
 
				+* `gpu` → cross-encoder used
			
 
				+* `cpu` → FlashRank only
			
 
				+
			
 
				+---
			
 
				+
			
 
				+# Health Check
			
 
				+
			
 
				+```
			
 
				+GET /health
			
 
				+```
			
 
				+
			
 
				+Example response:
			
 
				+
			
 
				+```
			
 
				+{
			
 
				+  "status": "ok",
			
 
				+  "cuda_available": true,
			
 
				+  "gpu_model_loaded": true
			
 
				+}
			
 
				+```
			
 
				+
			
 
				+---
			
 
				+
			
 
				+# GPU Usage
			
 
				+
			
 
				+The server automatically detects CUDA.
			
 
				+
			
 
				+GPU reranking is used only when:
			
 
				+
			
 
				+* CUDA is available
			
 
				+* sufficient VRAM is free
			
 
				+* no CUDA errors occur
			
 
				+
			
 
				+If GPU fails, the system **falls back to CPU automatically**.
			
 
				+
			
 
				+---
			
 
				+
			
 
				+# Recommended RAG Settings
			
 
				+
			
 
				+Typical pipeline:
			
 
				+
			
 
				+```
			
 
				+vector search → top 20 documents
			
 
				+reranker → top 5 documents
			
 
				+LLM context
			
 
				+```
			
 
				+
			
 
				+This keeps latency low while improving retrieval quality.
			
 
				+
			
 
				+---
			
 
				+
			
 
				+# Testing the Server
			
 
				+
			
 
				+Example curl request:
			
 
				+
			
 
				+```
			
 
				+curl http://localhost:5200/rerank \
			
 
				+  -X POST \
			
 
				+  -H "Content-Type: application/json" \
			
 
				+  -d '{
			
 
				+    "query": "reranker",
			
 
				+    "documents": [
			
 
				+      "cats",
			
 
				+      "reranking documents improves retrieval"
			
 
				+    ],
			
 
				+    "top_k": 1
			
 
				+  }'
			
 
				+```
			
 
				+
			
 
				+---
			
 
				+
			
 
				+# Model Information
			
 
				+
			
 
				+### CPU Stage
			
 
				+
			
 
				+FlashRank model:
			
 
				+
			
 
				+```
			
 
				+ms-marco-MiniLM-L-12-v2
			
 
				+```
			
 
				+
			
 
				+Fast ONNX reranker optimized for CPU.
			
 
				+
			
 
				+### GPU Stage
			
 
				+
			
 
				+Cross-encoder model:
			
 
				+
			
 
				+```
			
 
				+cross-encoder/ms-marco-MiniLM-L-6-v2
			
 
				+```
			
 
				+
			
 
				+Widely used semantic reranker with good performance.
			
 
				+
			
 
				+---
			
 
				+
			
 
				+# Future Improvements
			
 
				+
			
 
				+Possible enhancements:
			
 
				+
			
 
				+* result caching
			
 
				+* Docker container
			
 
				+* batch reranking API
			
 
				+* integration with mem0
			
 
				+* Prometheus metrics
			
 
				+* request logging
			
 
				+
			
 
				+---
			
 
				+
			
 
				+# License
			
 
				+
			
 
				+MIT License
			
--- a/requirements.txt
+++ b/requirements.txt
@@ -0,0 +1,5 @@
 
				+fastapi==0.110.0
			
 
				+uvicorn[standard]==0.29.0
			
 
				+sentence-transformers==2.7.0
			
 
				+flashrank==0.2.5
			
 
				+torch
			
--- a/reranker_server.py
+++ b/reranker_server.py
@@ -0,0 +1,283 @@
 
				+"""
			
 
				+Local Reranker Server
			
 
				+=====================
			
 
				+
			
 
				+Two-stage reranking architecture optimized for small GPUs.
			
 
				+
			
 
				+Pipeline:
			
 
				+
			
 
				+1) FlashRank CPU reranker (fast filtering)
			
 
				+2) GPU cross-encoder reranker (accurate final ranking)
			
 
				+
			
 
				+Features
			
 
				+--------
			
 
				+
			
 
				+✓ CPU-first architecture (safe for weak GPUs)
			
 
				+✓ GPU reranking when available
			
 
				+✓ automatic CUDA fallback
			
 
				+✓ VRAM-aware routing
			
 
				+✓ batching for low VRAM GPUs
			
 
				+✓ safe FlashRank integration
			
 
				+✓ FastAPI REST interface
			
 
				+✓ health endpoint
			
 
				+
			
 
				+Default API
			
 
				+-----------
			
 
				+
			
 
				+POST /rerank
			
 
				+
			
 
				+{
			
 
				+  "query": "...",
			
 
				+  "documents": ["doc1", "doc2"],
			
 
				+  "top_k": 5
			
 
				+}
			
 
				+
			
 
				+Server
			
 
				+------
			
 
				+
			
 
				+http://localhost:5200
			
 
				+"""
			
 
				+
			
 
				+from fastapi import FastAPI
			
 
				+from pydantic import BaseModel
			
 
				+from typing import List
			
 
				+
			
 
				+import torch
			
 
				+
			
 
				+from sentence_transformers import CrossEncoder
			
 
				+from flashrank import Ranker, RerankRequest
			
 
				+
			
 
				+
			
 
				+# --------------------------------------------------
			
 
				+# Configuration
			
 
				+# --------------------------------------------------
			
 
				+
			
 
				+PORT = 5200
			
 
				+
			
 
				+# FlashRank stage candidate count
			
 
				+FIRST_STAGE_TOP_K = 10
			
 
				+
			
 
				+# GPU batch size
			
 
				+BATCH_SIZE = 8
			
 
				+
			
 
				+# max token length
			
 
				+MAX_LENGTH = 256
			
 
				+
			
 
				+# minimal VRAM required to attempt GPU
			
 
				+MIN_GPU_MEMORY_GB = 1.5
			
 
				+
			
 
				+
			
 
				+# --------------------------------------------------
			
 
				+# Model initialization
			
 
				+# --------------------------------------------------
			
 
				+
			
 
				+print("Loading FlashRank CPU model...")
			
 
				+cpu_ranker = Ranker(model_name="ms-marco-MiniLM-L-12-v2")
			
 
				+
			
 
				+print("Loading GPU cross-encoder model...")
			
 
				+
			
 
				+gpu_available = torch.cuda.is_available()
			
 
				+
			
 
				+gpu_model = None
			
 
				+
			
 
				+if gpu_available:
			
 
				+    try:
			
 
				+        gpu_model = CrossEncoder(
			
 
				+            "cross-encoder/ms-marco-MiniLM-L-6-v2",
			
 
				+            device="cuda",
			
 
				+            max_length=MAX_LENGTH
			
 
				+        )
			
 
				+        print("GPU reranker loaded successfully")
			
 
				+
			
 
				+    except Exception as e:
			
 
				+        print("GPU initialization failed, running CPU-only:", e)
			
 
				+        gpu_model = None
			
 
				+        gpu_available = False
			
 
				+
			
 
				+else:
			
 
				+    print("CUDA not available, running CPU-only")
			
 
				+
			
 
				+
			
 
				+# --------------------------------------------------
			
 
				+# API schema
			
 
				+# --------------------------------------------------
			
 
				+
			
 
				+class RerankRequestModel(BaseModel):
			
 
				+    query: str
			
 
				+    documents: List[str]
			
 
				+    top_k: int = 5
			
 
				+
			
 
				+
			
 
				+class RerankResult(BaseModel):
			
 
				+    text: str
			
 
				+    score: float
			
 
				+
			
 
				+
			
 
				+class RerankResponse(BaseModel):
			
 
				+    results: List[RerankResult]
			
 
				+    backend: str
			
 
				+
			
 
				+
			
 
				+# --------------------------------------------------
			
 
				+# Utility functions
			
 
				+# --------------------------------------------------
			
 
				+
			
 
				+def gpu_memory_available():
			
 
				+    """
			
 
				+    Check available VRAM before attempting GPU inference.
			
 
				+    Prevents CUDA OOM on small GPUs.
			
 
				+    """
			
 
				+
			
 
				+    if not gpu_available:
			
 
				+        return False
			
 
				+
			
 
				+    free, total = torch.cuda.mem_get_info()
			
 
				+
			
 
				+    free_gb = free / (1024 ** 3)
			
 
				+
			
 
				+    return free_gb > MIN_GPU_MEMORY_GB
			
 
				+
			
 
				+
			
 
				+# --------------------------------------------------
			
 
				+# Stage 1: FlashRank CPU
			
 
				+# --------------------------------------------------
			
 
				+
			
 
				+def rerank_cpu_stage(query: str, docs: List[str]):
			
 
				+    """
			
 
				+    First-stage reranking using FlashRank.
			
 
				+
			
 
				+    FlashRank is extremely fast and filters
			
 
				+    the candidate documents before GPU reranking.
			
 
				+    """
			
 
				+
			
 
				+    passages = [{"text": d} for d in docs]
			
 
				+
			
 
				+    request = RerankRequest(
			
 
				+        query=query,
			
 
				+        passages=passages
			
 
				+    )
			
 
				+
			
 
				+    result = cpu_ranker.rerank(request)
			
 
				+
			
 
				+    ranked = [
			
 
				+        (r.get("text", ""), float(r.get("score", 0)))
			
 
				+        for r in result
			
 
				+    ]
			
 
				+
			
 
				+    return ranked
			
 
				+
			
 
				+
			
 
				+# --------------------------------------------------
			
 
				+# Stage 2: GPU cross-encoder
			
 
				+# --------------------------------------------------
			
 
				+
			
 
				+def rerank_gpu_stage(query: str, docs: List[str]):
			
 
				+    """
			
 
				+    Second-stage reranking using GPU cross-encoder.
			
 
				+    """
			
 
				+
			
 
				+    pairs = [(query, d) for d in docs]
			
 
				+
			
 
				+    scores = []
			
 
				+
			
 
				+    for i in range(0, len(pairs), BATCH_SIZE):
			
 
				+
			
 
				+        batch = pairs[i:i + BATCH_SIZE]
			
 
				+
			
 
				+        batch_scores = gpu_model.predict(batch)
			
 
				+
			
 
				+        scores.extend(batch_scores.tolist())
			
 
				+
			
 
				+    ranked = list(zip(docs, scores))
			
 
				+
			
 
				+    ranked.sort(key=lambda x: x[1], reverse=True)
			
 
				+
			
 
				+    return ranked
			
 
				+
			
 
				+
			
 
				+# --------------------------------------------------
			
 
				+# FastAPI app
			
 
				+# --------------------------------------------------
			
 
				+
			
 
				+app = FastAPI(
			
 
				+    title="Local Two-Stage Reranker",
			
 
				+    description="FlashRank CPU + MiniLM GPU reranking",
			
 
				+    version="2.0"
			
 
				+)
			
 
				+
			
 
				+
			
 
				+@app.post("/rerank", response_model=RerankResponse)
			
 
				+def rerank(request: RerankRequestModel):
			
 
				+
			
 
				+    query = request.query
			
 
				+    docs = request.documents
			
 
				+    top_k = min(request.top_k, len(docs))
			
 
				+
			
 
				+    # --------------------------------------------------
			
 
				+    # Stage 1: FlashRank CPU filtering
			
 
				+    # --------------------------------------------------
			
 
				+
			
 
				+    first_stage = rerank_cpu_stage(query, docs)
			
 
				+
			
 
				+    # select best candidates
			
 
				+    candidates = first_stage[:FIRST_STAGE_TOP_K]
			
 
				+
			
 
				+    candidate_docs = [d for d, s in candidates]
			
 
				+
			
 
				+    backend = "cpu"
			
 
				+
			
 
				+    # --------------------------------------------------
			
 
				+    # Stage 2: GPU reranking (optional)
			
 
				+    # --------------------------------------------------
			
 
				+
			
 
				+    if gpu_model and gpu_memory_available():
			
 
				+
			
 
				+        try:
			
 
				+
			
 
				+            second_stage = rerank_gpu_stage(query, candidate_docs)
			
 
				+
			
 
				+            backend = "gpu"
			
 
				+
			
 
				+        except (torch.cuda.OutOfMemoryError, RuntimeError):
			
 
				+
			
 
				+            print("CUDA failure -> using CPU stage results")
			
 
				+
			
 
				+            torch.cuda.empty_cache()
			
 
				+
			
 
				+            second_stage = candidates
			
 
				+
			
 
				+            backend = "cpu"
			
 
				+
			
 
				+    else:
			
 
				+
			
 
				+        second_stage = candidates
			
 
				+
			
 
				+    # --------------------------------------------------
			
 
				+    # Final result selection
			
 
				+    # --------------------------------------------------
			
 
				+
			
 
				+    final = second_stage[:top_k]
			
 
				+
			
 
				+    results = [
			
 
				+        RerankResult(text=d, score=float(s))
			
 
				+        for d, s in final
			
 
				+    ]
			
 
				+
			
 
				+    return RerankResponse(
			
 
				+        results=results,
			
 
				+        backend=backend
			
 
				+    )
			
 
				+
			
 
				+
			
 
				+# --------------------------------------------------
			
 
				+# Health endpoint
			
 
				+# --------------------------------------------------
			
 
				+
			
 
				+@app.get("/health")
			
 
				+def health():
			
 
				+
			
 
				+    return {
			
 
				+        "status": "ok",
			
 
				+        "cuda_available": torch.cuda.is_available(),
			
 
				+        "gpu_model_loaded": gpu_model is not None
			
 
				+    }