Explorar el Código

request throttle

Lukas Goldschmidt hace 2 días
padre
commit
8220079693
Se han modificado 3 ficheros con 19 adiciones y 3 borrados
  1. 5 0
      .env.example
  2. 5 1
      book_ingestor/config.py
  3. 9 2
      book_ingestor/mem0_writer.py

+ 5 - 0
.env.example

@@ -15,5 +15,10 @@ BOOKS_MANIFESTS=./books/manifests
 # Chunking
 CHUNK_SIZE_TOKENS=350
 
+INGEST_DELAY=0.5   # default — 2 chunks/sec, gentle on the GPU
+#INGEST_DELAY=1.0   # 1 chunk/sec — if GPU still runs hot
+#INGEST_DELAY=0.2   # faster — if GPU handles it fine
+#INGEST_DELAY=0.0   # no throttle — full speed, GPU on its own
+
 # Logging: DEBUG | INFO | WARNING
 LOG_LEVEL=INFO

+ 5 - 1
book_ingestor/config.py

@@ -29,6 +29,9 @@ class Config:
     # Chunking
     chunk_size_tokens: int
 
+    # Throttling — delay between POSTs to spare the embedder/GPU
+    ingest_delay: float
+
     # Logging
     log_level: str
 
@@ -50,9 +53,10 @@ def load_config() -> Config:
         books_done=os.getenv("BOOKS_DONE", "./books/done"),
         books_manifests=os.getenv("BOOKS_MANIFESTS", "./books/manifests"),
         chunk_size_tokens=int(os.getenv("CHUNK_SIZE_TOKENS", "350")),
+        ingest_delay=float(os.getenv("INGEST_DELAY", "0.5")),
         log_level=os.getenv("LOG_LEVEL", "INFO"),
     )
 
 
 # Singleton — import this everywhere
-cfg = load_config()
+cfg = load_config()

+ 9 - 2
book_ingestor/mem0_writer.py

@@ -10,6 +10,7 @@ Server expects: { text, user_id, metadata, infer }
 from __future__ import annotations
 
 import logging
+import time
 from datetime import datetime, timezone
 
 import requests
@@ -80,12 +81,18 @@ def write_content_chunk(chunk: Chunk, doc_title: str) -> str | None:
 
 
 def write_content_chunks_batch(chunks: list[Chunk], doc_title: str) -> list[str]:
-    """POST multiple content chunks. Returns list of successful memory IDs."""
+    """
+    POST multiple content chunks. Returns list of successful memory IDs.
+    Throttled by INGEST_DELAY to avoid hammering the Ollama embedder.
+    """
     ids = []
-    for chunk in chunks:
+    for i, chunk in enumerate(chunks):
         mem_id = write_content_chunk(chunk, doc_title)
         if mem_id:
             ids.append(mem_id)
+        # Throttle after every chunk except the last — give the GPU breathing room
+        if cfg.ingest_delay > 0 and i < len(chunks) - 1:
+            time.sleep(cfg.ingest_delay)
     return ids