version: "3.9" services: # ---- Main API Server ---- kumbh-api: build: context: . dockerfile: Dockerfile container_name: kumbh-api ports: - "8000:8000" volumes: - ./models:/app/models # Fine-tuned GGUF model - ./vectordb/chroma_db:/app/vectordb/chroma_db - ./data:/app/data environment: - CUDA_VISIBLE_DEVICES=0 - MODEL_PATH=/app/models/kumbh_model_q4_k_m.gguf - CHROMA_DB_PATH=/app/vectordb/chroma_db - LOG_LEVEL=INFO deploy: resources: reservations: devices: - driver: nvidia count: 1 capabilities: [gpu] restart: unless-stopped healthcheck: test: ["CMD", "curl", "-f", "http://localhost:8000/api/v1/health"] interval: 30s timeout: 10s retries: 3 # ---- Ollama (Alternative LLM backend) ---- ollama: image: ollama/ollama:latest container_name: kumbh-ollama ports: - "11434:11434" volumes: - ollama_data:/root/.ollama deploy: resources: reservations: devices: - driver: nvidia count: 1 capabilities: [gpu] restart: unless-stopped profiles: ["ollama"] # Only start when --profile ollama is specified volumes: ollama_data: