Spaces:
Running
Running
| version: "3.9" | |
| services: | |
| # ---- Main API Server ---- | |
| kumbh-api: | |
| build: | |
| context: . | |
| dockerfile: Dockerfile | |
| container_name: kumbh-api | |
| ports: | |
| - "8000:8000" | |
| volumes: | |
| - ./models:/app/models # Fine-tuned GGUF model | |
| - ./vectordb/chroma_db:/app/vectordb/chroma_db | |
| - ./data:/app/data | |
| environment: | |
| - CUDA_VISIBLE_DEVICES=0 | |
| - MODEL_PATH=/app/models/kumbh_model_q4_k_m.gguf | |
| - CHROMA_DB_PATH=/app/vectordb/chroma_db | |
| - LOG_LEVEL=INFO | |
| deploy: | |
| resources: | |
| reservations: | |
| devices: | |
| - driver: nvidia | |
| count: 1 | |
| capabilities: [gpu] | |
| restart: unless-stopped | |
| healthcheck: | |
| test: ["CMD", "curl", "-f", "http://localhost:8000/api/v1/health"] | |
| interval: 30s | |
| timeout: 10s | |
| retries: 3 | |
| # ---- Ollama (Alternative LLM backend) ---- | |
| ollama: | |
| image: ollama/ollama:latest | |
| container_name: kumbh-ollama | |
| ports: | |
| - "11434:11434" | |
| volumes: | |
| - ollama_data:/root/.ollama | |
| deploy: | |
| resources: | |
| reservations: | |
| devices: | |
| - driver: nvidia | |
| count: 1 | |
| capabilities: [gpu] | |
| restart: unless-stopped | |
| profiles: ["ollama"] # Only start when --profile ollama is specified | |
| volumes: | |
| ollama_data: | |