version: "3.9"

services:
  # ---- Main API Server ----
  kumbh-api:
    build:
      context: .
      dockerfile: Dockerfile
    container_name: kumbh-api
    ports:
      - "8000:8000"
    volumes:
      - ./models:/app/models          # Fine-tuned GGUF model
      - ./vectordb/chroma_db:/app/vectordb/chroma_db
      - ./data:/app/data
    environment:
      - CUDA_VISIBLE_DEVICES=0
      - MODEL_PATH=/app/models/kumbh_model_q4_k_m.gguf
      - CHROMA_DB_PATH=/app/vectordb/chroma_db
      - LOG_LEVEL=INFO
    deploy:
      resources:
        reservations:
          devices:
            - driver: nvidia
              count: 1
              capabilities: [gpu]
    restart: unless-stopped
    healthcheck:
      test: ["CMD", "curl", "-f", "http://localhost:8000/api/v1/health"]
      interval: 30s
      timeout: 10s
      retries: 3

  # ---- Ollama (Alternative LLM backend) ----
  ollama:
    image: ollama/ollama:latest
    container_name: kumbh-ollama
    ports:
      - "11434:11434"
    volumes:
      - ollama_data:/root/.ollama
    deploy:
      resources:
        reservations:
          devices:
            - driver: nvidia
              count: 1
              capabilities: [gpu]
    restart: unless-stopped
    profiles: ["ollama"]  # Only start when --profile ollama is specified

volumes:
  ollama_data: