Orpheus-FastAPI/docker-compose-gpu.yml at main · Lex-au/Orpheus-FastAPI · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
services:
  orpheus-fastapi:
    container_name: orpheus-fastapi
    build:
      context: .
      dockerfile: Dockerfile.gpu
    ports:
      - "5005:5005"
    env_file:
      - .env
    environment:
      - ORPHEUS_API_URL=http://llama-cpp-server:5006/v1/completions
    deploy:
      resources:
        reservations:
          devices:
            - driver: nvidia
              count: all
              capabilities: [gpu]
    restart: unless-stopped
    depends_on:
      llama-cpp-server:
        condition: service_started

  llama-cpp-server:
    image: ghcr.io/ggml-org/llama.cpp:server-cuda
    ports:
      - "5006:5006"
    volumes:
      - ./models:/models
    env_file:
      - .env
    depends_on:
      model-init:
        condition: service_completed_successfully
    deploy:
      resources:
        reservations:
          devices:
            - driver: nvidia
              count: all
              capabilities: [gpu]
    restart: unless-stopped
    command: >
      -m /models/${ORPHEUS_MODEL_NAME}
      --port 5006
      --host 0.0.0.0
      --n-gpu-layers 29
      --ctx-size ${ORPHEUS_MAX_TOKENS}
      --n-predict ${ORPHEUS_MAX_TOKENS}
      --rope-scaling linear

  model-init:
    image: curlimages/curl:latest
    user: ${UID}:${GID}
    volumes:
      - ./models:/app/models
    working_dir: /app
    command: >
      sh -c '
      if [ ! -f /app/models/${ORPHEUS_MODEL_NAME} ]; then
        echo "Downloading model file..."
        wget -P /app/models https://huggingface.co/lex-au/${ORPHEUS_MODEL_NAME}/resolve/main/${ORPHEUS_MODEL_NAME}
      else
        echo "Model file already exists"
      fi'
    restart: "no"