llama.cpp/.github/workflows/server-self-hosted.yml

name: Server (self-hosted)

on:
  workflow_dispatch: # allows manual triggering
    inputs:
      sha:
        description: 'Commit SHA1 to build'
        required: false
        type: string
      slow_tests:
        description: 'Run slow tests'
        required: true
        type: boolean
  push:
    branches:
      - master
    paths: [
      '.github/workflows/server-self-hosted.yml',
      '**/CMakeLists.txt',
      '**/Makefile',
      '**/*.h',
      '**/*.hpp',
      '**/*.c',
      '**/*.cpp',
      '**/*.cu',
      '**/*.swift',
      '**/*.m',
      'tools/server/**.*'
    ]

env:
  LLAMA_ARG_LOG_COLORS: 1
  LLAMA_ARG_LOG_PREFIX: 1
  LLAMA_ARG_LOG_TIMESTAMPS: 1
  LLAMA_ARG_LOG_VERBOSITY: 10

concurrency:
  group: ${{ github.workflow }}-${{ github.ref }}-${{ github.head_ref || github.run_id }}
  cancel-in-progress: true

jobs:
  server-metal:
    runs-on: [self-hosted, llama-server, macOS, ARM64]

    steps:
      - name: Clone
        id: checkout
        uses: actions/checkout@v6
        with:
          fetch-depth: 0
          ref: ${{ github.event.inputs.sha || github.event.pull_request.head.sha || github.sha || github.head_ref || github.ref_name }}

      - name: Build
        id: cmake_build
        run: |
          cmake -B build -DGGML_SCHED_NO_REALLOC=ON
          cmake --build build --config Release -j $(sysctl -n hw.logicalcpu) --target llama-server

      - name: Python setup
        id: setup_python
        run: |
          cd tools/server/tests
          python3 -m venv venv
          source venv/bin/activate
          pip install -r requirements.txt

      - name: Tests (GPUx1)
        id: server_integration_tests
        if: ${{ !github.event.pull_request }}
        run: |
          cd tools/server/tests
          source venv/bin/activate
          pytest -v -x -m "not slow"

      - name: Tests (GPUx1, backend-sampling)
        id: server_integration_tests_backend_sampling
        if: ${{ !github.event.pull_request }}
        run: |
          cd tools/server/tests
          source venv/bin/activate
          export LLAMA_ARG_BACKEND_SAMPLING=1
          pytest -v -x -m "not slow"

      - name: Tests (GPUx2)
        id: server_integration_tests_gpu2
        if: ${{ !github.event.pull_request }}
        run: |
          cd tools/server/tests
          source venv/bin/activate
          export GGML_METAL_DEVICES=2
          pytest -v -x -m "not slow"

      - name: Tests (GPUx2, backend-sampling)
        id: server_integration_tests_gpu2_backend_sampling
        if: ${{ !github.event.pull_request }}
        run: |
          cd tools/server/tests
          source venv/bin/activate
          export GGML_METAL_DEVICES=2 LLAMA_ARG_BACKEND_SAMPLING=1
          pytest -v -x -m "not slow"

  server-cuda:
    runs-on: [self-hosted, llama-server, Linux, NVIDIA]

    steps:
      - name: Clone
        id: checkout
        uses: actions/checkout@v6
        with:
          fetch-depth: 0
          ref: ${{ github.event.inputs.sha || github.event.pull_request.head.sha || github.sha || github.head_ref || github.ref_name }}

      - name: Build
        id: cmake_build
        run: |
          cmake -B build -DGGML_CUDA=ON -DGGML_SCHED_NO_REALLOC=ON
          cmake --build build --config Release -j $(nproc) --target llama-server

      - name: Python setup
        id: setup_python
        run: |
          cd tools/server/tests
          python3 -m venv venv
          source venv/bin/activate
          pip install -r requirements.txt

      - name: Tests (GPUx1)
        id: server_integration_tests
        if: ${{ !github.event.pull_request }}
        run: |
          cd tools/server/tests
          source venv/bin/activate
          pytest -v -x -m "not slow"

      - name: Tests (GPUx1, backend-sampling)
        id: server_integration_tests_backend_sampling
        if: ${{ !github.event.pull_request }}
        run: |
          cd tools/server/tests
          source venv/bin/activate
          export LLAMA_ARG_BACKEND_SAMPLING=1
          pytest -v -x -m "not slow"

  server-kleidiai:
    runs-on: ah-ubuntu_22_04-c8g_8x

    steps:
      - name: Clone
        id: checkout
        uses: actions/checkout@v6
        with:
          fetch-depth: 0
          ref: ${{ github.event.inputs.sha || github.event.pull_request.head.sha || github.sha || github.head_ref || github.ref_name }}

      - name: Dependencies
        id: depends
        run: |
          set -euxo pipefail
          sudo apt-get update
          sudo DEBIAN_FRONTEND=noninteractive NEEDRESTART_MODE=a \
          apt-get install -y \
           build-essential \
           libssl-dev \
           python3-venv \
           gpg \
           wget \
           time \
           git-lfs

          git lfs install

          # install the latest cmake
          sudo install -d /usr/share/keyrings
          wget -O - https://apt.kitware.com/keys/kitware-archive-latest.asc \
           | gpg --dearmor \
           | sudo tee /usr/share/keyrings/kitware-archive-keyring.gpg >/dev/null
          echo 'deb [signed-by=/usr/share/keyrings/kitware-archive-keyring.gpg] https://apt.kitware.com/ubuntu/ jammy main' \
           | sudo tee /etc/apt/sources.list.d/kitware.list
          sudo apt-get update
          sudo apt-get install -y cmake

      - name: Build
        id: cmake_build
        run: |
          cmake -B build -DGGML_SCHED_NO_REALLOC=ON -DGGML_CPU_KLEIDIAI=ON
          cmake --build build --config Release -j $(nproc) --target llama-server

      - name: Python setup
        id: setup_python
        run: |
          cd tools/server/tests
          python3 -m venv venv
          source venv/bin/activate
          pip install -r requirements.txt

      - name: Tests
        id: server_integration_tests
        if: ${{ !github.event.pull_request }}
        run: |
          cd tools/server/tests
          source venv/bin/activate
          pytest -v -x -m "not slow"