mirror of
https://github.com/ggml-org/llama.cpp.git
synced 2026-06-26 06:10:19 +00:00
a468b89018
Reduce the number of parallel jobs in server-self-hosted.yml by stacking test configurations as sequential steps within a single job, following the pattern from #23927. - server-metal: 4 matrix jobs -> 1 job with 4 sequential test steps - server-cuda: 2 matrix jobs -> 1 job with 2 sequential test steps - server-kleidiai: removed unnecessary single-entry matrix - removed unused Setup Node.js step from server-metal Total: 7 parallel jobs -> 3 parallel jobs Assisted-by: llama.cpp:local pi
203 lines
5.8 KiB
YAML
203 lines
5.8 KiB
YAML
name: Server (self-hosted)
|
|
|
|
on:
|
|
workflow_dispatch: # allows manual triggering
|
|
inputs:
|
|
sha:
|
|
description: 'Commit SHA1 to build'
|
|
required: false
|
|
type: string
|
|
slow_tests:
|
|
description: 'Run slow tests'
|
|
required: true
|
|
type: boolean
|
|
push:
|
|
branches:
|
|
- master
|
|
paths: [
|
|
'.github/workflows/server-self-hosted.yml',
|
|
'**/CMakeLists.txt',
|
|
'**/Makefile',
|
|
'**/*.h',
|
|
'**/*.hpp',
|
|
'**/*.c',
|
|
'**/*.cpp',
|
|
'**/*.cu',
|
|
'**/*.swift',
|
|
'**/*.m',
|
|
'tools/server/**.*'
|
|
]
|
|
|
|
env:
|
|
LLAMA_ARG_LOG_COLORS: 1
|
|
LLAMA_ARG_LOG_PREFIX: 1
|
|
LLAMA_ARG_LOG_TIMESTAMPS: 1
|
|
LLAMA_ARG_LOG_VERBOSITY: 10
|
|
|
|
concurrency:
|
|
group: ${{ github.workflow }}-${{ github.ref }}-${{ github.head_ref || github.run_id }}
|
|
cancel-in-progress: true
|
|
|
|
jobs:
|
|
server-metal:
|
|
runs-on: [self-hosted, llama-server, macOS, ARM64]
|
|
|
|
steps:
|
|
- name: Clone
|
|
id: checkout
|
|
uses: actions/checkout@v6
|
|
with:
|
|
fetch-depth: 0
|
|
ref: ${{ github.event.inputs.sha || github.event.pull_request.head.sha || github.sha || github.head_ref || github.ref_name }}
|
|
|
|
- name: Build
|
|
id: cmake_build
|
|
run: |
|
|
cmake -B build -DGGML_SCHED_NO_REALLOC=ON
|
|
cmake --build build --config Release -j $(sysctl -n hw.logicalcpu) --target llama-server
|
|
|
|
- name: Python setup
|
|
id: setup_python
|
|
run: |
|
|
cd tools/server/tests
|
|
python3 -m venv venv
|
|
source venv/bin/activate
|
|
pip install -r requirements.txt
|
|
|
|
- name: Tests (GPUx1)
|
|
id: server_integration_tests
|
|
if: ${{ !github.event.pull_request }}
|
|
run: |
|
|
cd tools/server/tests
|
|
source venv/bin/activate
|
|
pytest -v -x -m "not slow"
|
|
|
|
- name: Tests (GPUx1, backend-sampling)
|
|
id: server_integration_tests_backend_sampling
|
|
if: ${{ !github.event.pull_request }}
|
|
run: |
|
|
cd tools/server/tests
|
|
source venv/bin/activate
|
|
export LLAMA_ARG_BACKEND_SAMPLING=1
|
|
pytest -v -x -m "not slow"
|
|
|
|
- name: Tests (GPUx2)
|
|
id: server_integration_tests_gpu2
|
|
if: ${{ !github.event.pull_request }}
|
|
run: |
|
|
cd tools/server/tests
|
|
source venv/bin/activate
|
|
export GGML_METAL_DEVICES=2
|
|
pytest -v -x -m "not slow"
|
|
|
|
- name: Tests (GPUx2, backend-sampling)
|
|
id: server_integration_tests_gpu2_backend_sampling
|
|
if: ${{ !github.event.pull_request }}
|
|
run: |
|
|
cd tools/server/tests
|
|
source venv/bin/activate
|
|
export GGML_METAL_DEVICES=2 LLAMA_ARG_BACKEND_SAMPLING=1
|
|
pytest -v -x -m "not slow"
|
|
|
|
server-cuda:
|
|
runs-on: [self-hosted, llama-server, Linux, NVIDIA]
|
|
|
|
steps:
|
|
- name: Clone
|
|
id: checkout
|
|
uses: actions/checkout@v6
|
|
with:
|
|
fetch-depth: 0
|
|
ref: ${{ github.event.inputs.sha || github.event.pull_request.head.sha || github.sha || github.head_ref || github.ref_name }}
|
|
|
|
- name: Build
|
|
id: cmake_build
|
|
run: |
|
|
cmake -B build -DGGML_CUDA=ON -DGGML_SCHED_NO_REALLOC=ON
|
|
cmake --build build --config Release -j $(nproc) --target llama-server
|
|
|
|
- name: Python setup
|
|
id: setup_python
|
|
run: |
|
|
cd tools/server/tests
|
|
python3 -m venv venv
|
|
source venv/bin/activate
|
|
pip install -r requirements.txt
|
|
|
|
- name: Tests (GPUx1)
|
|
id: server_integration_tests
|
|
if: ${{ !github.event.pull_request }}
|
|
run: |
|
|
cd tools/server/tests
|
|
source venv/bin/activate
|
|
pytest -v -x -m "not slow"
|
|
|
|
- name: Tests (GPUx1, backend-sampling)
|
|
id: server_integration_tests_backend_sampling
|
|
if: ${{ !github.event.pull_request }}
|
|
run: |
|
|
cd tools/server/tests
|
|
source venv/bin/activate
|
|
export LLAMA_ARG_BACKEND_SAMPLING=1
|
|
pytest -v -x -m "not slow"
|
|
|
|
server-kleidiai:
|
|
runs-on: ah-ubuntu_22_04-c8g_8x
|
|
|
|
steps:
|
|
- name: Clone
|
|
id: checkout
|
|
uses: actions/checkout@v6
|
|
with:
|
|
fetch-depth: 0
|
|
ref: ${{ github.event.inputs.sha || github.event.pull_request.head.sha || github.sha || github.head_ref || github.ref_name }}
|
|
|
|
- name: Dependencies
|
|
id: depends
|
|
run: |
|
|
set -euxo pipefail
|
|
sudo apt-get update
|
|
sudo DEBIAN_FRONTEND=noninteractive NEEDRESTART_MODE=a \
|
|
apt-get install -y \
|
|
build-essential \
|
|
libssl-dev \
|
|
python3-venv \
|
|
gpg \
|
|
wget \
|
|
time \
|
|
git-lfs
|
|
|
|
git lfs install
|
|
|
|
# install the latest cmake
|
|
sudo install -d /usr/share/keyrings
|
|
wget -O - https://apt.kitware.com/keys/kitware-archive-latest.asc \
|
|
| gpg --dearmor \
|
|
| sudo tee /usr/share/keyrings/kitware-archive-keyring.gpg >/dev/null
|
|
echo 'deb [signed-by=/usr/share/keyrings/kitware-archive-keyring.gpg] https://apt.kitware.com/ubuntu/ jammy main' \
|
|
| sudo tee /etc/apt/sources.list.d/kitware.list
|
|
sudo apt-get update
|
|
sudo apt-get install -y cmake
|
|
|
|
- name: Build
|
|
id: cmake_build
|
|
run: |
|
|
cmake -B build -DGGML_SCHED_NO_REALLOC=ON -DGGML_CPU_KLEIDIAI=ON
|
|
cmake --build build --config Release -j $(nproc) --target llama-server
|
|
|
|
- name: Python setup
|
|
id: setup_python
|
|
run: |
|
|
cd tools/server/tests
|
|
python3 -m venv venv
|
|
source venv/bin/activate
|
|
pip install -r requirements.txt
|
|
|
|
- name: Tests
|
|
id: server_integration_tests
|
|
if: ${{ !github.event.pull_request }}
|
|
run: |
|
|
cd tools/server/tests
|
|
source venv/bin/activate
|
|
pytest -v -x -m "not slow"
|