mirror of
https://github.com/NVIDIA/TensorRT-LLM.git
synced 2026-02-11 05:23:38 +08:00
Merge branch 'main' into fix_spec_gate
This commit is contained in:
commit
d714196ea1
@ -0,0 +1 @@
|
||||
attn_backend: triton
|
||||
@ -65,7 +65,7 @@ models:
|
||||
- name: bigcode/starcoder2-7b
|
||||
yaml_extra: ['dashboard_default.yaml', 'world_size_2.yaml']
|
||||
- name: bigcode/starcoder2-15b-instruct-v0.1
|
||||
yaml_extra: ['dashboard_default.yaml', 'world_size_2.yaml']
|
||||
yaml_extra: ['dashboard_default.yaml', 'world_size_2.yaml', 'attn_backend_triton.yaml']
|
||||
- name: deepseek-ai/DeepSeek-Prover-V1.5-SFT
|
||||
yaml_extra: ['dashboard_default.yaml', 'world_size_2.yaml', 'compile_backend_torch_cudagraph.yaml']
|
||||
- name: deepseek-ai/DeepSeek-Prover-V2-7B
|
||||
@ -118,8 +118,6 @@ models:
|
||||
yaml_extra: ['dashboard_default.yaml', 'world_size_2.yaml', 'multimodal.yaml']
|
||||
- name: google/gemma-3-27b-it
|
||||
yaml_extra: ['dashboard_default.yaml', 'world_size_2.yaml', 'multimodal.yaml']
|
||||
- name: google/gemma-3-2b-it
|
||||
yaml_extra: ['dashboard_default.yaml', 'world_size_2.yaml']
|
||||
- name: deepseek-ai/DeepSeek-V2.5
|
||||
yaml_extra: ['dashboard_default.yaml', 'world_size_2.yaml']
|
||||
# DISABLED: Network timeout downloading from Hugging Face
|
||||
@ -145,8 +143,6 @@ models:
|
||||
# DISABLED: Graph transformation error in auto-deploy
|
||||
# - name: neuralmagic/Mixtral-8x7B-Instruct-v0.1-FP8
|
||||
# yaml_extra: ['dashboard_default.yaml', 'world_size_4.yaml']
|
||||
- name: TheBloke/falcon-40b-instruct-GPTQ
|
||||
yaml_extra: ['dashboard_default.yaml', 'world_size_4.yaml']
|
||||
- name: Qwen/QwQ-32B
|
||||
yaml_extra: ['dashboard_default.yaml', 'world_size_4.yaml', 'compile_backend_torch_cudagraph.yaml']
|
||||
- name: google/gemma-2-27b-it
|
||||
@ -159,7 +155,7 @@ models:
|
||||
yaml_extra: ['dashboard_default.yaml', 'world_size_4.yaml']
|
||||
- name: Qwen/QwQ-32B-Preview
|
||||
yaml_extra: ['dashboard_default.yaml', 'world_size_4.yaml', 'compile_backend_torch_cudagraph.yaml']
|
||||
- name: Qwen/Qwen3-Coder-32B-Instruct
|
||||
- name: Qwen/Qwen3-Coder-30B-A3B-Instruct
|
||||
yaml_extra: ['dashboard_default.yaml', 'world_size_4.yaml']
|
||||
- name: Qwen/Qwen3-235B-A22B-Instruct-2507
|
||||
yaml_extra: ['dashboard_default.yaml', 'world_size_4.yaml']
|
||||
@ -222,3 +218,5 @@ models:
|
||||
yaml_extra: ['dashboard_default.yaml', 'world_size_8.yaml', 'multimodal.yaml', 'llama4_scout.yaml']
|
||||
- name: meta-llama/Llama-4-Maverick-17B-128E-Instruct
|
||||
yaml_extra: ['dashboard_default.yaml', 'world_size_8.yaml', 'multimodal.yaml', 'llama4_maverick_lite.yaml']
|
||||
- name: nvidia/NVIDIA-Nemotron-3-Super-120B-BF16-BF16KV-010726
|
||||
yaml_extra: ['dashboard_default.yaml', 'world_size_4.yaml','super_v3.yaml']
|
||||
|
||||
@ -2396,80 +2396,80 @@ test = ["Cython", "array-api-strict (>=2.0,<2.1.1)", "asv", "gmpy2", "hypothesis
|
||||
|
||||
[[package]]
|
||||
name = "scipy"
|
||||
version = "1.16.3"
|
||||
version = "1.17.0"
|
||||
description = "Fundamental algorithms for scientific computing in Python"
|
||||
optional = false
|
||||
python-versions = ">=3.11"
|
||||
files = [
|
||||
{file = "scipy-1.16.3-cp311-cp311-macosx_10_14_x86_64.whl", hash = "sha256:40be6cf99e68b6c4321e9f8782e7d5ff8265af28ef2cd56e9c9b2638fa08ad97"},
|
||||
{file = "scipy-1.16.3-cp311-cp311-macosx_12_0_arm64.whl", hash = "sha256:8be1ca9170fcb6223cc7c27f4305d680ded114a1567c0bd2bfcbf947d1b17511"},
|
||||
{file = "scipy-1.16.3-cp311-cp311-macosx_14_0_arm64.whl", hash = "sha256:bea0a62734d20d67608660f69dcda23e7f90fb4ca20974ab80b6ed40df87a005"},
|
||||
{file = "scipy-1.16.3-cp311-cp311-macosx_14_0_x86_64.whl", hash = "sha256:2a207a6ce9c24f1951241f4693ede2d393f59c07abc159b2cb2be980820e01fb"},
|
||||
{file = "scipy-1.16.3-cp311-cp311-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:532fb5ad6a87e9e9cd9c959b106b73145a03f04c7d57ea3e6f6bb60b86ab0876"},
|
||||
{file = "scipy-1.16.3-cp311-cp311-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:0151a0749efeaaab78711c78422d413c583b8cdd2011a3c1d6c794938ee9fdb2"},
|
||||
{file = "scipy-1.16.3-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:b7180967113560cca57418a7bc719e30366b47959dd845a93206fbed693c867e"},
|
||||
{file = "scipy-1.16.3-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:deb3841c925eeddb6afc1e4e4a45e418d19ec7b87c5df177695224078e8ec733"},
|
||||
{file = "scipy-1.16.3-cp311-cp311-win_amd64.whl", hash = "sha256:53c3844d527213631e886621df5695d35e4f6a75f620dca412bcd292f6b87d78"},
|
||||
{file = "scipy-1.16.3-cp311-cp311-win_arm64.whl", hash = "sha256:9452781bd879b14b6f055b26643703551320aa8d79ae064a71df55c00286a184"},
|
||||
{file = "scipy-1.16.3-cp312-cp312-macosx_10_14_x86_64.whl", hash = "sha256:81fc5827606858cf71446a5e98715ba0e11f0dbc83d71c7409d05486592a45d6"},
|
||||
{file = "scipy-1.16.3-cp312-cp312-macosx_12_0_arm64.whl", hash = "sha256:c97176013d404c7346bf57874eaac5187d969293bf40497140b0a2b2b7482e07"},
|
||||
{file = "scipy-1.16.3-cp312-cp312-macosx_14_0_arm64.whl", hash = "sha256:2b71d93c8a9936046866acebc915e2af2e292b883ed6e2cbe5c34beb094b82d9"},
|
||||
{file = "scipy-1.16.3-cp312-cp312-macosx_14_0_x86_64.whl", hash = "sha256:3d4a07a8e785d80289dfe66b7c27d8634a773020742ec7187b85ccc4b0e7b686"},
|
||||
{file = "scipy-1.16.3-cp312-cp312-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:0553371015692a898e1aa858fed67a3576c34edefa6b7ebdb4e9dde49ce5c203"},
|
||||
{file = "scipy-1.16.3-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:72d1717fd3b5e6ec747327ce9bda32d5463f472c9dce9f54499e81fbd50245a1"},
|
||||
{file = "scipy-1.16.3-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:1fb2472e72e24d1530debe6ae078db70fb1605350c88a3d14bc401d6306dbffe"},
|
||||
{file = "scipy-1.16.3-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:c5192722cffe15f9329a3948c4b1db789fbb1f05c97899187dcf009b283aea70"},
|
||||
{file = "scipy-1.16.3-cp312-cp312-win_amd64.whl", hash = "sha256:56edc65510d1331dae01ef9b658d428e33ed48b4f77b1d51caf479a0253f96dc"},
|
||||
{file = "scipy-1.16.3-cp312-cp312-win_arm64.whl", hash = "sha256:a8a26c78ef223d3e30920ef759e25625a0ecdd0d60e5a8818b7513c3e5384cf2"},
|
||||
{file = "scipy-1.16.3-cp313-cp313-macosx_10_14_x86_64.whl", hash = "sha256:d2ec56337675e61b312179a1ad124f5f570c00f920cc75e1000025451b88241c"},
|
||||
{file = "scipy-1.16.3-cp313-cp313-macosx_12_0_arm64.whl", hash = "sha256:16b8bc35a4cc24db80a0ec836a9286d0e31b2503cb2fd7ff7fb0e0374a97081d"},
|
||||
{file = "scipy-1.16.3-cp313-cp313-macosx_14_0_arm64.whl", hash = "sha256:5803c5fadd29de0cf27fa08ccbfe7a9e5d741bf63e4ab1085437266f12460ff9"},
|
||||
{file = "scipy-1.16.3-cp313-cp313-macosx_14_0_x86_64.whl", hash = "sha256:b81c27fc41954319a943d43b20e07c40bdcd3ff7cf013f4fb86286faefe546c4"},
|
||||
{file = "scipy-1.16.3-cp313-cp313-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:0c3b4dd3d9b08dbce0f3440032c52e9e2ab9f96ade2d3943313dfe51a7056959"},
|
||||
{file = "scipy-1.16.3-cp313-cp313-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:7dc1360c06535ea6116a2220f760ae572db9f661aba2d88074fe30ec2aa1ff88"},
|
||||
{file = "scipy-1.16.3-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:663b8d66a8748051c3ee9c96465fb417509315b99c71550fda2591d7dd634234"},
|
||||
{file = "scipy-1.16.3-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:eab43fae33a0c39006a88096cd7b4f4ef545ea0447d250d5ac18202d40b6611d"},
|
||||
{file = "scipy-1.16.3-cp313-cp313-win_amd64.whl", hash = "sha256:062246acacbe9f8210de8e751b16fc37458213f124bef161a5a02c7a39284304"},
|
||||
{file = "scipy-1.16.3-cp313-cp313-win_arm64.whl", hash = "sha256:50a3dbf286dbc7d84f176f9a1574c705f277cb6565069f88f60db9eafdbe3ee2"},
|
||||
{file = "scipy-1.16.3-cp313-cp313t-macosx_10_14_x86_64.whl", hash = "sha256:fb4b29f4cf8cc5a8d628bc8d8e26d12d7278cd1f219f22698a378c3d67db5e4b"},
|
||||
{file = "scipy-1.16.3-cp313-cp313t-macosx_12_0_arm64.whl", hash = "sha256:8d09d72dc92742988b0e7750bddb8060b0c7079606c0d24a8cc8e9c9c11f9079"},
|
||||
{file = "scipy-1.16.3-cp313-cp313t-macosx_14_0_arm64.whl", hash = "sha256:03192a35e661470197556de24e7cb1330d84b35b94ead65c46ad6f16f6b28f2a"},
|
||||
{file = "scipy-1.16.3-cp313-cp313t-macosx_14_0_x86_64.whl", hash = "sha256:57d01cb6f85e34f0946b33caa66e892aae072b64b034183f3d87c4025802a119"},
|
||||
{file = "scipy-1.16.3-cp313-cp313t-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:96491a6a54e995f00a28a3c3badfff58fd093bf26cd5fb34a2188c8c756a3a2c"},
|
||||
{file = "scipy-1.16.3-cp313-cp313t-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:cd13e354df9938598af2be05822c323e97132d5e6306b83a3b4ee6724c6e522e"},
|
||||
{file = "scipy-1.16.3-cp313-cp313t-musllinux_1_2_aarch64.whl", hash = "sha256:63d3cdacb8a824a295191a723ee5e4ea7768ca5ca5f2838532d9f2e2b3ce2135"},
|
||||
{file = "scipy-1.16.3-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:e7efa2681ea410b10dde31a52b18b0154d66f2485328830e45fdf183af5aefc6"},
|
||||
{file = "scipy-1.16.3-cp313-cp313t-win_amd64.whl", hash = "sha256:2d1ae2cf0c350e7705168ff2429962a89ad90c2d49d1dd300686d8b2a5af22fc"},
|
||||
{file = "scipy-1.16.3-cp313-cp313t-win_arm64.whl", hash = "sha256:0c623a54f7b79dd88ef56da19bc2873afec9673a48f3b85b18e4d402bdd29a5a"},
|
||||
{file = "scipy-1.16.3-cp314-cp314-macosx_10_14_x86_64.whl", hash = "sha256:875555ce62743e1d54f06cdf22c1e0bc47b91130ac40fe5d783b6dfa114beeb6"},
|
||||
{file = "scipy-1.16.3-cp314-cp314-macosx_12_0_arm64.whl", hash = "sha256:bb61878c18a470021fb515a843dc7a76961a8daceaaaa8bad1332f1bf4b54657"},
|
||||
{file = "scipy-1.16.3-cp314-cp314-macosx_14_0_arm64.whl", hash = "sha256:f2622206f5559784fa5c4b53a950c3c7c1cf3e84ca1b9c4b6c03f062f289ca26"},
|
||||
{file = "scipy-1.16.3-cp314-cp314-macosx_14_0_x86_64.whl", hash = "sha256:7f68154688c515cdb541a31ef8eb66d8cd1050605be9dcd74199cbd22ac739bc"},
|
||||
{file = "scipy-1.16.3-cp314-cp314-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:8b3c820ddb80029fe9f43d61b81d8b488d3ef8ca010d15122b152db77dc94c22"},
|
||||
{file = "scipy-1.16.3-cp314-cp314-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:d3837938ae715fc0fe3c39c0202de3a8853aff22ca66781ddc2ade7554b7e2cc"},
|
||||
{file = "scipy-1.16.3-cp314-cp314-musllinux_1_2_aarch64.whl", hash = "sha256:aadd23f98f9cb069b3bd64ddc900c4d277778242e961751f77a8cb5c4b946fb0"},
|
||||
{file = "scipy-1.16.3-cp314-cp314-musllinux_1_2_x86_64.whl", hash = "sha256:b7c5f1bda1354d6a19bc6af73a649f8285ca63ac6b52e64e658a5a11d4d69800"},
|
||||
{file = "scipy-1.16.3-cp314-cp314-win_amd64.whl", hash = "sha256:e5d42a9472e7579e473879a1990327830493a7047506d58d73fc429b84c1d49d"},
|
||||
{file = "scipy-1.16.3-cp314-cp314-win_arm64.whl", hash = "sha256:6020470b9d00245926f2d5bb93b119ca0340f0d564eb6fbaad843eaebf9d690f"},
|
||||
{file = "scipy-1.16.3-cp314-cp314t-macosx_10_14_x86_64.whl", hash = "sha256:e1d27cbcb4602680a49d787d90664fa4974063ac9d4134813332a8c53dbe667c"},
|
||||
{file = "scipy-1.16.3-cp314-cp314t-macosx_12_0_arm64.whl", hash = "sha256:9b9c9c07b6d56a35777a1b4cc8966118fb16cfd8daf6743867d17d36cfad2d40"},
|
||||
{file = "scipy-1.16.3-cp314-cp314t-macosx_14_0_arm64.whl", hash = "sha256:3a4c460301fb2cffb7f88528f30b3127742cff583603aa7dc964a52c463b385d"},
|
||||
{file = "scipy-1.16.3-cp314-cp314t-macosx_14_0_x86_64.whl", hash = "sha256:f667a4542cc8917af1db06366d3f78a5c8e83badd56409f94d1eac8d8d9133fa"},
|
||||
{file = "scipy-1.16.3-cp314-cp314t-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:f379b54b77a597aa7ee5e697df0d66903e41b9c85a6dd7946159e356319158e8"},
|
||||
{file = "scipy-1.16.3-cp314-cp314t-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:4aff59800a3b7f786b70bfd6ab551001cb553244988d7d6b8299cb1ea653b353"},
|
||||
{file = "scipy-1.16.3-cp314-cp314t-musllinux_1_2_aarch64.whl", hash = "sha256:da7763f55885045036fabcebd80144b757d3db06ab0861415d1c3b7c69042146"},
|
||||
{file = "scipy-1.16.3-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:ffa6eea95283b2b8079b821dc11f50a17d0571c92b43e2b5b12764dc5f9b285d"},
|
||||
{file = "scipy-1.16.3-cp314-cp314t-win_amd64.whl", hash = "sha256:d9f48cafc7ce94cf9b15c6bffdc443a81a27bf7075cf2dcd5c8b40f85d10c4e7"},
|
||||
{file = "scipy-1.16.3-cp314-cp314t-win_arm64.whl", hash = "sha256:21d9d6b197227a12dcbf9633320a4e34c6b0e51c57268df255a0942983bac562"},
|
||||
{file = "scipy-1.16.3.tar.gz", hash = "sha256:01e87659402762f43bd2fee13370553a17ada367d42e7487800bf2916535aecb"},
|
||||
{file = "scipy-1.17.0-cp311-cp311-macosx_10_14_x86_64.whl", hash = "sha256:2abd71643797bd8a106dff97894ff7869eeeb0af0f7a5ce02e4227c6a2e9d6fd"},
|
||||
{file = "scipy-1.17.0-cp311-cp311-macosx_12_0_arm64.whl", hash = "sha256:ef28d815f4d2686503e5f4f00edc387ae58dfd7a2f42e348bb53359538f01558"},
|
||||
{file = "scipy-1.17.0-cp311-cp311-macosx_14_0_arm64.whl", hash = "sha256:272a9f16d6bb4667e8b50d25d71eddcc2158a214df1b566319298de0939d2ab7"},
|
||||
{file = "scipy-1.17.0-cp311-cp311-macosx_14_0_x86_64.whl", hash = "sha256:7204fddcbec2fe6598f1c5fdf027e9f259106d05202a959a9f1aecf036adc9f6"},
|
||||
{file = "scipy-1.17.0-cp311-cp311-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:fc02c37a5639ee67d8fb646ffded6d793c06c5622d36b35cfa8fe5ececb8f042"},
|
||||
{file = "scipy-1.17.0-cp311-cp311-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:dac97a27520d66c12a34fd90a4fe65f43766c18c0d6e1c0a80f114d2260080e4"},
|
||||
{file = "scipy-1.17.0-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:ebb7446a39b3ae0fe8f416a9a3fdc6fba3f11c634f680f16a239c5187bc487c0"},
|
||||
{file = "scipy-1.17.0-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:474da16199f6af66601a01546144922ce402cb17362e07d82f5a6cf8f963e449"},
|
||||
{file = "scipy-1.17.0-cp311-cp311-win_amd64.whl", hash = "sha256:255c0da161bd7b32a6c898e7891509e8a9289f0b1c6c7d96142ee0d2b114c2ea"},
|
||||
{file = "scipy-1.17.0-cp311-cp311-win_arm64.whl", hash = "sha256:85b0ac3ad17fa3be50abd7e69d583d98792d7edc08367e01445a1e2076005379"},
|
||||
{file = "scipy-1.17.0-cp312-cp312-macosx_10_14_x86_64.whl", hash = "sha256:0d5018a57c24cb1dd828bcf51d7b10e65986d549f52ef5adb6b4d1ded3e32a57"},
|
||||
{file = "scipy-1.17.0-cp312-cp312-macosx_12_0_arm64.whl", hash = "sha256:88c22af9e5d5a4f9e027e26772cc7b5922fab8bcc839edb3ae33de404feebd9e"},
|
||||
{file = "scipy-1.17.0-cp312-cp312-macosx_14_0_arm64.whl", hash = "sha256:f3cd947f20fe17013d401b64e857c6b2da83cae567adbb75b9dcba865abc66d8"},
|
||||
{file = "scipy-1.17.0-cp312-cp312-macosx_14_0_x86_64.whl", hash = "sha256:e8c0b331c2c1f531eb51f1b4fc9ba709521a712cce58f1aa627bc007421a5306"},
|
||||
{file = "scipy-1.17.0-cp312-cp312-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:5194c445d0a1c7a6c1a4a4681b6b7c71baad98ff66d96b949097e7513c9d6742"},
|
||||
{file = "scipy-1.17.0-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:9eeb9b5f5997f75507814ed9d298ab23f62cf79f5a3ef90031b1ee2506abdb5b"},
|
||||
{file = "scipy-1.17.0-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:40052543f7bbe921df4408f46003d6f01c6af109b9e2c8a66dd1cf6cf57f7d5d"},
|
||||
{file = "scipy-1.17.0-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:0cf46c8013fec9d3694dc572f0b54100c28405d55d3e2cb15e2895b25057996e"},
|
||||
{file = "scipy-1.17.0-cp312-cp312-win_amd64.whl", hash = "sha256:0937a0b0d8d593a198cededd4c439a0ea216a3f36653901ea1f3e4be949056f8"},
|
||||
{file = "scipy-1.17.0-cp312-cp312-win_arm64.whl", hash = "sha256:f603d8a5518c7426414d1d8f82e253e454471de682ce5e39c29adb0df1efb86b"},
|
||||
{file = "scipy-1.17.0-cp313-cp313-macosx_10_14_x86_64.whl", hash = "sha256:65ec32f3d32dfc48c72df4291345dae4f048749bc8d5203ee0a3f347f96c5ce6"},
|
||||
{file = "scipy-1.17.0-cp313-cp313-macosx_12_0_arm64.whl", hash = "sha256:1f9586a58039d7229ce77b52f8472c972448cded5736eaf102d5658bbac4c269"},
|
||||
{file = "scipy-1.17.0-cp313-cp313-macosx_14_0_arm64.whl", hash = "sha256:9fad7d3578c877d606b1150135c2639e9de9cecd3705caa37b66862977cc3e72"},
|
||||
{file = "scipy-1.17.0-cp313-cp313-macosx_14_0_x86_64.whl", hash = "sha256:423ca1f6584fc03936972b5f7c06961670dbba9f234e71676a7c7ccf938a0d61"},
|
||||
{file = "scipy-1.17.0-cp313-cp313-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:fe508b5690e9eaaa9467fc047f833af58f1152ae51a0d0aed67aa5801f4dd7d6"},
|
||||
{file = "scipy-1.17.0-cp313-cp313-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:6680f2dfd4f6182e7d6db161344537da644d1cf85cf293f015c60a17ecf08752"},
|
||||
{file = "scipy-1.17.0-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:eec3842ec9ac9de5917899b277428886042a93db0b227ebbe3a333b64ec7643d"},
|
||||
{file = "scipy-1.17.0-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:d7425fcafbc09a03731e1bc05581f5fad988e48c6a861f441b7ab729a49a55ea"},
|
||||
{file = "scipy-1.17.0-cp313-cp313-win_amd64.whl", hash = "sha256:87b411e42b425b84777718cc41516b8a7e0795abfa8e8e1d573bf0ef014f0812"},
|
||||
{file = "scipy-1.17.0-cp313-cp313-win_arm64.whl", hash = "sha256:357ca001c6e37601066092e7c89cca2f1ce74e2a520ca78d063a6d2201101df2"},
|
||||
{file = "scipy-1.17.0-cp313-cp313t-macosx_10_14_x86_64.whl", hash = "sha256:ec0827aa4d36cb79ff1b81de898e948a51ac0b9b1c43e4a372c0508c38c0f9a3"},
|
||||
{file = "scipy-1.17.0-cp313-cp313t-macosx_12_0_arm64.whl", hash = "sha256:819fc26862b4b3c73a60d486dbb919202f3d6d98c87cf20c223511429f2d1a97"},
|
||||
{file = "scipy-1.17.0-cp313-cp313t-macosx_14_0_arm64.whl", hash = "sha256:363ad4ae2853d88ebcde3ae6ec46ccca903ea9835ee8ba543f12f575e7b07e4e"},
|
||||
{file = "scipy-1.17.0-cp313-cp313t-macosx_14_0_x86_64.whl", hash = "sha256:979c3a0ff8e5ba254d45d59ebd38cde48fce4f10b5125c680c7a4bfe177aab07"},
|
||||
{file = "scipy-1.17.0-cp313-cp313t-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:130d12926ae34399d157de777472bf82e9061c60cc081372b3118edacafe1d00"},
|
||||
{file = "scipy-1.17.0-cp313-cp313t-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:6e886000eb4919eae3a44f035e63f0fd8b651234117e8f6f29bad1cd26e7bc45"},
|
||||
{file = "scipy-1.17.0-cp313-cp313t-musllinux_1_2_aarch64.whl", hash = "sha256:13c4096ac6bc31d706018f06a49abe0485f96499deb82066b94d19b02f664209"},
|
||||
{file = "scipy-1.17.0-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:cacbaddd91fcffde703934897c5cd2c7cb0371fac195d383f4e1f1c5d3f3bd04"},
|
||||
{file = "scipy-1.17.0-cp313-cp313t-win_amd64.whl", hash = "sha256:edce1a1cf66298cccdc48a1bdf8fb10a3bf58e8b58d6c3883dd1530e103f87c0"},
|
||||
{file = "scipy-1.17.0-cp313-cp313t-win_arm64.whl", hash = "sha256:30509da9dbec1c2ed8f168b8d8aa853bc6723fede1dbc23c7d43a56f5ab72a67"},
|
||||
{file = "scipy-1.17.0-cp314-cp314-macosx_10_14_x86_64.whl", hash = "sha256:c17514d11b78be8f7e6331b983a65a7f5ca1fd037b95e27b280921fe5606286a"},
|
||||
{file = "scipy-1.17.0-cp314-cp314-macosx_12_0_arm64.whl", hash = "sha256:4e00562e519c09da34c31685f6acc3aa384d4d50604db0f245c14e1b4488bfa2"},
|
||||
{file = "scipy-1.17.0-cp314-cp314-macosx_14_0_arm64.whl", hash = "sha256:f7df7941d71314e60a481e02d5ebcb3f0185b8d799c70d03d8258f6c80f3d467"},
|
||||
{file = "scipy-1.17.0-cp314-cp314-macosx_14_0_x86_64.whl", hash = "sha256:aabf057c632798832f071a8dde013c2e26284043934f53b00489f1773b33527e"},
|
||||
{file = "scipy-1.17.0-cp314-cp314-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:a38c3337e00be6fd8a95b4ed66b5d988bac4ec888fd922c2ea9fe5fb1603dd67"},
|
||||
{file = "scipy-1.17.0-cp314-cp314-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:00fb5f8ec8398ad90215008d8b6009c9db9fa924fd4c7d6be307c6f945f9cd73"},
|
||||
{file = "scipy-1.17.0-cp314-cp314-musllinux_1_2_aarch64.whl", hash = "sha256:f2a4942b0f5f7c23c7cd641a0ca1955e2ae83dedcff537e3a0259096635e186b"},
|
||||
{file = "scipy-1.17.0-cp314-cp314-musllinux_1_2_x86_64.whl", hash = "sha256:dbf133ced83889583156566d2bdf7a07ff89228fe0c0cb727f777de92092ec6b"},
|
||||
{file = "scipy-1.17.0-cp314-cp314-win_amd64.whl", hash = "sha256:3625c631a7acd7cfd929e4e31d2582cf00f42fcf06011f59281271746d77e061"},
|
||||
{file = "scipy-1.17.0-cp314-cp314-win_arm64.whl", hash = "sha256:9244608d27eafe02b20558523ba57f15c689357c85bdcfe920b1828750aa26eb"},
|
||||
{file = "scipy-1.17.0-cp314-cp314t-macosx_10_14_x86_64.whl", hash = "sha256:2b531f57e09c946f56ad0b4a3b2abee778789097871fc541e267d2eca081cff1"},
|
||||
{file = "scipy-1.17.0-cp314-cp314t-macosx_12_0_arm64.whl", hash = "sha256:13e861634a2c480bd237deb69333ac79ea1941b94568d4b0efa5db5e263d4fd1"},
|
||||
{file = "scipy-1.17.0-cp314-cp314t-macosx_14_0_arm64.whl", hash = "sha256:eb2651271135154aa24f6481cbae5cc8af1f0dd46e6533fb7b56aa9727b6a232"},
|
||||
{file = "scipy-1.17.0-cp314-cp314t-macosx_14_0_x86_64.whl", hash = "sha256:c5e8647f60679790c2f5c76be17e2e9247dc6b98ad0d3b065861e082c56e078d"},
|
||||
{file = "scipy-1.17.0-cp314-cp314t-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:5fb10d17e649e1446410895639f3385fd2bf4c3c7dfc9bea937bddcbc3d7b9ba"},
|
||||
{file = "scipy-1.17.0-cp314-cp314t-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:8547e7c57f932e7354a2319fab613981cde910631979f74c9b542bb167a8b9db"},
|
||||
{file = "scipy-1.17.0-cp314-cp314t-musllinux_1_2_aarch64.whl", hash = "sha256:33af70d040e8af9d5e7a38b5ed3b772adddd281e3062ff23fec49e49681c38cf"},
|
||||
{file = "scipy-1.17.0-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:f9eb55bb97d00f8b7ab95cb64f873eb0bf54d9446264d9f3609130381233483f"},
|
||||
{file = "scipy-1.17.0-cp314-cp314t-win_amd64.whl", hash = "sha256:1ff269abf702f6c7e67a4b7aad981d42871a11b9dd83c58d2d2ea624efbd1088"},
|
||||
{file = "scipy-1.17.0-cp314-cp314t-win_arm64.whl", hash = "sha256:031121914e295d9791319a1875444d55079885bbae5bdc9c5e0f2ee5f09d34ff"},
|
||||
{file = "scipy-1.17.0.tar.gz", hash = "sha256:2591060c8e648d8b96439e111ac41fd8342fdeff1876be2e19dea3fe8930454e"},
|
||||
]
|
||||
|
||||
[package.dependencies]
|
||||
numpy = ">=1.25.2,<2.6"
|
||||
numpy = ">=1.26.4,<2.7"
|
||||
|
||||
[package.extras]
|
||||
dev = ["cython-lint (>=0.12.2)", "doit (>=0.36.0)", "mypy (==1.10.0)", "pycodestyle", "pydevtool", "rich-click", "ruff (>=0.0.292)", "types-psutil", "typing_extensions"]
|
||||
doc = ["intersphinx_registry", "jupyterlite-pyodide-kernel", "jupyterlite-sphinx (>=0.19.1)", "jupytext", "linkify-it-py", "matplotlib (>=3.5)", "myst-nb (>=1.2.0)", "numpydoc", "pooch", "pydata-sphinx-theme (>=0.15.2)", "sphinx (>=5.0.0,<8.2.0)", "sphinx-copybutton", "sphinx-design (>=0.4.0)"]
|
||||
dev = ["click (<8.3.0)", "cython-lint (>=0.12.2)", "mypy (==1.10.0)", "pycodestyle", "ruff (>=0.12.0)", "spin", "types-psutil", "typing_extensions"]
|
||||
doc = ["intersphinx_registry", "jupyterlite-pyodide-kernel", "jupyterlite-sphinx (>=0.19.1)", "jupytext", "linkify-it-py", "matplotlib (>=3.5)", "myst-nb (>=1.2.0)", "numpydoc", "pooch", "pydata-sphinx-theme (>=0.15.2)", "sphinx (>=5.0.0,<8.2.0)", "sphinx-copybutton", "sphinx-design (>=0.4.0)", "tabulate"]
|
||||
test = ["Cython", "array-api-strict (>=2.3.1)", "asv", "gmpy2", "hypothesis (>=6.30)", "meson", "mpmath", "ninja", "pooch", "pytest (>=8.0.0)", "pytest-cov", "pytest-timeout", "pytest-xdist", "scikit-umfpack", "threadpoolctl"]
|
||||
|
||||
[[package]]
|
||||
|
||||
@ -1,4 +1,4 @@
|
||||
{
|
||||
"commit_hash": "ff7eb93f310d36f62b79ff5e229935bf50b934e7",
|
||||
"timestamp": "2026-01-10T02:39:45Z"
|
||||
"commit_hash": "3c65ec3c556d610a017d11fb968c6576c5b3b493",
|
||||
"timestamp": "2026-01-11T02:39:21Z"
|
||||
}
|
||||
|
||||
24
security_scanning/poetry.lock
generated
24
security_scanning/poetry.lock
generated
@ -3213,28 +3213,6 @@ files = [
|
||||
{file = "nvidia_cusparselt_cu12-0.7.1-py3-none-win_amd64.whl", hash = "sha256:f67fbb5831940ec829c9117b7f33807db9f9678dc2a617fbe781cac17b4e1075"},
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "nvidia-cutlass-dsl"
|
||||
version = "4.3.4"
|
||||
description = "NVIDIA CUTLASS Python DSL"
|
||||
optional = false
|
||||
python-versions = ">=3.10"
|
||||
files = [
|
||||
{file = "nvidia_cutlass_dsl-4.3.4-cp310-cp310-manylinux_2_28_aarch64.whl", hash = "sha256:118508bc84f2a55ec7af3affd379bb713edf837d593218329909db67b518e700"},
|
||||
{file = "nvidia_cutlass_dsl-4.3.4-cp310-cp310-manylinux_2_28_x86_64.whl", hash = "sha256:3fdf0603ab7ec1bf6a499fbf72cff65e73b597d6e1359286808317c69aeb7c3d"},
|
||||
{file = "nvidia_cutlass_dsl-4.3.4-cp311-cp311-manylinux_2_28_aarch64.whl", hash = "sha256:c5bd21ed877da171f115123a12aae4a920035fc47eb57c807f9fba9f3df97cf4"},
|
||||
{file = "nvidia_cutlass_dsl-4.3.4-cp311-cp311-manylinux_2_28_x86_64.whl", hash = "sha256:671936f1df909e7de377d0cc00cb4287a3458c013d34947600423e9deb827e41"},
|
||||
{file = "nvidia_cutlass_dsl-4.3.4-cp312-cp312-manylinux_2_28_aarch64.whl", hash = "sha256:57693d87677919572ab9eefa386b3f39e8e888bc4a9db7ab8730a97e8dbe06b4"},
|
||||
{file = "nvidia_cutlass_dsl-4.3.4-cp312-cp312-manylinux_2_28_x86_64.whl", hash = "sha256:a48fbff859e44dd548f8f26819d97d0595acea70e3b057c91dfdb47929015c72"},
|
||||
{file = "nvidia_cutlass_dsl-4.3.4-cp313-cp313-manylinux_2_28_aarch64.whl", hash = "sha256:36bde25160f461f393beba81868ef9e54d5ba2e0e7666ed3e44b6dbf788af493"},
|
||||
{file = "nvidia_cutlass_dsl-4.3.4-cp313-cp313-manylinux_2_28_x86_64.whl", hash = "sha256:be127f0f087028fa498f50a994c49f95b2c6a518e11e2567bc3d71528bf0a504"},
|
||||
]
|
||||
|
||||
[package.dependencies]
|
||||
cuda-python = ">=12.8"
|
||||
numpy = "*"
|
||||
typing-extensions = "*"
|
||||
|
||||
[[package]]
|
||||
name = "nvidia-ml-py"
|
||||
version = "13.590.44"
|
||||
@ -6339,4 +6317,4 @@ type = ["pytest-mypy"]
|
||||
[metadata]
|
||||
lock-version = "2.0"
|
||||
python-versions = ">=3.10,<3.13"
|
||||
content-hash = "f17eedd404a2af6728d14710809ea47ad34bc6672c035073bad9e6c709131a08"
|
||||
content-hash = "7c2c53dee07ff38461334f55fc46cbcb05770c5c4c6f95f797eb9f9800e76f60"
|
||||
|
||||
@ -72,7 +72,6 @@ triton = "3.5.0"
|
||||
tiktoken = "^0.12.0"
|
||||
blobfile = "^3.1.0"
|
||||
openai-harmony = "0.0.4"
|
||||
nvidia-cutlass-dsl = "4.3.4"
|
||||
plotly = "^6.5.1"
|
||||
numexpr = "<2.14.0"
|
||||
partial-json-parser = "^0.2.1.1.post7"
|
||||
|
||||
108
tests/integration/defs/perf/disagg/cleanup_jobs.sh
Normal file
108
tests/integration/defs/perf/disagg/cleanup_jobs.sh
Normal file
@ -0,0 +1,108 @@
|
||||
#!/bin/bash
|
||||
# cleanup_jobs.sh - Cancel all SLURM jobs tracked in jobs.txt
|
||||
#
|
||||
# This script is designed to run in GitLab CI after_script to ensure
|
||||
# all SLURM jobs are cancelled when the pipeline is interrupted, cancelled,
|
||||
# or times out.
|
||||
#
|
||||
# Usage:
|
||||
# bash cleanup_jobs.sh
|
||||
#
|
||||
# Environment variables:
|
||||
# OUTPUT_PATH: Directory containing jobs.txt and pytest.pid
|
||||
|
||||
set -e
|
||||
|
||||
OUTPUT_PATH="${OUTPUT_PATH:-/tmp}"
|
||||
JOBS_FILE="${OUTPUT_PATH}/jobs.txt"
|
||||
PID_FILE="${OUTPUT_PATH}/pytest.pid"
|
||||
|
||||
echo "=========================================="
|
||||
echo "SLURM Job Cleanup Script"
|
||||
echo "=========================================="
|
||||
echo "Output path: $OUTPUT_PATH"
|
||||
echo ""
|
||||
|
||||
# Show pytest PID if available (for debugging)
|
||||
if [ -f "$PID_FILE" ]; then
|
||||
PYTEST_PID=$(cat "$PID_FILE" | tr -d '\n')
|
||||
echo "Pytest PID: $PYTEST_PID"
|
||||
|
||||
# Check if pytest is still running
|
||||
if kill -0 "$PYTEST_PID" 2>/dev/null; then
|
||||
echo "Status: Still running"
|
||||
else
|
||||
echo "Status: Already terminated"
|
||||
fi
|
||||
echo ""
|
||||
else
|
||||
echo "No pytest.pid found (test may not have started)"
|
||||
echo ""
|
||||
fi
|
||||
|
||||
# Check if jobs.txt exists
|
||||
if [ ! -f "$JOBS_FILE" ]; then
|
||||
echo "[WARN] No jobs.txt found"
|
||||
echo " Nothing to cancel"
|
||||
echo "=========================================="
|
||||
exit 0
|
||||
fi
|
||||
|
||||
echo "[INFO] Reading jobs from: $JOBS_FILE"
|
||||
|
||||
# Read, deduplicate, and filter empty lines
|
||||
JOBS=$(sort -u "$JOBS_FILE" | grep -v '^$' || true)
|
||||
|
||||
if [ -z "$JOBS" ]; then
|
||||
echo "[WARN] jobs.txt is empty"
|
||||
echo " Nothing to cancel"
|
||||
echo "=========================================="
|
||||
exit 0
|
||||
fi
|
||||
|
||||
JOB_COUNT=$(echo "$JOBS" | wc -l)
|
||||
echo "Found $JOB_COUNT job(s) to cancel"
|
||||
echo ""
|
||||
|
||||
# Cancel each job
|
||||
CANCELLED=0
|
||||
ALREADY_DONE=0
|
||||
FAILED=0
|
||||
|
||||
echo "Cancelling jobs..."
|
||||
while IFS= read -r job_id; do
|
||||
if [ -n "$job_id" ]; then
|
||||
printf " %-12s ... " "$job_id"
|
||||
|
||||
# Try to cancel the job
|
||||
if scancel "$job_id" 2>/dev/null; then
|
||||
echo "[OK] Cancelled"
|
||||
CANCELLED=$((CANCELLED + 1))
|
||||
else
|
||||
# Check if job exists in squeue
|
||||
if squeue -j "$job_id" -h 2>/dev/null | grep -q "$job_id"; then
|
||||
echo "[FAIL] Failed to cancel"
|
||||
FAILED=$((FAILED + 1))
|
||||
else
|
||||
echo "[SKIP] Already finished"
|
||||
ALREADY_DONE=$((ALREADY_DONE + 1))
|
||||
fi
|
||||
fi
|
||||
fi
|
||||
done <<< "$JOBS"
|
||||
|
||||
echo ""
|
||||
echo "=========================================="
|
||||
echo "[DONE] Cleanup completed"
|
||||
echo " Total: $JOB_COUNT"
|
||||
echo " Cancelled: $CANCELLED"
|
||||
echo " Already done: $ALREADY_DONE"
|
||||
echo " Failed: $FAILED"
|
||||
echo "=========================================="
|
||||
|
||||
# Exit with error if any cancellation actually failed
|
||||
if [ $FAILED -gt 0 ]; then
|
||||
exit 1
|
||||
fi
|
||||
|
||||
exit 0
|
||||
@ -151,6 +151,7 @@ class BatchManager:
|
||||
|
||||
self.submitted_batches = set() # Track which batch numbers have been submitted
|
||||
self.job_mapping = {} # Map test_id -> SLURM job_id
|
||||
self.submit_errors = {} # Map test_id -> error message (validation/submission failures)
|
||||
self.all_configs = [] # Ordered list of all test configs
|
||||
|
||||
logger.info(f"\n{'=' * 70}")
|
||||
@ -214,6 +215,8 @@ class BatchManager:
|
||||
batch_num: Batch number to submit (0-indexed)
|
||||
"""
|
||||
from execution.executor import JobManager
|
||||
from utils.config_validator import ConfigValidator
|
||||
from utils.job_tracker import JobTracker
|
||||
|
||||
# Calculate batch range
|
||||
if self.batch_size:
|
||||
@ -230,33 +233,56 @@ class BatchManager:
|
||||
logger.info(f"Range: [{start_idx}:{end_idx}] ({len(batch_configs)} jobs)")
|
||||
logger.info(f"{'=' * 70}\n")
|
||||
|
||||
# Submit all jobs in this batch
|
||||
# Pre-validate all configs before submission
|
||||
logger.info("Pre-validating configurations...")
|
||||
valid_configs = []
|
||||
for config in batch_configs:
|
||||
try:
|
||||
ConfigValidator.validate_test_config(config)
|
||||
valid_configs.append(config)
|
||||
except Exception as e:
|
||||
# Validation failed - mark as None and record error
|
||||
self.job_mapping[config.test_id] = None
|
||||
self.submit_errors[config.test_id] = f"Validation failed: {str(e)}"
|
||||
logger.error(f" [FAILED] Validation failed: {config.test_id}")
|
||||
logger.error(f" Error: {str(e)[:100]}")
|
||||
|
||||
logger.info(
|
||||
f"Validation complete: {len(valid_configs)}/{len(batch_configs)} configs valid\n"
|
||||
)
|
||||
|
||||
# Submit only valid configs
|
||||
success_count = 0
|
||||
for i, config in enumerate(batch_configs, 1):
|
||||
for i, config in enumerate(valid_configs, 1):
|
||||
try:
|
||||
success, job_id = JobManager.submit_test_job(config)
|
||||
if success and job_id:
|
||||
self.job_mapping[config.test_id] = job_id
|
||||
JobTracker.record_job(job_id) # Record job ID for cleanup
|
||||
success_count += 1
|
||||
# Truncate test_id for display
|
||||
display_id = (
|
||||
config.test_id[:60] + "..." if len(config.test_id) > 60 else config.test_id
|
||||
logger.success(
|
||||
f" [{i:3d}/{len(valid_configs)}] Job {job_id} <- {config.test_id}"
|
||||
)
|
||||
logger.success(f" [{i:3d}/{len(batch_configs)}] Job {job_id} <- {display_id}")
|
||||
else:
|
||||
# Submission failed - mark as None and record error
|
||||
self.job_mapping[config.test_id] = None
|
||||
logger.error(f" [{i:3d}/{len(batch_configs)}] Failed: {config.test_id[:50]}")
|
||||
self.submit_errors[config.test_id] = f"Job submission failed: {job_id}"
|
||||
logger.error(f" [{i:3d}/{len(valid_configs)}] Failed: {config.test_id}")
|
||||
except Exception as e:
|
||||
# Submission exception - mark as None and record error
|
||||
self.job_mapping[config.test_id] = None
|
||||
logger.error(f" [{i:3d}/{len(batch_configs)}] Error: {e}")
|
||||
self.submit_errors[config.test_id] = f"Submission exception: {str(e)}"
|
||||
logger.error(f" [{i:3d}/{len(valid_configs)}] Error: {e}")
|
||||
|
||||
# Mark batch as submitted
|
||||
self.submitted_batches.add(batch_num)
|
||||
|
||||
logger.info(f"\n{'=' * 70}")
|
||||
logger.success(
|
||||
f"Batch {batch_num} Complete: {success_count}/{len(batch_configs)} succeeded"
|
||||
f"Batch {batch_num} Complete: {success_count}/{len(valid_configs)} submitted successfully"
|
||||
)
|
||||
if len(valid_configs) < len(batch_configs):
|
||||
logger.warning(f"Skipped {len(batch_configs) - len(valid_configs)} invalid config(s)")
|
||||
logger.info(f"{'=' * 70}\n")
|
||||
|
||||
|
||||
|
||||
@ -271,7 +271,7 @@ class JobManager:
|
||||
|
||||
@staticmethod
|
||||
def backup_logs(
|
||||
job_id: str,
|
||||
job_id: Optional[str],
|
||||
test_config,
|
||||
result_dir: str,
|
||||
is_passed: bool,
|
||||
@ -279,13 +279,18 @@ class JobManager:
|
||||
"""Backup logs and config files to test_id directory.
|
||||
|
||||
Args:
|
||||
job_id: SLURM job ID
|
||||
job_id: SLURM job ID (None if submission failed)
|
||||
test_config: TestConfig object
|
||||
result_dir: Result directory path (already named as test_id)
|
||||
is_passed: Whether the job passed
|
||||
Returns:
|
||||
Final directory path if successful, None otherwise
|
||||
"""
|
||||
if job_id is None:
|
||||
logger.warning(f"Job submission failed for {test_config.test_id}")
|
||||
else:
|
||||
logger.info(f"Backing up logs for job {job_id} ({test_config.test_id})")
|
||||
|
||||
if not os.path.exists(result_dir):
|
||||
logger.warning(f"Result directory does not exist yet: {result_dir}")
|
||||
return None
|
||||
|
||||
@ -92,6 +92,13 @@ class HypothesisTestingParams:
|
||||
# Dataset default parameters for hypothesis testing
|
||||
# Extracted from accuracy_core.py AccuracyTask subclasses
|
||||
DATASET_DEFAULTS = {
|
||||
"aime25": {
|
||||
"alpha": 0.05,
|
||||
"beta": 0.2,
|
||||
"sigma": 50,
|
||||
"num_samples": 30, # AIME 2025 full sample size
|
||||
"higher_is_better": True,
|
||||
},
|
||||
"gsm8k": {
|
||||
"alpha": 0.05,
|
||||
"beta": 0.2,
|
||||
@ -127,6 +134,14 @@ DATASET_DEFAULTS = {
|
||||
"num_samples": 198,
|
||||
"higher_is_better": True,
|
||||
},
|
||||
# Alias for gpqa_diamond (same task, different naming convention)
|
||||
"gpqa_diamond_cot_zeroshot": {
|
||||
"alpha": 0.05,
|
||||
"beta": 0.2,
|
||||
"sigma": 50,
|
||||
"num_samples": 198,
|
||||
"higher_is_better": True,
|
||||
},
|
||||
"json_mode_eval": {
|
||||
"alpha": 0.05,
|
||||
"beta": 0.2,
|
||||
|
||||
@ -22,44 +22,18 @@ cd "$WORK_DIR"
|
||||
python3 "$WORK_DIR/simple_collect.py" "$OUTPUT_PATH" 2>&1
|
||||
echo "System information collection completed"
|
||||
|
||||
# Step 2: Handle different installation modes
|
||||
echo ""
|
||||
echo "Step 2: Installing TensorRT-LLM..."
|
||||
# Step 2: Collect TensorRT-LLM version information (only for none mode)
|
||||
if [ "$INSTALL_MODE" = "none" ]; then
|
||||
echo "Using built-in TensorRT-LLM, skipping installation"
|
||||
|
||||
elif [ "$INSTALL_MODE" = "wheel" ]; then
|
||||
echo "Installing TensorRT-LLM wheel..."
|
||||
echo "Wheel path pattern: $WHEEL_PATH"
|
||||
|
||||
# Expand wildcard and install
|
||||
for wheel_file in $WHEEL_PATH; do
|
||||
if [ -f "$wheel_file" ]; then
|
||||
echo "Found wheel: $wheel_file"
|
||||
pip3 install "$wheel_file" 2>&1 || echo "Wheel install failed, continuing..."
|
||||
break
|
||||
fi
|
||||
done
|
||||
echo "Wheel installation completed"
|
||||
|
||||
elif [ "$INSTALL_MODE" = "source" ]; then
|
||||
echo "Installing TensorRT-LLM from source..."
|
||||
cd "$REPO_DIR"
|
||||
pip3 install -e . 2>&1 || echo "Source install failed, continuing..."
|
||||
echo "Source installation completed"
|
||||
|
||||
echo ""
|
||||
echo "Step 2: Collecting TensorRT-LLM version information..."
|
||||
VERSION_FILE="$OUTPUT_PATH/trtllm_version.txt"
|
||||
python3 -c "import tensorrt_llm; print(f'[TensorRT-LLM] TensorRT-LLM version: {tensorrt_llm.__version__}')" > "$VERSION_FILE" 2>&1 || echo "[TensorRT-LLM] TensorRT-LLM version: unknown" > "$VERSION_FILE"
|
||||
echo "TensorRT-LLM version written to: $VERSION_FILE"
|
||||
else
|
||||
echo "ERROR: Invalid install mode: $INSTALL_MODE"
|
||||
exit 1
|
||||
echo ""
|
||||
echo "Step 2: Skipping TensorRT-LLM version collection (install_mode=$INSTALL_MODE)"
|
||||
fi
|
||||
|
||||
# Step 3: Collect TensorRT-LLM version information
|
||||
echo ""
|
||||
echo "Step 3: Collecting TensorRT-LLM version information..."
|
||||
VERSION_FILE="$OUTPUT_PATH/trtllm_version.txt"
|
||||
python3 -c "import tensorrt_llm; print(f'[TensorRT-LLM] TensorRT-LLM version: {tensorrt_llm.__version__}')" > "$VERSION_FILE" 2>&1 || echo "[TensorRT-LLM] TensorRT-LLM version: unknown" > "$VERSION_FILE"
|
||||
echo "TensorRT-LLM version written to: $VERSION_FILE"
|
||||
|
||||
echo ""
|
||||
echo "=========================================="
|
||||
echo "Session Collect Job Completed"
|
||||
|
||||
@ -77,12 +77,12 @@ worker_config:
|
||||
stream_interval: 20
|
||||
num_postprocess_workers: 4
|
||||
ctx:
|
||||
max_batch_size: 8
|
||||
max_batch_size: 1
|
||||
max_num_tokens: 131104
|
||||
max_seq_len: 131104
|
||||
tensor_parallel_size: 1
|
||||
moe_expert_parallel_size: 1
|
||||
enable_attention_dp: true
|
||||
enable_attention_dp: false
|
||||
pipeline_parallel_size: 8
|
||||
print_iter_log: true
|
||||
cuda_graph_config: null
|
||||
|
||||
@ -47,6 +47,11 @@ else:
|
||||
@pytest.fixture(scope="session", autouse=True)
|
||||
def session_lifecycle():
|
||||
"""Session lifecycle management."""
|
||||
from utils.job_tracker import JobTracker
|
||||
|
||||
# Record pytest main process PID for GitLab CI cleanup
|
||||
JobTracker.record_pid()
|
||||
|
||||
session_tracker.start()
|
||||
try:
|
||||
yield
|
||||
@ -66,11 +71,8 @@ class TestDisaggBenchmark:
|
||||
"""Performance benchmark test for YAML configurations."""
|
||||
full_test_name = request.node.name
|
||||
|
||||
# Validate configuration first (before any other operations)
|
||||
try:
|
||||
ConfigValidator.validate_test_config(test_config)
|
||||
except Exception as e:
|
||||
pytest.fail(f"Configuration validation failed: {e}")
|
||||
# Note: Configuration validation is done during batch submission (in conftest.py)
|
||||
# If validation failed, job_id will be None and the assert below will fail
|
||||
|
||||
# Create test case tracker
|
||||
test_tracker = TestCaseTracker()
|
||||
@ -104,8 +106,11 @@ class TestDisaggBenchmark:
|
||||
# Get job_id from batch manager (auto-submits batch if needed)
|
||||
job_id = batch_manager.get_job_id(test_config)
|
||||
|
||||
# Validate submission result
|
||||
assert job_id, f"Failed to get job_id for {test_config.test_id}"
|
||||
# Validate submission result (will be None if validation/submission failed)
|
||||
error_msg = batch_manager.submit_errors.get(
|
||||
test_config.test_id, "Check batch submission logs for details"
|
||||
)
|
||||
assert job_id, f"Failed to submit job for {test_config.test_id}\n{error_msg}"
|
||||
|
||||
# Wait for completion (timeout: 10 hours = 36000 seconds)
|
||||
JobManager.wait_for_completion(job_id, 36000, test_config, check_early_failure=True)
|
||||
@ -125,13 +130,12 @@ class TestDisaggBenchmark:
|
||||
raise e
|
||||
finally:
|
||||
# Always backup logs, regardless of success or failure
|
||||
if job_id:
|
||||
result_dir = JobManager.get_result_dir(test_config)
|
||||
is_passed = result.get("success", False) if result else False
|
||||
try:
|
||||
JobManager.backup_logs(job_id, test_config, result_dir, is_passed)
|
||||
except Exception as backup_error:
|
||||
logger.error(f"Failed to backup logs: {backup_error}")
|
||||
result_dir = JobManager.get_result_dir(test_config)
|
||||
is_passed = result.get("success", False) if result else False
|
||||
try:
|
||||
JobManager.backup_logs(job_id, test_config, result_dir, is_passed)
|
||||
except Exception as backup_error:
|
||||
logger.error(f"Failed to backup logs: {backup_error}")
|
||||
|
||||
@pytest.mark.accuracy
|
||||
@pytest.mark.parametrize("test_config", ACCURACY_TEST_CASES)
|
||||
@ -204,13 +208,12 @@ class TestDisaggBenchmark:
|
||||
raise e
|
||||
finally:
|
||||
# Always backup logs, regardless of success or failure
|
||||
if job_id:
|
||||
result_dir = JobManager.get_result_dir(test_config)
|
||||
is_passed = result.get("success", False) if result else False
|
||||
try:
|
||||
JobManager.backup_logs(job_id, test_config, result_dir, is_passed)
|
||||
except Exception as backup_error:
|
||||
logger.error(f"Failed to backup logs: {backup_error}")
|
||||
result_dir = JobManager.get_result_dir(test_config)
|
||||
is_passed = result.get("success", False) if result else False
|
||||
try:
|
||||
JobManager.backup_logs(job_id, test_config, result_dir, is_passed)
|
||||
except Exception as backup_error:
|
||||
logger.error(f"Failed to backup logs: {backup_error}")
|
||||
|
||||
@pytest.mark.stress
|
||||
@pytest.mark.parametrize("test_config", STRESS_TEST_CASES)
|
||||
@ -222,11 +225,8 @@ class TestDisaggBenchmark:
|
||||
"""
|
||||
full_test_name = request.node.name
|
||||
|
||||
# Validate configuration first (before any other operations)
|
||||
try:
|
||||
ConfigValidator.validate_test_config(test_config)
|
||||
except Exception as e:
|
||||
pytest.fail(f"Configuration validation failed: {e}")
|
||||
# Note: Configuration validation is done during batch submission (in conftest.py)
|
||||
# If validation failed, job_id will be None and the assert below will fail
|
||||
|
||||
# Create test case tracker
|
||||
test_tracker = TestCaseTracker()
|
||||
@ -266,8 +266,11 @@ class TestDisaggBenchmark:
|
||||
# Get job_id from batch manager (auto-submits batch if needed)
|
||||
job_id = batch_manager.get_job_id(test_config)
|
||||
|
||||
# Validate submission result
|
||||
assert job_id, f"Failed to get job_id for {test_config.test_id}"
|
||||
# Validate submission result (will be None if validation/submission failed)
|
||||
error_msg = batch_manager.submit_errors.get(
|
||||
test_config.test_id, "Check batch submission logs for details"
|
||||
)
|
||||
assert job_id, f"Failed to submit job for {test_config.test_id}\n{error_msg}"
|
||||
|
||||
# Wait for completion (timeout: 10 hours = 36000 seconds)
|
||||
JobManager.wait_for_completion(job_id, 36000, test_config, check_early_failure=True)
|
||||
@ -287,13 +290,12 @@ class TestDisaggBenchmark:
|
||||
raise e
|
||||
finally:
|
||||
# Always backup logs, regardless of success or failure
|
||||
if job_id:
|
||||
result_dir = JobManager.get_result_dir(test_config)
|
||||
is_passed = result.get("success", False) if result else False
|
||||
try:
|
||||
JobManager.backup_logs(job_id, test_config, result_dir, is_passed)
|
||||
except Exception as backup_error:
|
||||
logger.error(f"Failed to backup logs: {backup_error}")
|
||||
result_dir = JobManager.get_result_dir(test_config)
|
||||
is_passed = result.get("success", False) if result else False
|
||||
try:
|
||||
JobManager.backup_logs(job_id, test_config, result_dir, is_passed)
|
||||
except Exception as backup_error:
|
||||
logger.error(f"Failed to backup logs: {backup_error}")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
|
||||
61
tests/integration/defs/perf/disagg/utils/job_tracker.py
Normal file
61
tests/integration/defs/perf/disagg/utils/job_tracker.py
Normal file
@ -0,0 +1,61 @@
|
||||
"""Simple job and process tracker for GitLab CI cleanup."""
|
||||
|
||||
import os
|
||||
|
||||
from utils.common import EnvManager
|
||||
from utils.logger import logger
|
||||
|
||||
|
||||
class JobTracker:
|
||||
"""Track SLURM job IDs and pytest PID for GitLab CI cleanup."""
|
||||
|
||||
@staticmethod
|
||||
def get_jobs_file() -> str:
|
||||
"""Get jobs.txt file path in output_path."""
|
||||
output_path = EnvManager.get_output_path()
|
||||
return os.path.join(output_path, "jobs.txt")
|
||||
|
||||
@staticmethod
|
||||
def get_pid_file() -> str:
|
||||
"""Get pytest.pid file path in output_path."""
|
||||
output_path = EnvManager.get_output_path()
|
||||
return os.path.join(output_path, "pytest.pid")
|
||||
|
||||
@staticmethod
|
||||
def record_pid():
|
||||
"""Record pytest main process PID to pytest.pid file."""
|
||||
pid = os.getpid()
|
||||
pid_file = JobTracker.get_pid_file()
|
||||
try:
|
||||
# Ensure output directory exists
|
||||
os.makedirs(os.path.dirname(pid_file), exist_ok=True)
|
||||
|
||||
# Write PID
|
||||
with open(pid_file, "w") as f:
|
||||
f.write(f"{pid}\n")
|
||||
f.flush()
|
||||
|
||||
logger.info(f"Recorded pytest PID: {pid} -> {pid_file}")
|
||||
except Exception as e:
|
||||
logger.warning(f"Failed to record PID: {e}")
|
||||
|
||||
@staticmethod
|
||||
def record_job(job_id: str):
|
||||
"""Append SLURM job ID to jobs.txt file.
|
||||
|
||||
Args:
|
||||
job_id: SLURM job ID to record
|
||||
"""
|
||||
jobs_file = JobTracker.get_jobs_file()
|
||||
try:
|
||||
# Ensure output directory exists
|
||||
os.makedirs(os.path.dirname(jobs_file), exist_ok=True)
|
||||
|
||||
# Append job ID
|
||||
with open(jobs_file, "a") as f:
|
||||
f.write(f"{job_id}\n")
|
||||
f.flush()
|
||||
|
||||
logger.debug(f"Recorded SLURM job: {job_id}")
|
||||
except Exception as e:
|
||||
logger.warning(f"Failed to record job ID {job_id}: {e}")
|
||||
@ -79,6 +79,8 @@ class SessionTracker:
|
||||
Uses the new sbatch-based approach for non-blocking execution.
|
||||
Submits the job and waits for completion using JobManager.
|
||||
"""
|
||||
from utils.job_tracker import JobTracker
|
||||
|
||||
self.end_time = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
|
||||
logger.info(f"Session ended: {self.end_time}")
|
||||
|
||||
@ -89,6 +91,9 @@ class SessionTracker:
|
||||
logger.error(f"Failed to submit session collect job: {job_id}")
|
||||
return False
|
||||
|
||||
# Record session collect job ID for cleanup
|
||||
JobTracker.record_job(job_id)
|
||||
|
||||
# Wait for job completion (reuses wait_for_completion method)
|
||||
logger.info(f"Waiting for session collect job {job_id} to complete...")
|
||||
JobManager.wait_for_completion(
|
||||
|
||||
@ -514,3 +514,4 @@ accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_ngram SKIP (http
|
||||
accuracy/test_llm_api_pytorch.py::TestLlama3_3_70BInstruct::test_fp8_eagle3_tp8[eagle3_one_model=True-torch_compile=False] SKIP (https://nvbugs/5787892)
|
||||
accuracy/test_llm_api_pytorch.py::TestLlama3_3_70BInstruct::test_fp8_eagle3_tp8[eagle3_one_model=False-torch_compile=False] SKIP (https://nvbugs/5787892)
|
||||
accuracy/test_llm_api_pytorch.py::TestLlama4MaverickInstruct::test_fp8_chunked_prefill[tp8ep8-cuda_graph=True] SKIP (https://nvbugs/5791839)
|
||||
accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4_4gpus_online_eplb[fp8kv=True-moe_backend=TRTLLM] SKIP (https://nvbugs/5740377)
|
||||
|
||||
Loading…
Reference in New Issue
Block a user