mirror of
https://github.com/NVIDIA/TensorRT-LLM.git
synced 2026-01-27 14:13:34 +08:00
* Add optimizations for deepseek min latency Signed-off-by: Zongfei Jing <20381269+zongfeijing@users.noreply.github.com> * Fix compile error Signed-off-by: Zongfei Jing <20381269+zongfeijing@users.noreply.github.com> * Update internal cutlass kernel libs Signed-off-by: Zongfei Jing <20381269+zongfeijing@users.noreply.github.com> * Format code Signed-off-by: Zongfei Jing <20381269+zongfeijing@users.noreply.github.com> * Resolve conflicts Signed-off-by: Zongfei Jing <20381269+zongfeijing@users.noreply.github.com> --------- Signed-off-by: Zongfei Jing <20381269+zongfeijing@users.noreply.github.com> |
||
|---|---|---|
| .. | ||
| auto_deploy | ||
| compilation | ||
| modeling | ||
| multi_gpu | ||
| multi_gpu_modeling | ||
| speculative | ||
| thop | ||
| helpers.py | ||
| pattern_watcher.py | ||
| test_attention.py | ||
| test_autotuner.py | ||
| test_flashinfer_attention.py | ||
| test_flashinfer_star_attn.py | ||
| test_fp4_bmm_quantize.py | ||
| test_fp4_gemm_quantize.py | ||
| test_fp4_linear.py | ||
| test_fp8_block_scale_gemm.py | ||
| test_fp8_linear.py | ||
| test_fp8_quantize.py | ||
| test_fused_moe.py | ||
| test_moe_routing.py | ||
| test_moe.py | ||
| test_overlap_scheduler_input.json | ||
| test_overlap_scheduler.py | ||
| test_pytorch_model_engine.py | ||
| test_vanilla_attention.py | ||