mirror of
https://github.com/NVIDIA/TensorRT-LLM.git
synced 2026-01-13 22:18:36 +08:00
Signed-off-by: Suyog Gupta <41447211+suyoggupta@users.noreply.github.com>
This commit is contained in:
parent
9f044b9dd9
commit
a1385243e1
@ -207,7 +207,7 @@ class TestNemotronMOE(LlmapiAccuracyTestHarness):
|
||||
kwargs = self.get_default_kwargs()
|
||||
# TODO: multi-stream MOE seems to increase the memory usage
|
||||
kwargs["max_batch_size"] = 32
|
||||
kwargs["free_mem_ratio"] = 0.5
|
||||
kwargs["free_mem_ratio"] = 0.4
|
||||
sampling_params = self.get_default_sampling_params()
|
||||
with AutoDeployLLM(model=self.MODEL_PATH_BF16,
|
||||
tokenizer=self.MODEL_PATH_BF16,
|
||||
@ -226,9 +226,9 @@ class TestNemotronMOE(LlmapiAccuracyTestHarness):
|
||||
# Manually set quant_config for FP8 model to get the accuracy threshold
|
||||
llm.args.quant_config.quant_algo = QuantAlgo.FP8
|
||||
llm.args.quant_config.kv_cache_quant_algo = QuantAlgo.FP8
|
||||
|
||||
# task = MMLU(self.MODEL_NAME)
|
||||
# task.evaluate(llm, sampling_params=sampling_params)
|
||||
sampling_params = self.get_default_sampling_params()
|
||||
task = MMLU(self.MODEL_NAME)
|
||||
task.evaluate(llm, sampling_params=sampling_params)
|
||||
task = GSM8K(self.MODEL_NAME)
|
||||
task.evaluate(llm)
|
||||
|
||||
|
||||
Loading…
Reference in New Issue
Block a user