diff --git a/tests/integration/defs/accuracy/references/gsm8k.yaml b/tests/integration/defs/accuracy/references/gsm8k.yaml
index 4313ba15f3..d5836220d0 100644
--- a/tests/integration/defs/accuracy/references/gsm8k.yaml
+++ b/tests/integration/defs/accuracy/references/gsm8k.yaml
@@ -358,3 +358,8 @@ MiniMaxAI/MiniMax-M2:
   - accuracy: 85
   - quant_algo: FP8_BLOCK_SCALES
     accuracy: 85
+nvidia/NVIDIA-Nemotron-3-Super-120B-012726:
+  - accuracy: 82.363
+  - quant_algo: FP8
+    kv_cache_quant_algo: FP8
+    accuracy: 82.121
diff --git a/tests/integration/defs/accuracy/references/mmlu.yaml b/tests/integration/defs/accuracy/references/mmlu.yaml
index 51198b62fe..6698920605 100644
--- a/tests/integration/defs/accuracy/references/mmlu.yaml
+++ b/tests/integration/defs/accuracy/references/mmlu.yaml
@@ -392,3 +392,8 @@ nvidia/Nemotron-3-Nano:
   - quant_algo: FP8
     kv_cache_quant_algo: FP8
     accuracy: 74.35
+nvidia/NVIDIA-Nemotron-3-Super-120B-012726:
+  - accuracy: 86.88
+  - quant_algo: FP8
+    kv_cache_quant_algo: FP8
+    accuracy: 86.12
diff --git a/tests/integration/defs/accuracy/test_llm_api_autodeploy.py b/tests/integration/defs/accuracy/test_llm_api_autodeploy.py
index 9e0a4f23f8..3ee8eb2d62 100644
--- a/tests/integration/defs/accuracy/test_llm_api_autodeploy.py
+++ b/tests/integration/defs/accuracy/test_llm_api_autodeploy.py
@@ -319,10 +319,10 @@ class TestNemotronSuperV3(LlmapiAccuracyTestHarness):
     Runs the model via AutoDeploy and verifies benchmark performance on MMLU and GSM8K
     """
 
-    MODEL_NAME = "nvidia/Nemotron-Super-V3"
-    MODEL_PATH_BF16 = f"{llm_models_root()}/Nemotron-Super-3-120B-A12B-dev"
-    MODEL_PATH_FP8 = f"{llm_models_root()}/Nemotron-SuperV3-phase1-mtp-fp8-fp8kv"
-    MODEL_PATH_FP4 = f"{llm_models_root()}/Nemotron-SuperV3-phase1-mtp-nvfp4-fp8kv"
+    MODEL_NAME = "nvidia/NVIDIA-Nemotron-3-Super-120B-012726"
+    MODEL_PATH_BF16 = f"{llm_models_root()}/NVIDIA-Nemotron-3-Super-120B-BF16-BF16KV-012726"
+    MODEL_PATH_FP8 = f"{llm_models_root()}/NVIDIA-Nemotron-3-Super-120B-FP8-FP8KV-012726"
+    MODEL_PATH_FP4 = f"{llm_models_root()}/NVIDIA-Nemotron-3-Super-120B-NVFP4-FP8KV-012726"
 
     # Set minimum possible seq len + small buffer, for test speed & memory usage
     MAX_SEQ_LEN = max(MMLU.MAX_INPUT_LEN + MMLU.MAX_OUTPUT_LEN,
@@ -371,7 +371,6 @@ class TestNemotronSuperV3(LlmapiAccuracyTestHarness):
             task.evaluate(llm)
         print_memory_usage("After evaluation")
 
-    @pytest.mark.skip("Skipping FP8 test until it is supported")
     @pytest.mark.skip_less_device_memory(180000)
     @pytest.mark.parametrize("world_size", [1, 4, 8])
     def test_fp8(self, world_size):
@@ -394,7 +393,7 @@ class TestNemotronSuperV3(LlmapiAccuracyTestHarness):
 
     @pytest.mark.skip("Skipping FP4 test until it is supported")
     @pytest.mark.skip_less_device_memory(180000)
-    @pytest.mark.parametrize("world_size", [1, 4, 8])
+    @pytest.mark.parametrize("world_size", [4, 8])
     def test_fp4(self, world_size):
         if get_device_count() < world_size:
             pytest.skip("Not enough devices for world size, skipping test")