fix: compile flags for trtllm fmha_v2 (#2175)

jimmyzho · web-flow · commit cc50469514e5 · 2025-12-05T00:19:44.000-08:00
## 📌 Description  ## 🔍 Related Issues  ## 🚀 Pull Request Checklist Thank you for contributing to FlashInfer! Before we review your pull request, please make sure the following items are complete. ### ✅ Pre-commit Checks - [ ] I have installed `pre-commit` by running `pip install pre-commit` (or used your preferred method). - [ ] I have installed the hooks with `pre-commit install`. - [ ] I have run the hooks manually with `pre-commit run --all-files` and fixed any reported issues. > If you are unsure about how to set up `pre-commit`, see [the pre-commit documentation](https://pre-commit.com/). ## 🧪 Tests - [ ] Tests have been added or updated as needed. - [ ] All tests are passing (`unittest`, etc.). ## Reviewer Notes   ## Summary by CodeRabbit * **Chores** * Removed noisy runtime console prints during build/generation. * Updated CUDA compiler requirements to target CUDA 12 and added a new compiler flag for compatibility. * **Bug Fixes** * Added an early check that raises a clear error on unsupported GPU devices (SM120a), preventing misruns. * **Tests** * Test now skips automatically when the required SM120a GPU support is not present. <sub>✏️ Tip: You can customize this high-level summary in your review settings.</sub>
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
diff --git a/flashinfer/jit/attention/fmha_v2/generator_utils.py b/flashinfer/jit/attention/fmha_v2/generator_utils.py
@@ -3711,10 +3711,10 @@ def generate_files(specs_names):
     ]
     if "CUDA_PATH" in os.environ:
         cmd[0] = os.environ["CUDA_PATH"] + "/bin/" + cmd[0]
-    print('Running command "{}" to build "bin/print_traits.exe":'.format(" ".join(cmd)))
+    # print('Running command "{}" to build "bin/print_traits.exe":'.format(" ".join(cmd)))
     process = subprocess.Popen(cmd, stdin=subprocess.PIPE, stdout=subprocess.PIPE)
     output, error = process.communicate()
-    print('Running "bin/print_traits.exe":')
+    # print('Running "bin/print_traits.exe":')
     process = subprocess.Popen(
         "bin/print_traits.exe", stdin=subprocess.PIPE, stdout=subprocess.PIPE
     )
diff --git a/flashinfer/jit/attention/modules.py b/flashinfer/jit/attention/modules.py
@@ -1901,9 +1901,10 @@ def gen_trtllm_fmha_v2_module() -> JitSpec:
     source_paths = kernel_paths + [binding_source_path]
 
     nvcc_flags = current_compilation_context.get_nvcc_flags_list(
-        supported_major_versions=[10, 11, 12]
+        supported_major_versions=[12]
     )
     nvcc_flags.append(f"-I{jit_env.FLASHINFER_CSRC_DIR / 'fmha_v2'}")
+    nvcc_flags.append("-Wno-deprecated-gpu-targets")
 
     return gen_jit_spec(
         uri,
diff --git a/flashinfer/prefill.py b/flashinfer/prefill.py
@@ -3603,6 +3603,8 @@ def fmha_v2_prefill_deepseek(
         If return_lse is True, the output will be a tuple of two tensors, the first is the output tensor, the second is the lse tensor.
         If return_lse is False, the output will be a single tensor.
     """
+    if not is_sm120a_supported(query.device):
+        raise ValueError("fmha_v2_prefill_deepseek is only supported on SM120 GPUs.")
     assert query.shape[3] == 192 and key.shape[3] == 192 and value.shape[3] == 128, (
         "currently only support deepseek r1 192 query and 128 value"
     )
diff --git a/tests/attention/test_fmha_v2_prefill_deepseek.py b/tests/attention/test_fmha_v2_prefill_deepseek.py
@@ -5,6 +5,7 @@
 
 from flashinfer.prefill import fmha_v2_prefill_deepseek
 from tests.utils_fp8 import to_float8
+from flashinfer.utils import is_sm120a_supported
 
 
 def attention_ref(
@@ -56,6 +57,8 @@ def attention_ref(
 def test_fmha_v2_prefill_deepseek(
     batch_size, num_heads, head_dim_qk, head_dim_v, seq_len, qkv_dtype, o_dtype
 ):
+    if not is_sm120a_supported(torch.device("cuda")):
+        pytest.skip("fmha_v2_prefill_deepseek is only supported on SM120 GPUs.")
     torch.manual_seed(42)
 
     def initialize_tensors(batch_size, num_heads, head_dim_qk, head_dim_v, seq_len):

Original file line number	Diff line number	Diff line change
`@@ -3711,10 +3711,10 @@ def generate_files(specs_names):`
`3711`	`3711`	`]`
`3712`	`3712`	`if "CUDA_PATH" in os.environ:`
`3713`	`3713`	`cmd[0] = os.environ["CUDA_PATH"] + "/bin/" + cmd[0]`
`3714`		`- print('Running command "{}" to build "bin/print_traits.exe":'.format(" ".join(cmd)))`
	`3714`	`+ # print('Running command "{}" to build "bin/print_traits.exe":'.format(" ".join(cmd)))`
`3715`	`3715`	`process = subprocess.Popen(cmd, stdin=subprocess.PIPE, stdout=subprocess.PIPE)`
`3716`	`3716`	`output, error = process.communicate()`
`3717`		`- print('Running "bin/print_traits.exe":')`
	`3717`	`+ # print('Running "bin/print_traits.exe":')`
`3718`	`3718`	`process = subprocess.Popen(`
`3719`	`3719`	`"bin/print_traits.exe", stdin=subprocess.PIPE, stdout=subprocess.PIPE`
`3720`	`3720`	`)`
Original file line number	Diff line number	Diff line change
`@@ -3603,6 +3603,8 @@ def fmha_v2_prefill_deepseek(`
`3603`	`3603`	`If return_lse is True, the output will be a tuple of two tensors, the first is the output tensor, the second is the lse tensor.`
`3604`	`3604`	`If return_lse is False, the output will be a single tensor.`
`3605`	`3605`	`"""`
	`3606`	`+ if not is_sm120a_supported(query.device):`
	`3607`	`+ raise ValueError("fmha_v2_prefill_deepseek is only supported on SM120 GPUs.")`
`3606`	`3608`	`assert query.shape[3] == 192 and key.shape[3] == 192 and value.shape[3] == 128, (`
`3607`	`3609`	`"currently only support deepseek r1 192 query and 128 value"`
`3608`	`3610`	`)`