-
Notifications
You must be signed in to change notification settings - Fork 120
Closed
Description
Problem Description
Hi, I encountered the following error on the gfx950 target. I am trying to integrate, this operator into sgl-project/sglang#11257
python3 python/sglang/repro_aiter_build_error.py
[aiter] compile_template_op func_name = 'top_k_top_p_sampling_from_probs_8cf4b5128e35da75bd73cf78ae7dab83' with {'src_template': <Template memory:7f32600daa10>, 'md_name': 'top_k_top_p_sampling_from_probs', 'includes': ['/sgl-workspace/aiter/csrc/cpp_itfs/utils.h', '/sgl-workspace/aiter/csrc/cpp_itfs/sampling/sampling.cuh', '/sgl-workspace/aiter/csrc/cpp_itfs/sampling/vec_dtypes.cuh'], 'sources': [], 'cxxflags': [], 'func_name': 'top_k_top_p_sampling_from_probs_8cf4b5128e35da75bd73cf78ae7dab83', 'folder': 'top_k_top_p_sampling_from_probs_8cf4b5128e35da75bd73cf78ae7dab83', 'kwargs': OrderedDict([('d', 128256), ('deterministic', False)]), 'src_file': '/*\n * Copyright (C) 2024-2025 by FlashInfer team.\n *\n * Licensed under the Apache License, Version 2.0 (the "License");\n * you may not use this file except in compliance with the License.\n * You may obtain a copy of the License at\n *\n * http://www.apache.org/licenses/LICENSE-2.0\n *\n * Unless required by applicable law or agreed to in writing, software\n * distributed under the License is distributed on an "AS IS" BASIS,\n * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n * See the License for the specific language governing permissions and\n * limitations under the License.\n */\n\n\n#include "sampling.cuh"\n\n\n#define FUNCTION_DEFINE \\\n void top_k_top_p_sampling_from_probs_8cf4b5128e35da75bd73cf78ae7dab83(void* probs_ptr, \\\n void* output_ptr, \\\n void* indices_ptr, \\\n void* top_k_arr_ptr, \\\n void* top_p_arr_ptr, \\\n int batch_size, \\\n int top_k_val, \\\n float top_p_val, \\\n int philox_seed, \\\n int philox_offset, \\\n void* stream)\n\nextern "C" {\nFUNCTION_DEFINE;\n}\n \nFUNCTION_DEFINE\n{\n constexpr uint32_t vec_size = std::gcd(16 / sizeof(float), 128256);\n \n const uint32_t smem_size = sizeof(aiter::sampling::SamplingTempStorage<aiter::sampling::BLOCK_THREADS, aiter::sampling::SCAN_ALGO, aiter::sampling::REDUCE_ALGO>);\n dim3 nblks(batch_size);\n dim3 nthrs(aiter::sampling::BLOCK_THREADS);\n auto kernel = aiter::sampling::TopKTopPSamplingFromProbKernel<aiter::sampling::BLOCK_THREADS, aiter::sampling::SCAN_ALGO, aiter::sampling::REDUCE_ALGO,\n vec_size, false, float, int>;\n hipFuncSetAttribute(reinterpret_cast<const void*>(kernel), hipFuncAttributeMaxDynamicSharedMemorySize, smem_size);\n kernel<<<nblks, nthrs, smem_size, reinterpret_cast<hipStream_t>(stream)>>>(reinterpret_cast<float*>(probs_ptr), reinterpret_cast<int*>(top_k_arr_ptr), reinterpret_cast<float*>(top_p_arr_ptr), reinterpret_cast<int*>(output_ptr), reinterpret_cast<int*>(indices_ptr), top_k_val, top_p_val, 128256, static_cast<uint64_t>(philox_seed), static_cast<uint64_t>(philox_offset));\n}'}...
[aiter] start build /root/.aiter/build/top_k_top_p_sampling_from_probs_8cf4b5128e35da75bd73cf78ae7dab83
hipcc -fPIC -mllvm -enable-post-misched=0 -Wno-switch-bool -mllvm -amdgpu-early-inline-all=true -std=c++20 -Wno-vla-cxx-extension -fgpu-flush-denormals-to-zero -DUSE_ROCM -U__HIP_NO_HALF_OPERATORS__ -DENABLE_FP8 -Wno-unused-result -D__HIP_PLATFORM_AMD__=1 -fno-offload-uniform-block -O3 -DUSE_PROF_API=1 -mllvm --lsr-drop-solution=1 -D__HIP_PLATFORM_HCC__=1 -DLEGACY_HIPBLAS_DIRECT -mllvm --amdgpu-kernarg-preload-count=16 -mllvm -amdgpu-coerce-illegal-types=1 --offload-arch=gfx950 -Wno-undefined-func-template -U__HIP_NO_HALF_CONVERSIONS__ -mllvm -amdgpu-function-calls=false -I/root/.aiter/build/top_k_top_p_sampling_from_probs_8cf4b5128e35da75bd73cf78ae7dab83/include -c top_k_top_p_sampling_from_probs_8cf4b5128e35da75bd73cf78ae7dab83.cpp -o top_k_top_p_sampling_from_probs_8cf4b5128e35da75bd73cf78ae7dab83.o
In file included from top_k_top_p_sampling_from_probs_8cf4b5128e35da75bd73cf78ae7dab83.cpp:18:
/root/.aiter/build/top_k_top_p_sampling_from_probs_8cf4b5128e35da75bd73cf78ae7dab83/include/sampling.cuh:299:18: error: reference to non-static member function must be called
298 | BlockScan<float, BLOCK_THREADS, SCAN_ALGORITHM>(temp_storage->block_prim.scan)
| ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
299 | .InclusiveSum<VEC_SIZE>(prob_greater_than_threshold, inclusive_cdf);
| ~^~~~~~~~~~~~
/root/.aiter/build/top_k_top_p_sampling_from_probs_8cf4b5128e35da75bd73cf78ae7dab83/include/sampling.cuh:543:13: note: in instantiation of function template specialization 'aiter::sampling::DeviceSamplingFromProb<4U, 1024U, hipcub::BLOCK_SCAN_WARP_SCANS, hipcub::BLOCK_REDUCE_WARP_REDUCTIONS, false, (lambda at /root/.aiter/build/top_k_top_p_sampling_from_probs_8cf4b5128e35da75bd73cf78ae7dab83/include/sampling.cuh:548:23)>' requested here
543 | DeviceSamplingFromProb<VEC_SIZE,
| ^
top_k_top_p_sampling_from_probs_8cf4b5128e35da75bd73cf78ae7dab83.cpp:45:36: note: in instantiation of function template specialization 'aiter::sampling::TopKTopPSamplingFromProbKernel<1024U, hipcub::BLOCK_SCAN_WARP_SCANS, hipcub::BLOCK_REDUCE_WARP_REDUCTIONS, 4U, false, float, int>' requested here
45 | auto kernel = aiter::sampling::TopKTopPSamplingFromProbKernel<aiter::sampling::BLOCK_THREADS, aiter::sampling::SCAN_ALGO, aiter::sampling::REDUCE_ALGO,
| ^
/opt/rocm-7.0.0/lib/llvm/bin/../../../include/hipcub/block/../backend/rocprim/block/block_scan.hpp:112:10: note: possible target for call
112 | void InclusiveSum(T input, T& output)
| ^
/opt/rocm-7.0.0/lib/llvm/bin/../../../include/hipcub/block/../backend/rocprim/block/block_scan.hpp:118:10: note: possible target for call
118 | void InclusiveSum(T input, T& output, T& block_aggregate)
| ^
/opt/rocm-7.0.0/lib/llvm/bin/../../../include/hipcub/block/../backend/rocprim/block/block_scan.hpp:125:10: note: possible target for call
125 | void InclusiveSum(T input, T& output, BlockPrefixCallbackOp& block_prefix_callback_op)
| ^
/opt/rocm-7.0.0/lib/llvm/bin/../../../include/hipcub/block/../backend/rocprim/block/block_scan.hpp:134:10: note: possible target for call
134 | void InclusiveSum(T(&input)[ITEMS_PER_THREAD], T(&output)[ITEMS_PER_THREAD])
| ^
/opt/rocm-7.0.0/lib/llvm/bin/../../../include/hipcub/block/../backend/rocprim/block/block_scan.hpp:141:10: note: possible target for call
141 | void InclusiveSum(T(&input)[ITEMS_PER_THREAD], T(&output)[ITEMS_PER_THREAD],
| ^
/opt/rocm-7.0.0/lib/llvm/bin/../../../include/hipcub/block/../backend/rocprim/block/block_scan.hpp:149:10: note: possible target for call
149 | void InclusiveSum(T(&input)[ITEMS_PER_THREAD], T(&output)[ITEMS_PER_THREAD],
| ^
1 error generated when compiling for gfx950.
failed to execute:/opt/rocm/lib/llvm/bin/clang++ --offload-arch=gfx950 -fPIC -mllvm -enable-post-misched=0 -Wno-switch-bool -mllvm -amdgpu-early-inline-all=true -std=c++20 -Wno-vla-cxx-extension -fgpu-flush-denormals-to-zero -DUSE_ROCM -U__HIP_NO_HALF_OPERATORS__ -DENABLE_FP8 -Wno-unused-result -D__HIP_PLATFORM_AMD__=1 -fno-offload-uniform-block -O3 -DUSE_PROF_API=1 -mllvm --lsr-drop-solution=1 -D__HIP_PLATFORM_HCC__=1 -DLEGACY_HIPBLAS_DIRECT -mllvm --amdgpu-kernarg-preload-count=16 -mllvm -amdgpu-coerce-illegal-types=1 -Wno-undefined-func-template -U__HIP_NO_HALF_CONVERSIONS__ -mllvm -amdgpu-function-calls=false -I/root/.aiter/build/top_k_top_p_sampling_from_probs_8cf4b5128e35da75bd73cf78ae7dab83/include -c -x hip top_k_top_p_sampling_from_probs_8cf4b5128e35da75bd73cf78ae7dab83.cpp -o "top_k_top_p_sampling_from_probs_8cf4b5128e35da75bd73cf78ae7dab83.o"
make: *** [Makefile:12: top_k_top_p_sampling_from_probs_8cf4b5128e35da75bd73cf78ae7dab83.o] Error 1
[aiter] finish build /root/.aiter/build/top_k_top_p_sampling_from_probs_8cf4b5128e35da75bd73cf78ae7dab83, cost 10.23218401s
AITER op failed:
Traceback (most recent call last):
File "/sgl-workspace/sglang/python/sglang/repro_aiter_build_error.py", line 25, in main
samples = torch.ops.aiter.top_k_top_p_sampling_from_probs(
File "/opt/venv/lib/python3.10/site-packages/torch/_ops.py", line 1254, in __call__
return self._op(*args, **kwargs)
File "/opt/venv/lib/python3.10/site-packages/torch/utils/_device.py", line 103, in __torch_function__
return func(*args, **kwargs)
File "/opt/venv/lib/python3.10/site-packages/torch/_ops.py", line 1254, in __call__
return self._op(*args, **kwargs)
File "/sgl-workspace/aiter/csrc/cpp_itfs/torch_utils.py", line 112, in _op_func
return op_func(*args, **kwargs)
File "/sgl-workspace/aiter/aiter/ops/sampling.py", line 70, in top_k_top_p_sampling_from_probs
return top_k_top_p_sampling_from_probs_core(
File "/sgl-workspace/aiter/csrc/cpp_itfs/sampling/top_k_top_p_sampling_from_probs.py", line 65, in top_k_top_p_sampling_from_probs
func = compile(vocab_size, deterministic)
File "/sgl-workspace/aiter/csrc/cpp_itfs/sampling/top_k_top_p_sampling_from_probs.py", line 23, in compile
return compile_template_op(
File "/sgl-workspace/aiter/csrc/cpp_itfs/utils.py", line 281, in compile_template_op
compile_lib(src_file, folder, includes, sources, cxxflags)
File "/sgl-workspace/aiter/csrc/cpp_itfs/utils.py", line 231, in compile_lib
mp_lock(lock_path=lock_path, main_func=main_func, final_func=final_func)
File "/sgl-workspace/aiter/csrc/cpp_itfs/utils.py", line 83, in mp_lock
ret = main_func()
File "/sgl-workspace/aiter/csrc/cpp_itfs/utils.py", line 218, in main_func
subprocess.run(
File "/usr/lib/python3.10/subprocess.py", line 526, in run
raise CalledProcessError(retcode, process.args,
subprocess.CalledProcessError: Command 'cd /root/.aiter/build/top_k_top_p_sampling_from_probs_8cf4b5128e35da75bd73cf78ae7dab83 && make build -j1' returned non-zero exit status 2.
Operating System
NAME="Ubuntu" VERSION="22.04.5 LTS (Jammy Jellyfish)"
CPU
AMD EPYC 9575F 64-Core Processor
GPU
8 X MI355X
ROCm Version
ROCm 7
ROCm Component
No response
Steps to Reproduce
Sample reproduction example.
# repro_aiter_build_error.py
import torch
# Must import to register torch.ops.aiter.* kernels
import aiter as _aiter # noqa: F401
import aiter.ops.sampling # noqa: F401
torch.set_default_device("cuda")
def _to_tensor_scalar_tuple(x):
if isinstance(x, torch.Tensor):
return (x, 0)
return (None, x)
def main():
batch_size = 40
vocab_size = 128256
top_k = 20
top_p = 0.6
pre_norm = torch.rand(batch_size, vocab_size, dtype=torch.float32)
probs = pre_norm / pre_norm.sum(dim=-1, keepdim=True)
try:
samples = torch.ops.aiter.top_k_top_p_sampling_from_probs(
probs,
None,
*_to_tensor_scalar_tuple(top_k),
*_to_tensor_scalar_tuple(top_p),
deterministic=False, # set True if you want that specialization
)
print("Success. Samples shape:", samples.shape, "dtype:", samples.dtype)
except Exception as e:
import traceback
print("AITER op failed:")
traceback.print_exc()
if __name__ == "__main__":
main()
(Optional for Linux users) Output of /opt/rocm/bin/rocminfo --support
No response
Additional Information
SGLang
Metadata
Metadata
Assignees
Labels
No labels