Skip to content

[Issue]: top_k_top_p_sampling_from_probs error with gfx950 #1137

@b8zhong

Description

@b8zhong

Problem Description

Hi, I encountered the following error on the gfx950 target. I am trying to integrate, this operator into sgl-project/sglang#11257

python3 python/sglang/repro_aiter_build_error.py
[aiter] compile_template_op func_name = 'top_k_top_p_sampling_from_probs_8cf4b5128e35da75bd73cf78ae7dab83' with {'src_template': <Template memory:7f32600daa10>, 'md_name': 'top_k_top_p_sampling_from_probs', 'includes': ['/sgl-workspace/aiter/csrc/cpp_itfs/utils.h', '/sgl-workspace/aiter/csrc/cpp_itfs/sampling/sampling.cuh', '/sgl-workspace/aiter/csrc/cpp_itfs/sampling/vec_dtypes.cuh'], 'sources': [], 'cxxflags': [], 'func_name': 'top_k_top_p_sampling_from_probs_8cf4b5128e35da75bd73cf78ae7dab83', 'folder': 'top_k_top_p_sampling_from_probs_8cf4b5128e35da75bd73cf78ae7dab83', 'kwargs': OrderedDict([('d', 128256), ('deterministic', False)]), 'src_file': '/*\n * Copyright (C) 2024-2025 by FlashInfer team.\n *\n * Licensed under the Apache License, Version 2.0 (the "License");\n * you may not use this file except in compliance with the License.\n * You may obtain a copy of the License at\n *\n *   http://www.apache.org/licenses/LICENSE-2.0\n *\n * Unless required by applicable law or agreed to in writing, software\n * distributed under the License is distributed on an "AS IS" BASIS,\n * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n * See the License for the specific language governing permissions and\n * limitations under the License.\n */\n\n\n#include "sampling.cuh"\n\n\n#define FUNCTION_DEFINE                                      \\\n    void top_k_top_p_sampling_from_probs_8cf4b5128e35da75bd73cf78ae7dab83(void* probs_ptr,                      \\\n                      void* output_ptr,                      \\\n                      void* indices_ptr,                     \\\n                      void* top_k_arr_ptr,                   \\\n                      void* top_p_arr_ptr,                   \\\n                      int batch_size,                        \\\n                      int top_k_val,                         \\\n                      float top_p_val,                       \\\n                      int philox_seed,                       \\\n                      int philox_offset,                     \\\n                      void* stream)\n\nextern "C" {\nFUNCTION_DEFINE;\n}\n                        \nFUNCTION_DEFINE\n{\n    constexpr uint32_t vec_size = std::gcd(16 / sizeof(float), 128256);\n \n    const uint32_t smem_size = sizeof(aiter::sampling::SamplingTempStorage<aiter::sampling::BLOCK_THREADS, aiter::sampling::SCAN_ALGO, aiter::sampling::REDUCE_ALGO>);\n    dim3 nblks(batch_size);\n    dim3 nthrs(aiter::sampling::BLOCK_THREADS);\n    auto kernel = aiter::sampling::TopKTopPSamplingFromProbKernel<aiter::sampling::BLOCK_THREADS, aiter::sampling::SCAN_ALGO, aiter::sampling::REDUCE_ALGO,\n                                                vec_size, false, float, int>;\n    hipFuncSetAttribute(reinterpret_cast<const void*>(kernel), hipFuncAttributeMaxDynamicSharedMemorySize, smem_size);\n    kernel<<<nblks, nthrs, smem_size, reinterpret_cast<hipStream_t>(stream)>>>(reinterpret_cast<float*>(probs_ptr), reinterpret_cast<int*>(top_k_arr_ptr), reinterpret_cast<float*>(top_p_arr_ptr), reinterpret_cast<int*>(output_ptr), reinterpret_cast<int*>(indices_ptr), top_k_val, top_p_val, 128256, static_cast<uint64_t>(philox_seed), static_cast<uint64_t>(philox_offset));\n}'}...
[aiter] start build /root/.aiter/build/top_k_top_p_sampling_from_probs_8cf4b5128e35da75bd73cf78ae7dab83
hipcc -fPIC -mllvm -enable-post-misched=0 -Wno-switch-bool -mllvm -amdgpu-early-inline-all=true -std=c++20 -Wno-vla-cxx-extension -fgpu-flush-denormals-to-zero -DUSE_ROCM -U__HIP_NO_HALF_OPERATORS__ -DENABLE_FP8 -Wno-unused-result -D__HIP_PLATFORM_AMD__=1 -fno-offload-uniform-block -O3 -DUSE_PROF_API=1 -mllvm --lsr-drop-solution=1 -D__HIP_PLATFORM_HCC__=1 -DLEGACY_HIPBLAS_DIRECT -mllvm --amdgpu-kernarg-preload-count=16 -mllvm -amdgpu-coerce-illegal-types=1 --offload-arch=gfx950 -Wno-undefined-func-template -U__HIP_NO_HALF_CONVERSIONS__ -mllvm -amdgpu-function-calls=false -I/root/.aiter/build/top_k_top_p_sampling_from_probs_8cf4b5128e35da75bd73cf78ae7dab83/include -c top_k_top_p_sampling_from_probs_8cf4b5128e35da75bd73cf78ae7dab83.cpp -o top_k_top_p_sampling_from_probs_8cf4b5128e35da75bd73cf78ae7dab83.o
In file included from top_k_top_p_sampling_from_probs_8cf4b5128e35da75bd73cf78ae7dab83.cpp:18:
/root/.aiter/build/top_k_top_p_sampling_from_probs_8cf4b5128e35da75bd73cf78ae7dab83/include/sampling.cuh:299:18: error: reference to non-static member function must be called
  298 |             BlockScan<float, BLOCK_THREADS, SCAN_ALGORITHM>(temp_storage->block_prim.scan)
      |             ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
  299 |                 .InclusiveSum<VEC_SIZE>(prob_greater_than_threshold, inclusive_cdf);
      |                 ~^~~~~~~~~~~~
/root/.aiter/build/top_k_top_p_sampling_from_probs_8cf4b5128e35da75bd73cf78ae7dab83/include/sampling.cuh:543:13: note: in instantiation of function template specialization 'aiter::sampling::DeviceSamplingFromProb<4U, 1024U, hipcub::BLOCK_SCAN_WARP_SCANS, hipcub::BLOCK_REDUCE_WARP_REDUCTIONS, false, (lambda at /root/.aiter/build/top_k_top_p_sampling_from_probs_8cf4b5128e35da75bd73cf78ae7dab83/include/sampling.cuh:548:23)>' requested here
  543 |             DeviceSamplingFromProb<VEC_SIZE,
      |             ^
top_k_top_p_sampling_from_probs_8cf4b5128e35da75bd73cf78ae7dab83.cpp:45:36: note: in instantiation of function template specialization 'aiter::sampling::TopKTopPSamplingFromProbKernel<1024U, hipcub::BLOCK_SCAN_WARP_SCANS, hipcub::BLOCK_REDUCE_WARP_REDUCTIONS, 4U, false, float, int>' requested here
   45 |     auto kernel = aiter::sampling::TopKTopPSamplingFromProbKernel<aiter::sampling::BLOCK_THREADS, aiter::sampling::SCAN_ALGO, aiter::sampling::REDUCE_ALGO,
      |                                    ^
/opt/rocm-7.0.0/lib/llvm/bin/../../../include/hipcub/block/../backend/rocprim/block/block_scan.hpp:112:10: note: possible target for call
  112 |     void InclusiveSum(T input, T& output)
      |          ^
/opt/rocm-7.0.0/lib/llvm/bin/../../../include/hipcub/block/../backend/rocprim/block/block_scan.hpp:118:10: note: possible target for call
  118 |     void InclusiveSum(T input, T& output, T& block_aggregate)
      |          ^
/opt/rocm-7.0.0/lib/llvm/bin/../../../include/hipcub/block/../backend/rocprim/block/block_scan.hpp:125:10: note: possible target for call
  125 |     void InclusiveSum(T input, T& output, BlockPrefixCallbackOp& block_prefix_callback_op)
      |          ^
/opt/rocm-7.0.0/lib/llvm/bin/../../../include/hipcub/block/../backend/rocprim/block/block_scan.hpp:134:10: note: possible target for call
  134 |     void InclusiveSum(T(&input)[ITEMS_PER_THREAD], T(&output)[ITEMS_PER_THREAD])
      |          ^
/opt/rocm-7.0.0/lib/llvm/bin/../../../include/hipcub/block/../backend/rocprim/block/block_scan.hpp:141:10: note: possible target for call
  141 |     void InclusiveSum(T(&input)[ITEMS_PER_THREAD], T(&output)[ITEMS_PER_THREAD],
      |          ^
/opt/rocm-7.0.0/lib/llvm/bin/../../../include/hipcub/block/../backend/rocprim/block/block_scan.hpp:149:10: note: possible target for call
  149 |     void InclusiveSum(T(&input)[ITEMS_PER_THREAD], T(&output)[ITEMS_PER_THREAD],
      |          ^
1 error generated when compiling for gfx950.
failed to execute:/opt/rocm/lib/llvm/bin/clang++  --offload-arch=gfx950  -fPIC -mllvm -enable-post-misched=0 -Wno-switch-bool -mllvm -amdgpu-early-inline-all=true -std=c++20 -Wno-vla-cxx-extension -fgpu-flush-denormals-to-zero -DUSE_ROCM -U__HIP_NO_HALF_OPERATORS__ -DENABLE_FP8 -Wno-unused-result -D__HIP_PLATFORM_AMD__=1 -fno-offload-uniform-block -O3 -DUSE_PROF_API=1 -mllvm --lsr-drop-solution=1 -D__HIP_PLATFORM_HCC__=1 -DLEGACY_HIPBLAS_DIRECT -mllvm --amdgpu-kernarg-preload-count=16 -mllvm -amdgpu-coerce-illegal-types=1 -Wno-undefined-func-template -U__HIP_NO_HALF_CONVERSIONS__ -mllvm -amdgpu-function-calls=false -I/root/.aiter/build/top_k_top_p_sampling_from_probs_8cf4b5128e35da75bd73cf78ae7dab83/include -c -x hip top_k_top_p_sampling_from_probs_8cf4b5128e35da75bd73cf78ae7dab83.cpp -o "top_k_top_p_sampling_from_probs_8cf4b5128e35da75bd73cf78ae7dab83.o"
make: *** [Makefile:12: top_k_top_p_sampling_from_probs_8cf4b5128e35da75bd73cf78ae7dab83.o] Error 1
[aiter] finish build /root/.aiter/build/top_k_top_p_sampling_from_probs_8cf4b5128e35da75bd73cf78ae7dab83, cost 10.23218401s
AITER op failed:
Traceback (most recent call last):
  File "/sgl-workspace/sglang/python/sglang/repro_aiter_build_error.py", line 25, in main
    samples = torch.ops.aiter.top_k_top_p_sampling_from_probs(
  File "/opt/venv/lib/python3.10/site-packages/torch/_ops.py", line 1254, in __call__
    return self._op(*args, **kwargs)
  File "/opt/venv/lib/python3.10/site-packages/torch/utils/_device.py", line 103, in __torch_function__
    return func(*args, **kwargs)
  File "/opt/venv/lib/python3.10/site-packages/torch/_ops.py", line 1254, in __call__
    return self._op(*args, **kwargs)
  File "/sgl-workspace/aiter/csrc/cpp_itfs/torch_utils.py", line 112, in _op_func
    return op_func(*args, **kwargs)
  File "/sgl-workspace/aiter/aiter/ops/sampling.py", line 70, in top_k_top_p_sampling_from_probs
    return top_k_top_p_sampling_from_probs_core(
  File "/sgl-workspace/aiter/csrc/cpp_itfs/sampling/top_k_top_p_sampling_from_probs.py", line 65, in top_k_top_p_sampling_from_probs
    func = compile(vocab_size, deterministic)
  File "/sgl-workspace/aiter/csrc/cpp_itfs/sampling/top_k_top_p_sampling_from_probs.py", line 23, in compile
    return compile_template_op(
  File "/sgl-workspace/aiter/csrc/cpp_itfs/utils.py", line 281, in compile_template_op
    compile_lib(src_file, folder, includes, sources, cxxflags)
  File "/sgl-workspace/aiter/csrc/cpp_itfs/utils.py", line 231, in compile_lib
    mp_lock(lock_path=lock_path, main_func=main_func, final_func=final_func)
  File "/sgl-workspace/aiter/csrc/cpp_itfs/utils.py", line 83, in mp_lock
    ret = main_func()
  File "/sgl-workspace/aiter/csrc/cpp_itfs/utils.py", line 218, in main_func
    subprocess.run(
  File "/usr/lib/python3.10/subprocess.py", line 526, in run
    raise CalledProcessError(retcode, process.args,
subprocess.CalledProcessError: Command 'cd /root/.aiter/build/top_k_top_p_sampling_from_probs_8cf4b5128e35da75bd73cf78ae7dab83 && make build -j1' returned non-zero exit status 2.

Operating System

NAME="Ubuntu" VERSION="22.04.5 LTS (Jammy Jellyfish)"

CPU

AMD EPYC 9575F 64-Core Processor

GPU

8 X MI355X

ROCm Version

ROCm 7

ROCm Component

No response

Steps to Reproduce

Sample reproduction example.

# repro_aiter_build_error.py
import torch

# Must import to register torch.ops.aiter.* kernels
import aiter as _aiter  # noqa: F401
import aiter.ops.sampling  # noqa: F401

torch.set_default_device("cuda")

def _to_tensor_scalar_tuple(x):
    if isinstance(x, torch.Tensor):
        return (x, 0)
    return (None, x)

def main():
    batch_size = 40
    vocab_size = 128256
    top_k = 20
    top_p = 0.6

    pre_norm = torch.rand(batch_size, vocab_size, dtype=torch.float32)
    probs = pre_norm / pre_norm.sum(dim=-1, keepdim=True)

    try:
        samples = torch.ops.aiter.top_k_top_p_sampling_from_probs(
            probs,
            None,
            *_to_tensor_scalar_tuple(top_k),
            *_to_tensor_scalar_tuple(top_p),
            deterministic=False,  # set True if you want that specialization
        )
        print("Success. Samples shape:", samples.shape, "dtype:", samples.dtype)
    except Exception as e:
        import traceback
        print("AITER op failed:")
        traceback.print_exc()

if __name__ == "__main__":
    main()

(Optional for Linux users) Output of /opt/rocm/bin/rocminfo --support

No response

Additional Information

SGLang

Metadata

Metadata

Assignees

Labels

No labels
No labels

Type

No type

Projects

No projects

Milestone

No milestone

Relationships

None yet

Development

No branches or pull requests

Issue actions