sgl-project
diff --git a/‎.github/CODEOWNERS‎
Lines changed: 1 addition & 1 deletion b/‎.github/CODEOWNERS‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎.github/workflows/ci-monitor.yml‎
Lines changed: 6 additions & 3 deletions b/‎.github/workflows/ci-monitor.yml‎
Lines changed: 6 additions & 3 deletions
diff --git a/‎.github/workflows/pr-test.yml‎
Lines changed: 45 additions & 27 deletions b/‎.github/workflows/pr-test.yml‎
Lines changed: 45 additions & 27 deletions
diff --git a/‎docker/Dockerfile.rocm‎
Lines changed: 90 additions & 3 deletions b/‎docker/Dockerfile.rocm‎
Lines changed: 90 additions & 3 deletions
diff --git a/‎docs/advanced_features/hyperparameter_tuning.md‎
Lines changed: 1 addition & 1 deletion b/‎docs/advanced_features/hyperparameter_tuning.md‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎docs/advanced_features/server_arguments.md‎
Lines changed: 1 addition & 0 deletions b/‎docs/advanced_features/server_arguments.md‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎docs/references/faq.md‎
Lines changed: 10 additions & 6 deletions b/‎docs/references/faq.md‎
Lines changed: 10 additions & 6 deletions
@@ -12,7 +12,7 @@
 /python/sglang/srt/eplb @fzyzcjy
 /python/sglang/srt/function_call @CatherineSue @JustinTong0323
 /python/sglang/srt/layers @merrymercy @Ying1123 @zhyncs @ispobock @HaiShaw @ch-wan @BBuf @kushanam @Edwardf0t1
-/python/sglang/srt/layers/attention @ping1jing2
+/python/sglang/srt/layers/attention/ascend_backend.py @ping1jing2
 /python/sglang/srt/lora @Ying1123 @Fridge003 @lifuhuang
 /python/sglang/srt/managers @merrymercy @Ying1123 @hnyls2002 @xiezhq-hermann
 /python/sglang/srt/mem_cache @merrymercy @Ying1123 @hnyls2002 @xiezhq-hermann
 
@@ -2,8 +2,7 @@ name: CI Monitor
 
 on:
   schedule:
-    # Run every 6 hours at 00:00, 06:00, 12:00, 18:00 UTC
-    - cron: '0 */6 * * *'
+    - cron: '0 */12 * * *'
   workflow_dispatch:
     inputs:
       limit:
@@ -16,6 +15,10 @@ concurrency:
   group: ci-monitor-${{ github.ref }}
   cancel-in-progress: true
 
+permissions:
+  contents: write
+  actions: read
+
 jobs:
   ci-monitor:
     if: github.repository == 'sgl-project/sglang'|| github.event_name == 'pull_request'
@@ -50,7 +53,7 @@ jobs:
           PYTHONIOENCODING: utf-8
         run: |
           cd scripts/ci_monitor
-          python ci_analyzer_perf.py --token $GITHUB_TOKEN --limit 500 --output-dir performance_tables_$(date +%Y%m%d_%H%M%S)
+          python ci_analyzer_perf.py --token $GITHUB_TOKEN --limit ${{ github.event.inputs.limit || '1000' }} --output-dir performance_tables_$(date +%Y%m%d_%H%M%S) --upload-to-github
 
       - name: Upload Analysis Results
         uses: actions/upload-artifact@v4
 
@@ -99,8 +99,6 @@ jobs:
     needs: [check-changes, sgl-kernel-build-wheels]
     if: needs.check-changes.outputs.sgl_kernel == 'true'
     runs-on: 1-gpu-runner
-    env:
-      HF_TOKEN: ${{ secrets.HF_TOKEN }}
     steps:
       - uses: actions/checkout@v4
 
@@ -155,6 +153,50 @@ jobs:
           cd test/srt
           python3 test_mla_deepseek_v3.py
 
+  sgl-kernel-benchmark-test:
+    needs: [check-changes, sgl-kernel-build-wheels]
+    if: always() && !failure() && !cancelled()
+    runs-on: 1-gpu-runner
+    env:
+      HF_TOKEN: ${{ secrets.HF_TOKEN }}
+      CI: true
+    steps:
+      - uses: actions/checkout@v4
+
+      - name: Cleanup
+        run: |
+          ls -alh sgl-kernel/dist || true
+          rm -rf sgl-kernel/dist/* || true
+
+      - name: Download artifacts
+        uses: actions/download-artifact@v4
+        with:
+          path: sgl-kernel/dist/
+          merge-multiple: true
+          pattern: wheel-python3.10-cuda12.9
+
+      - name: Install dependencies
+        run: |
+          CUSTOM_BUILD_SGL_KERNEL=${{needs.check-changes.outputs.sgl_kernel}} bash scripts/ci/ci_install_dependency.sh
+
+      - name: Run benchmark tests
+        timeout-minutes: 45
+        run: |
+          cd sgl-kernel/benchmark
+          echo "Running sgl-kernel benchmark tests in CI mode..."
+
+          echo "CI environment variable: $CI"
+          echo "GITHUB_ACTIONS environment variable: $GITHUB_ACTIONS"
+
+          for bench_file in bench_*.py; do
+            echo "Testing $bench_file..."
+            timeout 60 python3 "$bench_file" || echo "Warning: $bench_file timed out or failed, continuing..."
+            echo "Completed $bench_file"
+            echo "---"
+          done
+
+          echo "All benchmark tests completed!"
+
   # =============================================== primary ====================================================
 
   unit-test-frontend:
@@ -189,8 +231,6 @@ jobs:
     if: always() && !failure() && !cancelled() &&
         ((needs.check-changes.outputs.main_package == 'true') || (needs.check-changes.outputs.sgl_kernel == 'true'))
     runs-on: 1-gpu-runner
-    env:
-      HF_TOKEN: ${{ secrets.HF_TOKEN }}
     strategy:
       fail-fast: false
       matrix:
@@ -222,8 +262,6 @@ jobs:
     if: always() && !failure() && !cancelled() &&
         ((needs.check-changes.outputs.main_package == 'true') || (needs.check-changes.outputs.sgl_kernel == 'true'))
     runs-on: 2-gpu-runner
-    env:
-      HF_TOKEN: ${{ secrets.HF_TOKEN }}
     strategy:
       fail-fast: false
       matrix:
@@ -255,8 +293,6 @@ jobs:
     if: always() && !failure() && !cancelled() &&
         ((needs.check-changes.outputs.main_package == 'true') || (needs.check-changes.outputs.sgl_kernel == 'true'))
     runs-on: 4-gpu-runner
-    env:
-      HF_TOKEN: ${{ secrets.HF_TOKEN }}
     strategy:
       fail-fast: false
       matrix:
@@ -288,8 +324,6 @@ jobs:
     if: always() && !failure() && !cancelled() &&
         ((needs.check-changes.outputs.main_package == 'true') || (needs.check-changes.outputs.sgl_kernel == 'true'))
     runs-on: 8-gpu-runner
-    env:
-      HF_TOKEN: ${{ secrets.HF_TOKEN }}
     strategy:
       fail-fast: false
       matrix:
@@ -321,8 +355,6 @@ jobs:
     if: always() && !failure() && !cancelled() &&
         ((needs.check-changes.outputs.main_package == 'true') || (needs.check-changes.outputs.sgl_kernel == 'true'))
     runs-on: 1-gpu-runner
-    env:
-      HF_TOKEN: ${{ secrets.HF_TOKEN }}
     steps:
       - name: Checkout code
         uses: actions/checkout@v4
@@ -382,8 +414,6 @@ jobs:
     if: always() && !failure() && !cancelled() &&
         ((needs.check-changes.outputs.main_package == 'true') || (needs.check-changes.outputs.sgl_kernel == 'true'))
     runs-on: 1-gpu-runner
-    env:
-      HF_TOKEN: ${{ secrets.HF_TOKEN }}
     steps:
       - name: Checkout code
         uses: actions/checkout@v4
@@ -435,8 +465,6 @@ jobs:
     if: always() && !failure() && !cancelled() &&
         ((needs.check-changes.outputs.main_package == 'true') || (needs.check-changes.outputs.sgl_kernel == 'true'))
     runs-on: 2-gpu-runner
-    env:
-      HF_TOKEN: ${{ secrets.HF_TOKEN }}
     steps:
       - name: Checkout code
         uses: actions/checkout@v4
@@ -494,8 +522,6 @@ jobs:
     if: always() && !failure() && !cancelled() &&
         ((needs.check-changes.outputs.main_package == 'true') || (needs.check-changes.outputs.sgl_kernel == 'true'))
     runs-on: 1-gpu-runner
-    env:
-      HF_TOKEN: ${{ secrets.HF_TOKEN }}
     steps:
       - name: Checkout code
         uses: actions/checkout@v4
@@ -526,8 +552,6 @@ jobs:
     if: always() && !failure() && !cancelled() &&
         ((needs.check-changes.outputs.main_package == 'true') || (needs.check-changes.outputs.sgl_kernel == 'true'))
     runs-on: 2-gpu-runner
-    env:
-      HF_TOKEN: ${{ secrets.HF_TOKEN }}
     steps:
       - name: Checkout code
         uses: actions/checkout@v4
@@ -558,8 +582,6 @@ jobs:
     if: always() && !failure() && !cancelled() &&
         ((needs.check-changes.outputs.main_package == 'true') || (needs.check-changes.outputs.sgl_kernel == 'true'))
     runs-on: 4-gpu-runner
-    env:
-      HF_TOKEN: ${{ secrets.HF_TOKEN }}
     steps:
       - name: Checkout code
         uses: actions/checkout@v4
@@ -587,8 +609,6 @@ jobs:
     if: always() && !failure() && !cancelled() &&
         ((needs.check-changes.outputs.main_package == 'true') || (needs.check-changes.outputs.sgl_kernel == 'true'))
     runs-on: 8-gpu-runner
-    env:
-      HF_TOKEN: ${{ secrets.HF_TOKEN }}
     steps:
       - name: Checkout code
         uses: actions/checkout@v4
@@ -616,8 +636,6 @@ jobs:
     if: always() && !failure() && !cancelled() &&
         ((needs.check-changes.outputs.main_package == 'true') || (needs.check-changes.outputs.sgl_kernel == 'true'))
     runs-on: 4-b200-runner
-    env:
-      HF_TOKEN: ${{ secrets.HF_TOKEN }}
     strategy:
       fail-fast: false
     steps:
@@ -647,7 +665,7 @@ jobs:
       check-changes,
 
       sgl-kernel-build-wheels,
-      sgl-kernel-unit-test, sgl-kernel-mla-test,
+      sgl-kernel-unit-test, sgl-kernel-mla-test, sgl-kernel-benchmark-test,
 
       unit-test-frontend, unit-test-backend-1-gpu,
       unit-test-backend-2-gpu, unit-test-backend-4-gpu, unit-test-backend-8-gpu,
 
@@ -69,6 +69,13 @@ ARG LLVM_COMMIT="6520ace8227ffe2728148d5f3b9872a870b0a560"
 ARG MOONCAKE_REPO="https://github.com/kvcache-ai/Mooncake.git"
 ARG MOONCAKE_COMMIT="dcdf1c784b40aa6975a8ed89fe26321b028e40e8"
 
+ARG TILELANG_REPO="https://github.com/HaiShaw/tilelang.git"
+ARG TILELANG_BRANCH="dsv32-mi35x"
+ARG TILELANG_COMMIT="ae938cf885743f165a19656d1122ad42bb0e30b8"
+
+ARG FHT_REPO="https://github.com/jeffdaily/fast-hadamard-transform.git"
+ARG FHT_BRANCH="rocm"
+ARG FHT_COMMIT="46efb7d776d38638fc39f3c803eaee3dd7016bd1"
 USER root
 
 # Install some basic utilities
@@ -90,8 +97,6 @@ RUN if [ "$BUILD_LLVM" = "1" ]; then \
      && make -j$(nproc); \
     fi
 
-# -----------------------
-
 # -----------------------
 # AITER
 RUN pip uninstall -y aiter
@@ -155,7 +160,6 @@ RUN if [ "$BUILD_MOONCAKE" = "1" ]; then \
      make -j "$(nproc)" && make install; \
     fi
 
-
 # -----------------------
 # Build SGLang
 ARG BUILD_TYPE=all
@@ -207,6 +211,89 @@ RUN python3 -m pip install --no-cache-dir setuptools-rust \
     && python3 -m pip install --no-cache-dir . \
     && rm -rf /root/.cache
 
+# -----------------------
+# TileLang
+ENV DEBIAN_FRONTEND=noninteractive
+ENV LIBGL_ALWAYS_INDIRECT=1
+RUN echo "LC_ALL=en_US.UTF-8" >> /etc/environment
+
+RUN /bin/bash -lc 'set -euo pipefail; \
+  # Build TileLang only for gfx950
+  if [ "${GPU_ARCH:-}" != "gfx950" ]; then \
+    echo "[TileLang] Skipping (GPU_ARCH=${GPU_ARCH:-unset})"; \
+    exit 0; \
+  fi; \
+  echo "[TileLang] Building TileLang for ${GPU_ARCH}"; \
+  \
+  # System dependencies (NO llvm-dev to avoid llvm-config-16 shadowing)
+  apt-get update && apt-get install -y --no-install-recommends \
+      build-essential git wget curl ca-certificates gnupg \
+      libgtest-dev libgmock-dev \
+      libprotobuf-dev protobuf-compiler libgflags-dev libsqlite3-dev \
+      python3 python3-dev python3-setuptools python3-pip \
+      gcc libtinfo-dev zlib1g-dev libedit-dev libxml2-dev \
+      cmake ninja-build pkg-config libstdc++6 \
+  && rm -rf /var/lib/apt/lists/*; \
+  \
+  # Build GoogleTest static libs (Ubuntu package ships sources only)
+  cmake -S /usr/src/googletest -B /tmp/build-gtest -DBUILD_GTEST=ON -DBUILD_GMOCK=ON -DCMAKE_BUILD_TYPE=Release && \
+  cmake --build /tmp/build-gtest -j"$(nproc)" && \
+  cp -v /tmp/build-gtest/lib/*.a /usr/lib/x86_64-linux-gnu/ && \
+  rm -rf /tmp/build-gtest; \
+  \
+  # Keep setuptools < 80 (compat with base image)
+  python3 -m pip install --upgrade "setuptools>=77.0.3,<80" wheel cmake ninja && \
+  python3 -m pip cache purge || true; \
+  \
+  # Locate ROCm llvm-config; fallback to installing LLVM 18 if missing
+  LLVM_CONFIG_PATH=""; \
+  for p in /opt/rocm/llvm/bin/llvm-config /opt/rocm/llvm-*/bin/llvm-config /opt/rocm-*/llvm*/bin/llvm-config; do \
+    if [ -x "$p" ]; then LLVM_CONFIG_PATH="$p"; break; fi; \
+  done; \
+  if [ -z "$LLVM_CONFIG_PATH" ]; then \
+    echo "[TileLang] ROCm llvm-config not found; installing LLVM 18..."; \
+    curl -fsSL https://apt.llvm.org/llvm.sh -o /tmp/llvm.sh; \
+    chmod +x /tmp/llvm.sh; \
+    /tmp/llvm.sh 18; \
+    LLVM_CONFIG_PATH="$(command -v llvm-config-18)"; \
+    if [ -z "$LLVM_CONFIG_PATH" ]; then echo "ERROR: llvm-config-18 not found after install"; exit 1; fi; \
+  fi; \
+  echo "[TileLang] Using LLVM_CONFIG at: $LLVM_CONFIG_PATH"; \
+  export PATH="$(dirname "$LLVM_CONFIG_PATH"):/usr/local/bin:${PATH}"; \
+  export LLVM_CONFIG="$LLVM_CONFIG_PATH"; \
+  \
+  # Optional shim for tools that expect llvm-config-16
+  mkdir -p /usr/local/bin && \
+  printf "#!/usr/bin/env bash\nexec \"%s\" \"\$@\"\n" "$LLVM_CONFIG_PATH" > /usr/local/bin/llvm-config-16 && \
+  chmod +x /usr/local/bin/llvm-config-16; \
+  \
+  # TVM Python bits need Cython
+  python3 -m pip install --no-cache-dir "cython>=0.29.36,<3.0"; \
+  \
+  # Clone + pin TileLang (bundled TVM), then build
+  git clone --recursive --branch "${TILELANG_BRANCH}" "${TILELANG_REPO}" /opt/tilelang && \
+  cd /opt/tilelang && \
+  git fetch --depth=1 origin "${TILELANG_COMMIT}" || true && \
+  git checkout -f "${TILELANG_COMMIT}" && \
+  git submodule update --init --recursive && \
+  export CMAKE_ARGS="-DLLVM_CONFIG=${LLVM_CONFIG} ${CMAKE_ARGS:-}" && \
+  bash ./install_rocm.sh'
+
+# -----------------------
+# Hadamard-transform (HIP build)
+RUN /bin/bash -lc 'set -euo pipefail; \
+    git clone --branch "${FHT_BRANCH}" "${FHT_REPO}" fast-hadamard-transform; \
+    cd fast-hadamard-transform; \
+    git checkout -f "${FHT_COMMIT}"; \
+    python setup.py install'
+
+# -----------------------
+# Python tools
+RUN python3 -m pip install --no-cache-dir \
+    py-spy \
+    pre-commit
+
+# -----------------------
 # Performance environment variable.
 ENV HIP_FORCE_DEV_KERNARG=1
 ENV HSA_NO_SCRATCH_RECLAIM=1
 
@@ -23,7 +23,7 @@ The case of a server being too conservative can happen when users send many requ
 
 On the other hand, if you see `token usage` very high and you frequently see warnings like
 `KV cache pool is full. Retract requests. #retracted_reqs: 1, #new_token_ratio: 0.9998 -> 1.0000`, you can increase `--schedule-conservativeness` to a value like 1.3.
-If you see `KV cache pool is full. Retract requests.` occasionally but not frequently, it is okay.
+If you see `KV cache pool is full. Retract requests.` occasionally but not frequently (~1 time per minute), it is okay.
 
 ### Tune `--mem-fraction-static` to increase KV cache pool capacity
 SGLang allocates memory as follows:
 
@@ -113,6 +113,7 @@ Please consult the documentation below and [server_args.py](https://github.com/s
 | `--quantization` | The quantization method. | None |
 | `--quantization-param-path` | Path to the JSON file containing the KV cache scaling factors. This should generally be supplied, when KV cache dtype is FP8. Otherwise, KV cache scaling factors default to 1.0, which may cause accuracy issues. | None |
 | `--kv-cache-dtype` | Data type for kv cache storage. 'auto' will use model data type. 'fp8_e5m2' and 'fp8_e4m3' is supported for CUDA 11.8+. | auto |
+| `--enable-fp32-lm-head` | If set, the LM head outputs (logits) are in FP32. | False |
 
 ## Memory and scheduling
 
 
@@ -9,14 +9,20 @@ If you encounter out-of-memory (OOM) errors, you can adjust the following parame
 
 - If OOM occurs during prefill, try reducing `--chunked-prefill-size` to `4096` or `2048`. This saves memory but slows down the prefill speed for long prompts.
 - If OOM occurs during decoding, try lowering `--max-running-requests`.
-- You can also reduce `--mem-fraction-static` to a smaller value, such as 0.8 or 0.7. This decreases the memory usage of the KV cache memory pool and helps prevent OOM errors during both prefill and decoding. However, it limits maximum concurrency and reduces peak throughput.
+- You can also decrease `--mem-fraction-static` to a smaller value, such as 0.8 or 0.7. This decreases the memory usage of the KV cache memory pool and helps prevent OOM errors during both prefill and decoding. However, it limits maximum concurrency and reduces peak throughput.
 - Another common case for OOM is requesting input logprobs for a long prompt as it requires significant memory. To address this, set `logprob_start_len` in your sampling parameters to include only the necessary parts. If you do need input logprobs for a long prompt, try reducing `--mem-fraction-static`.
 
 ### CUDA Error: Illegal Memory Access Encountered
 This error may result from kernel errors or out-of-memory issues:
 - If it is a kernel error, resolving it may be challenging. Please file an issue on GitHub.
 - If it is an out-of-memory issue, it may sometimes be reported as this error instead of "Out of Memory." Refer to the section above for guidance on avoiding OOM issues.
 
+### The server hangs
+- If the server hangs during initialization or running, it can be memory issues (out of memory), network issues (nccl errors), or other bugs in sglang.
+    - If it is out of memory, you might see that `avail mem` is very low during the initialization or right after initialization. In this case,
+      you can try to decrease `--mem-fraction-static`, decrease `--cuda-graph-max-bs`, or decrease `--chunked-prefill-size`.
+- Other bugs, please raise a Github issue to us.
+
 
 ## Frequently Asked Questions
 
@@ -28,8 +34,6 @@ From our initial investigation, this indeterminism arises from two factors: dyna
 
 To achieve more deterministic outputs in the current code, you can add `--disable-radix-cache` and send only one request at a time. The results will be mostly deterministic under this setting.
 
-We are still investigating the root causes and potential solutions. In the short term, we may introduce a "deterministic mode" that uses more padding to address the variance caused by dynamic batching. This mode will be more deterministic but slower.
-
-We have two issues to track our progress:
-- The deterministic mode is tracked at [https://github.com/sgl-project/sglang/issues/1729](https://github.com/sgl-project/sglang/issues/1729).
-- The per-request random seed is tracked at [https://github.com/sgl-project/sglang/issues/1335](https://github.com/sgl-project/sglang/issues/1335).
+**Note**:
+Recently, we also introduced a deterministic mode, you can enable it with `--enable-deterministic-inference`. It might not work for all cases.
+Please find more details in this blog post: https://lmsys.org/blog/2025-09-22-sglang-deterministic/