[CI] Fix nightly CI (#3821)

Potabk · web-flow · commit 90ae114569a8 · 2025-10-28T20:40:03.000+08:00
### What this PR does / why we need it? This patch fix the nightly CI runs [failure](https://github.com/vllm-project/vllm-ascend/actions/runs/18848144365) ### Does this PR introduce _any_ user-facing change? ### How was this patch tested? - vLLM version: v0.11.0rc3 - vLLM main: vllm-project/vllm@releases/v0.11.1 --------- Signed-off-by: wangli <wangli858794774@gmail.com>
diff --git a/.github/workflows/_e2e_nightly_multi_node.yaml b/.github/workflows/_e2e_nightly_multi_node.yaml
@@ -88,12 +88,17 @@ jobs:
         - name: Install kubectl
           run: |
             # Install kubectl
+            arch=$(uname -m)
+
+            if echo "$arch" | grep -qiE "arm|aarch64"; then
+              echo "Detected ARM architecture: $arch"
+              KUBECTL="$KUBECTL"_arm
+            fi
             install -o root -g root -m 0755 $KUBECTL /usr/local/bin/kubectl
-            
+
             # Verify kubectl installation
             kubectl version --client=true
 
-        # TODO: Add A2 tests
         - name: Decode kubeconfig from secrets
           run: |
             # Decode and save kubeconfig
@@ -175,7 +180,7 @@ jobs:
 
         - name: Determine is success
           run: |
-            TIMEOUT=600
+            TIMEOUT=300
             ELAPSED=0
             while [ ! -f "$RESULT_FILE" ]; do
               sleep 5
diff --git a/tests/e2e/conftest.py b/tests/e2e/conftest.py
@@ -116,7 +116,7 @@ def __init__(self,
                  model: str,
                  vllm_serve_args: Union[list[str], str],
                  *,
-                 server_host: str = "0.0.0.0",
+                 server_host: str = '0.0.0.0',
                  server_port: int = 8080,
                  env_dict: Optional[dict[str, str]] = None,
                  seed: Optional[int] = None,
diff --git a/tests/e2e/nightly/multi_node/config/multi_node_config.py b/tests/e2e/nightly/multi_node/config/multi_node_config.py
@@ -84,16 +84,17 @@ def _init_dist_env(self):
         self.envs["LOCAL_IP"] = self.cur_ip
         self.envs["NIC_NAME"] = self.nic_name
 
+        master_ip = self.cluster_ips[0]
         if self.disaggregated_prefill:
             self.envs[
                 "DISAGGREGATED_PREFILL_RANK_TABLE_PATH"] = self.disaggregated_prefill.get(
                     "ranktable_path")
             if self.cur_index < self.decode_start_index:
-                self.envs["MASTER_IP"] = self.cluster_ips[0]
+                master_ip = self.cluster_ips[0]
             else:
-                self.envs["MASTER_IP"] = self.cluster_ips[
-                    self.decode_start_index]
+                master_ip = self.cluster_ips[self.decode_start_index]
 
+        self.envs["MASTER_IP"] = master_ip
         ascend_path = "/usr/local/Ascend/ascend-toolkit/latest/python/site-packages"
         self.envs[
             "LD_LIBRARY_PATH"] = f"{ascend_path}:{self.envs.get('LD_LIBRARY_PATH', os.environ.get('LD_LIBRARY_PATH', ''))}"
@@ -288,8 +289,3 @@ def _gen_ranktable(self):
         subprocess.run(cmd, env=env, check=True)
         assert os.path.exists(
             str(ranktable_path)), "failed generate ranktable.json"
-
-
-if __name__ == '__main__':
-    config = MultiNodeConfig.from_yaml()
-    print(config.perf_cmd)
diff --git a/tests/e2e/nightly/multi_node/scripts/run.sh b/tests/e2e/nightly/multi_node/scripts/run.sh
@@ -121,7 +121,7 @@ download_go() {
 }
 
 install_ais_bench() {
-    local AIS_BENCH="$SRC_DIR/benchmark"
+    local AIS_BENCH="$SRC_DIR/vllm-ascend/benchmark"
     git clone https://gitee.com/aisbench/benchmark.git $AIS_BENCH
     cd $AIS_BENCH
     git checkout v3.0-20250930-master
@@ -166,8 +166,8 @@ run_tests() {
     kill_npu_processes
     ret=$?
     if [ "$LWS_WORKER_INDEX" -eq 0 ]; then
-        mkdir -p "$(dirname "$RESULT_PATH")"
-        echo $ret > "$RESULT_PATH"
+        mkdir -p "$(dirname "$RESULT_FILE_PATH")"
+        echo $ret > "$RESULT_FILE_PATH"
     fi
     return $ret
 }
diff --git a/tests/e2e/nightly/multi_node/test_multi_node.py b/tests/e2e/nightly/multi_node/test_multi_node.py
@@ -48,7 +48,7 @@ def get_local_model_path_with_retry(
 async def get_completions(url: str, model: str, prompts: Union[str, List[str]],
                           **api_kwargs: Any) -> List[str]:
     """
-    Asynchronously send HTTP requests to a /v1/completions endpoint.
+    Asynchronously send HTTP requests to endpoint.
 
     Args:
         url: Full endpoint URL, e.g. "http://localhost:1025/v1/completions"
@@ -88,7 +88,10 @@ async def get_completions(url: str, model: str, prompts: Union[str, List[str]],
 @pytest.mark.asyncio
 async def test_multi_node() -> None:
     config = MultiNodeConfig.from_yaml()
+    # To avoid modelscope 400 HttpError, we should download the model with retry
     local_model_path = get_local_model_path_with_retry(config.model)
+    config.server_cmd = config.server_cmd.replace(config.model,
+                                                  local_model_path)
     assert local_model_path is not None, "can not find any local weight for test"
     env_dict = config.envs
     perf_cmd = config.perf_cmd
@@ -113,11 +116,6 @@ async def test_multi_node() -> None:
         ) as remote_server:
             if config.is_master:
                 port = proxy_port if disaggregated_prefill else server_port
-                base_url = f"http://localhost:{port}/v1/completions"
-                _ = await get_completions(url=base_url,
-                                          model=local_model_path,
-                                          prompts=prompts,
-                                          api_kwargs=api_keyword_args)
                 # aisbench test
                 if acc_cmd:
                     run_aisbench_cases(local_model_path, port, acc_cmd)
diff --git a/tools/aisbench.py b/tools/aisbench.py
@@ -14,11 +14,16 @@
 # limitations under the License.
 # This file is a part of the vllm-ascend project.
 #
+import hashlib
 import json
 import os
 import re
 import subprocess
+import tempfile
+from pathlib import Path
 
+import filelock
+import huggingface_hub
 import pandas as pd
 from modelscope import snapshot_download  # type: ignore
 
@@ -63,10 +68,12 @@ def __init__(self,
                  port: int,
                  aisbench_config: dict,
                  verify=True):
-        self.dataset_path = snapshot_download(aisbench_config["dataset_path"],
-                                              repo_type='dataset')
         self.model = model
-        self.model_path = snapshot_download(model)
+        self.dataset_path = maybe_download_from_modelscope(
+            aisbench_config["dataset_path"], repo_type="dataset")
+        self.model_path = maybe_download_from_modelscope(model)
+        assert self.dataset_path is not None and self.model_path is not None, \
+            f"Failed to download dataset or model: dataset={self.dataset_path}, model={self.model_path}"
         self.port = port
         self.task_type = aisbench_config["case_type"]
         self.request_conf = aisbench_config["request_conf"]
@@ -254,3 +261,52 @@ def run_aisbench_cases(model, port, aisbench_cases):
 def get_TTFT(result):
     TTFT = result[0][0].loc["TTFT", "Average"][:-3]
     return float(TTFT)
+
+
+temp_dir = tempfile.gettempdir()
+
+
+def get_lock(model_name_or_path: str | Path, cache_dir: str | None = None):
+    lock_dir = cache_dir or temp_dir
+    model_name_or_path = str(model_name_or_path)
+    os.makedirs(os.path.dirname(lock_dir), exist_ok=True)
+    model_name = model_name_or_path.replace("/", "-")
+    hash_name = hashlib.sha256(model_name.encode()).hexdigest()
+    # add hash to avoid conflict with old users' lock files
+    lock_file_name = hash_name + model_name + ".lock"
+    # mode 0o666 is required for the filelock to be shared across users
+    lock = filelock.FileLock(os.path.join(lock_dir, lock_file_name),
+                             mode=0o666)
+    return lock
+
+
+def maybe_download_from_modelscope(
+    model: str,
+    repo_type: str | None = None,
+    revision: str | None = None,
+    download_dir: str | None = None,
+    ignore_patterns: str | list[str] | None = None,
+    allow_patterns: list[str] | str | None = None,
+) -> str | None:
+    """
+    Download model/dataset from ModelScope hub.
+    Returns the path to the downloaded model, or None if the model is not
+    downloaded from ModelScope.
+    """
+    # Use file lock to prevent multiple processes from
+    # downloading the same model weights at the same time.
+    with get_lock(model, download_dir):
+        if not os.path.exists(model):
+            model_path = snapshot_download(
+                model_id=model,
+                repo_type=repo_type,
+                cache_dir=download_dir,
+                local_files_only=huggingface_hub.constants.HF_HUB_OFFLINE,
+                revision=revision,
+                ignore_file_pattern=ignore_patterns,
+                allow_patterns=allow_patterns,
+            )
+        else:
+            model_path = model
+        return model_path
+    return None