Skip to content

Commit 90ae114

Browse files
authored
[CI] Fix nightly CI (#3821)
### What this PR does / why we need it? This patch fix the nightly CI runs [failure](https://github.com/vllm-project/vllm-ascend/actions/runs/18848144365) ### Does this PR introduce _any_ user-facing change? ### How was this patch tested? - vLLM version: v0.11.0rc3 - vLLM main: vllm-project/vllm@releases/v0.11.1 --------- Signed-off-by: wangli <[email protected]>
1 parent a7450db commit 90ae114

File tree

6 files changed

+79
-24
lines changed

6 files changed

+79
-24
lines changed

.github/workflows/_e2e_nightly_multi_node.yaml

Lines changed: 8 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -88,12 +88,17 @@ jobs:
8888
- name: Install kubectl
8989
run: |
9090
# Install kubectl
91+
arch=$(uname -m)
92+
93+
if echo "$arch" | grep -qiE "arm|aarch64"; then
94+
echo "Detected ARM architecture: $arch"
95+
KUBECTL="$KUBECTL"_arm
96+
fi
9197
install -o root -g root -m 0755 $KUBECTL /usr/local/bin/kubectl
92-
98+
9399
# Verify kubectl installation
94100
kubectl version --client=true
95101
96-
# TODO: Add A2 tests
97102
- name: Decode kubeconfig from secrets
98103
run: |
99104
# Decode and save kubeconfig
@@ -175,7 +180,7 @@ jobs:
175180
176181
- name: Determine is success
177182
run: |
178-
TIMEOUT=600
183+
TIMEOUT=300
179184
ELAPSED=0
180185
while [ ! -f "$RESULT_FILE" ]; do
181186
sleep 5

tests/e2e/conftest.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -116,7 +116,7 @@ def __init__(self,
116116
model: str,
117117
vllm_serve_args: Union[list[str], str],
118118
*,
119-
server_host: str = "0.0.0.0",
119+
server_host: str = '0.0.0.0',
120120
server_port: int = 8080,
121121
env_dict: Optional[dict[str, str]] = None,
122122
seed: Optional[int] = None,

tests/e2e/nightly/multi_node/config/multi_node_config.py

Lines changed: 4 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -84,16 +84,17 @@ def _init_dist_env(self):
8484
self.envs["LOCAL_IP"] = self.cur_ip
8585
self.envs["NIC_NAME"] = self.nic_name
8686

87+
master_ip = self.cluster_ips[0]
8788
if self.disaggregated_prefill:
8889
self.envs[
8990
"DISAGGREGATED_PREFILL_RANK_TABLE_PATH"] = self.disaggregated_prefill.get(
9091
"ranktable_path")
9192
if self.cur_index < self.decode_start_index:
92-
self.envs["MASTER_IP"] = self.cluster_ips[0]
93+
master_ip = self.cluster_ips[0]
9394
else:
94-
self.envs["MASTER_IP"] = self.cluster_ips[
95-
self.decode_start_index]
95+
master_ip = self.cluster_ips[self.decode_start_index]
9696

97+
self.envs["MASTER_IP"] = master_ip
9798
ascend_path = "/usr/local/Ascend/ascend-toolkit/latest/python/site-packages"
9899
self.envs[
99100
"LD_LIBRARY_PATH"] = f"{ascend_path}:{self.envs.get('LD_LIBRARY_PATH', os.environ.get('LD_LIBRARY_PATH', ''))}"
@@ -288,8 +289,3 @@ def _gen_ranktable(self):
288289
subprocess.run(cmd, env=env, check=True)
289290
assert os.path.exists(
290291
str(ranktable_path)), "failed generate ranktable.json"
291-
292-
293-
if __name__ == '__main__':
294-
config = MultiNodeConfig.from_yaml()
295-
print(config.perf_cmd)

tests/e2e/nightly/multi_node/scripts/run.sh

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -121,7 +121,7 @@ download_go() {
121121
}
122122

123123
install_ais_bench() {
124-
local AIS_BENCH="$SRC_DIR/benchmark"
124+
local AIS_BENCH="$SRC_DIR/vllm-ascend/benchmark"
125125
git clone https://gitee.com/aisbench/benchmark.git $AIS_BENCH
126126
cd $AIS_BENCH
127127
git checkout v3.0-20250930-master
@@ -166,8 +166,8 @@ run_tests() {
166166
kill_npu_processes
167167
ret=$?
168168
if [ "$LWS_WORKER_INDEX" -eq 0 ]; then
169-
mkdir -p "$(dirname "$RESULT_PATH")"
170-
echo $ret > "$RESULT_PATH"
169+
mkdir -p "$(dirname "$RESULT_FILE_PATH")"
170+
echo $ret > "$RESULT_FILE_PATH"
171171
fi
172172
return $ret
173173
}

tests/e2e/nightly/multi_node/test_multi_node.py

Lines changed: 4 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -48,7 +48,7 @@ def get_local_model_path_with_retry(
4848
async def get_completions(url: str, model: str, prompts: Union[str, List[str]],
4949
**api_kwargs: Any) -> List[str]:
5050
"""
51-
Asynchronously send HTTP requests to a /v1/completions endpoint.
51+
Asynchronously send HTTP requests to endpoint.
5252
5353
Args:
5454
url: Full endpoint URL, e.g. "http://localhost:1025/v1/completions"
@@ -88,7 +88,10 @@ async def get_completions(url: str, model: str, prompts: Union[str, List[str]],
8888
@pytest.mark.asyncio
8989
async def test_multi_node() -> None:
9090
config = MultiNodeConfig.from_yaml()
91+
# To avoid modelscope 400 HttpError, we should download the model with retry
9192
local_model_path = get_local_model_path_with_retry(config.model)
93+
config.server_cmd = config.server_cmd.replace(config.model,
94+
local_model_path)
9295
assert local_model_path is not None, "can not find any local weight for test"
9396
env_dict = config.envs
9497
perf_cmd = config.perf_cmd
@@ -113,11 +116,6 @@ async def test_multi_node() -> None:
113116
) as remote_server:
114117
if config.is_master:
115118
port = proxy_port if disaggregated_prefill else server_port
116-
base_url = f"http://localhost:{port}/v1/completions"
117-
_ = await get_completions(url=base_url,
118-
model=local_model_path,
119-
prompts=prompts,
120-
api_kwargs=api_keyword_args)
121119
# aisbench test
122120
if acc_cmd:
123121
run_aisbench_cases(local_model_path, port, acc_cmd)

tools/aisbench.py

Lines changed: 59 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -14,11 +14,16 @@
1414
# limitations under the License.
1515
# This file is a part of the vllm-ascend project.
1616
#
17+
import hashlib
1718
import json
1819
import os
1920
import re
2021
import subprocess
22+
import tempfile
23+
from pathlib import Path
2124

25+
import filelock
26+
import huggingface_hub
2227
import pandas as pd
2328
from modelscope import snapshot_download # type: ignore
2429

@@ -63,10 +68,12 @@ def __init__(self,
6368
port: int,
6469
aisbench_config: dict,
6570
verify=True):
66-
self.dataset_path = snapshot_download(aisbench_config["dataset_path"],
67-
repo_type='dataset')
6871
self.model = model
69-
self.model_path = snapshot_download(model)
72+
self.dataset_path = maybe_download_from_modelscope(
73+
aisbench_config["dataset_path"], repo_type="dataset")
74+
self.model_path = maybe_download_from_modelscope(model)
75+
assert self.dataset_path is not None and self.model_path is not None, \
76+
f"Failed to download dataset or model: dataset={self.dataset_path}, model={self.model_path}"
7077
self.port = port
7178
self.task_type = aisbench_config["case_type"]
7279
self.request_conf = aisbench_config["request_conf"]
@@ -254,3 +261,52 @@ def run_aisbench_cases(model, port, aisbench_cases):
254261
def get_TTFT(result):
255262
TTFT = result[0][0].loc["TTFT", "Average"][:-3]
256263
return float(TTFT)
264+
265+
266+
temp_dir = tempfile.gettempdir()
267+
268+
269+
def get_lock(model_name_or_path: str | Path, cache_dir: str | None = None):
270+
lock_dir = cache_dir or temp_dir
271+
model_name_or_path = str(model_name_or_path)
272+
os.makedirs(os.path.dirname(lock_dir), exist_ok=True)
273+
model_name = model_name_or_path.replace("/", "-")
274+
hash_name = hashlib.sha256(model_name.encode()).hexdigest()
275+
# add hash to avoid conflict with old users' lock files
276+
lock_file_name = hash_name + model_name + ".lock"
277+
# mode 0o666 is required for the filelock to be shared across users
278+
lock = filelock.FileLock(os.path.join(lock_dir, lock_file_name),
279+
mode=0o666)
280+
return lock
281+
282+
283+
def maybe_download_from_modelscope(
284+
model: str,
285+
repo_type: str | None = None,
286+
revision: str | None = None,
287+
download_dir: str | None = None,
288+
ignore_patterns: str | list[str] | None = None,
289+
allow_patterns: list[str] | str | None = None,
290+
) -> str | None:
291+
"""
292+
Download model/dataset from ModelScope hub.
293+
Returns the path to the downloaded model, or None if the model is not
294+
downloaded from ModelScope.
295+
"""
296+
# Use file lock to prevent multiple processes from
297+
# downloading the same model weights at the same time.
298+
with get_lock(model, download_dir):
299+
if not os.path.exists(model):
300+
model_path = snapshot_download(
301+
model_id=model,
302+
repo_type=repo_type,
303+
cache_dir=download_dir,
304+
local_files_only=huggingface_hub.constants.HF_HUB_OFFLINE,
305+
revision=revision,
306+
ignore_file_pattern=ignore_patterns,
307+
allow_patterns=allow_patterns,
308+
)
309+
else:
310+
model_path = model
311+
return model_path
312+
return None

0 commit comments

Comments
 (0)