Skip to content

Commit 5313be9

Browse files
authored
[CI] Remind re-run when auto_parallel CI exit -6 (#69212)
1 parent 36b88c8 commit 5313be9

File tree

1 file changed

+100
-28
lines changed

1 file changed

+100
-28
lines changed

tools/auto_parallel/ci_auto_parallel.sh

Lines changed: 100 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,12 @@ mkdir -p /workspace/case_logs
2121
export log_path=/workspace/case_logs
2222
export case_list=()
2323

24+
global_total_count=0
25+
global_success_count=0
26+
global_exit_250_arr=()
27+
global_runtime_fail_arr=()
28+
global_verification_fail_arr=()
29+
2430
install_paddle(){
2531
echo -e "\033[31m ---- Install paddlepaddle-gpu \033"
2632
if [ -n "$paddle" ];then
@@ -70,17 +76,68 @@ if [[ $1 -ne 0 ]] && [[ $1 -ne 250 ]];then
7076
EXCODE=2
7177
if [ ! -f ${log_path}/$2 ];then
7278
echo -e "\033[31m run $2 CI FAIL \033"
73-
else
74-
mv ${log_path}/$2 ${log_path}/$2_FAIL.log
75-
echo -e "\033[31m ${log_path}/$2_FAIL \033"
76-
tail -70 ${log_path}/$2_FAIL.log
79+
else
80+
mv ${log_path}/$2 ${log_path}/$2_FAIL.log
81+
echo -e "\033[31m ${log_path}/$2_FAIL \033"
82+
tail -70 ${log_path}/$2_FAIL.log
7783
fi
7884
exit $EXCODE
7985
else
80-
echo -e "\033[32m run $3 CI SUCCESS \033"
86+
echo -e "\033[32m The $3 CI has completed \033"
8187
fi
8288
}
8389

90+
function execute_func_list(){
91+
cd ${log_path} || { echo "Failed to enter log_path: $log_path"; return 1; }
92+
total_count=0
93+
success_count=0
94+
runtime_fail_count=0
95+
verification_fail_count=0
96+
exit_250_count=0
97+
while IFS= read -r func_name; do
98+
let total_count++
99+
let global_total_count++
100+
execute_num=1
101+
while true; do
102+
bash $1 exec_case $func_name $FLAGS_install_deps $FLAGS_download_data
103+
result=$?
104+
if [ $result -eq 0 ]; then
105+
echo -e "\033[32m test success!"
106+
let success_count++
107+
let global_success_count++
108+
elif [ $result -eq 2 ]; then
109+
echo -e "\033[31m verification failed!"
110+
let verification_fail_count++
111+
global_verification_fail_arr+=("$func_name")
112+
elif [ $result -eq 250 ]; then
113+
if [ $execute_num -eq 1 ]; then
114+
echo -e "\033[31m fist time execute failed, try again!"
115+
let execute_num++
116+
continue
117+
else
118+
echo -e "\033[31m second time execute failed, exit!"
119+
let exit_250_count++
120+
global_exit_250_arr+=("$func_name")
121+
fi
122+
else
123+
echo "test failed!"
124+
mv ${log_path}/$func_name ${log_path}/${func_name}_FAIL.log
125+
echo -e "\033[31m ${log_path}/$func_name_FAIL \033"
126+
tail -15 ${log_path}/${func_name}_FAIL.log
127+
let runtime_fail_count++
128+
global_runtime_fail_arr+=("$func_name")
129+
fi
130+
break
131+
done
132+
done < functions.txt
133+
echo -e "\033[31m $2 test case has complicated \033"
134+
echo -e "\033[31m $(printf '\t') total tests : $total_count \033"
135+
echo -e "\033[31m $(printf '\t') success tests : $success_count \033"
136+
echo -e "\033[31m $(printf '\t') runtime fail tests : $runtime_fail_count \033"
137+
echo -e "\033[31m $(printf '\t') verification fail tests : $verification_fail_count \033"
138+
echo -e "\033[31m $(printf '\t') exit 250 tests(intermittent issue) : $exit_250_count \033"
139+
}
140+
84141
# Get the list of pending cases
85142
get_diff_TO_case
86143
# Remove duplicates and store the results back to the original list
@@ -101,24 +158,10 @@ if [[ ${#case_list[*]} -ne 0 ]];then
101158
export FLAGS_install_deps=0
102159
for case in ${case_list[*]};do
103160
echo -e "\033[31m ---- running case $case_num/${#case_list[*]}: ${case} \033"
104-
if [[ ${case} == "llama_auto" ]];then
105-
bash /workspace/PaddleNLP/scripts/distribute/ci_case_auto.sh llama_case_list_auto $FLAGS_install_deps $FLAGS_download_data
106-
print_info $? `ls -lt ${log_path} | grep "llama" | head -n 1 | awk '{print $9}'` ${case}
107-
export FLAGS_install_deps=1
108-
export FLAGS_download_data="llama ""$FLAGS_download_data"
109-
let case_num++
110-
elif [[ ${case} == "auto_unit_test" ]];then
161+
if [[ ${case} == "auto_unit_test" ]];then
111162
bash /workspace/Paddle/tools/auto_parallel/ci_case_unit.sh auto_unit_test
112163
print_info $? `ls -lt ${log_path} | grep "test" | head -n 1 | awk '{print $9}'` ${case}
113164
let case_num++
114-
elif [[ ${case} == "gpt-3_auto" ]];then
115-
bash /workspace/PaddleNLP/scripts/distribute/ci_case_auto.sh llm_gpt_case_list_auto $FLAGS_install_deps $FLAGS_download_data
116-
print_info $? `ls -lt ${log_path} | grep "llm_gpt_dygraph_auto_" | head -n 1 | awk '{print $9}'` ${case}
117-
let case_num++
118-
elif [[ ${case} == "gpt-3_dygraph" ]];then
119-
bash /workspace/PaddleNLP/scripts/distribute/ci_case_dy.sh llm_gpt_case_list_dygraph $FLAGS_install_deps $FLAGS_download_data
120-
print_info $? `ls -lt ${log_path} | grep "llm_gpt" | head -n 1 | awk '{print $9}'` ${case}
121-
let case_num++
122165
elif [[ ${case} == "dygraph_unit_test" ]];then
123166
bash /workspace/Paddle/tools/auto_parallel/ci_case_unit.sh dygraph_unit_test
124167
print_info $? `ls -lt ${log_path} | grep "test" | head -n 1 | awk '{print $9}'` ${case}
@@ -127,22 +170,51 @@ if [[ ${#case_list[*]} -ne 0 ]];then
127170
bash /workspace/Paddle/tools/auto_parallel/ci_case_unit.sh llama_auto_unit_test
128171
print_info $? `ls -lt ${log_path} | grep "test" | head -n 1 | awk '{print $9}'` ${case}
129172
let case_num++
173+
elif [[ ${case} == "llama_auto" ]];then
174+
cmd=/workspace/PaddleNLP/scripts/distribute/ci_case_auto.sh
175+
bash $cmd prepare_case llama_case_list_auto $FLAGS_install_deps $FLAGS_download_data
176+
execute_func_list $cmd llama_auto
177+
export FLAGS_install_deps=1
178+
export FLAGS_download_data="llama ""$FLAGS_download_data"
179+
let case_num++
180+
elif [[ ${case} == "gpt-3_auto" ]];then
181+
cmd=/workspace/PaddleNLP/scripts/distribute/ci_case_auto.sh
182+
bash $cmd prepare_case llm_gpt_case_list_auto $FLAGS_install_deps $FLAGS_download_data
183+
execute_func_list $cmd gpt-3_auto
184+
let case_num++
185+
elif [[ ${case} == "gpt-3_dygraph" ]];then
186+
cmd=/workspace/PaddleNLP/scripts/distribute/ci_case_dy.sh
187+
bash $cmd prepare_case llm_gpt_case_list_dygraph $FLAGS_install_deps $FLAGS_download_data
188+
execute_func_list $cmd gpt-3_dygraph
189+
let case_num++
130190
else
131191
echo -e "\033[31m ---- no ${case} \033"
132192
let case_num++
133193
fi
134194
done
135195
echo -e "\033[31m ---- end run case \033"
136-
cd ${log_path}
137-
if [ ! -f *FAIL* ];then
138-
FF=0
196+
197+
echo -e "\033[31m ---- total tests : $global_total_count \033"
198+
if [ ${#global_exit_250_arr[@]} -ne 0 ]; then
199+
echo -e "\033[32m ---- exit 250 test : ${#global_exit_250_arr[@]} \033"
200+
for case in "${global_exit_250_arr[@]}"; do
201+
echo -e "\t$case(exit 250)"
202+
done
203+
fi
204+
205+
if [ ${#global_runtime_fail_arr[@]} -eq 0 ] && [ ${#global_verification_fail_arr[@]} -eq 0 ]; then
206+
echo -e "\033[32m ---- all cases Success \033"
139207
EXCODE=0
140-
echo -e "\033[32m ---- all case Success \033"
141208
else
142-
FF=`ls *FAIL*|wc -l`
143-
EXCODE=2
144-
echo -e "\033[31m ---- case Failed number: ${FF} \033"
145-
ls *_FAIL*
209+
echo -e "\033[32m ---- runtime failed test : ${#global_runtime_fail_arr[@]} \033"
210+
for case in "${global_runtime_fail_arr[@]}"; do
211+
echo -e "\t$case(failed)"
212+
done
213+
echo -e "\033[32m ---- verification failed test : ${#global_verification_fail_arr[@]} \033"
214+
for case in "${global_verification_fail_arr[@]}"; do
215+
echo -e "\t$case(failed)"
216+
done
217+
EXCODE=1
146218
fi
147219
else
148220
echo -e "\033[32m Changed Not CI case, Skips \033"

0 commit comments

Comments
 (0)