[CI] Remind re-run when auto_parallel CI exit -6 (#69212)

waliwali777 · web-flow · commit 5313be944182 · 2024-12-04T14:14:42.000+08:00
diff --git a/tools/auto_parallel/ci_auto_parallel.sh b/tools/auto_parallel/ci_auto_parallel.sh
@@ -21,6 +21,12 @@ mkdir -p /workspace/case_logs
 export log_path=/workspace/case_logs
 export case_list=()
 
+global_total_count=0
+global_success_count=0
+global_exit_250_arr=()
+global_runtime_fail_arr=()
+global_verification_fail_arr=()
+
 install_paddle(){
     echo -e "\033[31m ---- Install paddlepaddle-gpu  \033"
     if [ -n "$paddle" ];then
@@ -70,17 +76,68 @@ if [[ $1 -ne 0 ]] && [[ $1 -ne 250 ]];then
     EXCODE=2
     if [ ! -f ${log_path}/$2 ];then
         echo -e "\033[31m run $2 CI FAIL \033"
-    else
-        mv ${log_path}/$2 ${log_path}/$2_FAIL.log
-        echo -e "\033[31m ${log_path}/$2_FAIL \033"
-        tail -70 ${log_path}/$2_FAIL.log
+else
+    mv ${log_path}/$2 ${log_path}/$2_FAIL.log
+    echo -e "\033[31m ${log_path}/$2_FAIL \033"
+    tail -70 ${log_path}/$2_FAIL.log
     fi
     exit $EXCODE
 else
-    echo -e "\033[32m run $3 CI SUCCESS \033"
+    echo -e "\033[32m The $3 CI has completed \033"
 fi
 }
 
+function execute_func_list(){
+    cd ${log_path} || { echo "Failed to enter log_path: $log_path"; return 1; }
+    total_count=0
+    success_count=0
+    runtime_fail_count=0
+    verification_fail_count=0
+    exit_250_count=0
+    while IFS= read -r func_name; do
+        let total_count++
+        let global_total_count++
+        execute_num=1
+        while true; do
+            bash $1 exec_case $func_name $FLAGS_install_deps $FLAGS_download_data
+            result=$?
+            if [ $result -eq 0 ]; then
+                echo -e "\033[32m test success!"
+                let success_count++
+                let global_success_count++
+            elif [ $result -eq 2 ]; then
+                echo -e "\033[31m verification failed!"
+                let verification_fail_count++
+                global_verification_fail_arr+=("$func_name")
+            elif [ $result -eq 250 ]; then
+                if [ $execute_num -eq 1 ]; then
+                    echo -e "\033[31m fist time execute failed, try again!"
+                    let execute_num++
+                    continue
+                else
+                    echo -e "\033[31m second time execute failed, exit!"
+                    let exit_250_count++
+                    global_exit_250_arr+=("$func_name")
+                fi
+            else
+                echo "test failed!"
+                mv ${log_path}/$func_name ${log_path}/${func_name}_FAIL.log
+                echo -e "\033[31m ${log_path}/$func_name_FAIL \033"
+                tail -15 ${log_path}/${func_name}_FAIL.log
+                let runtime_fail_count++
+                global_runtime_fail_arr+=("$func_name")
+            fi
+            break
+        done
+    done < functions.txt
+    echo -e "\033[31m $2 test case has complicated \033"
+    echo -e "\033[31m $(printf '\t')  total tests :  $total_count \033"
+    echo -e "\033[31m $(printf '\t')  success tests :  $success_count \033"
+    echo -e "\033[31m $(printf '\t')  runtime fail tests :  $runtime_fail_count \033"
+    echo -e "\033[31m $(printf '\t')  verification fail tests :  $verification_fail_count \033"
+    echo -e "\033[31m $(printf '\t')  exit 250 tests(intermittent issue) :  $exit_250_count \033"
+}
+
 # Get the list of pending cases
 get_diff_TO_case
 # Remove duplicates and store the results back to the original list
@@ -101,24 +158,10 @@ if [[ ${#case_list[*]} -ne 0 ]];then
     export FLAGS_install_deps=0
     for case in ${case_list[*]};do
         echo -e "\033[31m ---- running case $case_num/${#case_list[*]}: ${case} \033"
-        if [[ ${case} == "llama_auto" ]];then
-            bash /workspace/PaddleNLP/scripts/distribute/ci_case_auto.sh llama_case_list_auto $FLAGS_install_deps $FLAGS_download_data
-            print_info $? `ls -lt ${log_path} | grep "llama" | head -n 1 | awk '{print $9}'` ${case}
-            export FLAGS_install_deps=1
-            export FLAGS_download_data="llama ""$FLAGS_download_data"
-            let case_num++
-        elif [[ ${case} == "auto_unit_test" ]];then
+        if [[ ${case} == "auto_unit_test" ]];then
             bash /workspace/Paddle/tools/auto_parallel/ci_case_unit.sh auto_unit_test
             print_info $? `ls -lt ${log_path} | grep "test" | head -n 1 | awk '{print $9}'` ${case}
             let case_num++
-        elif [[ ${case} == "gpt-3_auto" ]];then
-            bash /workspace/PaddleNLP/scripts/distribute/ci_case_auto.sh llm_gpt_case_list_auto $FLAGS_install_deps $FLAGS_download_data
-            print_info $? `ls -lt ${log_path} | grep "llm_gpt_dygraph_auto_" | head -n 1 | awk '{print $9}'` ${case}
-            let case_num++
-        elif [[ ${case} == "gpt-3_dygraph" ]];then
-            bash /workspace/PaddleNLP/scripts/distribute/ci_case_dy.sh llm_gpt_case_list_dygraph $FLAGS_install_deps $FLAGS_download_data
-            print_info $? `ls -lt ${log_path} | grep "llm_gpt" | head -n 1 | awk '{print $9}'` ${case}
-            let case_num++
         elif [[ ${case} == "dygraph_unit_test" ]];then
             bash /workspace/Paddle/tools/auto_parallel/ci_case_unit.sh dygraph_unit_test
             print_info $? `ls -lt ${log_path} | grep "test" | head -n 1 | awk '{print $9}'` ${case}
@@ -127,22 +170,51 @@ if [[ ${#case_list[*]} -ne 0 ]];then
             bash /workspace/Paddle/tools/auto_parallel/ci_case_unit.sh llama_auto_unit_test
             print_info $? `ls -lt ${log_path} | grep "test" | head -n 1 | awk '{print $9}'` ${case}
             let case_num++
+        elif [[ ${case} == "llama_auto" ]];then
+            cmd=/workspace/PaddleNLP/scripts/distribute/ci_case_auto.sh
+            bash $cmd prepare_case llama_case_list_auto $FLAGS_install_deps $FLAGS_download_data
+            execute_func_list $cmd llama_auto
+            export FLAGS_install_deps=1
+            export FLAGS_download_data="llama ""$FLAGS_download_data"
+            let case_num++
+        elif [[ ${case} == "gpt-3_auto" ]];then
+            cmd=/workspace/PaddleNLP/scripts/distribute/ci_case_auto.sh
+            bash $cmd prepare_case llm_gpt_case_list_auto $FLAGS_install_deps $FLAGS_download_data
+            execute_func_list $cmd gpt-3_auto
+            let case_num++
+        elif [[ ${case} == "gpt-3_dygraph" ]];then
+            cmd=/workspace/PaddleNLP/scripts/distribute/ci_case_dy.sh
+            bash $cmd prepare_case llm_gpt_case_list_dygraph $FLAGS_install_deps $FLAGS_download_data
+            execute_func_list $cmd gpt-3_dygraph
+            let case_num++
         else
             echo -e "\033[31m ---- no ${case} \033"
             let case_num++
         fi
     done
     echo -e "\033[31m ---- end run case  \033"
-    cd ${log_path}
-    if [ ! -f *FAIL* ];then
-        FF=0
+
+    echo -e "\033[31m ---- total tests :  $global_total_count \033"
+    if [ ${#global_exit_250_arr[@]} -ne 0 ]; then
+        echo -e "\033[32m ---- exit 250 test  :  ${#global_exit_250_arr[@]} \033"
+        for case in "${global_exit_250_arr[@]}"; do
+            echo -e "\t$case(exit 250)"
+        done
+    fi
+
+    if [ ${#global_runtime_fail_arr[@]} -eq 0 ] && [ ${#global_verification_fail_arr[@]} -eq 0 ]; then
+        echo -e "\033[32m ---- all cases Success  \033"
         EXCODE=0
-        echo -e "\033[32m ---- all case Success \033"
     else
-        FF=`ls *FAIL*|wc -l`
-        EXCODE=2
-        echo -e "\033[31m ---- case Failed number: ${FF} \033"
-        ls *_FAIL*
+        echo -e "\033[32m ---- runtime failed test  :  ${#global_runtime_fail_arr[@]} \033"
+        for case in "${global_runtime_fail_arr[@]}"; do
+            echo -e "\t$case(failed)"
+        done
+        echo -e "\033[32m ---- verification failed test  :  ${#global_verification_fail_arr[@]} \033"
+        for case in "${global_verification_fail_arr[@]}"; do
+            echo -e "\t$case(failed)"
+        done
+        EXCODE=1
     fi
 else
     echo -e "\033[32m Changed Not CI case, Skips \033"