datquocnguyen · datquocnguyen · Sep 20, 2022 · Sep 19, 2022 · Sep 19, 2022 · Sep 19, 2022
diff --git a/.github/workflows/check_runner_status.yml b/.github/workflows/check_runner_status.yml
@@ -0,0 +1,57 @@
+name: Self-hosted runner (check runner status)
+
+# Note that each job's dependencies go into a corresponding docker file.
+#
+# For example for `run_all_tests_torch_cuda_extensions_gpu` the docker image is
+# `huggingface/transformers-pytorch-deepspeed-latest-gpu`, which can be found at
+# `docker/transformers-pytorch-deepspeed-latest-gpu/Dockerfile`
+
+on:
+  repository_dispatch:
+  schedule:
+    # run per hour
+    - cron: "* */1 * * *"
+
+env:
+  TRANSFORMERS_IS_CI: yes
+
+jobs:
+  check_runner_status:
+    name: Check Runner Status
+    runs-on: ubuntu-latest
+    steps:
+      - name: Checkout transformers
+        uses: actions/checkout@v2
+        with:
+          fetch-depth: 2
+
+      - name: Check Runner Status
+        run: python utils/check_self_hosted_runner.py --target_runners single-gpu-ci-runner-docker,multi-gpu-ci-runner-docker,single-gpu-scheduled-ci-runner-docker,multi-scheduled-scheduled-ci-runner-docker --token ${{ secrets.ACCESS_REPO_INFO_TOKEN }}
+
+  send_results:
+    name: Send results to webhook
+    runs-on: ubuntu-latest
+    needs: check_runner_status
+    if: ${{ failure() }}
+    steps:
+      - name: Preliminary job status
+        shell: bash
+        run: |
+          echo "Runner availability: ${{ needs.check_runner_status.result }}"
+
+      - uses: actions/checkout@v2
+      - uses: actions/download-artifact@v2
+      - name: Send message to Slack
+        env:
+          CI_SLACK_BOT_TOKEN: ${{ secrets.CI_SLACK_BOT_TOKEN }}
+          CI_SLACK_CHANNEL_ID: ${{ secrets.CI_SLACK_CHANNEL_ID }}
+          CI_SLACK_CHANNEL_ID_DAILY: ${{ secrets.CI_SLACK_CHANNEL_ID_DAILY }}
+          CI_SLACK_CHANNEL_DUMMY_TESTS: ${{ secrets.CI_SLACK_CHANNEL_DUMMY_TESTS }}
+          CI_SLACK_REPORT_CHANNEL_ID: ${{ secrets.CI_SLACK_CHANNEL_ID_DAILY }}
+          CI_EVENT: runner status check
+          RUNNER_STATUS: ${{ needs.check_runner_status.result }}
+        # We pass `needs.setup.outputs.matrix` as the argument. A processing in `notification_service.py` to change
+        # `models/bert` to `models_bert` is required, as the artifact names use `_` instead of `/`.
+        run: |
+          pip install slack_sdk
+          python utils/notification_service.py
diff --git a/.github/workflows/self-nightly-scheduled.yml b/.github/workflows/self-nightly-scheduled.yml
@@ -23,8 +23,21 @@ env:
   RUN_PT_TF_CROSS_TESTS: 1
 
 jobs:
-  run_check_runners:
+  check_runner_status:
+    name: Check Runner Status
+    runs-on: ubuntu-latest
+    steps:
+      - name: Checkout transformers
+        uses: actions/checkout@v2
+        with:
+          fetch-depth: 2
+
+      - name: Check Runner Status
+        run: python utils/check_self_hosted_runner.py --target_runners single-gpu-scheduled-ci-runner-docker,multi-gpu-scheduled-ci-runner-docker --token ${{ secrets.ACCESS_REPO_INFO_TOKEN }}
+
+  check_runners:
     name: Check Runners
+    needs: check_runner_status
     strategy:
       matrix:
         machine_type: [single-gpu, multi-gpu]
@@ -39,7 +52,7 @@ jobs:
 
   setup:
     name: Setup
-    needs: run_check_runners
+    needs: check_runners
     strategy:
       matrix:
         machine_type: [single-gpu, multi-gpu]
@@ -83,7 +96,7 @@ jobs:
     container:
       image: huggingface/transformers-all-latest-torch-nightly-gpu
       options: --gpus 0 --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
-    needs: [run_check_runners, setup]
+    needs: setup
     steps:
       - name: Echo folder ${{ matrix.folders }}
         shell: bash
@@ -136,7 +149,7 @@ jobs:
     container:
       image: huggingface/transformers-all-latest-torch-nightly-gpu
       options: --gpus all --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
-    needs: [run_check_runners, setup]
+    needs: setup
     steps:
       - name: Echo folder ${{ matrix.folders }}
         shell: bash
@@ -185,7 +198,7 @@ jobs:
       matrix:
         machine_type: [single-gpu, multi-gpu]
     runs-on: ${{ format('{0}-{1}', matrix.machine_type, 'docker') }}
-    needs: [run_check_runners, setup]
+    needs: setup
     container:
       image: huggingface/transformers-pytorch-deepspeed-nightly-gpu
       options: --gpus all --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
@@ -236,13 +249,21 @@ jobs:
     name: Send results to webhook
     runs-on: ubuntu-latest
     if: always()
-    needs: [run_check_runners, setup, run_tests_single_gpu, run_tests_multi_gpu, run_all_tests_torch_cuda_extensions_gpu]
+    needs: [
+      check_runner_status,
+      check_runners,
+      setup,
+      run_tests_single_gpu,
+      run_tests_multi_gpu,
+      run_all_tests_torch_cuda_extensions_gpu
+    ]
     steps:
       - name: Preliminary job status
         shell: bash
         # For the meaning of these environment variables, see the job `Setup`
         run: |
-          echo "Runner status: ${{ needs.run_check_runners.result }}"
+          echo "Runner availability: ${{ needs.check_runner_status.result }}"
+          echo "Runner status: ${{ needs.check_runners.result }}"
           echo "Setup status: ${{ needs.setup.result }}"
 
       - uses: actions/checkout@v2
@@ -255,8 +276,9 @@ jobs:
           CI_SLACK_CHANNEL_DUMMY_TESTS: ${{ secrets.CI_SLACK_CHANNEL_DUMMY_TESTS }}
           CI_SLACK_REPORT_CHANNEL_ID: ${{ secrets.CI_SLACK_CHANNEL_ID_PAST_FUTURE }}
           CI_EVENT: nightly-build
+          RUNNER_STATUS: ${{ needs.check_runner_status.result }}
+          RUNNER_ENV_STATUS: ${{ needs.check_runners.result }}
           SETUP_STATUS: ${{ needs.setup.result }}
-          RUNNER_STATUS: ${{ needs.run_check_runners.result }}
         # We pass `needs.setup.outputs.matrix` as the argument. A processing in `notification_service.py` to change
         # `models/bert` to `models_bert` is required, as the artifact names use `_` instead of `/`.
         run: |

diff --git a/.github/workflows/self-past.yml b/.github/workflows/self-past.yml
@@ -27,9 +27,43 @@ env:
   RUN_PT_TF_CROSS_TESTS: 1
 
 jobs:
+  check_runner_status:
+    name: Check Runner Status
+    runs-on: ubuntu-latest
+    steps:
+      - name: Checkout transformers
+        uses: actions/checkout@v2
+        with:
+          fetch-depth: 2
+
+      - name: Check Runner Status
+        run: python utils/check_self_hosted_runner.py --target_runners single-gpu-past-ci-runner-docker,multi-gpu-past-ci-runner-docker --token ${{ secrets.ACCESS_REPO_INFO_TOKEN }}
+
+  check_runners:
+    name: Check Runners
+    needs: check_runner_status
+    strategy:
+      matrix:
+        machine_type: [single-gpu, multi-gpu]
+    runs-on: ${{ format('{0}-{1}', matrix.machine_type, 'docker-past-ci') }}
+    container:
+      image: huggingface/transformers-${{ inputs.framework }}-past-${{ inputs.version }}-gpu
+      options: --gpus 0 --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
+    steps:
+      - name: NVIDIA-SMI
+        run: |
+          nvidia-smi
+
   setup:
     name: Setup
-    runs-on: ubuntu-latest
+    needs: check_runners
+    strategy:
+      matrix:
+        machine_type: [single-gpu, multi-gpu]
+    runs-on: ${{ format('{0}-{1}', matrix.machine_type, 'docker-past-ci') }}
+    container:
+      image: huggingface/transformers-${{ inputs.framework }}-past-${{ inputs.version }}-gpu
+      options: --gpus 0 --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
     outputs:
       matrix: ${{ steps.set-matrix.outputs.matrix }}
     steps:
@@ -50,21 +84,6 @@ jobs:
           cd tests
           echo "::set-output name=matrix::$(python3 -c 'import os; tests = os.getcwd(); model_tests = os.listdir(os.path.join(tests, "models")); d1 = sorted(list(filter(os.path.isdir, os.listdir(tests)))); d2 = sorted(list(filter(os.path.isdir, [f"models/{x}" for x in model_tests]))); d1.remove("models"); d = d2 + d1; print(d)')"
 
-  run_check_runners:
-    name: Check Runners
-    needs: setup
-    strategy:
-      matrix:
-        machine_type: [single-gpu, multi-gpu]
-    runs-on: ${{ format('{0}-{1}', matrix.machine_type, 'docker-past-ci') }}
-    container:
-      image: huggingface/transformers-${{ inputs.framework }}-past-${{ inputs.version }}-gpu
-      options: --gpus 0 --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
-    steps:
-      - name: NVIDIA-SMI
-        run: |
-          nvidia-smi
-
   run_tests_single_gpu:
     name: Model tests
     strategy:
@@ -76,7 +95,7 @@ jobs:
     container:
       image: huggingface/transformers-${{ inputs.framework }}-past-${{ inputs.version }}-gpu
       options: --gpus 0 --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
-    needs: [setup, run_check_runners]
+    needs: setup
     steps:
       - name: Update clone
         working-directory: /transformers
@@ -129,7 +148,7 @@ jobs:
     container:
       image: huggingface/transformers-${{ inputs.framework }}-past-${{ inputs.version }}-gpu
       options: --gpus all --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
-    needs: [setup, run_check_runners]
+    needs: setup
     steps:
       - name: Update clone
         working-directory: /transformers
@@ -175,13 +194,14 @@ jobs:
     name: Send results to webhook
     runs-on: ubuntu-latest
     if: always()
-    needs: [setup, run_check_runners, run_tests_single_gpu, run_tests_multi_gpu]
+    needs: [check_runner_status, check_runners, setup, run_tests_single_gpu, run_tests_multi_gpu]
     steps:
       - name: Preliminary job status
         shell: bash
         # For the meaning of these environment variables, see the job `Setup`
         run: |
-          echo "Runner status: ${{ needs.run_check_runners.result }}"
+          echo "Runner availability: ${{ needs.check_runner_status.result }}"
+          echo "Runner status: ${{ needs.check_runners.result }}"
           echo "Setup status: ${{ needs.setup.result }}"
 
       - uses: actions/checkout@v2
@@ -199,8 +219,9 @@ jobs:
           CI_SLACK_CHANNEL_DUMMY_TESTS: ${{ secrets.CI_SLACK_CHANNEL_DUMMY_TESTS }}
           CI_SLACK_REPORT_CHANNEL_ID: ${{ secrets.CI_SLACK_CHANNEL_ID_PAST_FUTURE }}
           CI_EVENT: Past CI - ${{ inputs.framework }}-${{ inputs.version }}
+          RUNNER_STATUS: ${{ needs.check_runner_status.result }}
+          RUNNER_ENV_STATUS: ${{ needs.check_runners.result }}
           SETUP_STATUS: ${{ needs.setup.result }}
-          RUNNER_STATUS: ${{ needs.run_check_runners.result }}
         # We pass `needs.setup.outputs.matrix` as the argument. A processing in `notification_service.py` to change
         # `models/bert` to `models_bert` is required, as the artifact names use `_` instead of `/`.
         run: |