Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
57 changes: 57 additions & 0 deletions .github/workflows/check_runner_status.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,57 @@
name: Self-hosted runner (check runner status)

# Note that each job's dependencies go into a corresponding docker file.
#
# For example for `run_all_tests_torch_cuda_extensions_gpu` the docker image is
# `huggingface/transformers-pytorch-deepspeed-latest-gpu`, which can be found at
# `docker/transformers-pytorch-deepspeed-latest-gpu/Dockerfile`

on:
repository_dispatch:
schedule:
# run per hour
- cron: "* */1 * * *"

env:
TRANSFORMERS_IS_CI: yes

jobs:
check_runner_status:
name: Check Runner Status
runs-on: ubuntu-latest
steps:
- name: Checkout transformers
uses: actions/checkout@v2
with:
fetch-depth: 2

- name: Check Runner Status
run: python utils/check_self_hosted_runner.py --target_runners single-gpu-ci-runner-docker,multi-gpu-ci-runner-docker,single-gpu-scheduled-ci-runner-docker,multi-scheduled-scheduled-ci-runner-docker --token ${{ secrets.ACCESS_REPO_INFO_TOKEN }}

send_results:
name: Send results to webhook
runs-on: ubuntu-latest
needs: check_runner_status
if: ${{ failure() }}
steps:
- name: Preliminary job status
shell: bash
run: |
echo "Runner availability: ${{ needs.check_runner_status.result }}"

- uses: actions/checkout@v2
- uses: actions/download-artifact@v2
- name: Send message to Slack
env:
CI_SLACK_BOT_TOKEN: ${{ secrets.CI_SLACK_BOT_TOKEN }}
CI_SLACK_CHANNEL_ID: ${{ secrets.CI_SLACK_CHANNEL_ID }}
CI_SLACK_CHANNEL_ID_DAILY: ${{ secrets.CI_SLACK_CHANNEL_ID_DAILY }}
CI_SLACK_CHANNEL_DUMMY_TESTS: ${{ secrets.CI_SLACK_CHANNEL_DUMMY_TESTS }}
CI_SLACK_REPORT_CHANNEL_ID: ${{ secrets.CI_SLACK_CHANNEL_ID_DAILY }}
CI_EVENT: runner status check
RUNNER_STATUS: ${{ needs.check_runner_status.result }}
# We pass `needs.setup.outputs.matrix` as the argument. A processing in `notification_service.py` to change
# `models/bert` to `models_bert` is required, as the artifact names use `_` instead of `/`.
run: |
pip install slack_sdk
python utils/notification_service.py
38 changes: 30 additions & 8 deletions .github/workflows/self-nightly-scheduled.yml
Original file line number Diff line number Diff line change
Expand Up @@ -23,8 +23,21 @@ env:
RUN_PT_TF_CROSS_TESTS: 1

jobs:
run_check_runners:
check_runner_status:
name: Check Runner Status
runs-on: ubuntu-latest
steps:
- name: Checkout transformers
uses: actions/checkout@v2
with:
fetch-depth: 2

- name: Check Runner Status
run: python utils/check_self_hosted_runner.py --target_runners single-gpu-scheduled-ci-runner-docker,multi-gpu-scheduled-ci-runner-docker --token ${{ secrets.ACCESS_REPO_INFO_TOKEN }}

check_runners:
name: Check Runners
needs: check_runner_status
strategy:
matrix:
machine_type: [single-gpu, multi-gpu]
Expand All @@ -39,7 +52,7 @@ jobs:

setup:
name: Setup
needs: run_check_runners
needs: check_runners
strategy:
matrix:
machine_type: [single-gpu, multi-gpu]
Expand Down Expand Up @@ -83,7 +96,7 @@ jobs:
container:
image: huggingface/transformers-all-latest-torch-nightly-gpu
options: --gpus 0 --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
needs: [run_check_runners, setup]
needs: setup
steps:
- name: Echo folder ${{ matrix.folders }}
shell: bash
Expand Down Expand Up @@ -136,7 +149,7 @@ jobs:
container:
image: huggingface/transformers-all-latest-torch-nightly-gpu
options: --gpus all --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
needs: [run_check_runners, setup]
needs: setup
steps:
- name: Echo folder ${{ matrix.folders }}
shell: bash
Expand Down Expand Up @@ -185,7 +198,7 @@ jobs:
matrix:
machine_type: [single-gpu, multi-gpu]
runs-on: ${{ format('{0}-{1}', matrix.machine_type, 'docker') }}
needs: [run_check_runners, setup]
needs: setup
container:
image: huggingface/transformers-pytorch-deepspeed-nightly-gpu
options: --gpus all --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
Expand Down Expand Up @@ -236,13 +249,21 @@ jobs:
name: Send results to webhook
runs-on: ubuntu-latest
if: always()
needs: [run_check_runners, setup, run_tests_single_gpu, run_tests_multi_gpu, run_all_tests_torch_cuda_extensions_gpu]
needs: [
check_runner_status,
check_runners,
setup,
run_tests_single_gpu,
run_tests_multi_gpu,
run_all_tests_torch_cuda_extensions_gpu
]
steps:
- name: Preliminary job status
shell: bash
# For the meaning of these environment variables, see the job `Setup`
run: |
echo "Runner status: ${{ needs.run_check_runners.result }}"
echo "Runner availability: ${{ needs.check_runner_status.result }}"
echo "Runner status: ${{ needs.check_runners.result }}"
echo "Setup status: ${{ needs.setup.result }}"

- uses: actions/checkout@v2
Expand All @@ -255,8 +276,9 @@ jobs:
CI_SLACK_CHANNEL_DUMMY_TESTS: ${{ secrets.CI_SLACK_CHANNEL_DUMMY_TESTS }}
CI_SLACK_REPORT_CHANNEL_ID: ${{ secrets.CI_SLACK_CHANNEL_ID_PAST_FUTURE }}
CI_EVENT: nightly-build
RUNNER_STATUS: ${{ needs.check_runner_status.result }}
RUNNER_ENV_STATUS: ${{ needs.check_runners.result }}
SETUP_STATUS: ${{ needs.setup.result }}
RUNNER_STATUS: ${{ needs.run_check_runners.result }}
# We pass `needs.setup.outputs.matrix` as the argument. A processing in `notification_service.py` to change
# `models/bert` to `models_bert` is required, as the artifact names use `_` instead of `/`.
run: |
Expand Down
63 changes: 42 additions & 21 deletions .github/workflows/self-past.yml
Original file line number Diff line number Diff line change
Expand Up @@ -27,9 +27,43 @@ env:
RUN_PT_TF_CROSS_TESTS: 1

jobs:
check_runner_status:
name: Check Runner Status
runs-on: ubuntu-latest
steps:
- name: Checkout transformers
uses: actions/checkout@v2
with:
fetch-depth: 2

- name: Check Runner Status
run: python utils/check_self_hosted_runner.py --target_runners single-gpu-past-ci-runner-docker,multi-gpu-past-ci-runner-docker --token ${{ secrets.ACCESS_REPO_INFO_TOKEN }}

check_runners:
name: Check Runners
needs: check_runner_status
strategy:
matrix:
machine_type: [single-gpu, multi-gpu]
runs-on: ${{ format('{0}-{1}', matrix.machine_type, 'docker-past-ci') }}
container:
image: huggingface/transformers-${{ inputs.framework }}-past-${{ inputs.version }}-gpu
options: --gpus 0 --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
steps:
- name: NVIDIA-SMI
run: |
nvidia-smi

setup:
name: Setup
runs-on: ubuntu-latest
needs: check_runners
strategy:
matrix:
machine_type: [single-gpu, multi-gpu]
runs-on: ${{ format('{0}-{1}', matrix.machine_type, 'docker-past-ci') }}
container:
image: huggingface/transformers-${{ inputs.framework }}-past-${{ inputs.version }}-gpu
options: --gpus 0 --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
outputs:
matrix: ${{ steps.set-matrix.outputs.matrix }}
steps:
Expand All @@ -50,21 +84,6 @@ jobs:
cd tests
echo "::set-output name=matrix::$(python3 -c 'import os; tests = os.getcwd(); model_tests = os.listdir(os.path.join(tests, "models")); d1 = sorted(list(filter(os.path.isdir, os.listdir(tests)))); d2 = sorted(list(filter(os.path.isdir, [f"models/{x}" for x in model_tests]))); d1.remove("models"); d = d2 + d1; print(d)')"

run_check_runners:
name: Check Runners
needs: setup
strategy:
matrix:
machine_type: [single-gpu, multi-gpu]
runs-on: ${{ format('{0}-{1}', matrix.machine_type, 'docker-past-ci') }}
container:
image: huggingface/transformers-${{ inputs.framework }}-past-${{ inputs.version }}-gpu
options: --gpus 0 --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
steps:
- name: NVIDIA-SMI
run: |
nvidia-smi

run_tests_single_gpu:
name: Model tests
strategy:
Expand All @@ -76,7 +95,7 @@ jobs:
container:
image: huggingface/transformers-${{ inputs.framework }}-past-${{ inputs.version }}-gpu
options: --gpus 0 --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
needs: [setup, run_check_runners]
needs: setup
steps:
- name: Update clone
working-directory: /transformers
Expand Down Expand Up @@ -129,7 +148,7 @@ jobs:
container:
image: huggingface/transformers-${{ inputs.framework }}-past-${{ inputs.version }}-gpu
options: --gpus all --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
needs: [setup, run_check_runners]
needs: setup
steps:
- name: Update clone
working-directory: /transformers
Expand Down Expand Up @@ -175,13 +194,14 @@ jobs:
name: Send results to webhook
runs-on: ubuntu-latest
if: always()
needs: [setup, run_check_runners, run_tests_single_gpu, run_tests_multi_gpu]
needs: [check_runner_status, check_runners, setup, run_tests_single_gpu, run_tests_multi_gpu]
steps:
- name: Preliminary job status
shell: bash
# For the meaning of these environment variables, see the job `Setup`
run: |
echo "Runner status: ${{ needs.run_check_runners.result }}"
echo "Runner availability: ${{ needs.check_runner_status.result }}"
echo "Runner status: ${{ needs.check_runners.result }}"
echo "Setup status: ${{ needs.setup.result }}"

- uses: actions/checkout@v2
Expand All @@ -199,8 +219,9 @@ jobs:
CI_SLACK_CHANNEL_DUMMY_TESTS: ${{ secrets.CI_SLACK_CHANNEL_DUMMY_TESTS }}
CI_SLACK_REPORT_CHANNEL_ID: ${{ secrets.CI_SLACK_CHANNEL_ID_PAST_FUTURE }}
CI_EVENT: Past CI - ${{ inputs.framework }}-${{ inputs.version }}
RUNNER_STATUS: ${{ needs.check_runner_status.result }}
RUNNER_ENV_STATUS: ${{ needs.check_runners.result }}
SETUP_STATUS: ${{ needs.setup.result }}
RUNNER_STATUS: ${{ needs.run_check_runners.result }}
# We pass `needs.setup.outputs.matrix` as the argument. A processing in `notification_service.py` to change
# `models/bert` to `models_bert` is required, as the artifact names use `_` instead of `/`.
run: |
Expand Down
Loading