Skip to content

Commit 0749265

Browse files
authored
Upgrade PyTorch version to v1.13.0 (#2082)
* Upgrade PyTorch version to v1.13.0 Signed-off-by: Yuki Iwai <[email protected]> * Build container images using minikube in E2E tests Signed-off-by: Yuki Iwai <[email protected]> Signed-off-by: Yuki Iwai <[email protected]>
1 parent 6bcbd25 commit 0749265

File tree

5 files changed

+24
-35
lines changed

5 files changed

+24
-35
lines changed

examples/v1beta1/trial-images/darts-cnn-cifar10/Dockerfile.gpu

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
1-
# Pytorch=1.11.0, cuda=11.6.0
2-
# Ref: https://docs.nvidia.com/deeplearning/frameworks/pytorch-release-notes/rel_22-08.html#rel_22-08
3-
FROM nvcr.io/nvidia/pytorch:22.02-py3
1+
# We need to use the nvcr.io/nvidia/pytorch image as a base image to support both linux/amd64 and linux_arm64 platforms.
2+
# PyTorch=1.13.0, cuda=11.8.0
3+
# Ref: https://docs.nvidia.com/deeplearning/frameworks/pytorch-release-notes/rel-22-11.html#rel-22-11
4+
FROM nvcr.io/nvidia/pytorch:22.11-py3
45

56
ENV TARGET_DIR /opt/darts-cnn-cifar10
67

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,3 @@
1-
torch==1.11.0
2-
torchvision==0.12.0
1+
torch==1.13.1
2+
torchvision==0.14.1
33
Pillow>=9.1.1

examples/v1beta1/trial-images/pytorch-mnist/Dockerfile.gpu

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
1-
# Pytorch=1.11.0, cuda=11.6.0
2-
# Ref: https://docs.nvidia.com/deeplearning/frameworks/pytorch-release-notes/rel_22-08.html#rel_22-08
3-
FROM nvcr.io/nvidia/pytorch:22.02-py3
1+
# We need to use the nvcr.io/nvidia/pytorch image as a base image to support both linux/amd64 and linux_arm64 platforms.
2+
# PyTorch=1.13.0, cuda=11.8.0
3+
# Ref: https://docs.nvidia.com/deeplearning/frameworks/pytorch-release-notes/rel-22-11.html#rel-22-11
4+
FROM nvcr.io/nvidia/pytorch:22.11-py3
45

56
ADD examples/v1beta1/trial-images/pytorch-mnist /opt/pytorch-mnist
67

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
11
cloudml-hypertune==0.1.0.dev6
2-
torch==1.11.0
3-
torchvision==0.12.0
2+
torch==1.13.1
3+
torchvision==0.14.1
44
Pillow>=9.1.1

test/e2e/v1beta1/scripts/gh-actions/build-load.sh

Lines changed: 12 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,10 @@
2020
set -o errexit
2121
set -o pipefail
2222
set -o nounset
23-
cd "$(dirname "$0")"
23+
24+
pushd .
25+
cd "$(dirname "$0")/../../../../.."
26+
trap popd EXIT
2427

2528
TRIAL_IMAGES=${1:-""}
2629
EXPERIMENTS=${2:-""}
@@ -48,14 +51,7 @@ _build_containers() {
4851
done
4952

5053
echo -e "\nBuilding $CONTAINER_NAME image with $DOCKERFILE...\n"
51-
docker buildx build --platform "$(uname -m)" --load -t "$REGISTRY/$CONTAINER_NAME:$TAG" -f "../../../../../$DOCKERFILE" ../../../../../
52-
}
53-
54-
_load_minikube_cluster() {
55-
CONTAINER_NAME=${1:-"katib-controller"}
56-
57-
echo -e "\n\nLoading $CONTAINER_NAME image...\n\n"
58-
minikube image load "$REGISTRY/$CONTAINER_NAME:$TAG"
54+
DOCKER_BUILDKIT=1 minikube image build --build-opt platform=linux/amd64 --all -t "$REGISTRY/$CONTAINER_NAME:$TAG" -f "$DOCKERFILE" .
5955
}
6056

6157
_install_tools() {
@@ -66,11 +62,6 @@ _install_tools() {
6662
fi
6763
}
6864

69-
cleanup_build_cache() {
70-
echo -e "\nCleanup Build Cache...\n"
71-
docker builder prune
72-
}
73-
7465
run() {
7566
CONTAINER_NAME=${1:-"katib-controller"}
7667
DOCKERFILE=${2:-"$CMD_PREFIX/katib-controller/$VERSION/Dockerfile"}
@@ -85,10 +76,10 @@ run() {
8576
# Search for Suggestion Images required for Trial.
8677
for exp_name in "${EXPERIMENT_ARRAY[@]}"; do
8778

88-
exp_path=$(find ../../../../../examples/v1beta1 -name "${exp_name}.yaml")
79+
exp_path=$(find examples/v1beta1 -name "${exp_name}.yaml")
8980
algorithm_name="$(yq eval '.spec.algorithm.algorithmName' "$exp_path")"
9081

91-
suggestion_image_name="$(yq eval '.data.suggestion' ../../../../../manifests/v1beta1/components/controller/katib-config.yaml |
82+
suggestion_image_name="$(yq eval '.data.suggestion' manifests/v1beta1/components/controller/katib-config.yaml |
9283
algorithm_name=$algorithm_name yq eval '.[env(algorithm_name)].image' | cut -d: -f1)"
9384
suggestion_name="$(basename "$suggestion_image_name")"
9485

@@ -99,7 +90,6 @@ run() {
9990
for s in "${suggestions[@]}"; do
10091
if [ "$s" == "$CONTAINER_NAME" ]; then
10192
_build_containers "$CONTAINER_NAME" "$DOCKERFILE"
102-
_load_minikube_cluster "$CONTAINER_NAME"
10393
break
10494
fi
10595
done
@@ -112,10 +102,10 @@ run() {
112102
# Search for EarlyStopping Images required for Trial.
113103
for exp_name in "${EXPERIMENT_ARRAY[@]}"; do
114104

115-
exp_path=$(find ../../../../../examples/v1beta1 -name "${exp_name}.yaml")
105+
exp_path=$(find examples/v1beta1 -name "${exp_name}.yaml")
116106
algorithm_name="$(yq eval '.spec.earlyStopping.algorithmName' "$exp_path")"
117107

118-
earlystopping_image_name="$(yq eval '.data.early-stopping' ../../../../../manifests/v1beta1/components/controller/katib-config.yaml |
108+
earlystopping_image_name="$(yq eval '.data.early-stopping' manifests/v1beta1/components/controller/katib-config.yaml |
119109
algorithm_name=$algorithm_name yq eval '.[env(algorithm_name)].image' | cut -d: -f1)"
120110
earlystopping_name="$(basename "$earlystopping_image_name")"
121111

@@ -126,15 +116,13 @@ run() {
126116
for e in "${earlystoppings[@]}"; do
127117
if [ "$e" == "$CONTAINER_NAME" ]; then
128118
_build_containers "$CONTAINER_NAME" "$DOCKERFILE"
129-
_load_minikube_cluster "$CONTAINER_NAME"
130119
break
131120
fi
132121
done
133122

134123
# Others
135124
else
136125
_build_containers "$CONTAINER_NAME" "$DOCKERFILE"
137-
_load_minikube_cluster "$CONTAINER_NAME"
138126
fi
139127
}
140128

@@ -153,7 +141,6 @@ fi
153141
run "cert-generator" "$CMD_PREFIX/cert-generator/$VERSION/Dockerfile"
154142
run "file-metrics-collector" "$CMD_PREFIX/metricscollector/$VERSION/file-metricscollector/Dockerfile"
155143
run "tfevent-metrics-collector" "$CMD_PREFIX/metricscollector/$VERSION/tfevent-metricscollector/Dockerfile"
156-
cleanup_build_cache
157144

158145
# Suggestion images
159146
echo -e "\nBuilding suggestion images..."
@@ -165,18 +152,18 @@ run "suggestion-optuna" "$CMD_PREFIX/suggestion/optuna/$VERSION/Dockerfile"
165152
run "suggestion-pbt" "$CMD_PREFIX/suggestion/pbt/$VERSION/Dockerfile"
166153
run "suggestion-enas" "$CMD_PREFIX/suggestion/nas/enas/$VERSION/Dockerfile"
167154
run "suggestion-darts" "$CMD_PREFIX/suggestion/nas/darts/$VERSION/Dockerfile"
168-
cleanup_build_cache
169155

170156
# Early stopping images
171157
echo -e "\nBuilding early stopping images...\n"
172158
run "earlystopping-medianstop" "$CMD_PREFIX/earlystopping/medianstop/$VERSION/Dockerfile"
173-
cleanup_build_cache
174159

175160
# Training container images
176161
echo -e "\nBuilding training container images..."
177162
for name in "${TRIAL_IMAGE_ARRAY[@]}"; do
178163
run "$name" "examples/$VERSION/trial-images/$name/Dockerfile"
179164
done
180-
cleanup_build_cache
165+
166+
echo -e "\nCleanup Build Cache...\n"
167+
docker buildx prune -f
181168

182169
echo -e "\nAll Katib images with ${TAG} tag have been built successfully!\n"

0 commit comments

Comments
 (0)