Skip to content

Commit 4ce703c

Browse files
Jenkinsfile Fixes Part 2 (#627)
1. Trigger a conda build by updating setup.py 2. Trigger a docker build by updating the Dockerfile 3. Excluding the Jenkinsfile repo from the changesets 4. Updated the build matrix to also tag the latest image. 5. Fix a race condition with the Junit merge. Each rank now has its own junitxml file to which it writes. Modified the composer launcher to use a shell, so the dist env variables are available when specifying the junitxml path. 6. Disable deepspeed tests. They are too flaky. When this PR is merged, the dockerfiles will be updated on Dockerhub.
1 parent 6e923fa commit 4ce703c

File tree

8 files changed

+47
-23
lines changed

8 files changed

+47
-23
lines changed

.ci/Jenkinsfile

Lines changed: 27 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -38,6 +38,7 @@ def cloneJenkinsfilesRepo() {
3838
doGenerateSubmoduleConfigurations: false,
3939
extensions: [[$class: 'RelativeTargetDirectory', relativeTargetDir: jenkinsfileRepoTargetDir]],
4040
submoduleCfg: [],
41+
changelog: false,
4142
userRemoteConfigs: [[url: jenkinsfileRepo, credentialsId: gitCredentialsId]]
4243
])
4344
return "$WORKSPACE_TMP/$jenkinsfileRepoTargetDir"
@@ -78,7 +79,7 @@ def runLint(pDockerImage) {
7879
string(name: 'P_CLOUD', value: pCloud),
7980
string(name: 'P_GIT_REPO', value: gitUrl),
8081
string(name: 'P_GIT_COMMIT', value: gitCommit),
81-
string(name: 'P_EPHEMERAL_STORAGE_LIMIT', value: '2Gi'),
82+
string(name: 'P_EPHEMERAL_STORAGE_LIMIT', value: '7Gi'),
8283
string(name: 'P_DOCKER_IMAGE', value: pDockerImage),
8384
string(name: 'P_TIMEOUT', value: pTimeout),
8485
string(name: 'P_CPU_LIMIT', value: "2"),
@@ -117,7 +118,7 @@ def runPytest(pDockerImage, gpu, extraDeps) {
117118
string(name: 'P_MEM_LIMIT', value: memLimit),
118119
string(name: 'P_TIMEOUT', value: pTimeout),
119120
string(name: 'P_N_GPUS', value: nGpus),
120-
string(name: 'P_EPHEMERAL_STORAGE_LIMIT', value: '2Gi'),
121+
string(name: 'P_EPHEMERAL_STORAGE_LIMIT', value: '32Gi'),
121122
text(name: 'P_COMMAND', value: "./.ci/test.sh '$extraDeps' '$markers'"),
122123
string(name: 'P_ARTIFACTS_GLOB', value: artifactsGlob),
123124
string(name: 'P_JUNIT_GLOB', value: junitGlob),
@@ -179,16 +180,16 @@ stage('Build') {
179180
pytorchDockerBuildMatrix.each { entry ->
180181
def command = entry[0] // command is the command to run
181182
def stagingImage = entry[1] // stagingImage is where the built docker image is pushed
182-
def buildArgs = entry[2] // buildArgs is a map of the build arguments passed to kaniko
183-
jobs << [ "$buildArgs": { ->
183+
def buildConfigListOfTuples = entry[2] // buildConfigListOfTuples is a list of (key, value) pairs of the build args from the matrix
184+
jobs << [ "$buildConfigListOfTuples": { ->
184185
trackBuild(
185186
job: jenkinsShellJobName,
186187
parameters: [
187188
string(name: 'P_CLOUD', value: pCloud),
188189
string(name: 'P_GIT_REPO', value: gitUrl),
189190
string(name: 'P_GIT_COMMIT', value: gitCommit),
190191
string(name: 'P_DOCKER_IMAGE', value: kanikoDockerImage),
191-
string(name: 'P_EPHEMERAL_STORAGE_LIMIT', value: '16Gi'),
192+
string(name: 'P_EPHEMERAL_STORAGE_LIMIT', value: '32Gi'),
192193
text(name: 'P_COMMAND', value: command),
193194
string(name: 'P_TIMEOUT', value: pTimeout),
194195
string(name: 'P_CPU_LIMIT', value: '4'),
@@ -199,16 +200,31 @@ stage('Build') {
199200
// no need to run tests again
200201
return
201202
}
202-
def tag = buildArgs['TAG']
203-
def gpu = buildArgs['CUDA_VERSION'] != 'cpu'
203+
def gpu = false
204+
def isLintImage = false
205+
def tag = null
206+
buildConfigListOfTuples.each { item ->
207+
def key = item[0]
208+
def val = item[1]
209+
210+
if (key == 'CUDA_VERSION') {
211+
gpu = val != 'cpu'
212+
}
213+
if (key == 'TAG') {
214+
tag = val
215+
// there could be multiple tags
216+
isLintImage = isLintImage || tag == lintImage
217+
}
218+
219+
}
204220
def extraDeps = 'all'
205221
def subJobs = [
206222
"Pytest - ${tag}" : { -> runPytest(stagingImage, gpu, extraDeps) }
207223
]
208-
if (tag == lintImage) {
224+
if (isLintImage) {
209225
// and run lint and a dev install on this image
210226
subJobs << [
211-
"Pytest - ${tag}, extraDeps=dev": { -> runPytest(stagingImage, false, 'dev') },
227+
"Pytest - extraDeps=dev": { -> runPytest(stagingImage, false, 'dev') },
212228
"Lint": { -> runLint(stagingImage) },
213229
]
214230
}
@@ -244,9 +260,9 @@ stage('Build') {
244260
string(name: 'P_CLOUD', value: pCloud),
245261
string(name: 'P_GIT_REPO', value: gitUrl),
246262
string(name: 'P_GIT_COMMIT', value: gitCommit),
247-
string(name: 'P_EPHEMERAL_STORAGE_LIMIT', value: '8Gi'),
263+
string(name: 'P_EPHEMERAL_STORAGE_LIMIT', value: '32Gi'),
248264
string(name: 'P_DOCKER_IMAGE', value: condaBuildDockerImage),
249-
string(name: 'P_TIMEOUT', value: pTimeout),
265+
string(name: 'P_TIMEOUT', value: '3600'), // Conda builds take longer
250266
string(name: 'P_CPU_LIMIT', value: "4"),
251267
string(name: 'P_MEM_LIMIT', value: "8Gi"),
252268
string(name: 'P_COMMAND', value: "./.ci/build_conda.sh")

.ci/test.sh

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,8 @@ fi
1919
JUNIT_PREFIX=build/output/${BUILD_NUMBER}
2020
mkdir -p $(dirname $JUNIT_PREFIX)
2121
make test PYTEST="coverage run -m pytest" DURATION=all EXTRA_ARGS="--junitxml $JUNIT_PREFIX.n0.junit.xml -v -m '$MARKERS'"
22-
make test-dist PYTEST="coverage run -m pytest" DURATION=all WORLD_SIZE=2 EXTRA_ARGS="--junitxml $JUNIT_PREFIX.n2.junit.xml -v -m '$MARKERS'"
22+
RANK_ARG='\$${RANK}' # escape RANK from the makefile and the makefile shell command
23+
make test-dist PYTEST="coverage run -m pytest" DURATION=all WORLD_SIZE=2 EXTRA_ARGS="--junitxml $JUNIT_PREFIX.${RANK_ARG}_n2.junit.xml -v -m '$MARKERS'"
2324

2425
# Combine the coverage reports
2526
python -m coverage combine

composer/cli/launcher.py

Lines changed: 9 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -122,10 +122,12 @@ def launch_processes(nproc: int, world_size: int, base_rank: int, node_rank: int
122122

123123
for local_rank in range(nproc):
124124
global_rank = base_rank + local_rank
125+
cmd = f"{sys.executable} -u"
125126
if module_mode:
126-
cmd = [sys.executable, '-u', '-m', training_script, *training_script_args]
127-
else:
128-
cmd = [sys.executable, '-u', training_script, *training_script_args]
127+
cmd += " -m"
128+
training_script_args_quoted = [f'"{arg}"' for arg in training_script_args]
129+
130+
cmd += f" {training_script} {' '.join(training_script_args_quoted)}"
129131

130132
current_env = os.environ.copy()
131133
current_env["RANK"] = str(global_rank)
@@ -137,15 +139,17 @@ def launch_processes(nproc: int, world_size: int, base_rank: int, node_rank: int
137139
current_env["MASTER_PORT"] = str(master_port)
138140
current_env["COMPOSER_RUN_DIRECTORY"] = run_directory
139141

140-
log.info("Launching process for local_rank(%s), global_rank(%s)", local_rank, global_rank)
142+
log.info("Launching process for local_rank(%s), global_rank(%s) with command(%s)", local_rank, global_rank, cmd)
141143

142144
if local_rank == 0:
143-
process = subprocess.Popen(cmd, env=current_env, text=True)
145+
process = subprocess.Popen(cmd, env=current_env, text=True, shell=True)
144146
else:
145147
logs_dir = os.path.join(run_directory, f"rank_{global_rank}", "logs")
146148
os.makedirs(logs_dir, exist_ok=True)
147149
process = subprocess.Popen(
148150
cmd,
151+
# Using a shell to execute the command, so the env variables will be available to the CLI arguments
152+
shell=True,
149153
env=current_env,
150154
stdout=open(os.path.join(logs_dir, f"rank_{global_rank}.stdout.txt"), "x"),
151155
stderr=open(os.path.join(logs_dir, f"rank_{global_rank}.stderr.txt"), "x"),

docker/pytorch/Dockerfile

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@ ARG DEBIAN_FRONTEND=noninteractive
66
# remove a bad symlink from the base composer image
77
# If this file is present after the first command, kaniko
88
# won't be able to build the docker image.
9-
RUN rm -f /usr/local/cuda-11.3/cuda-11.3 && mkdir -p /usr/local/cuda-11.3 && touch /usr/local/cuda-11.3/cuda-11.3
9+
RUN rm -f /usr/local/cuda-11.3/cuda-11.3
1010

1111
RUN apt-get update && \
1212
apt-get install -y --no-install-recommends \

docker/pytorch/build_matrix.sh

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@ echo "TAG='mosaicml/pytorch:1.9.1_cu111-python3.7-ubuntu20.04' BASE_IMAGE='nvidi
77
echo "TAG='mosaicml/pytorch:1.9.1_cpu-python3.7-ubuntu20.04' BASE_IMAGE='ubuntu:20.04' PYTHON_VERSION='3.7' CUDA_VERSION_TAG='cpu' CUDA_VERSION='cpu' LINUX_DISTRO='ubuntu:20.04' PYTORCH_VERSION='1.9.1' TORCHVISION_VERSION='0.10.1'"
88
echo "TAG='mosaicml/pytorch:1.9.1_cu111-python3.8-ubuntu20.04' BASE_IMAGE='nvidia/cuda:11.1.1-cudnn8-devel-ubuntu20.04' PYTHON_VERSION='3.8' CUDA_VERSION_TAG='cu111' CUDA_VERSION='11.1.1' LINUX_DISTRO='ubuntu:20.04' PYTORCH_VERSION='1.9.1' TORCHVISION_VERSION='0.10.1'"
99
echo "TAG='mosaicml/pytorch:1.9.1_cpu-python3.8-ubuntu20.04' BASE_IMAGE='ubuntu:20.04' PYTHON_VERSION='3.8' CUDA_VERSION_TAG='cpu' CUDA_VERSION='cpu' LINUX_DISTRO='ubuntu:20.04' PYTORCH_VERSION='1.9.1' TORCHVISION_VERSION='0.10.1'"
10-
echo "TAG='mosaicml/pytorch:1.10.0_cu113-python3.9-ubuntu20.04' BASE_IMAGE='nvidia/cuda:11.3.1-cudnn8-devel-ubuntu20.04' PYTHON_VERSION='3.9' CUDA_VERSION_TAG='cu113' CUDA_VERSION='11.3.1' LINUX_DISTRO='ubuntu:20.04' PYTORCH_VERSION='1.10.0' TORCHVISION_VERSION='0.11.1'"
10+
echo "TAG='mosaicml/pytorch:1.10.0_cu113-python3.9-ubuntu20.04' TAG='mosaicml/pytorch:latest' BASE_IMAGE='nvidia/cuda:11.3.1-cudnn8-devel-ubuntu20.04' PYTHON_VERSION='3.9' CUDA_VERSION_TAG='cu113' CUDA_VERSION='11.3.1' LINUX_DISTRO='ubuntu:20.04' PYTORCH_VERSION='1.10.0' TORCHVISION_VERSION='0.11.1'"
1111
echo "TAG='mosaicml/pytorch:1.10.0_cpu-python3.9-ubuntu20.04' BASE_IMAGE='ubuntu:20.04' PYTHON_VERSION='3.9' CUDA_VERSION_TAG='cpu' CUDA_VERSION='cpu' LINUX_DISTRO='ubuntu:20.04' PYTORCH_VERSION='1.10.0' TORCHVISION_VERSION='0.11.1'"
1212
echo "TAG='mosaicml/pytorch:1.9.1_cu111-python3.8-ubuntu18.04' BASE_IMAGE='nvidia/cuda:11.1.1-cudnn8-devel-ubuntu18.04' PYTHON_VERSION='3.8' CUDA_VERSION_TAG='cu111' CUDA_VERSION='11.1.1' LINUX_DISTRO='ubuntu:18.04' PYTORCH_VERSION='1.9.1' TORCHVISION_VERSION='0.10.1'"
1313
echo "TAG='mosaicml/pytorch:1.9.1_cpu-python3.8-ubuntu18.04' BASE_IMAGE='ubuntu:18.04' PYTHON_VERSION='3.8' CUDA_VERSION_TAG='cpu' CUDA_VERSION='cpu' LINUX_DISTRO='ubuntu:18.04' PYTORCH_VERSION='1.9.1' TORCHVISION_VERSION='0.10.1'"

setup.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -122,7 +122,9 @@ def package_files(directory: str):
122122
version="0.4.0",
123123
author="MosaicML",
124124
author_email="[email protected]",
125-
description="composing methods for ML training efficiency",
125+
description=
126+
"Composer provides well-engineered implementations of efficient training methods to give "
127+
"the tools that help you train a better model for cheaper.",
126128
long_description=long_description,
127129
long_description_content_type="text/markdown",
128130
url="https://github.com/mosaicml/composer",

tests/trainer/test_checkpoint.py

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -266,13 +266,12 @@ def test_checkpoint(
266266
- assert that the checkpoint from the new trainer at the end is the same as the checkpoint from the first trainer at the end.
267267
"""
268268
del world_size # unused. Read via env variable
269+
if deepspeed_enabled:
270+
pytest.skip("Deepspeed tests are unstable. See https://github.com/mosaicml/composer/issues/610.")
269271

270272
if not isinstance(device_hparams, GPUDeviceHparams) and deepspeed_enabled:
271273
pytest.skip("DeepSpeed tests must be ran on GPU")
272274

273-
if model_name == "resnet50_synthetic" and deepspeed_enabled:
274-
pytest.skip("Skipping tests timing out on jenkins. TODO: fix.")
275-
276275
if model_name is not None:
277276
if not isinstance(device_hparams, GPUDeviceHparams):
278277
pytest.skip("Real models require a GPU -- otherwise they take too long")

tests/trainer/test_ddp.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -138,6 +138,8 @@ def test_ddp(device: DeviceHparams, world_size: int, composer_trainer_hparams: T
138138
We assert that each of these tensors are different to ensure that 1) random seeding works properly,
139139
and 2) each ddp process is indeed getting different data.
140140
"""
141+
if deepspeed:
142+
pytest.skip("Deepspeed tests are unstable. See https://github.com/mosaicml/composer/issues/610.")
141143

142144
hparams = composer_trainer_hparams
143145
model_hparams = hparams.model

0 commit comments

Comments
 (0)