Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
70 commits
Select commit Hold shift + click to select a range
ebf6052
Jenkins + Docker Improvements
ravi-mosaicml Feb 28, 2022
f2cb6a1
Fixed typo
ravi-mosaicml Feb 28, 2022
6870eb0
Echoing job urls
ravi-mosaicml Feb 28, 2022
987ae95
Fix typo
ravi-mosaicml Feb 28, 2022
3940a6f
Testing...
ravi-mosaicml Mar 1, 2022
3f8ec23
Testing
ravi-mosaicml Mar 1, 2022
3984964
Added storage limit; moved the build matrix locally
ravi-mosaicml Mar 1, 2022
cf27afc
Fixes
ravi-mosaicml Mar 1, 2022
8d2fb3d
testing
ravi-mosaicml Mar 1, 2022
816c08e
testing
ravi-mosaicml Mar 1, 2022
2385489
Fixing for merge commits
ravi-mosaicml Mar 1, 2022
6da83a6
testing
ravi-mosaicml Mar 1, 2022
48f4d7a
Merge branch 'dev' into ravi/jenkinsfile_improvements
ravi-mosaicml Mar 1, 2022
1373409
testing
ravi-mosaicml Mar 1, 2022
4f2808a
Merge branch 'ravi/jenkinsfile_improvements' of github.com:mosaicml/c…
ravi-mosaicml Mar 1, 2022
48bf896
testing
ravi-mosaicml Mar 1, 2022
1a417e5
testing
ravi-mosaicml Mar 1, 2022
057ae81
testing
ravi-mosaicml Mar 1, 2022
80d12a0
testing
ravi-mosaicml Mar 1, 2022
1975611
Jenkinsfile cleanup
ravi-mosaicml Mar 1, 2022
203df72
Removed runWithChecks; fixed echoing of URL on subjob failures
ravi-mosaicml Mar 1, 2022
e646a56
Reconfigured docker builds
ravi-mosaicml Mar 1, 2022
9fd831b
testing
ravi-mosaicml Mar 1, 2022
a083a4d
Fixed typo
ravi-mosaicml Mar 1, 2022
0cd68d6
Parallelize the dockerbuilds
ravi-mosaicml Mar 1, 2022
737f810
Testing
ravi-mosaicml Mar 1, 2022
8dae40e
Fixed pytorchDockerBuildMatrix
ravi-mosaicml Mar 1, 2022
1c6d3d3
Bugfixes
ravi-mosaicml Mar 1, 2022
075e228
Added missing def
ravi-mosaicml Mar 1, 2022
0f6d896
testing
ravi-mosaicml Mar 1, 2022
48a9e9b
testing
ravi-mosaicml Mar 1, 2022
6e4214e
Reduce verbosity
ravi-mosaicml Mar 1, 2022
b88ffc2
Bugfixes
ravi-mosaicml Mar 1, 2022
084cba7
Remove echo
ravi-mosaicml Mar 1, 2022
d8df259
Added milestone
ravi-mosaicml Mar 1, 2022
37f136e
Fixed milestone
ravi-mosaicml Mar 1, 2022
73d0634
testing
ravi-mosaicml Mar 1, 2022
692b2dd
testing
ravi-mosaicml Mar 1, 2022
e721bb1
Updated the description in setup.py to match the readme.
ravi-mosaicml Mar 1, 2022
9db390a
testing
ravi-mosaicml Mar 1, 2022
d5700f3
Fixed build conda
ravi-mosaicml Mar 1, 2022
95f2a57
Merge branch 'ravi/jenkinsfile_improvements' into ravi/jenkinsfile_im…
ravi-mosaicml Mar 1, 2022
b5dc06b
Merge branch 'dev' into ravi/jenkinsfile_improvements
ravi-mosaicml Mar 1, 2022
500e8a3
Adjusted memory requirements
ravi-mosaicml Mar 1, 2022
3001287
Merge branch 'ravi/jenkinsfile_improvements' into ravi/jenkinsfile_im…
ravi-mosaicml Mar 1, 2022
b597021
testing
ravi-mosaicml Mar 1, 2022
80ad8e6
Merge branch 'ravi/jenkinsfile_improvements' into ravi/jenkinsfile_im…
ravi-mosaicml Mar 1, 2022
bba873b
Adjusted conda limits
ravi-mosaicml Mar 1, 2022
8b22a2a
Merge branch 'ravi/jenkinsfile_improvements' into ravi/jenkinsfile_im…
ravi-mosaicml Mar 1, 2022
7f1de0d
Merge branch 'dev' into ravi/jenkinsfile_improvments_test
ravi-mosaicml Mar 1, 2022
36c95ca
Increaed conda memory limit
ravi-mosaicml Mar 1, 2022
0046f34
Excluding the jenkinsfile repo changes from the changelog
ravi-mosaicml Mar 1, 2022
29bd924
Fix the dockerfile once more
ravi-mosaicml Mar 1, 2022
e88fffe
Increase conda timeout
ravi-mosaicml Mar 1, 2022
131fb7c
Tagged the latest image
ravi-mosaicml Mar 1, 2022
b780e49
testing
ravi-mosaicml Mar 1, 2022
d8f923b
Increased docker build ephemeral storage limit
ravi-mosaicml Mar 1, 2022
9595664
Fixed a typo
ravi-mosaicml Mar 1, 2022
10f5316
Merge branch 'dev' into ravi/jenkinsfile_improvments_test
ravi-mosaicml Mar 1, 2022
4b942d4
Update .ci/Jenkinsfile
ravi-mosaicml Mar 1, 2022
771e4cd
Fixed a race condition where multiple pytests wrote to the same junitxml
ravi-mosaicml Mar 1, 2022
4b5e667
Merge branch 'dev' into ravi/jenkinsfile_improvments_test
ravi-mosaicml Mar 2, 2022
85dd68f
Skip all deepspeed tests
ravi-mosaicml Mar 2, 2022
d23bcaf
Merge branch 'dev' into ravi/jenkinsfile_improvments_test
ravi-mosaicml Mar 2, 2022
48e2d3d
Merge branch 'ravi/jenkinsfile_improvments_test' of github.com:mosaic…
ravi-mosaicml Mar 2, 2022
73aa462
Merge branch 'dev' into ravi/jenkinsfile_improvments_test
ravi-mosaicml Mar 2, 2022
8c315f4
Increased storage
ravi-mosaicml Mar 2, 2022
b4e8e9a
Merge branch 'ravi/jenkinsfile_improvments_test' of github.com:mosaic…
ravi-mosaicml Mar 2, 2022
4da5167
Increased ephemeral storage limit
ravi-mosaicml Mar 2, 2022
e0ad4d4
Increased storage to 32Gi
ravi-mosaicml Mar 2, 2022
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
38 changes: 27 additions & 11 deletions .ci/Jenkinsfile
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,7 @@ def cloneJenkinsfilesRepo() {
doGenerateSubmoduleConfigurations: false,
extensions: [[$class: 'RelativeTargetDirectory', relativeTargetDir: jenkinsfileRepoTargetDir]],
submoduleCfg: [],
changelog: false,
userRemoteConfigs: [[url: jenkinsfileRepo, credentialsId: gitCredentialsId]]
])
return "$WORKSPACE_TMP/$jenkinsfileRepoTargetDir"
Expand Down Expand Up @@ -78,7 +79,7 @@ def runLint(pDockerImage) {
string(name: 'P_CLOUD', value: pCloud),
string(name: 'P_GIT_REPO', value: gitUrl),
string(name: 'P_GIT_COMMIT', value: gitCommit),
string(name: 'P_EPHEMERAL_STORAGE_LIMIT', value: '2Gi'),
string(name: 'P_EPHEMERAL_STORAGE_LIMIT', value: '7Gi'),
string(name: 'P_DOCKER_IMAGE', value: pDockerImage),
string(name: 'P_TIMEOUT', value: pTimeout),
string(name: 'P_CPU_LIMIT', value: "2"),
Expand Down Expand Up @@ -117,7 +118,7 @@ def runPytest(pDockerImage, gpu, extraDeps) {
string(name: 'P_MEM_LIMIT', value: memLimit),
string(name: 'P_TIMEOUT', value: pTimeout),
string(name: 'P_N_GPUS', value: nGpus),
string(name: 'P_EPHEMERAL_STORAGE_LIMIT', value: '2Gi'),
string(name: 'P_EPHEMERAL_STORAGE_LIMIT', value: '32Gi'),
text(name: 'P_COMMAND', value: "./.ci/test.sh '$extraDeps' '$markers'"),
string(name: 'P_ARTIFACTS_GLOB', value: artifactsGlob),
string(name: 'P_JUNIT_GLOB', value: junitGlob),
Expand Down Expand Up @@ -179,16 +180,16 @@ stage('Build') {
pytorchDockerBuildMatrix.each { entry ->
def command = entry[0] // command is the command to run
def stagingImage = entry[1] // stagingImage is where the built docker image is pushed
def buildArgs = entry[2] // buildArgs is a map of the build arguments passed to kaniko
jobs << [ "$buildArgs": { ->
def buildConfigListOfTuples = entry[2] // buildConfigListOfTuples is a list of (key, value) pairs of the build args from the matrix
jobs << [ "$buildConfigListOfTuples": { ->
trackBuild(
job: jenkinsShellJobName,
parameters: [
string(name: 'P_CLOUD', value: pCloud),
string(name: 'P_GIT_REPO', value: gitUrl),
string(name: 'P_GIT_COMMIT', value: gitCommit),
string(name: 'P_DOCKER_IMAGE', value: kanikoDockerImage),
string(name: 'P_EPHEMERAL_STORAGE_LIMIT', value: '16Gi'),
string(name: 'P_EPHEMERAL_STORAGE_LIMIT', value: '32Gi'),
text(name: 'P_COMMAND', value: command),
string(name: 'P_TIMEOUT', value: pTimeout),
string(name: 'P_CPU_LIMIT', value: '4'),
Expand All @@ -199,16 +200,31 @@ stage('Build') {
// no need to run tests again
return
}
def tag = buildArgs['TAG']
def gpu = buildArgs['CUDA_VERSION'] != 'cpu'
def gpu = false
def isLintImage = false
def tag = null
buildConfigListOfTuples.each { item ->
def key = item[0]
def val = item[1]

if (key == 'CUDA_VERSION') {
gpu = val != 'cpu'
}
if (key == 'TAG') {
tag = val
// there could be multiple tags
isLintImage = isLintImage || tag == lintImage
}

}
def extraDeps = 'all'
def subJobs = [
"Pytest - ${tag}" : { -> runPytest(stagingImage, gpu, extraDeps) }
]
if (tag == lintImage) {
if (isLintImage) {
// and run lint and a dev install on this image
subJobs << [
"Pytest - ${tag}, extraDeps=dev": { -> runPytest(stagingImage, false, 'dev') },
"Pytest - extraDeps=dev": { -> runPytest(stagingImage, false, 'dev') },
"Lint": { -> runLint(stagingImage) },
]
}
Expand Down Expand Up @@ -244,9 +260,9 @@ stage('Build') {
string(name: 'P_CLOUD', value: pCloud),
string(name: 'P_GIT_REPO', value: gitUrl),
string(name: 'P_GIT_COMMIT', value: gitCommit),
string(name: 'P_EPHEMERAL_STORAGE_LIMIT', value: '8Gi'),
string(name: 'P_EPHEMERAL_STORAGE_LIMIT', value: '32Gi'),
string(name: 'P_DOCKER_IMAGE', value: condaBuildDockerImage),
string(name: 'P_TIMEOUT', value: pTimeout),
string(name: 'P_TIMEOUT', value: '3600'), // Conda builds take longer
string(name: 'P_CPU_LIMIT', value: "4"),
string(name: 'P_MEM_LIMIT', value: "8Gi"),
string(name: 'P_COMMAND', value: "./.ci/build_conda.sh")
Expand Down
3 changes: 2 additions & 1 deletion .ci/test.sh
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,8 @@ fi
JUNIT_PREFIX=build/output/${BUILD_NUMBER}
mkdir -p $(dirname $JUNIT_PREFIX)
make test PYTEST="coverage run -m pytest" DURATION=all EXTRA_ARGS="--junitxml $JUNIT_PREFIX.n0.junit.xml -v -m '$MARKERS'"
make test-dist PYTEST="coverage run -m pytest" DURATION=all WORLD_SIZE=2 EXTRA_ARGS="--junitxml $JUNIT_PREFIX.n2.junit.xml -v -m '$MARKERS'"
RANK_ARG='\$${RANK}' # escape RANK from the makefile and the makefile shell command
make test-dist PYTEST="coverage run -m pytest" DURATION=all WORLD_SIZE=2 EXTRA_ARGS="--junitxml $JUNIT_PREFIX.${RANK_ARG}_n2.junit.xml -v -m '$MARKERS'"

# Combine the coverage reports
python -m coverage combine
Expand Down
14 changes: 9 additions & 5 deletions composer/cli/launcher.py
Original file line number Diff line number Diff line change
Expand Up @@ -122,10 +122,12 @@ def launch_processes(nproc: int, world_size: int, base_rank: int, node_rank: int

for local_rank in range(nproc):
global_rank = base_rank + local_rank
cmd = f"{sys.executable} -u"
if module_mode:
cmd = [sys.executable, '-u', '-m', training_script, *training_script_args]
else:
cmd = [sys.executable, '-u', training_script, *training_script_args]
cmd += " -m"
training_script_args_quoted = [f'"{arg}"' for arg in training_script_args]

cmd += f" {training_script} {' '.join(training_script_args_quoted)}"

current_env = os.environ.copy()
current_env["RANK"] = str(global_rank)
Expand All @@ -137,15 +139,17 @@ def launch_processes(nproc: int, world_size: int, base_rank: int, node_rank: int
current_env["MASTER_PORT"] = str(master_port)
current_env["COMPOSER_RUN_DIRECTORY"] = run_directory

log.info("Launching process for local_rank(%s), global_rank(%s)", local_rank, global_rank)
log.info("Launching process for local_rank(%s), global_rank(%s) with command(%s)", local_rank, global_rank, cmd)

if local_rank == 0:
process = subprocess.Popen(cmd, env=current_env, text=True)
process = subprocess.Popen(cmd, env=current_env, text=True, shell=True)
else:
logs_dir = os.path.join(run_directory, f"rank_{global_rank}", "logs")
os.makedirs(logs_dir, exist_ok=True)
process = subprocess.Popen(
cmd,
# Using a shell to execute the command, so the env variables will be available to the CLI arguments
shell=True,
env=current_env,
stdout=open(os.path.join(logs_dir, f"rank_{global_rank}.stdout.txt"), "x"),
stderr=open(os.path.join(logs_dir, f"rank_{global_rank}.stderr.txt"), "x"),
Expand Down
2 changes: 1 addition & 1 deletion docker/pytorch/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ ARG DEBIAN_FRONTEND=noninteractive
# remove a bad symlink from the base composer image
# If this file is present after the first command, kaniko
# won't be able to build the docker image.
RUN rm -f /usr/local/cuda-11.3/cuda-11.3 && mkdir -p /usr/local/cuda-11.3 && touch /usr/local/cuda-11.3/cuda-11.3
RUN rm -f /usr/local/cuda-11.3/cuda-11.3

RUN apt-get update && \
apt-get install -y --no-install-recommends \
Expand Down
2 changes: 1 addition & 1 deletion docker/pytorch/build_matrix.sh
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ echo "TAG='mosaicml/pytorch:1.9.1_cu111-python3.7-ubuntu20.04' BASE_IMAGE='nvidi
echo "TAG='mosaicml/pytorch:1.9.1_cpu-python3.7-ubuntu20.04' BASE_IMAGE='ubuntu:20.04' PYTHON_VERSION='3.7' CUDA_VERSION_TAG='cpu' CUDA_VERSION='cpu' LINUX_DISTRO='ubuntu:20.04' PYTORCH_VERSION='1.9.1' TORCHVISION_VERSION='0.10.1'"
echo "TAG='mosaicml/pytorch:1.9.1_cu111-python3.8-ubuntu20.04' BASE_IMAGE='nvidia/cuda:11.1.1-cudnn8-devel-ubuntu20.04' PYTHON_VERSION='3.8' CUDA_VERSION_TAG='cu111' CUDA_VERSION='11.1.1' LINUX_DISTRO='ubuntu:20.04' PYTORCH_VERSION='1.9.1' TORCHVISION_VERSION='0.10.1'"
echo "TAG='mosaicml/pytorch:1.9.1_cpu-python3.8-ubuntu20.04' BASE_IMAGE='ubuntu:20.04' PYTHON_VERSION='3.8' CUDA_VERSION_TAG='cpu' CUDA_VERSION='cpu' LINUX_DISTRO='ubuntu:20.04' PYTORCH_VERSION='1.9.1' TORCHVISION_VERSION='0.10.1'"
echo "TAG='mosaicml/pytorch:1.10.0_cu113-python3.9-ubuntu20.04' BASE_IMAGE='nvidia/cuda:11.3.1-cudnn8-devel-ubuntu20.04' PYTHON_VERSION='3.9' CUDA_VERSION_TAG='cu113' CUDA_VERSION='11.3.1' LINUX_DISTRO='ubuntu:20.04' PYTORCH_VERSION='1.10.0' TORCHVISION_VERSION='0.11.1'"
echo "TAG='mosaicml/pytorch:1.10.0_cu113-python3.9-ubuntu20.04' TAG='mosaicml/pytorch:latest' BASE_IMAGE='nvidia/cuda:11.3.1-cudnn8-devel-ubuntu20.04' PYTHON_VERSION='3.9' CUDA_VERSION_TAG='cu113' CUDA_VERSION='11.3.1' LINUX_DISTRO='ubuntu:20.04' PYTORCH_VERSION='1.10.0' TORCHVISION_VERSION='0.11.1'"
echo "TAG='mosaicml/pytorch:1.10.0_cpu-python3.9-ubuntu20.04' BASE_IMAGE='ubuntu:20.04' PYTHON_VERSION='3.9' CUDA_VERSION_TAG='cpu' CUDA_VERSION='cpu' LINUX_DISTRO='ubuntu:20.04' PYTORCH_VERSION='1.10.0' TORCHVISION_VERSION='0.11.1'"
echo "TAG='mosaicml/pytorch:1.9.1_cu111-python3.8-ubuntu18.04' BASE_IMAGE='nvidia/cuda:11.1.1-cudnn8-devel-ubuntu18.04' PYTHON_VERSION='3.8' CUDA_VERSION_TAG='cu111' CUDA_VERSION='11.1.1' LINUX_DISTRO='ubuntu:18.04' PYTORCH_VERSION='1.9.1' TORCHVISION_VERSION='0.10.1'"
echo "TAG='mosaicml/pytorch:1.9.1_cpu-python3.8-ubuntu18.04' BASE_IMAGE='ubuntu:18.04' PYTHON_VERSION='3.8' CUDA_VERSION_TAG='cpu' CUDA_VERSION='cpu' LINUX_DISTRO='ubuntu:18.04' PYTORCH_VERSION='1.9.1' TORCHVISION_VERSION='0.10.1'"
4 changes: 3 additions & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -122,7 +122,9 @@ def package_files(directory: str):
version="0.4.0",
author="MosaicML",
author_email="[email protected]",
description="composing methods for ML training efficiency",
description=
"Composer provides well-engineered implementations of efficient training methods to give "
"the tools that help you train a better model for cheaper.",
long_description=long_description,
long_description_content_type="text/markdown",
url="https://github.com/mosaicml/composer",
Expand Down
5 changes: 2 additions & 3 deletions tests/trainer/test_checkpoint.py
Original file line number Diff line number Diff line change
Expand Up @@ -266,13 +266,12 @@ def test_checkpoint(
- assert that the checkpoint from the new trainer at the end is the same as the checkpoint from the first trainer at the end.
"""
del world_size # unused. Read via env variable
if deepspeed_enabled:
pytest.skip("Deepspeed tests are unstable. See https://github.com/mosaicml/composer/issues/610.")

if not isinstance(device_hparams, GPUDeviceHparams) and deepspeed_enabled:
pytest.skip("DeepSpeed tests must be ran on GPU")

if model_name == "resnet50_synthetic" and deepspeed_enabled:
pytest.skip("Skipping tests timing out on jenkins. TODO: fix.")

if model_name is not None:
if not isinstance(device_hparams, GPUDeviceHparams):
pytest.skip("Real models require a GPU -- otherwise they take too long")
Expand Down
2 changes: 2 additions & 0 deletions tests/trainer/test_ddp.py
Original file line number Diff line number Diff line change
Expand Up @@ -138,6 +138,8 @@ def test_ddp(device: DeviceHparams, world_size: int, composer_trainer_hparams: T
We assert that each of these tensors are different to ensure that 1) random seeding works properly,
and 2) each ddp process is indeed getting different data.
"""
if deepspeed:
pytest.skip("Deepspeed tests are unstable. See https://github.com/mosaicml/composer/issues/610.")

hparams = composer_trainer_hparams
model_hparams = hparams.model
Expand Down