Skip to content

Commit 7d06c64

Browse files
committed
merge dev
1 parent 77b2d88 commit 7d06c64

File tree

23 files changed

+502
-411
lines changed

23 files changed

+502
-411
lines changed

.ci/Jenkinsfile

Lines changed: 27 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -38,6 +38,7 @@ def cloneJenkinsfilesRepo() {
3838
doGenerateSubmoduleConfigurations: false,
3939
extensions: [[$class: 'RelativeTargetDirectory', relativeTargetDir: jenkinsfileRepoTargetDir]],
4040
submoduleCfg: [],
41+
changelog: false,
4142
userRemoteConfigs: [[url: jenkinsfileRepo, credentialsId: gitCredentialsId]]
4243
])
4344
return "$WORKSPACE_TMP/$jenkinsfileRepoTargetDir"
@@ -78,7 +79,7 @@ def runLint(pDockerImage) {
7879
string(name: 'P_CLOUD', value: pCloud),
7980
string(name: 'P_GIT_REPO', value: gitUrl),
8081
string(name: 'P_GIT_COMMIT', value: gitCommit),
81-
string(name: 'P_EPHEMERAL_STORAGE_LIMIT', value: '2Gi'),
82+
string(name: 'P_EPHEMERAL_STORAGE_LIMIT', value: '7Gi'),
8283
string(name: 'P_DOCKER_IMAGE', value: pDockerImage),
8384
string(name: 'P_TIMEOUT', value: pTimeout),
8485
string(name: 'P_CPU_LIMIT', value: "2"),
@@ -117,7 +118,7 @@ def runPytest(pDockerImage, gpu, extraDeps) {
117118
string(name: 'P_MEM_LIMIT', value: memLimit),
118119
string(name: 'P_TIMEOUT', value: pTimeout),
119120
string(name: 'P_N_GPUS', value: nGpus),
120-
string(name: 'P_EPHEMERAL_STORAGE_LIMIT', value: '2Gi'),
121+
string(name: 'P_EPHEMERAL_STORAGE_LIMIT', value: '32Gi'),
121122
text(name: 'P_COMMAND', value: "./.ci/test.sh '$extraDeps' '$markers'"),
122123
string(name: 'P_ARTIFACTS_GLOB', value: artifactsGlob),
123124
string(name: 'P_JUNIT_GLOB', value: junitGlob),
@@ -179,16 +180,16 @@ stage('Build') {
179180
pytorchDockerBuildMatrix.each { entry ->
180181
def command = entry[0] // command is the command to run
181182
def stagingImage = entry[1] // stagingImage is where the built docker image is pushed
182-
def buildArgs = entry[2] // buildArgs is a map of the build arguments passed to kaniko
183-
jobs << [ "$buildArgs": { ->
183+
def buildConfigListOfTuples = entry[2] // buildConfigListOfTuples is a list of (key, value) pairs of the build args from the matrix
184+
jobs << [ "$buildConfigListOfTuples": { ->
184185
trackBuild(
185186
job: jenkinsShellJobName,
186187
parameters: [
187188
string(name: 'P_CLOUD', value: pCloud),
188189
string(name: 'P_GIT_REPO', value: gitUrl),
189190
string(name: 'P_GIT_COMMIT', value: gitCommit),
190191
string(name: 'P_DOCKER_IMAGE', value: kanikoDockerImage),
191-
string(name: 'P_EPHEMERAL_STORAGE_LIMIT', value: '16Gi'),
192+
string(name: 'P_EPHEMERAL_STORAGE_LIMIT', value: '32Gi'),
192193
text(name: 'P_COMMAND', value: command),
193194
string(name: 'P_TIMEOUT', value: pTimeout),
194195
string(name: 'P_CPU_LIMIT', value: '4'),
@@ -199,16 +200,31 @@ stage('Build') {
199200
// no need to run tests again
200201
return
201202
}
202-
def tag = buildArgs['TAG']
203-
def gpu = buildArgs['CUDA_VERSION'] != 'cpu'
203+
def gpu = false
204+
def isLintImage = false
205+
def tag = null
206+
buildConfigListOfTuples.each { item ->
207+
def key = item[0]
208+
def val = item[1]
209+
210+
if (key == 'CUDA_VERSION') {
211+
gpu = val != 'cpu'
212+
}
213+
if (key == 'TAG') {
214+
tag = val
215+
// there could be multiple tags
216+
isLintImage = isLintImage || tag == lintImage
217+
}
218+
219+
}
204220
def extraDeps = 'all'
205221
def subJobs = [
206222
"Pytest - ${tag}" : { -> runPytest(stagingImage, gpu, extraDeps) }
207223
]
208-
if (tag == lintImage) {
224+
if (isLintImage) {
209225
// and run lint and a dev install on this image
210226
subJobs << [
211-
"Pytest - ${tag}, extraDeps=dev": { -> runPytest(stagingImage, false, 'dev') },
227+
"Pytest - extraDeps=dev": { -> runPytest(stagingImage, false, 'dev') },
212228
"Lint": { -> runLint(stagingImage) },
213229
]
214230
}
@@ -244,9 +260,9 @@ stage('Build') {
244260
string(name: 'P_CLOUD', value: pCloud),
245261
string(name: 'P_GIT_REPO', value: gitUrl),
246262
string(name: 'P_GIT_COMMIT', value: gitCommit),
247-
string(name: 'P_EPHEMERAL_STORAGE_LIMIT', value: '8Gi'),
263+
string(name: 'P_EPHEMERAL_STORAGE_LIMIT', value: '32Gi'),
248264
string(name: 'P_DOCKER_IMAGE', value: condaBuildDockerImage),
249-
string(name: 'P_TIMEOUT', value: pTimeout),
265+
string(name: 'P_TIMEOUT', value: '3600'), // Conda builds take longer
250266
string(name: 'P_CPU_LIMIT', value: "4"),
251267
string(name: 'P_MEM_LIMIT', value: "8Gi"),
252268
string(name: 'P_COMMAND', value: "./.ci/build_conda.sh")

.ci/test.sh

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,8 @@ fi
1919
JUNIT_PREFIX=build/output/${BUILD_NUMBER}
2020
mkdir -p $(dirname $JUNIT_PREFIX)
2121
make test PYTEST="coverage run -m pytest" DURATION=all EXTRA_ARGS="--junitxml $JUNIT_PREFIX.n0.junit.xml -v -m '$MARKERS'"
22-
make test-dist PYTEST="coverage run -m pytest" DURATION=all WORLD_SIZE=2 EXTRA_ARGS="--junitxml $JUNIT_PREFIX.n2.junit.xml -v -m '$MARKERS'"
22+
RANK_ARG='\$${RANK}' # escape RANK from the makefile and the makefile shell command
23+
make test-dist PYTEST="coverage run -m pytest" DURATION=all WORLD_SIZE=2 EXTRA_ARGS="--junitxml $JUNIT_PREFIX.${RANK_ARG}_n2.junit.xml -v -m '$MARKERS'"
2324

2425
# Combine the coverage reports
2526
python -m coverage combine

Makefile

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,7 @@ dirs := composer examples tests
1414

1515
# run this to autoformat your code
1616
style:
17-
$(PYTHON) -m isort -i $(dirs)
17+
$(PYTHON) -m isort $(dirs)
1818
$(PYTHON) -m yapf -rip $(dirs)
1919
$(PYTHON) -m docformatter -ri --wrap-summaries 120 --wrap-descriptions 120 $(dirs)
2020

README.md

Lines changed: 42 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -30,7 +30,7 @@
3030
<a href="https://join.slack.com/t/mosaicml-community/shared_invite/zt-w0tiddn9-WGTlRpfjcO9J5jyrMub1dg">
3131
<img alt="Chat @ Slack" src="https://img.shields.io/badge/slack-chat-2eb67d.svg?logo=slack">
3232
</a>
33-
<a href="https://join.slack.com/t/mosaicml-community/shared_invite/zt-w0tiddn9-WGTlRpfjcO9J5jyrMub1dg">
33+
<a href="https://github.com/mosaicml/composer/blob/dev/LICENSE">
3434
<img alt="License" src="https://img.shields.io/badge/License-Apache%202.0-green.svg?logo=slack">
3535
</a>
3636
</p>
@@ -40,8 +40,8 @@
4040
<p align="center">Composer provides well-engineered implementations of efficient training methods to give the tools that help you train <b>a better model for cheaper</b>.</p>
4141

4242
<p align="center">
43-
<img src="docs/images/cost_graph_light.svg#gh-light-mode-only" width="80%"/>
44-
<img src="docs/images/cost_graph_dark.svg#gh-dark-mode-only" width="80%"/>
43+
<img src="docs/images/cost_graph_light.svg#gh-light-mode-only" width="100%"/>
44+
<img src="docs/images/cost_graph_dark.svg#gh-dark-mode-only" width="100%"/>
4545
</p>
4646

4747
Using Composer, you can:
@@ -78,7 +78,7 @@ conda install mosaicml
7878

7979
Composer provides both a **Functional API** (similar to `torch.nn.functional`) and a **Trainer** (that abstracts away the training loop) to provide flexibility to users.
8080

81-
#### Example: Functional API [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1HIxLs61pyf0ln7MlnrGYvkNHq1uVbNWu?usp=sharing)
81+
#### Example: Functional API [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/mosaicml/composer/blob/dev/notebooks/Composer_Functional.ipynb)
8282

8383
For users who choose to use their own training loop, we provide state-less functional implementations of our algorithms for a end-user to integrate.
8484

@@ -104,9 +104,9 @@ for epoch in range(NUM_EPOCHS):
104104
optimizer.step()
105105
```
106106

107-
See the official [Composer Functional API Colab notebook](https://colab.research.google.com/drive/1HIxLs61pyf0ln7MlnrGYvkNHq1uVbNWu?usp=sharing) for more.
107+
See the official [Composer Functional API Colab notebook](https://colab.research.google.com/github/mosaicml/composer/blob/dev/notebooks/Composer_Functional.ipynb) for more.
108108

109-
#### Example: Trainer [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://drive.google.com/file/d/12Dl0NVDaj4tf4gfpfg-rkIAoO_H7edo3/edit)
109+
#### Example: Trainer [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/mosaicml/composer/blob/dev/notebooks/up_and_running_with_composer.ipynb)
110110

111111
For maximal speedups, we recommend using our Trainer, which manages handling user state, performant algorithm implementations, and provides useful engineering abstractions to permit rapid experimentation.
112112

@@ -285,6 +285,42 @@ We welcome any comments, feedback, or contributions to Composer! Please do not h
285285

286286
## Learn More
287287

288+
Here's some resources actively maintained by the Composer community to help you get started:
289+
<table>
290+
<thead>
291+
<tr>
292+
<th><b>Resource</b></th>
293+
<th><b>Details</b></th>
294+
</tr>
295+
</thead>
296+
<tbody>
297+
<tr>
298+
<td><a href="https://www.mosaicml.com/blog/founders-blog" target="_blank" rel="noopener noreferrer">Founder's Blog</a></td>
299+
<td>A blog post by our founders highlighting why MosaicML exists</td>
300+
</tr>
301+
<tr>
302+
<td><a href="https://colab.research.google.com/github/mosaicml/composer/blob/dev/notebooks/up_and_running_with_composer.ipynb" target="_blank" rel="noopener noreferrer">Getting started with our Trainer</a></td>
303+
<td>An interactive Colab Notebook aimed at teaching users about our Trainer</td>
304+
</tr>
305+
<tr>
306+
<td><a href="https://colab.research.google.com/github/mosaicml/composer/blob/dev/notebooks/Composer_Functional.ipynb" target="_blank" rel="noopener noreferrer">Getting started with our Functional API</a></td>
307+
<td>An interactive Colab Notebook aimed at teaching users about our Functional API</td>
308+
</tr>
309+
<tr>
310+
<td><a href="https://colab.research.google.com/github/mosaicml/composer/blob/dev/notebooks/custom_method_tutorial.ipynb" target="_blank" rel="noopener noreferrer">Building Speedup Methods</a></td>
311+
<td>An interactive Colab Notebook aimed at teaching users about building speedup methods on top of Composer</td>
312+
</tr>
313+
<tr>
314+
<td><a href="https://colab.research.google.com/github/mosaicml/composer/blob/dev/notebooks/nlp_notebook_tutorial.ipynb" target="_blank" rel="noopener noreferrer">Training BERTs with Composer</a></td>
315+
<td>An interactive Colab Notebook aimed at helping users learn how to train BERT models with Composer!</td>
316+
</tr>
317+
<tr>
318+
<td><a href="https://mosaicml.com/jobs" target="_blank" rel="noopener noreferrer">We're Hiring!</a></td>
319+
<td>Join us! 🤩</td>
320+
</tr>
321+
</tbody>
322+
</table>
323+
288324
If you have any questions, please feel free to reach out to us on [Twiter](https://twitter.com/mosaicml), [email](mailto:[email protected]), or our [Community Slack](https://join.slack.com/t/mosaicml-community/shared_invite/zt-w0tiddn9-WGTlRpfjcO9J5jyrMub1dg)!
289325

290326
## Contributors

composer/algorithms/cutmix/README.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
# 🎃 CutMix
1+
# ✂️ CutMix
22

33
[\[How to Use\]](#how-to-use) - [\[Suggested Hyperparameters\]](#suggested-hyperparameters) - [\[Technical Details\]](#technical-details) - [\[Attribution\]](#attribution)
44

composer/algorithms/cutout/README.md

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
# ✂️ Cutout
1+
# 🎃 Cutout
22

33
[\[How to Use\]](#how-to-use) - [\[Suggested Hyperparameters\]](#suggested-hyperparameters) - [\[Technical Details\]](#technical-details) - [\[Attribution\]](#attribution)
44

@@ -16,7 +16,7 @@ It is a regularization technique that improves the accuracy of models for comput
1616
### Functional Interface
1717

1818
```python
19-
# Run the CutOut algorithm directly on the batch data using the Composer functional API
19+
# Run the CutOut algorithm directly on the batch data using the Composer functional API
2020

2121
from composer import functional as cf
2222

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,3 @@
1-
# ️ Scale Schedule
1+
# 🗜️ Scale Schedule
22

33
This method is deprecated. It has been replaced by the `scale_schedule_ratio` param supported directly by the Composer Trainer. For backwards compatibility, the Composer Trainer detects when this algorithm has been initialized and pulls the `ratio` param accordingly.

composer/cli/launcher.py

Lines changed: 9 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -122,10 +122,12 @@ def launch_processes(nproc: int, world_size: int, base_rank: int, node_rank: int
122122

123123
for local_rank in range(nproc):
124124
global_rank = base_rank + local_rank
125+
cmd = f"{sys.executable} -u"
125126
if module_mode:
126-
cmd = [sys.executable, '-u', '-m', training_script, *training_script_args]
127-
else:
128-
cmd = [sys.executable, '-u', training_script, *training_script_args]
127+
cmd += " -m"
128+
training_script_args_quoted = [f'"{arg}"' for arg in training_script_args]
129+
130+
cmd += f" {training_script} {' '.join(training_script_args_quoted)}"
129131

130132
current_env = os.environ.copy()
131133
current_env["RANK"] = str(global_rank)
@@ -137,15 +139,17 @@ def launch_processes(nproc: int, world_size: int, base_rank: int, node_rank: int
137139
current_env["MASTER_PORT"] = str(master_port)
138140
current_env["COMPOSER_RUN_DIRECTORY"] = run_directory
139141

140-
log.info("Launching process for local_rank(%s), global_rank(%s)", local_rank, global_rank)
142+
log.info("Launching process for local_rank(%s), global_rank(%s) with command(%s)", local_rank, global_rank, cmd)
141143

142144
if local_rank == 0:
143-
process = subprocess.Popen(cmd, env=current_env, text=True)
145+
process = subprocess.Popen(cmd, env=current_env, text=True, shell=True)
144146
else:
145147
logs_dir = os.path.join(run_directory, f"rank_{global_rank}", "logs")
146148
os.makedirs(logs_dir, exist_ok=True)
147149
process = subprocess.Popen(
148150
cmd,
151+
# Using a shell to execute the command, so the env variables will be available to the CLI arguments
152+
shell=True,
149153
env=current_env,
150154
stdout=open(os.path.join(logs_dir, f"rank_{global_rank}.stdout.txt"), "x"),
151155
stderr=open(os.path.join(logs_dir, f"rank_{global_rank}.stderr.txt"), "x"),

composer/core/state.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -81,7 +81,7 @@ class State(Serializable):
8181
precision_context ((precision: Precision) -> ContextManager): Function to produce a context manager to mandate precision.
8282
8383
optimizers (types.Optimizers, optional): The optimizers being used to train the model. Multiple optimizers are not currently supported.
84-
schedulers (types.Schedulers, optional): The learning rate schedulers, typically wrapped in :class:`ComposableScheduler`.
84+
schedulers (PyTorchScheduler, optional): The learning rate scheduler (can also be a list or tuple of schedulers).
8585
scaler (torch.cuda.amp.GradScaler, optional): The gradient scaler in use for mixed precision training.
8686
8787
algorithms (Sequence[Algorithm]): The algorithms used for training.
@@ -108,7 +108,7 @@ class State(Serializable):
108108
batch_num_tokens: int
109109
loss: types.Tensors
110110
outputs: types.Tensors
111-
_schedulers: List[types.Scheduler]
111+
_schedulers: List[types.PyTorchScheduler]
112112

113113
# These attributes will be serialized using .state_dict(), and loaded with .load_state_dict()
114114
# All other attributes will not be serialized.
@@ -212,7 +212,7 @@ def schedulers(self):
212212
return self._schedulers
213213

214214
@schedulers.setter
215-
def schedulers(self, schedulers: types.Schedulers):
215+
def schedulers(self, schedulers: types.PyTorchScheduler):
216216
self._schedulers[:] = ensure_tuple(schedulers)
217217

218218
@property

composer/core/types.py

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -166,8 +166,7 @@ def __len__(self) -> int:
166166
Metrics = Union[Metric, MetricCollection]
167167
Optimizer = torch.optim.Optimizer
168168
Optimizers = Many[Optimizer]
169-
Scheduler = torch.optim.lr_scheduler._LRScheduler
170-
Schedulers = Many[Scheduler]
169+
PyTorchScheduler = torch.optim.lr_scheduler._LRScheduler
171170

172171
Scaler = torch.cuda.amp.grad_scaler.GradScaler
173172

0 commit comments

Comments
 (0)