mosaicml
diff --git a/‎.ci/Jenkinsfile‎
Lines changed: 27 additions & 11 deletions b/‎.ci/Jenkinsfile‎
Lines changed: 27 additions & 11 deletions
diff --git a/‎.ci/test.sh‎
Lines changed: 2 additions & 1 deletion b/‎.ci/test.sh‎
Lines changed: 2 additions & 1 deletion
diff --git a/‎Makefile‎
Lines changed: 1 addition & 1 deletion b/‎Makefile‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎README.md‎
Lines changed: 42 additions & 6 deletions b/‎README.md‎
Lines changed: 42 additions & 6 deletions
diff --git a/‎composer/algorithms/cutmix/README.md‎
Lines changed: 1 addition & 1 deletion b/‎composer/algorithms/cutmix/README.md‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎composer/algorithms/cutout/README.md‎
Lines changed: 2 additions & 2 deletions b/‎composer/algorithms/cutout/README.md‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎composer/algorithms/scale_schedule/README.md‎
Lines changed: 1 addition & 1 deletion b/‎composer/algorithms/scale_schedule/README.md‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎composer/cli/launcher.py‎
Lines changed: 9 additions & 5 deletions b/‎composer/cli/launcher.py‎
Lines changed: 9 additions & 5 deletions
diff --git a/‎composer/core/state.py‎
Lines changed: 3 additions & 3 deletions b/‎composer/core/state.py‎
Lines changed: 3 additions & 3 deletions
diff --git a/‎composer/core/types.py‎
Lines changed: 1 addition & 2 deletions b/‎composer/core/types.py‎
Lines changed: 1 addition & 2 deletions
@@ -38,6 +38,7 @@ def cloneJenkinsfilesRepo() {
             doGenerateSubmoduleConfigurations: false,
             extensions: [[$class: 'RelativeTargetDirectory', relativeTargetDir: jenkinsfileRepoTargetDir]],
             submoduleCfg: [],
+            changelog: false,
             userRemoteConfigs: [[url: jenkinsfileRepo, credentialsId: gitCredentialsId]]
         ])
         return "$WORKSPACE_TMP/$jenkinsfileRepoTargetDir"
@@ -78,7 +79,7 @@ def runLint(pDockerImage) {
             string(name: 'P_CLOUD', value: pCloud),
             string(name: 'P_GIT_REPO', value: gitUrl),
             string(name: 'P_GIT_COMMIT', value: gitCommit),
-            string(name: 'P_EPHEMERAL_STORAGE_LIMIT', value: '2Gi'),
+            string(name: 'P_EPHEMERAL_STORAGE_LIMIT', value: '7Gi'),
             string(name: 'P_DOCKER_IMAGE', value: pDockerImage),
             string(name: 'P_TIMEOUT', value: pTimeout),
             string(name: 'P_CPU_LIMIT', value: "2"),
@@ -117,7 +118,7 @@ def runPytest(pDockerImage, gpu, extraDeps) {
             string(name: 'P_MEM_LIMIT', value: memLimit),
             string(name: 'P_TIMEOUT', value: pTimeout),
             string(name: 'P_N_GPUS', value: nGpus),
-            string(name: 'P_EPHEMERAL_STORAGE_LIMIT', value: '2Gi'),
+            string(name: 'P_EPHEMERAL_STORAGE_LIMIT', value: '32Gi'),
             text(name: 'P_COMMAND', value: "./.ci/test.sh '$extraDeps' '$markers'"),
             string(name: 'P_ARTIFACTS_GLOB', value: artifactsGlob),
             string(name: 'P_JUNIT_GLOB', value: junitGlob),
@@ -179,16 +180,16 @@ stage('Build') {
         pytorchDockerBuildMatrix.each { entry ->
             def command = entry[0]  // command is the command to run
             def stagingImage = entry[1]  // stagingImage is where the built docker image is pushed
-            def buildArgs = entry[2]  // buildArgs is a map of the build arguments passed to kaniko
-            jobs << [ "$buildArgs": { -> 
+            def buildConfigListOfTuples = entry[2]  // buildConfigListOfTuples is a list of (key, value) pairs of the build args from the matrix
+            jobs << [ "$buildConfigListOfTuples": { -> 
                 trackBuild(
                     job: jenkinsShellJobName,
                     parameters: [
                         string(name: 'P_CLOUD', value: pCloud),
                         string(name: 'P_GIT_REPO', value: gitUrl),
                         string(name: 'P_GIT_COMMIT', value: gitCommit),
                         string(name: 'P_DOCKER_IMAGE', value: kanikoDockerImage),
-                        string(name: 'P_EPHEMERAL_STORAGE_LIMIT', value: '16Gi'),
+                        string(name: 'P_EPHEMERAL_STORAGE_LIMIT', value: '32Gi'),
                         text(name: 'P_COMMAND', value: command),
                         string(name: 'P_TIMEOUT', value: pTimeout),
                         string(name: 'P_CPU_LIMIT', value: '4'),
@@ -199,16 +200,31 @@ stage('Build') {
                     // no need to run tests again
                     return
                 }
-                def tag = buildArgs['TAG']
-                def gpu = buildArgs['CUDA_VERSION'] != 'cpu'
+                def gpu = false
+                def isLintImage = false
+                def tag = null
+                buildConfigListOfTuples.each { item ->
+                    def key = item[0]
+                    def val = item[1]
+
+                    if (key == 'CUDA_VERSION') {
+                        gpu = val != 'cpu'
+                    }
+                    if (key == 'TAG') {
+                        tag = val
+                        // there could be multiple tags
+                        isLintImage = isLintImage || tag == lintImage
+                    }
+                
+                }
                 def extraDeps = 'all'
                 def subJobs = [
                     "Pytest - ${tag}" : { -> runPytest(stagingImage, gpu, extraDeps) }
                 ]
-                if (tag == lintImage) {
+                if (isLintImage) {
                     // and run lint and a dev install on this image
                     subJobs << [
-                        "Pytest - ${tag}, extraDeps=dev": { -> runPytest(stagingImage, false, 'dev') },
+                        "Pytest - extraDeps=dev": { -> runPytest(stagingImage, false, 'dev') },
                         "Lint": { -> runLint(stagingImage) },
                     ]
                 }
@@ -244,9 +260,9 @@ stage('Build') {
                         string(name: 'P_CLOUD', value: pCloud),
                         string(name: 'P_GIT_REPO', value: gitUrl),
                         string(name: 'P_GIT_COMMIT', value: gitCommit),
-                        string(name: 'P_EPHEMERAL_STORAGE_LIMIT', value: '8Gi'),
+                        string(name: 'P_EPHEMERAL_STORAGE_LIMIT', value: '32Gi'),
                         string(name: 'P_DOCKER_IMAGE', value: condaBuildDockerImage),
-                        string(name: 'P_TIMEOUT', value: pTimeout),
+                        string(name: 'P_TIMEOUT', value: '3600'), // Conda builds take longer
                         string(name: 'P_CPU_LIMIT', value: "4"),
                         string(name: 'P_MEM_LIMIT', value: "8Gi"),
                         string(name: 'P_COMMAND', value: "./.ci/build_conda.sh")
 
@@ -19,7 +19,8 @@ fi
 JUNIT_PREFIX=build/output/${BUILD_NUMBER}
 mkdir -p $(dirname $JUNIT_PREFIX)
 make test PYTEST="coverage run -m pytest" DURATION=all EXTRA_ARGS="--junitxml $JUNIT_PREFIX.n0.junit.xml -v -m '$MARKERS'"
-make test-dist PYTEST="coverage run -m pytest" DURATION=all WORLD_SIZE=2 EXTRA_ARGS="--junitxml $JUNIT_PREFIX.n2.junit.xml -v -m '$MARKERS'"
+RANK_ARG='\$${RANK}' # escape RANK from the makefile and the makefile shell command
+make test-dist PYTEST="coverage run -m pytest" DURATION=all WORLD_SIZE=2 EXTRA_ARGS="--junitxml $JUNIT_PREFIX.${RANK_ARG}_n2.junit.xml -v -m '$MARKERS'"
 
 # Combine the coverage reports
 python -m coverage combine
 
@@ -14,7 +14,7 @@ dirs := composer examples tests
 
 # run this to autoformat your code
 style:
-	$(PYTHON) -m isort -i $(dirs)
+	$(PYTHON) -m isort $(dirs)
 	$(PYTHON) -m yapf -rip $(dirs)
 	$(PYTHON) -m docformatter -ri --wrap-summaries 120 --wrap-descriptions 120 $(dirs)
 
 
@@ -30,7 +30,7 @@
     <a href="https://join.slack.com/t/mosaicml-community/shared_invite/zt-w0tiddn9-WGTlRpfjcO9J5jyrMub1dg">
         <img alt="Chat @ Slack" src="https://img.shields.io/badge/slack-chat-2eb67d.svg?logo=slack">
     </a>
-    <a href="https://join.slack.com/t/mosaicml-community/shared_invite/zt-w0tiddn9-WGTlRpfjcO9J5jyrMub1dg">
+    <a href="https://github.com/mosaicml/composer/blob/dev/LICENSE">
         <img alt="License" src="https://img.shields.io/badge/License-Apache%202.0-green.svg?logo=slack">
     </a>
 </p>
@@ -40,8 +40,8 @@
 <p align="center">Composer provides well-engineered implementations of efficient training methods to give the tools that help you train <b>a better model for cheaper</b>.</p>
 
 <p align="center">
-<img src="docs/images/cost_graph_light.svg#gh-light-mode-only" width="80%"/>
-<img src="docs/images/cost_graph_dark.svg#gh-dark-mode-only" width="80%"/>
+<img src="docs/images/cost_graph_light.svg#gh-light-mode-only" width="100%"/>
+<img src="docs/images/cost_graph_dark.svg#gh-dark-mode-only" width="100%"/>
 </p>
 
 Using Composer, you can:
@@ -78,7 +78,7 @@ conda install mosaicml
 
 Composer provides both a **Functional API** (similar to `torch.nn.functional`) and a **Trainer** (that abstracts away the training loop) to provide flexibility to users.
 
-#### Example: Functional API [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1HIxLs61pyf0ln7MlnrGYvkNHq1uVbNWu?usp=sharing)
+#### Example: Functional API [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/mosaicml/composer/blob/dev/notebooks/Composer_Functional.ipynb)
 
 For users who choose to use their own training loop, we provide state-less functional implementations of our algorithms for a end-user to integrate.
 
@@ -104,9 +104,9 @@ for epoch in range(NUM_EPOCHS):
         optimizer.step()
 ```
 
-See the official [Composer Functional API Colab notebook](https://colab.research.google.com/drive/1HIxLs61pyf0ln7MlnrGYvkNHq1uVbNWu?usp=sharing) for more.
+See the official [Composer Functional API Colab notebook](https://colab.research.google.com/github/mosaicml/composer/blob/dev/notebooks/Composer_Functional.ipynb) for more.
 
-#### Example: Trainer [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://drive.google.com/file/d/12Dl0NVDaj4tf4gfpfg-rkIAoO_H7edo3/edit)
+#### Example: Trainer [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/mosaicml/composer/blob/dev/notebooks/up_and_running_with_composer.ipynb)
 
 For maximal speedups, we recommend using our Trainer, which manages handling user state, performant algorithm implementations, and provides useful engineering abstractions to permit rapid experimentation.
 
@@ -285,6 +285,42 @@ We welcome any comments, feedback, or contributions to Composer! Please do not h
 
 ## Learn More
 
+Here's some resources actively maintained by the Composer community to help you get started:
+<table>
+<thead>
+  <tr>
+      <th><b>Resource</b></th>
+      <th><b>Details</b></th>
+  </tr>
+</thead>
+<tbody>
+  <tr>
+    <td><a href="https://www.mosaicml.com/blog/founders-blog" target="_blank" rel="noopener noreferrer">Founder's Blog</a></td>
+    <td>A blog post by our founders highlighting why MosaicML exists</td>
+  </tr>
+  <tr>
+    <td><a href="https://colab.research.google.com/github/mosaicml/composer/blob/dev/notebooks/up_and_running_with_composer.ipynb" target="_blank" rel="noopener noreferrer">Getting started with our Trainer</a></td>
+    <td>An interactive Colab Notebook aimed at teaching users about our Trainer</td>
+  </tr>
+  <tr>
+    <td><a href="https://colab.research.google.com/github/mosaicml/composer/blob/dev/notebooks/Composer_Functional.ipynb" target="_blank" rel="noopener noreferrer">Getting started with our Functional API</a></td>
+    <td>An interactive Colab Notebook aimed at teaching users about our Functional API</td>
+  </tr>
+  <tr>
+    <td><a href="https://colab.research.google.com/github/mosaicml/composer/blob/dev/notebooks/custom_method_tutorial.ipynb" target="_blank" rel="noopener noreferrer">Building Speedup Methods</a></td>
+    <td>An interactive Colab Notebook aimed at teaching users about building speedup methods on top of Composer</td>
+  </tr>
+  <tr>
+    <td><a href="https://colab.research.google.com/github/mosaicml/composer/blob/dev/notebooks/nlp_notebook_tutorial.ipynb" target="_blank" rel="noopener noreferrer">Training BERTs with Composer</a></td>
+    <td>An interactive Colab Notebook aimed at helping users learn how to train BERT models with Composer!</td>
+  </tr>
+  <tr>
+    <td><a href="https://mosaicml.com/jobs" target="_blank" rel="noopener noreferrer">We're Hiring!</a></td>
+    <td>Join us! 🤩</td>
+  </tr>
+</tbody>
+</table>
+
 If you have any questions, please feel free to reach out to us on [Twiter](https://twitter.com/mosaicml), [email](mailto:[email protected]), or our [Community Slack](https://join.slack.com/t/mosaicml-community/shared_invite/zt-w0tiddn9-WGTlRpfjcO9J5jyrMub1dg)!
 
 ## Contributors
 
@@ -1,4 +1,4 @@
-# 🎃 CutMix
+# ✂️ CutMix
 
 [\[How to Use\]](#how-to-use) - [\[Suggested Hyperparameters\]](#suggested-hyperparameters) - [\[Technical Details\]](#technical-details) - [\[Attribution\]](#attribution)
 
 
@@ -1,4 +1,4 @@
-# ✂️ Cutout
+# 🎃 Cutout
 
 [\[How to Use\]](#how-to-use) - [\[Suggested Hyperparameters\]](#suggested-hyperparameters) - [\[Technical Details\]](#technical-details) - [\[Attribution\]](#attribution)
 
@@ -16,7 +16,7 @@ It is a regularization technique that improves the accuracy of models for comput
 ### Functional Interface
 
 ```python
-# Run the CutOut algorithm directly on the batch data using the Composer functional API 
+# Run the CutOut algorithm directly on the batch data using the Composer functional API
 
 from composer import functional as cf
 
 
@@ -1,3 +1,3 @@
-# ⚖️ Scale Schedule
+# 🗜️ Scale Schedule
 
 This method is deprecated. It has been replaced by the `scale_schedule_ratio` param supported directly by the Composer Trainer. For backwards compatibility, the Composer Trainer detects when this algorithm has been initialized and pulls the `ratio` param accordingly.
@@ -122,10 +122,12 @@ def launch_processes(nproc: int, world_size: int, base_rank: int, node_rank: int
 
     for local_rank in range(nproc):
         global_rank = base_rank + local_rank
+        cmd = f"{sys.executable} -u"
         if module_mode:
-            cmd = [sys.executable, '-u', '-m', training_script, *training_script_args]
-        else:
-            cmd = [sys.executable, '-u', training_script, *training_script_args]
+            cmd += " -m"
+        training_script_args_quoted = [f'"{arg}"' for arg in training_script_args]
+
+        cmd += f" {training_script} {' '.join(training_script_args_quoted)}"
 
         current_env = os.environ.copy()
         current_env["RANK"] = str(global_rank)
@@ -137,15 +139,17 @@ def launch_processes(nproc: int, world_size: int, base_rank: int, node_rank: int
         current_env["MASTER_PORT"] = str(master_port)
         current_env["COMPOSER_RUN_DIRECTORY"] = run_directory
 
-        log.info("Launching process for local_rank(%s), global_rank(%s)", local_rank, global_rank)
+        log.info("Launching process for local_rank(%s), global_rank(%s) with command(%s)", local_rank, global_rank, cmd)
 
         if local_rank == 0:
-            process = subprocess.Popen(cmd, env=current_env, text=True)
+            process = subprocess.Popen(cmd, env=current_env, text=True, shell=True)
         else:
             logs_dir = os.path.join(run_directory, f"rank_{global_rank}", "logs")
             os.makedirs(logs_dir, exist_ok=True)
             process = subprocess.Popen(
                 cmd,
+                # Using a shell to execute the command, so the env variables will be available to the CLI arguments
+                shell=True,
                 env=current_env,
                 stdout=open(os.path.join(logs_dir, f"rank_{global_rank}.stdout.txt"), "x"),
                 stderr=open(os.path.join(logs_dir, f"rank_{global_rank}.stderr.txt"), "x"),
 
@@ -81,7 +81,7 @@ class State(Serializable):
         precision_context ((precision: Precision) -> ContextManager): Function to produce a context manager to mandate precision.
 
         optimizers (types.Optimizers, optional): The optimizers being used to train the model. Multiple optimizers are not currently supported.
-        schedulers (types.Schedulers, optional): The learning rate schedulers, typically wrapped in :class:`ComposableScheduler`.
+        schedulers (PyTorchScheduler, optional): The learning rate scheduler (can also be a list or tuple of schedulers).
         scaler (torch.cuda.amp.GradScaler, optional): The gradient scaler in use for mixed precision training.
 
         algorithms (Sequence[Algorithm]): The algorithms used for training.
@@ -108,7 +108,7 @@ class State(Serializable):
     batch_num_tokens: int
     loss: types.Tensors
     outputs: types.Tensors
-    _schedulers: List[types.Scheduler]
+    _schedulers: List[types.PyTorchScheduler]
 
     # These attributes will be serialized using .state_dict(), and loaded with .load_state_dict()
     # All other attributes will not be serialized.
@@ -212,7 +212,7 @@ def schedulers(self):
         return self._schedulers
 
     @schedulers.setter
-    def schedulers(self, schedulers: types.Schedulers):
+    def schedulers(self, schedulers: types.PyTorchScheduler):
         self._schedulers[:] = ensure_tuple(schedulers)
 
     @property
 
@@ -166,8 +166,7 @@ def __len__(self) -> int:
 Metrics = Union[Metric, MetricCollection]
 Optimizer = torch.optim.Optimizer
 Optimizers = Many[Optimizer]
-Scheduler = torch.optim.lr_scheduler._LRScheduler
-Schedulers = Many[Scheduler]
+PyTorchScheduler = torch.optim.lr_scheduler._LRScheduler
 
 Scaler = torch.cuda.amp.grad_scaler.GradScaler
Original file line number	Diff line number	Diff line change
`@@ -1,4 +1,4 @@`
`1`		`-# 🎃 CutMix`
	`1`	`+# ✂️ CutMix`
`2`	`2`
`3`	`3`	`[\[How to Use\]](#how-to-use) - [\[Suggested Hyperparameters\]](#suggested-hyperparameters) - [\[Technical Details\]](#technical-details) - [\[Attribution\]](#attribution)`
`4`	`4`
Original file line number	Diff line number	Diff line change
`@@ -1,3 +1,3 @@`
`1`		`-# ⚖️ Scale Schedule`
	`1`	`+# 🗜️ Scale Schedule`
`2`	`2`
`3`	`3`	This method is deprecated. It has been replaced by the `scale_schedule_ratio` param supported directly by the Composer Trainer. For backwards compatibility, the Composer Trainer detects when this algorithm has been initialized and pulls the `ratio` param accordingly.