mosaicml
diff --git a/‎composer/algorithms/alibi/alibi.py‎
Lines changed: 4 additions & 4 deletions b/‎composer/algorithms/alibi/alibi.py‎
Lines changed: 4 additions & 4 deletions
diff --git a/‎composer/algorithms/augmix/augmix.py‎
Lines changed: 19 additions & 19 deletions b/‎composer/algorithms/augmix/augmix.py‎
Lines changed: 19 additions & 19 deletions
diff --git a/‎composer/algorithms/blurpool/blurpool.py‎
Lines changed: 6 additions & 6 deletions b/‎composer/algorithms/blurpool/blurpool.py‎
Lines changed: 6 additions & 6 deletions
diff --git a/‎composer/algorithms/colout/colout.py‎
Lines changed: 7 additions & 7 deletions b/‎composer/algorithms/colout/colout.py‎
Lines changed: 7 additions & 7 deletions
diff --git a/‎composer/algorithms/cutmix/cutmix.py‎
Lines changed: 10 additions & 10 deletions b/‎composer/algorithms/cutmix/cutmix.py‎
Lines changed: 10 additions & 10 deletions
diff --git a/‎composer/algorithms/cutout/cutout.py‎
Lines changed: 4 additions & 2 deletions b/‎composer/algorithms/cutout/cutout.py‎
Lines changed: 4 additions & 2 deletions
diff --git a/‎composer/algorithms/factorize/factorize.py‎
Lines changed: 12 additions & 8 deletions b/‎composer/algorithms/factorize/factorize.py‎
Lines changed: 12 additions & 8 deletions
@@ -80,12 +80,12 @@ def apply_alibi(
             necessary for evaluating on sequence lengths longer than the model was
             initialized to accommodate. Takes positional arguments ``module`` and
             ``max_sequence_length``. For example,
-            ``composer.algorithms.alibi._gpt2_alibi.enlarge_mask``. Default = ``None``,
+            ``composer.algorithms.alibi._gpt2_alibi.enlarge_mask``. Default: ``None``,
             which means no modification of the model's default attention mask.
         optimizers (Optimizers, optional): Existing optimizers bound to ``model.parameters()``.
             All optimizers that have already been constructed with
             ``model.parameters()`` must be specified here so they will optimize
-            the correct parameters.
+            the correct parameters. Default: ``None``.
 
             If the optimizer(s) are constructed *after* calling this function,
             then it is safe to omit this parameter. These optimizers will see the correct
@@ -182,12 +182,12 @@ class Alibi(Algorithm):
         max_sequence_length (int): Maximum sequence length that the
             model will be able to accept. This is sometimes necessary for evaluating
             on sequence lengths longer than the model was initialized to
-            accommodate.
+            accommodate. Default: ``8192``.
         train_sequence_length_scaling (float, optional): Amount by which to scale
             training sequence length. One batch of training data will be
             reshaped from shape :math:`(sequence\\_length, batch)` to
             :math:`(sequence\\_length \\times train\\_sequence\\_length\\_scaling,
-            \\frac{batch}{train\\_sequence\\_length\\_scaling})`. Default = ``0.25``.
+            \\frac{batch}{train\\_sequence\\_length\\_scaling})`. Default: ``0.25``.
     """
 
     def __init__(self,
 
@@ -32,7 +32,7 @@ def augmix_image(img: ImgT,
                  augmentation_set: List = augmentation_sets["all"]) -> ImgT:
     """Applies AugMix (`Hendrycks et al, 2020 <http://arxiv.org/abs/1912.02781>`_) data
     augmentation to a single image or batch of images. See
-    :class:`~composer.algorithms.augmix.augmix.AugMix` and the 
+    :class:`.AugMix` and the
     :doc:`Method Card </method_cards/augmix>` for details. This function only acts on a
     single image (or batch) per call and is unlikely to be used in a training loop. Use
     :class:`~composer.algorithms.augmix.augmix.AugmentAndMixTransform` to use AugMix as
@@ -56,12 +56,12 @@ def augmix_image(img: ImgT,
 
     Args:
         img (PIL.Image): Image or batch of images to be AugMix'd.
-        severity (int, optional): See :class:`~composer.algorithms.augmix.augmix.AugMix`.
-        depth (int, optional): See :class:`~composer.algorithms.augmix.augmix.AugMix`.
-        width (int, optional): See :class:`~composer.algorithms.augmix.augmix.AugMix`.
-        alpha (float, optional): See :class:`~composer.algorithms.augmix.augmix.AugMix`.
+        severity (int, optional): See :class:`.AugMix`.
+        depth (int, optional): See :class:`.AugMix`.
+        width (int, optional): See :class:`.AugMix`.
+        alpha (float, optional): See :class:`.AugMix`.
         augmentation_set (str, optional): See
-            :class:`~composer.algorithms.augmix.augmix.AugMix`.
+            :class:`.AugMix`.
 
     Returns:
          PIL.Image: AugMix'd image.
@@ -102,7 +102,7 @@ def _augmix_pil_image(img_pil: PillowImage, severity: int, depth: int, width: in
 class AugmentAndMixTransform(torch.nn.Module):
     """Wrapper module for :func:`~composer.algorithms.augmix.augmix.augmix_image` that can
     be passed to :class:`torchvision.transforms.Compose`. See
-    :class:`~composer.algorithms.augmix.augmix.AugMix` and the :doc:`Method Card
+    :class:`.AugMix` and the :doc:`Method Card
     </method_cards/augmix>` for details.
 
     Example:
@@ -123,12 +123,12 @@ class AugmentAndMixTransform(torch.nn.Module):
             transformed_image = composed(image)
 
     Args:
-        severity (int, optional): See :class:`~composer.algorithms.augmix.augmix.AugMix`.
-        depth (int, optional): See :class:`~composer.algorithms.augmix.augmix.AugMix`.
-        width (int, optional): See :class:`~composer.algorithms.augmix.augmix.AugMix`.
-        alpha (float, optional): See :class:`~composer.algorithms.augmix.augmix.AugMix`.
+        severity (int, optional): See :class:`.AugMix`.
+        depth (int, optional): See :class:`.AugMix`.
+        width (int, optional): See :class:`.AugMix`.
+        alpha (float, optional): See :class:`.AugMix`.
         augmentation_set (str, optional): See
-            :class:`~composer.algorithms.augmix.augmix.AugMix`.
+            :class:`.AugMix`.
     """
 
     def __init__(self,
@@ -167,8 +167,8 @@ class AugMix(Algorithm):
     ``Dirichlet(alpha, alpha, ...)`` distribution. The coefficient for mixing the combined augmented image and the
     original image is drawn from a ``Beta(alpha, alpha)`` distribution, using the same ``alpha``.
 
-    This algorithm runs on on :attr:`~composer.core.event.Event.FIT_START` to insert a dataset transformation. It is a no-op if this algorithm already
-    applied itself on the :attr:`State.train_dataloader.dataset`.
+    This algorithm runs on on :attr:`~composer.core.event.Event.FIT_START` to insert a dataset transformation.
+    It is a no-op if this algorithm already applied itself on the :attr:`State.train_dataloader.dataset`.
 
     See the :doc:`Method Card </method_cards/augmix>` for more details.
 
@@ -196,14 +196,14 @@ class AugMix(Algorithm):
 
     Args:
         severity (int, optional): Severity of augmentations; ranges from 0
-            (no augmentation) to 10 (most severe). Default = ``3``.
+            (no augmentation) to 10 (most severe). Default: ``3``.
         depth (int, optional): Number of augmentations per sequence. -1 enables stochastic
-            depth sampled uniformly from [1, 3]. Default = ``-1``.
-        width (int, optional): Number of augmentation sequences. Default = ``3``.
+            depth sampled uniformly from [1, 3]. Default: ``-1``.
+        width (int, optional): Number of augmentation sequences. Default: ``3``.
         alpha (float, optional): Pseudocount for Beta and Dirichlet distributions. Must be
             > 0.  Higher values yield mixing coefficients closer to uniform weighting. As
             the value approaches 0, the mixing coefficients approach using only one
-            version of each image. Default = ``1.0``.
+            version of each image. Default: ``1.0``.
         augmentation_set (str, optional): Must be one of the following options:
 
             * ``"augmentations_all"``
@@ -225,7 +225,7 @@ class AugMix(Algorithm):
                 "sharpness", and "brightness" that account for diverging effects around 0
                 (or 1).
 
-            Default = ``"all"``.
+            Default: ``"all"``.
     """
 
     # TODO document each value of augmentation_set in more detail; i.e.,
 
@@ -31,15 +31,15 @@ def apply_blurpool(model: torch.nn.Module,
     Args:
         model (torch.nn.Module): the model to modify in-place
         replace_convs (bool, optional): replace strided :class:`torch.nn.Conv2d` modules with
-            :class:`.BlurConv2d` modules
+            :class:`.BlurConv2d` modules. Default: ``True``.
         replace_maxpools (bool, optional): replace eligible :class:`torch.nn.MaxPool2d` modules
-            with :class:`.BlurMaxPool2d` modules.
+            with :class:`.BlurMaxPool2d` modules. Default: ``True``.
         blur_first (bool, optional): for ``replace_convs``, blur input before the associated
             convolution. When set to ``False``, the convolution is applied with
             a stride of 1 before the blurring, resulting in significant
             overhead (though more closely matching
             `the paper <http://proceedings.mlr.press/v97/zhang19a.html>`_).
-            See :class:`.BlurConv2d` for further discussion.
+            See :class:`.BlurConv2d` for further discussion. Default: ``True``.
         optimizers (Optimizers, optional):  Existing optimizers bound to
             ``model.parameters()``. All optimizers that have already been
             constructed with ``model.parameters()`` must be specified here so
@@ -82,14 +82,14 @@ class BlurPool(Algorithm):
 
     Args:
         replace_convs (bool): replace strided :class:`torch.nn.Conv2d` modules with
-            :class:`.BlurConv2d` modules
+            :class:`.BlurConv2d` modules. Default: ``True``.
         replace_maxpools (bool): replace eligible :class:`torch.nn.MaxPool2d` modules
-            with :class:`.BlurMaxPool2d` modules.
+            with :class:`.BlurMaxPool2d` modules. Default: ``True``.
         blur_first (bool): when ``replace_convs`` is ``True``, blur input before the
             associated convolution. When set to ``False``, the convolution is
             applied with a stride of 1 before the blurring, resulting in
             significant overhead (though more closely matching the paper).
-            See :class:`.BlurConv2d` for further discussion.
+            See :class:`.BlurConv2d` for further discussion. Default: ``True``.
     """
 
     def __init__(self, replace_convs: bool, replace_maxpools: bool, blur_first: bool) -> None:
 
@@ -45,8 +45,8 @@ def colout_batch(X: ImgT, p_row: float = 0.15, p_col: float = 0.15) -> ImgT:
         X: :class:`PIL.Image.Image` or :class:`torch.Tensor` of image data. In
             the latter case, must be a single image of shape ``CHW`` or a batch
             of images of shape ``NCHW``.
-        p_row: Fraction of rows to drop (drop along H).
-        p_col: Fraction of columns to drop (drop along W).
+        p_row: Fraction of rows to drop (drop along H). Default: ``0.15``.
+        p_col: Fraction of columns to drop (drop along W). Default: ``0.15``.
 
     Returns:
         torch.Tensor: Input batch tensor with randomly dropped columns and rows.
@@ -94,8 +94,8 @@ class ColOutTransform:
             transforms = transforms.Compose([colout_transform, transforms.ToTensor()])
 
     Args:
-        p_row (float): Fraction of rows to drop (drop along H).
-        p_col (float): Fraction of columns to drop (drop along W).
+        p_row (float): Fraction of rows to drop (drop along H). Default: ``0.15``.
+        p_col (float): Fraction of columns to drop (drop along W). Default: ``0.15``.
     """
 
     def __init__(self, p_row: float = 0.15, p_col: float = 0.15):
@@ -142,9 +142,9 @@ class ColOut(Algorithm):
             )
 
     Args:
-        p_row (float): Fraction of rows to drop (drop along H).
-        p_col (float): Fraction of columns to drop (drop along W).
-        batch (bool): Run ColOut at the batch level.
+        p_row (float): Fraction of rows to drop (drop along H). Default: ``0.15``.
+        p_col (float): Fraction of columns to drop (drop along W). Default: ``0.15``.
+        batch (bool): Run ColOut at the batch level. Default: ``True``.
     """
 
     def __init__(self, p_row: float = 0.15, p_col: float = 0.15, batch: bool = True):
 
@@ -51,16 +51,16 @@ def cutmix_batch(X: Tensor,
             )
 
     Args:
-        X: input tensor of shape (B, d1, d2, ..., dn), B is batch size, d1-dn
+        X (Tensor): input tensor of shape (B, d1, d2, ..., dn), B is batch size, d1-dn
             are feature dimensions.
-        y: target tensor of shape (B, f1, f2, ..., fm), B is batch size, f1-fn
+        y (Tensor): target tensor of shape (B, f1, f2, ..., fm), B is batch size, f1-fn
             are possible target dimensions.
-        n_classes: total number of classes.
-        alpha: parameter for the beta distribution of the cutmix region size.
-        cutmix_lambda: optional, fixed size of cutmix region.
-        bbox: optional, predetermined (rx1, ry1, rx2, ry2) coords of the bounding box.
-        indices: Permutation of the batch indices `1..B`. Used
-            for permuting without randomness.
+        n_classes (int): total number of classes.
+        alpha (float, optional): parameter for the beta distribution of the cutmix region size. Default: ``1``.
+        cutmix_lambda (float, optional): fixed size of cutmix region. Default: ``None``.
+        bbox (tuple, optional): predetermined (rx1, ry1, rx2, ry2) coords of the bounding box. Default: ``None``.
+        indices (Tensor, optional): Permutation of the batch indices `1..B`. Used
+            for permuting without randomness. Default: ``None``.
 
     Returns:
         X_cutmix: batch of inputs after cutmix has been applied.
@@ -132,11 +132,11 @@ class CutMix(Algorithm):
 
     Args:
         num_classes (int): the number of classes in the task labels.
-        alpha (float): the psuedocount for the Beta distribution used to sample
+        alpha (float, optional): the psuedocount for the Beta distribution used to sample
             area parameters. As ``alpha`` grows, the two samples
             in each pair tend to be weighted more equally. As ``alpha``
             approaches 0 from above, the combination approaches only using
-            one element of the pair.
+            one element of the pair. Default: ``1``.
     """
 
     def __init__(self, num_classes: int, alpha: float = 1.):
 
@@ -38,11 +38,12 @@ def cutout_batch(X: ImgT, n_holes: int = 1, length: Union[int, float] = 0.5) ->
         X: :class:`PIL.Image.Image` or :class:`torch.Tensor` of image data. In
             the latter case, must be a single image of shape ``CHW`` or a batch
             of images of shape ``NCHW``.
-        n_holes: Integer number of holes to cut out
+        n_holes: Integer number of holes to cut out. Default: ``1``.
         length: Side length of the square holes to cut out. Must be greater than
             0. If ``0 < length < 1``, ``length`` is interpreted as a fraction
             of ``min(H, W)`` and converted to ``int(length * min(H, W))``.
             If ``length >= 1``, ``length`` is used as an integer size directly.
+            Default: ``0.5``.
 
     Returns:
         X_cutout: Batch of images with ``n_holes`` holes of dimension
@@ -91,11 +92,12 @@ class CutOut(Algorithm):
 
     Args:
         X (Tensor): Batch Tensor image of size (B, C, H, W).
-        n_holes: Integer number of holes to cut out
+        n_holes: Integer number of holes to cut out. Default: ``1``.
         length: Side length of the square holes to cut out. Must be greater than
             0. If ``0 < length < 1``, ``length`` is interpreted as a fraction
             of ``min(H, W)`` and converted to ``int(length * min(H, W))``.
             If ``length >= 1``, ``length`` is used as an integer size directly.
+            Default: ``0.5``.
     """
 
     def __init__(self, n_holes: int = 1, length: Union[int, float] = 0.5):
 
@@ -38,27 +38,29 @@ def apply_factorization(model: torch.nn.Module,
     Args:
         model (torch.nn.Module): the model to modify in-place
         factorize_convs (bool, optional): whether to try factorizing :class:`~torch.nn.Conv2d` modules.
+            Default: ``True``.
         factorize_linears (bool, optional): whether to try factorizing :class:`~torch.nn.Linear` modules.
+            Default: ``True``.
         min_channels (int, optional): if a :class:`~torch.nn.Conv2d` module does not have at least
             this many input and output channels, it will be ignored. Modules with
             few channels are unlikely to be accelerated by factorization due
-            to poor hardware utilization.
+            to poor hardware utilization. Default: ``512``.
         latent_channels (int or float, optional): number of latent channels to use in factorized
             convolutions. Can be specified as either an integer > 1 or as
             float within [0, 1). In the latter case, the value is
             interpreted as a fraction of ``min(in_channels, out_channels)``
             for each :class:`~torch.nn.Conv2d` module, and is converted to
-            the equivalent integer value, with a minimum of 1.
+            the equivalent integer value, with a minimum of 1. Default: ``0.25``.
         min_features (int, optional): if a :class:`~torch.nn.Linear` module does not have at least
             this many input and output features, it will be ignored. Modules with
             few features are unlikely to be accelerated by factorization due
-            to poor hardware utilization.
+            to poor hardware utilization. Default: ``512``.
         latent_features (int or float, optional): size of the latent space for factorized linear modules.
             Can be specified as either an integer > 1 or as a float within [0, 0.5).
             In the latter case, the value is interpreted as a fraction of
             ``min(in_features, out_features)`` for each :class:`~torch.nn.Linear`
             module, and is converted to the equivalent integer value, with a
-            minimum of 1.
+            minimum of 1. Default: ``0.25``.
         optimizers (Optimizers, optional):  Existing optimizers bound to
             ``model.parameters()``. All optimizers that have already been
             constructed with ``model.parameters()`` must be specified here so
@@ -123,27 +125,29 @@ class Factorize(Algorithm):
 
     Args:
         factorize_convs (bool): whether to try factorizing :class:`~torch.nn.Conv2d` modules.
+            Default: ``True``.
         factorize_linears (bool): whether to try factorizing :class:`~torch.nn.Linear` modules.
+            Default: ``True``.
         min_channels (int): if a :class:`~torch.nn.Conv2d` module does not have at least
             this many input and output channels, it will be ignored. Modules with
             few channels are unlikely to be accelerated by factorization due
-            to poor hardware utilization.
+            to poor hardware utilization. Default: ``256``.
         latent_channels (int, float): number of latent channels to use in factorized
             convolutions. Can be specified as either an integer > 1 or as
             float within [0, 1). In the latter case, the value is
             interpreted as a fraction of ``min(in_channels, out_channels)``
             for each :class:`~torch.nn.Conv2d` module, and is converted to
-            the equivalent integer value, with a minimum of 1.
+            the equivalent integer value, with a minimum of 1. Default: ``0.25``.
         min_features (int): if a :class:`~torch.nn.Linear` module does not have at least
             this many input and output features, it will be ignored. Modules with
             few features are unlikely to be accelerated by factorization due
-            to poor hardware utilization.
+            to poor hardware utilization. Default: ``256``.
         latent_features (int, float): size of the latent space for factorized linear modules.
             Can be specified as either an integer > 1 or as a float within [0, 0.5).
             In the latter case, the value is interpreted as a fraction of
             ``min(in_features, out_features)`` for each :class:`~torch.nn.Linear`
             module, and is converted to the equivalent integer value, with a
-            minimum of 1.
+            minimum of 1. Default: ``128``.
     """
 
     def __init__(self,