diff --git a/composer/algorithms/alibi/alibi.py b/composer/algorithms/alibi/alibi.py
index 0f8e8e7ed7..98dac340db 100755
--- a/composer/algorithms/alibi/alibi.py
+++ b/composer/algorithms/alibi/alibi.py
@@ -80,12 +80,12 @@ def apply_alibi(
             necessary for evaluating on sequence lengths longer than the model was
             initialized to accommodate. Takes positional arguments ``module`` and
             ``max_sequence_length``. For example,
-            ``composer.algorithms.alibi._gpt2_alibi.enlarge_mask``. Default = ``None``,
+            ``composer.algorithms.alibi._gpt2_alibi.enlarge_mask``. Default: ``None``,
             which means no modification of the model's default attention mask.
         optimizers (Optimizers, optional): Existing optimizers bound to ``model.parameters()``.
             All optimizers that have already been constructed with
             ``model.parameters()`` must be specified here so they will optimize
-            the correct parameters.
+            the correct parameters. Default: ``None``.
 
             If the optimizer(s) are constructed *after* calling this function,
             then it is safe to omit this parameter. These optimizers will see the correct
@@ -182,12 +182,12 @@ class Alibi(Algorithm):
         max_sequence_length (int): Maximum sequence length that the
             model will be able to accept. This is sometimes necessary for evaluating
             on sequence lengths longer than the model was initialized to
-            accommodate.
+            accommodate. Default: ``8192``.
         train_sequence_length_scaling (float, optional): Amount by which to scale
             training sequence length. One batch of training data will be
             reshaped from shape :math:`(sequence\\_length, batch)` to
             :math:`(sequence\\_length \\times train\\_sequence\\_length\\_scaling,
-            \\frac{batch}{train\\_sequence\\_length\\_scaling})`. Default = ``0.25``.
+            \\frac{batch}{train\\_sequence\\_length\\_scaling})`. Default: ``0.25``.
     """
 
     def __init__(self,
diff --git a/composer/algorithms/augmix/augmix.py b/composer/algorithms/augmix/augmix.py
index bfe6917d19..e63f8fbc6b 100755
--- a/composer/algorithms/augmix/augmix.py
+++ b/composer/algorithms/augmix/augmix.py
@@ -32,7 +32,7 @@ def augmix_image(img: ImgT,
                  augmentation_set: List = augmentation_sets["all"]) -> ImgT:
     """Applies AugMix (`Hendrycks et al, 2020 <http://arxiv.org/abs/1912.02781>`_) data
     augmentation to a single image or batch of images. See
-    :class:`~composer.algorithms.augmix.augmix.AugMix` and the 
+    :class:`.AugMix` and the
     :doc:`Method Card </method_cards/augmix>` for details. This function only acts on a
     single image (or batch) per call and is unlikely to be used in a training loop. Use
     :class:`~composer.algorithms.augmix.augmix.AugmentAndMixTransform` to use AugMix as
@@ -56,12 +56,12 @@ def augmix_image(img: ImgT,
 
     Args:
         img (PIL.Image): Image or batch of images to be AugMix'd.
-        severity (int, optional): See :class:`~composer.algorithms.augmix.augmix.AugMix`.
-        depth (int, optional): See :class:`~composer.algorithms.augmix.augmix.AugMix`.
-        width (int, optional): See :class:`~composer.algorithms.augmix.augmix.AugMix`.
-        alpha (float, optional): See :class:`~composer.algorithms.augmix.augmix.AugMix`.
+        severity (int, optional): See :class:`.AugMix`.
+        depth (int, optional): See :class:`.AugMix`.
+        width (int, optional): See :class:`.AugMix`.
+        alpha (float, optional): See :class:`.AugMix`.
         augmentation_set (str, optional): See
-            :class:`~composer.algorithms.augmix.augmix.AugMix`.
+            :class:`.AugMix`.
 
     Returns:
          PIL.Image: AugMix'd image.
@@ -102,7 +102,7 @@ def _augmix_pil_image(img_pil: PillowImage, severity: int, depth: int, width: in
 class AugmentAndMixTransform(torch.nn.Module):
     """Wrapper module for :func:`~composer.algorithms.augmix.augmix.augmix_image` that can
     be passed to :class:`torchvision.transforms.Compose`. See
-    :class:`~composer.algorithms.augmix.augmix.AugMix` and the :doc:`Method Card
+    :class:`.AugMix` and the :doc:`Method Card
     </method_cards/augmix>` for details.
 
     Example:
@@ -123,12 +123,12 @@ class AugmentAndMixTransform(torch.nn.Module):
             transformed_image = composed(image)
 
     Args:
-        severity (int, optional): See :class:`~composer.algorithms.augmix.augmix.AugMix`.
-        depth (int, optional): See :class:`~composer.algorithms.augmix.augmix.AugMix`.
-        width (int, optional): See :class:`~composer.algorithms.augmix.augmix.AugMix`.
-        alpha (float, optional): See :class:`~composer.algorithms.augmix.augmix.AugMix`.
+        severity (int, optional): See :class:`.AugMix`.
+        depth (int, optional): See :class:`.AugMix`.
+        width (int, optional): See :class:`.AugMix`.
+        alpha (float, optional): See :class:`.AugMix`.
         augmentation_set (str, optional): See
-            :class:`~composer.algorithms.augmix.augmix.AugMix`.
+            :class:`.AugMix`.
     """
 
     def __init__(self,
@@ -167,8 +167,8 @@ class AugMix(Algorithm):
     ``Dirichlet(alpha, alpha, ...)`` distribution. The coefficient for mixing the combined augmented image and the
     original image is drawn from a ``Beta(alpha, alpha)`` distribution, using the same ``alpha``.
 
-    This algorithm runs on on :attr:`~composer.core.event.Event.FIT_START` to insert a dataset transformation. It is a no-op if this algorithm already
-    applied itself on the :attr:`State.train_dataloader.dataset`.
+    This algorithm runs on on :attr:`~composer.core.event.Event.FIT_START` to insert a dataset transformation.
+    It is a no-op if this algorithm already applied itself on the :attr:`State.train_dataloader.dataset`.
 
     See the :doc:`Method Card </method_cards/augmix>` for more details.
 
@@ -196,14 +196,14 @@ class AugMix(Algorithm):
 
     Args:
         severity (int, optional): Severity of augmentations; ranges from 0
-            (no augmentation) to 10 (most severe). Default = ``3``.
+            (no augmentation) to 10 (most severe). Default: ``3``.
         depth (int, optional): Number of augmentations per sequence. -1 enables stochastic
-            depth sampled uniformly from [1, 3]. Default = ``-1``.
-        width (int, optional): Number of augmentation sequences. Default = ``3``.
+            depth sampled uniformly from [1, 3]. Default: ``-1``.
+        width (int, optional): Number of augmentation sequences. Default: ``3``.
         alpha (float, optional): Pseudocount for Beta and Dirichlet distributions. Must be
             > 0.  Higher values yield mixing coefficients closer to uniform weighting. As
             the value approaches 0, the mixing coefficients approach using only one
-            version of each image. Default = ``1.0``.
+            version of each image. Default: ``1.0``.
         augmentation_set (str, optional): Must be one of the following options:
 
             * ``"augmentations_all"``
@@ -225,7 +225,7 @@ class AugMix(Algorithm):
                 "sharpness", and "brightness" that account for diverging effects around 0
                 (or 1).
 
-            Default = ``"all"``.
+            Default: ``"all"``.
     """
 
     # TODO document each value of augmentation_set in more detail; i.e.,
diff --git a/composer/algorithms/blurpool/blurpool.py b/composer/algorithms/blurpool/blurpool.py
index b49e7e4bd3..ba162a9728 100644
--- a/composer/algorithms/blurpool/blurpool.py
+++ b/composer/algorithms/blurpool/blurpool.py
@@ -31,15 +31,15 @@ def apply_blurpool(model: torch.nn.Module,
     Args:
         model (torch.nn.Module): the model to modify in-place
         replace_convs (bool, optional): replace strided :class:`torch.nn.Conv2d` modules with
-            :class:`.BlurConv2d` modules
+            :class:`.BlurConv2d` modules. Default: ``True``.
         replace_maxpools (bool, optional): replace eligible :class:`torch.nn.MaxPool2d` modules
-            with :class:`.BlurMaxPool2d` modules.
+            with :class:`.BlurMaxPool2d` modules. Default: ``True``.
         blur_first (bool, optional): for ``replace_convs``, blur input before the associated
             convolution. When set to ``False``, the convolution is applied with
             a stride of 1 before the blurring, resulting in significant
             overhead (though more closely matching
             `the paper <http://proceedings.mlr.press/v97/zhang19a.html>`_).
-            See :class:`.BlurConv2d` for further discussion.
+            See :class:`.BlurConv2d` for further discussion. Default: ``True``.
         optimizers (Optimizers, optional):  Existing optimizers bound to
             ``model.parameters()``. All optimizers that have already been
             constructed with ``model.parameters()`` must be specified here so
@@ -82,14 +82,14 @@ class BlurPool(Algorithm):
 
     Args:
         replace_convs (bool): replace strided :class:`torch.nn.Conv2d` modules with
-            :class:`.BlurConv2d` modules
+            :class:`.BlurConv2d` modules. Default: ``True``.
         replace_maxpools (bool): replace eligible :class:`torch.nn.MaxPool2d` modules
-            with :class:`.BlurMaxPool2d` modules.
+            with :class:`.BlurMaxPool2d` modules. Default: ``True``.
         blur_first (bool): when ``replace_convs`` is ``True``, blur input before the
             associated convolution. When set to ``False``, the convolution is
             applied with a stride of 1 before the blurring, resulting in
             significant overhead (though more closely matching the paper).
-            See :class:`.BlurConv2d` for further discussion.
+            See :class:`.BlurConv2d` for further discussion. Default: ``True``.
     """
 
     def __init__(self, replace_convs: bool, replace_maxpools: bool, blur_first: bool) -> None:
diff --git a/composer/algorithms/colout/colout.py b/composer/algorithms/colout/colout.py
index e5cbc50bc8..7952beab10 100644
--- a/composer/algorithms/colout/colout.py
+++ b/composer/algorithms/colout/colout.py
@@ -45,8 +45,8 @@ def colout_batch(X: ImgT, p_row: float = 0.15, p_col: float = 0.15) -> ImgT:
         X: :class:`PIL.Image.Image` or :class:`torch.Tensor` of image data. In
             the latter case, must be a single image of shape ``CHW`` or a batch
             of images of shape ``NCHW``.
-        p_row: Fraction of rows to drop (drop along H).
-        p_col: Fraction of columns to drop (drop along W).
+        p_row: Fraction of rows to drop (drop along H). Default: ``0.15``.
+        p_col: Fraction of columns to drop (drop along W). Default: ``0.15``.
 
     Returns:
         torch.Tensor: Input batch tensor with randomly dropped columns and rows.
@@ -94,8 +94,8 @@ class ColOutTransform:
             transforms = transforms.Compose([colout_transform, transforms.ToTensor()])
 
     Args:
-        p_row (float): Fraction of rows to drop (drop along H).
-        p_col (float): Fraction of columns to drop (drop along W).
+        p_row (float): Fraction of rows to drop (drop along H). Default: ``0.15``.
+        p_col (float): Fraction of columns to drop (drop along W). Default: ``0.15``.
     """
 
     def __init__(self, p_row: float = 0.15, p_col: float = 0.15):
@@ -142,9 +142,9 @@ class ColOut(Algorithm):
             )
 
     Args:
-        p_row (float): Fraction of rows to drop (drop along H).
-        p_col (float): Fraction of columns to drop (drop along W).
-        batch (bool): Run ColOut at the batch level.
+        p_row (float): Fraction of rows to drop (drop along H). Default: ``0.15``.
+        p_col (float): Fraction of columns to drop (drop along W). Default: ``0.15``.
+        batch (bool): Run ColOut at the batch level. Default: ``True``.
     """
 
     def __init__(self, p_row: float = 0.15, p_col: float = 0.15, batch: bool = True):
diff --git a/composer/algorithms/cutmix/cutmix.py b/composer/algorithms/cutmix/cutmix.py
index 850b3dd500..64ab3367b2 100644
--- a/composer/algorithms/cutmix/cutmix.py
+++ b/composer/algorithms/cutmix/cutmix.py
@@ -51,16 +51,16 @@ def cutmix_batch(X: Tensor,
             )
 
     Args:
-        X: input tensor of shape (B, d1, d2, ..., dn), B is batch size, d1-dn
+        X (Tensor): input tensor of shape (B, d1, d2, ..., dn), B is batch size, d1-dn
             are feature dimensions.
-        y: target tensor of shape (B, f1, f2, ..., fm), B is batch size, f1-fn
+        y (Tensor): target tensor of shape (B, f1, f2, ..., fm), B is batch size, f1-fn
             are possible target dimensions.
-        n_classes: total number of classes.
-        alpha: parameter for the beta distribution of the cutmix region size.
-        cutmix_lambda: optional, fixed size of cutmix region.
-        bbox: optional, predetermined (rx1, ry1, rx2, ry2) coords of the bounding box.
-        indices: Permutation of the batch indices `1..B`. Used
-            for permuting without randomness.
+        n_classes (int): total number of classes.
+        alpha (float, optional): parameter for the beta distribution of the cutmix region size. Default: ``1``.
+        cutmix_lambda (float, optional): fixed size of cutmix region. Default: ``None``.
+        bbox (tuple, optional): predetermined (rx1, ry1, rx2, ry2) coords of the bounding box. Default: ``None``.
+        indices (Tensor, optional): Permutation of the batch indices `1..B`. Used
+            for permuting without randomness. Default: ``None``.
 
     Returns:
         X_cutmix: batch of inputs after cutmix has been applied.
@@ -132,11 +132,11 @@ class CutMix(Algorithm):
 
     Args:
         num_classes (int): the number of classes in the task labels.
-        alpha (float): the psuedocount for the Beta distribution used to sample
+        alpha (float, optional): the psuedocount for the Beta distribution used to sample
             area parameters. As ``alpha`` grows, the two samples
             in each pair tend to be weighted more equally. As ``alpha``
             approaches 0 from above, the combination approaches only using
-            one element of the pair.
+            one element of the pair. Default: ``1``.
     """
 
     def __init__(self, num_classes: int, alpha: float = 1.):
diff --git a/composer/algorithms/cutout/cutout.py b/composer/algorithms/cutout/cutout.py
index 65d18a1c28..fc491c8c76 100644
--- a/composer/algorithms/cutout/cutout.py
+++ b/composer/algorithms/cutout/cutout.py
@@ -38,11 +38,12 @@ def cutout_batch(X: ImgT, n_holes: int = 1, length: Union[int, float] = 0.5) ->
         X: :class:`PIL.Image.Image` or :class:`torch.Tensor` of image data. In
             the latter case, must be a single image of shape ``CHW`` or a batch
             of images of shape ``NCHW``.
-        n_holes: Integer number of holes to cut out
+        n_holes: Integer number of holes to cut out. Default: ``1``.
         length: Side length of the square holes to cut out. Must be greater than
             0. If ``0 < length < 1``, ``length`` is interpreted as a fraction
             of ``min(H, W)`` and converted to ``int(length * min(H, W))``.
             If ``length >= 1``, ``length`` is used as an integer size directly.
+            Default: ``0.5``.
 
     Returns:
         X_cutout: Batch of images with ``n_holes`` holes of dimension
@@ -91,11 +92,12 @@ class CutOut(Algorithm):
 
     Args:
         X (Tensor): Batch Tensor image of size (B, C, H, W).
-        n_holes: Integer number of holes to cut out
+        n_holes: Integer number of holes to cut out. Default: ``1``.
         length: Side length of the square holes to cut out. Must be greater than
             0. If ``0 < length < 1``, ``length`` is interpreted as a fraction
             of ``min(H, W)`` and converted to ``int(length * min(H, W))``.
             If ``length >= 1``, ``length`` is used as an integer size directly.
+            Default: ``0.5``.
     """
 
     def __init__(self, n_holes: int = 1, length: Union[int, float] = 0.5):
diff --git a/composer/algorithms/factorize/factorize.py b/composer/algorithms/factorize/factorize.py
index a9a865f41e..e76d96daef 100644
--- a/composer/algorithms/factorize/factorize.py
+++ b/composer/algorithms/factorize/factorize.py
@@ -38,27 +38,29 @@ def apply_factorization(model: torch.nn.Module,
     Args:
         model (torch.nn.Module): the model to modify in-place
         factorize_convs (bool, optional): whether to try factorizing :class:`~torch.nn.Conv2d` modules.
+            Default: ``True``.
         factorize_linears (bool, optional): whether to try factorizing :class:`~torch.nn.Linear` modules.
+            Default: ``True``.
         min_channels (int, optional): if a :class:`~torch.nn.Conv2d` module does not have at least
             this many input and output channels, it will be ignored. Modules with
             few channels are unlikely to be accelerated by factorization due
-            to poor hardware utilization.
+            to poor hardware utilization. Default: ``512``.
         latent_channels (int or float, optional): number of latent channels to use in factorized
             convolutions. Can be specified as either an integer > 1 or as
             float within [0, 1). In the latter case, the value is
             interpreted as a fraction of ``min(in_channels, out_channels)``
             for each :class:`~torch.nn.Conv2d` module, and is converted to
-            the equivalent integer value, with a minimum of 1.
+            the equivalent integer value, with a minimum of 1. Default: ``0.25``.
         min_features (int, optional): if a :class:`~torch.nn.Linear` module does not have at least
             this many input and output features, it will be ignored. Modules with
             few features are unlikely to be accelerated by factorization due
-            to poor hardware utilization.
+            to poor hardware utilization. Default: ``512``.
         latent_features (int or float, optional): size of the latent space for factorized linear modules.
             Can be specified as either an integer > 1 or as a float within [0, 0.5).
             In the latter case, the value is interpreted as a fraction of
             ``min(in_features, out_features)`` for each :class:`~torch.nn.Linear`
             module, and is converted to the equivalent integer value, with a
-            minimum of 1.
+            minimum of 1. Default: ``0.25``.
         optimizers (Optimizers, optional):  Existing optimizers bound to
             ``model.parameters()``. All optimizers that have already been
             constructed with ``model.parameters()`` must be specified here so
@@ -123,27 +125,29 @@ class Factorize(Algorithm):
 
     Args:
         factorize_convs (bool): whether to try factorizing :class:`~torch.nn.Conv2d` modules.
+            Default: ``True``.
         factorize_linears (bool): whether to try factorizing :class:`~torch.nn.Linear` modules.
+            Default: ``True``.
         min_channels (int): if a :class:`~torch.nn.Conv2d` module does not have at least
             this many input and output channels, it will be ignored. Modules with
             few channels are unlikely to be accelerated by factorization due
-            to poor hardware utilization.
+            to poor hardware utilization. Default: ``256``.
         latent_channels (int, float): number of latent channels to use in factorized
             convolutions. Can be specified as either an integer > 1 or as
             float within [0, 1). In the latter case, the value is
             interpreted as a fraction of ``min(in_channels, out_channels)``
             for each :class:`~torch.nn.Conv2d` module, and is converted to
-            the equivalent integer value, with a minimum of 1.
+            the equivalent integer value, with a minimum of 1. Default: ``0.25``.
         min_features (int): if a :class:`~torch.nn.Linear` module does not have at least
             this many input and output features, it will be ignored. Modules with
             few features are unlikely to be accelerated by factorization due
-            to poor hardware utilization.
+            to poor hardware utilization. Default: ``256``.
         latent_features (int, float): size of the latent space for factorized linear modules.
             Can be specified as either an integer > 1 or as a float within [0, 0.5).
             In the latter case, the value is interpreted as a fraction of
             ``min(in_features, out_features)`` for each :class:`~torch.nn.Linear`
             module, and is converted to the equivalent integer value, with a
-            minimum of 1.
+            minimum of 1. Default: ``128``.
     """
 
     def __init__(self,
diff --git a/composer/algorithms/ghost_batchnorm/ghost_batchnorm.py b/composer/algorithms/ghost_batchnorm/ghost_batchnorm.py
index 82cf891b45..07170a6ccb 100644
--- a/composer/algorithms/ghost_batchnorm/ghost_batchnorm.py
+++ b/composer/algorithms/ghost_batchnorm/ghost_batchnorm.py
@@ -14,7 +14,6 @@
 
 log = logging.getLogger(__name__)
 
-_DEFAULT_GHOST_BATCH_SIZE = 32
 _TORCH_BATCHNORM_BASE_CLASS = torch.nn.modules.batchnorm._BatchNorm
 
 
@@ -29,7 +28,7 @@ def apply_ghost_batchnorm(model: torch.nn.Module,
 
     Args:
         model (torch.nn.Module): the model to modify in-place
-        ghost_batch_size (int, optional): size of sub-batches to normalize over
+        ghost_batch_size (int, optional): size of sub-batches to normalize over. Default: ``32``.
         optimizers (Optimizers, optional):  Existing optimizers bound to ``model.parameters()``.
             All optimizers that have already been constructed with
             ``model.parameters()`` must be specified here so they will optimize
@@ -73,10 +72,10 @@ class GhostBatchNorm(Algorithm):
     Runs on :attr:`~composer.core.event.Event.INIT`.
 
     Args:
-        ghost_batch_size (int): size of sub-batches to normalize over
+        ghost_batch_size (int, optional): size of sub-batches to normalize over. Default: ``32``.
     """
 
-    def __init__(self, ghost_batch_size: int = _DEFAULT_GHOST_BATCH_SIZE):
+    def __init__(self, ghost_batch_size: int = 32):
         self.ghost_batch_size = ghost_batch_size
 
     def match(self, event: Event, state: State) -> bool:
@@ -140,9 +139,9 @@ class _GhostBatchNorm(torch.nn.Module):
     `torch.nn.BatchNorm3d <https://pytorch.org/docs/stable/generated/torch.nn.BatchNorm3d.html>`_.
 
     Args:
-        ghost_batch_size: the size of the chunks passed into the underlying
-            batch normalization
         base_batchnorm: A batch normalization module to be applied to each chunk
+        ghost_batch_size (int, optional): the size of the chunks passed into the underlying
+            batch normalization. Default: ``32``.
 
     Raises:
         ValueError: If ``ghost_batch_size`` exceeds the number of samples in
@@ -151,7 +150,7 @@ class _GhostBatchNorm(torch.nn.Module):
             much smaller than the overall batch size.
     """
 
-    def __init__(self, base_batchnorm: _TORCH_BATCHNORM_BASE_CLASS, ghost_batch_size: int = _DEFAULT_GHOST_BATCH_SIZE):
+    def __init__(self, base_batchnorm: _TORCH_BATCHNORM_BASE_CLASS, ghost_batch_size: int = 32):
         super().__init__()
         self.ghost_batch_size = ghost_batch_size
         self.batchnorm = base_batchnorm
diff --git a/composer/algorithms/label_smoothing/label_smoothing.py b/composer/algorithms/label_smoothing/label_smoothing.py
index bd073d2bf9..7fba264d67 100644
--- a/composer/algorithms/label_smoothing/label_smoothing.py
+++ b/composer/algorithms/label_smoothing/label_smoothing.py
@@ -32,11 +32,11 @@ def smooth_labels(logits: Tensor, targets: Tensor, alpha: float):
                           )
 
     Args:
-        logits: Output of the model. Tensor of shape (N, C, d1, ..., dn) for
+        logits (Tensor): Output of the model. Tensor of shape (N, C, d1, ..., dn) for
             N examples and C classes, and d1, ..., dn extra dimensions.
-        targets: Tensor of shape (N) containing integers 0 <= i <= C-1
+        targets (Tensor): Tensor of shape (N) containing integers 0 <= i <= C-1
             specifying the target labels for each example.
-        alpha: Strength of the label smoothing, in [0, 1]. ``alpha=0``
+        alpha (float): Strength of the label smoothing, in [0, 1]. ``alpha=0``
             means no label smoothing, and ``alpha=1`` means maximal
             smoothing (targets are ignored).
     """
@@ -71,7 +71,7 @@ class LabelSmoothing(Algorithm):
             )
 
     Args:
-        alpha: Strength of the label smoothing, in [0, 1]. ``alpha=0``
+        alpha (float): Strength of the label smoothing, in [0, 1]. ``alpha=0``
             means no label smoothing, and ``alpha=1`` means maximal
             smoothing (targets are ignored).
     """
diff --git a/composer/algorithms/layer_freezing/layer_freezing.py b/composer/algorithms/layer_freezing/layer_freezing.py
index 082afac739..73dc0b3c62 100644
--- a/composer/algorithms/layer_freezing/layer_freezing.py
+++ b/composer/algorithms/layer_freezing/layer_freezing.py
@@ -22,8 +22,8 @@ def freeze_layers(
     model: Model,
     optimizers: Optimizers,
     current_duration: float,
-    freeze_start: float,
-    freeze_level: float,
+    freeze_start: float = 0.5,
+    freeze_level: float = 1.0,
 ) -> Tuple[int, float]:
     """Progressively freeze the layers of the network in-place during training, starting with the earlier layers.
 
@@ -44,8 +44,10 @@ def freeze_layers(
         model (Model): The model being trained.
         optimizers (Optimizers): The optimizers used during training.
         current_duration (float): The fraction on [0; 1) of the training process complete.
-        freeze_start (float): The fraction of the training process on [0; 1) to run before freezing begins.
-        freeze_level (float): The maximum fraction of layers on [0; 1) to freeze.
+        freeze_start (float, optional): The fraction of the training process on [0; 1) to run
+            before freezing begins. Default: ``0.5``.
+        freeze_level (float, optional): The maximum fraction of layers on [0; 1) to freeze.
+            Default: ``1.0``.
 
     Return:
         (int, float): The number of layers frozen, and the percentage of the total model frozen.
@@ -105,8 +107,8 @@ class LayerFreezing(Algorithm):
             )
 
     Args:
-        freeze_start (float): The fraction of training to run before freezing begins.
-        freeze_level (float): The maximum fraction of layers to freeze.
+        freeze_start (float): The fraction of training to run before freezing begins. Default: ``0.5``.
+        freeze_level (float): The maximum fraction of layers to freeze. Default: ``1.0``.
     """
 
     def __init__(self, freeze_start: float = 0.5, freeze_level: float = 1.0):
diff --git a/composer/algorithms/mixup/mixup.py b/composer/algorithms/mixup/mixup.py
index 1a6d102891..c4ed4e3798 100644
--- a/composer/algorithms/mixup/mixup.py
+++ b/composer/algorithms/mixup/mixup.py
@@ -48,24 +48,24 @@ def mixup_batch(x: Tensor,
                                                 )
 
     Args:
-        x: input tensor of shape (B, d1, d2, ..., dn), B is batch size, d1-dn
+        x (Tensor): input tensor of shape (B, d1, d2, ..., dn), B is batch size, d1-dn
             are feature dimensions.
-        y: target tensor of shape (B, f1, f2, ..., fm), B is batch size, f1-fn
+        y (Tensor): target tensor of shape (B, f1, f2, ..., fm), B is batch size, f1-fn
             are possible target dimensions.
-        interpolation_lambda: coefficient used to interpolate between the
+        n_classes: total number of classes.
+        interpolation_lambda (float, optional): coefficient used to interpolate between the
             two examples. If provided, must be in ``[0, 1]``. If ``None``,
-            value is drawn from a ``Beta(alpha, alpha)`` distribution.
-        alpha: parameter for the beta distribution over the
+            value is drawn from a ``Beta(alpha, alpha)`` distribution. Default: ``None``.
+        alpha (float, optional): parameter for the beta distribution over the
             ``interpolation_lambda``. Only used if ``interpolation_lambda``
-            is not provided.
-        n_classes: total number of classes.
-        indices: Permutation of the batch indices `1..B`. Used
+            is not provided. Default: ``0.2``.
+        indices (Tensor, optional): Permutation of the batch indices `1..B`. Used
             for permuting without randomness.
 
     Returns:
-        x_mix: batch of inputs after mixup has been applied
-        y_mix: labels after mixup has been applied
-        perm: the permutation used
+        x_mix (Tensor): batch of inputs after mixup has been applied
+        y_mix (Tensor): labels after mixup has been applied
+        perm (Tuple[Tensor]): the permutation used
     """
     if interpolation_lambda is None:
         interpolation_lambda = _gen_interpolation_lambda(alpha)
@@ -118,11 +118,11 @@ class MixUp(Algorithm):
 
     Args:
         num_classes (int): the number of classes in the task labels.
-        alpha (float): the psuedocount for the Beta distribution used to sample
+        alpha (float, optional): the psuedocount for the Beta distribution used to sample
             interpolation parameters. As ``alpha`` grows, the two samples
             in each pair tend to be weighted more equally. As ``alpha``
             approaches 0 from above, the combination approaches only using
-            one element of the pair.
+            one element of the pair. Default: ``0.2``.
     """
 
     def __init__(self, num_classes: int, alpha: float = 0.2):
diff --git a/composer/algorithms/progressive_resizing/progressive_resizing.py b/composer/algorithms/progressive_resizing/progressive_resizing.py
index cb03d0397c..da129e4ee2 100755
--- a/composer/algorithms/progressive_resizing/progressive_resizing.py
+++ b/composer/algorithms/progressive_resizing/progressive_resizing.py
@@ -46,16 +46,16 @@ def resize_batch(X: torch.Tensor,
             )
 
     Args:
-        X: input tensor of shape (N, C, H, W). Resizing will be done along
+        X (Tensor): input tensor of shape (N, C, H, W). Resizing will be done along
             dimensions H and W using the constant factor ``scale_factor``.
-        y: output tensor of shape (N, H, W) or (N, C, H, W) that will also be resized if
+        y (Tensor): output tensor of shape (N, H, W) or (N, C, H, W) that will also be resized if
             ``resize_targets`` is ``True``,
-        scale_factor: scaling coefficient for the height and width of the
+        scale_factor (float): scaling coefficient for the height and width of the
             input/output tensor. 1.0 keeps the original size.
-        mode: type of scaling to perform. Value must be one of ``'crop'`` or
+        mode (str, optional): type of scaling to perform. Value must be one of ``'crop'`` or
             ``'resize'``. ``'crop'`` performs a random crop, whereas ``'resize'``
-            performs a nearest neighbor interpolation.
-        resize_targets: whether to resize the targets, ``y``, as well
+            performs a nearest neighbor interpolation. Default: ``"resize"``.
+        resize_targets (bool, optional): whether to resize the targets, ``y``. Default: ``False``.
 
     Returns:
         X_sized: resized input tensor of shape ``(N, C, H * scale_factor, W * scale_factor)``.
@@ -141,14 +141,14 @@ class ProgressiveResizing(Algorithm):
             )
 
     Args:
-        mode: Type of scaling to perform. Value must be one of ``'crop'`` or ``'resize'``.
+        mode (str, optional): Type of scaling to perform. Value must be one of ``'crop'`` or ``'resize'``.
             ``'crop'`` performs a random crop, whereas ``'resize'`` performs a bilinear
-            interpolation.
-        initial_scale: Initial scale factor used to shrink the inputs. Must be a
-            value in between 0 and 1.
-        finetune_fraction: Fraction of training to reserve for finetuning on the
-            full-sized inputs. Must be a value in between 0 and 1.
-        resize_targets: If True, resize targets also.
+            interpolation. Default: ``"resize"``.
+        initial_scale (float, optional): Initial scale factor used to shrink the inputs. Must be a
+            value in between 0 and 1. Default: ``0.5``.
+        finetune_fraction (float, optional): Fraction of training to reserve for finetuning on the
+            full-sized inputs. Must be a value in between 0 and 1. Default: ``0.2``.
+        resize_targets (bool, optional): If True, resize targets also. Default: ``False``.
     """
 
     def __init__(self,
diff --git a/composer/algorithms/randaugment/__init__.py b/composer/algorithms/randaugment/__init__.py
index 122c6e766b..4b6f06e075 100755
--- a/composer/algorithms/randaugment/__init__.py
+++ b/composer/algorithms/randaugment/__init__.py
@@ -2,7 +2,7 @@
 
 """Randomly applies a sequence of image data augmentations
 (`Cubuk et al, 2019 <https://arxiv.org/abs/1909.13719>`_) to an image. See
-:class:`~composer.algorithms.randaugment.randaugment.RandAugment` or the :doc:`Method Card
+:class:`.RandAugment` or the :doc:`Method Card
 </method_cards/randaugment>` for details.
 """
 
diff --git a/composer/algorithms/randaugment/randaugment.py b/composer/algorithms/randaugment/randaugment.py
index 0485bd28ce..4f258081f8 100755
--- a/composer/algorithms/randaugment/randaugment.py
+++ b/composer/algorithms/randaugment/randaugment.py
@@ -28,10 +28,10 @@ def randaugment_image(img: ImgT,
                       augmentation_set: List = augmentation_sets["all"]) -> ImgT:
     """Randomly applies a sequence of image data augmentations
     (`Cubuk et al, 2019 <https://arxiv.org/abs/1909.13719>`_) to an image or batch of
-    images. See :class:`~composer.algorithms.randaugment.randaugment.RandAugment` or the
+    images. See :class:`.RandAugment` or the
     :doc:`Method Card </method_cards/randaugment>` for details. This function only acts on
     a single image (or batch of images) per call and is unlikely to be used in a training
-    loop. Use :class:`~composer.algorithms.randaugment.randaugment.RandAugmentTransform`
+    loop. Use :class:`.RandAugmentTransform`
     to use RandAugment as part of a :class:`torchvision.datasets.VisionDataset`\\'s
     ``transform``.
 
@@ -51,10 +51,10 @@ def randaugment_image(img: ImgT,
 
     Args:
         img (PIL.Image): Image or batch of images to be RandAugmented.
-        severity (int, optional): See :class:`~composer.algorithms.randaugment.randaugment.RandAugment`.
-        depth (int, optional): See :class:`~composer.algorithms.randaugment.randaugment.RandAugment`.
+        severity (int, optional): See :class:`.RandAugment`.
+        depth (int, optional): See :class:`.RandAugment`.
         augmentation_set (str, optional): See
-            :class:`~composer.algorithms.randaugment.randaugment.RandAugment`.
+            :class:`.RandAugment`.
 
     Returns:
         PIL.Image: RandAugmented image.
@@ -72,9 +72,9 @@ def _randaugment_pil_image(img: PillowImage, severity: int, depth: int, augmenta
 
 
 class RandAugmentTransform(torch.nn.Module):
-    """Wraps :func:`~composer.algorithms.randaugment.randaugment.randaugment_image` in a
+    """Wraps :func:`.randaugment_image` in a
     ``torchvision``-compatible transform. See
-    :class:`~composer.algorithms.randaugment.randaugment.RandAugment` or the :doc:`Method
+    :class:`.RandAugment` or the :doc:`Method
     Card </method_cards/randaugment>` for more details.
 
     Example:
@@ -92,10 +92,10 @@ class RandAugmentTransform(torch.nn.Module):
             transformed_image = composed(image)
 
     Args:
-        severity (int, optional): See :class:`~composer.algorithms.randaugment.randaugment.RandAugment`.
-        depth (int, optional): See :class:`~composer.algorithms.randaugment.randaugment.RandAugment`.
+        severity (int, optional): See :class:`.RandAugment`.
+        depth (int, optional): See :class:`.RandAugment`.
         augmentation_set (str, optional): See
-            :class:`~composer.algorithms.randaugment.randaugment.RandAugment`.
+            :class:`.RandAugment`.
     """
 
     def __init__(self, severity: int = 9, depth: int = 2, augmentation_set: str = "all"):
@@ -149,9 +149,9 @@ class RandAugment(Algorithm):
 
     Args:
         severity (int, optional): Severity of augmentation operators (between 1 to 10). M
-            in the original paper. Default = ``9``.
+            in the original paper. Default: ``9``.
         depth (int, optional): Depth of augmentation chain. N in the original paper
-            Default = ``2``.
+            Default: ``2``.
         augmentation_set (str, optional): Must be one of the following options:
 
             * ``"augmentations_all"``
@@ -173,7 +173,7 @@ class RandAugment(Algorithm):
                 "sharpness", and "brightness" that account for diverging effects around 0
                 (or 1).
 
-            Default = ``"all"``.
+            Default: ``"all"``.
     """
 
     def __init__(self, severity: int = 9, depth: int = 2, augmentation_set: str = "all"):
diff --git a/composer/algorithms/sam/sam.py b/composer/algorithms/sam/sam.py
index 7fcea881ae..e072c225ae 100644
--- a/composer/algorithms/sam/sam.py
+++ b/composer/algorithms/sam/sam.py
@@ -94,11 +94,13 @@ class SAM(Algorithm):
     existing optimizer with a :class:`SAMOptimizer`.
 
     Args:
-        rho: The neighborhood size parameter of SAM. Must be greater than 0.
-        epsilon: A small value added to the gradient norm for numerical stability.
-        interval: SAM will run once per ``interval`` steps. A value of 1 will
+        rho (float, optional): The neighborhood size parameter of SAM. Must be greater than 0.
+            Default: ``0.05``.
+        epsilon (float, optional): A small value added to the gradient norm for numerical stability.
+            Default: ``1e-12``.
+        interval (int, optional): SAM will run once per ``interval`` steps. A value of 1 will
             cause SAM to run every step. Steps on which SAM runs take
-            roughly twice as much time to complete.
+            roughly twice as much time to complete. Default: ``1``.
     """
 
     def __init__(
diff --git a/composer/algorithms/scale_schedule/scale_schedule.py b/composer/algorithms/scale_schedule/scale_schedule.py
index fb2f2d2d68..e4223ac9db 100644
--- a/composer/algorithms/scale_schedule/scale_schedule.py
+++ b/composer/algorithms/scale_schedule/scale_schedule.py
@@ -18,9 +18,9 @@ class ScaleSchedule(Algorithm):
     accordingly.
 
     Args:
-        ratio: The factor by which to scale the duration of the schedule. E.g., 0.5
+        ratio (float, optional): The factor by which to scale the duration of the schedule. E.g., 0.5
             makes the schedule take half as long and 2.0 makes it
-            take twice as long. default: 1.0.
+            take twice as long. Default: ``1.0``.
 
     See also:
         :func:`composer.trainer.scale_schedule.scale_scheduler`
diff --git a/composer/algorithms/selective_backprop/selective_backprop.py b/composer/algorithms/selective_backprop/selective_backprop.py
index 6269997921..7a9ab38fb5 100644
--- a/composer/algorithms/selective_backprop/selective_backprop.py
+++ b/composer/algorithms/selective_backprop/selective_backprop.py
@@ -34,9 +34,12 @@ def should_selective_backprop(
     Args:
         current_duration (float): The elapsed training duration, on [0.0; 1.0)
         batch_idx (int): The current batch within the epoch
-        start (float): The duration at which selective backprop should be enabled
-        end (float): The duration at which selective backprop should be disabled
-        interrupt (int): The number of batches between vanilla minibatch gradient updates
+        start (float, optional): The duration at which selective backprop should be enabled.
+            Default: ``0.5``.
+        end (float, optional): The duration at which selective backprop should be disabled
+            Default: ``0.9``.
+        interrupt (int, optional): The number of batches between vanilla minibatch gradient updates
+            Default: ``2``.
 
     Returns:
         bool: If selective backprop should be performed on this batch.
@@ -75,7 +78,7 @@ def select_using_loss(X: torch.Tensor,
             to ensure that per-sample losses are returned.
         keep: Fraction of examples in the batch to keep
         scale_factor: Multiplier between 0 and 1 for spatial size. Downsampling
-            requires the input tensor to be at least 3D.
+            requires the input tensor to be at least 3D. Default: ``1``.
 
     Returns:
         (torch.Tensor, torch.Tensor): The pruned batch of inputs and targets
@@ -159,12 +162,16 @@ class SelectiveBackprop(Algorithm):
     alternate with vanilla minibatch steps.
 
     Args:
-        start: SB interval start as fraction of training duration
-        end: SB interval end as fraction of training duration
-        keep: fraction of minibatch to select and keep for gradient computation
-        scale_factor: scale for downsampling input for selection forward pass
-        interrupt: interrupt SB with a vanilla minibatch step every
-            ``interrupt`` batches
+        start (float, optional): SB interval start as fraction of training duration
+            Default: ``0.5``.
+        end (float, optional): SB interval end as fraction of training duration
+            Default: ``0.9``.
+        keep (float, optional): fraction of minibatch to select and keep for gradient computation
+            Default: ``0.5``.
+        scale_factor (float, optional): scale for downsampling input for selection forward pass
+            Default: ``0.5``.
+        interrupt (int, optional): interrupt SB with a vanilla minibatch step every
+            ``interrupt`` batches. Default: ``2``.
     """
 
     def __init__(self,
diff --git a/composer/algorithms/seq_length_warmup/seq_length_warmup.py b/composer/algorithms/seq_length_warmup/seq_length_warmup.py
index b0621128b6..3b77bf31e1 100644
--- a/composer/algorithms/seq_length_warmup/seq_length_warmup.py
+++ b/composer/algorithms/seq_length_warmup/seq_length_warmup.py
@@ -39,7 +39,7 @@ def set_batch_sequence_length(batch: Dict[str, Tensor], curr_seq_len: int, trunc
         batch (Dict[str, Tensor]): The input batch to the model, must be a dictionary.
         curr_seq_length (int): The desired sequence length to apply.
         truncate (bool, optional): Truncate sequences early, or reshape tensors to create
-            new examples out of the extra tokens. Default = ``True``.
+            new examples out of the extra tokens. Default: ``True``.
 
     Returns:
         Dict[str, Tensor]: a Mapping of input tensors to the model,
diff --git a/composer/algorithms/squeeze_excite/squeeze_excite.py b/composer/algorithms/squeeze_excite/squeeze_excite.py
index 7b902540e0..6186e40299 100644
--- a/composer/algorithms/squeeze_excite/squeeze_excite.py
+++ b/composer/algorithms/squeeze_excite/squeeze_excite.py
@@ -31,10 +31,14 @@ def apply_squeeze_excite(
     like in a convolutional layer.
 
     Args:
+        model (torch.nn.Module): The module to apply squeeze excite replacement.
         latent_channels (float, optional): Dimensionality of the hidden layer within the added
             MLP. If less than 1, interpreted as a fraction of the number of
             output channels in the :class:`~torch.nn.Conv2d` immediately
-            preceding each Squeeze-and-Excitation block.
+            preceding each Squeeze-and-Excitation block. Default: ``64``.
+        min_channels (int, optional): An SE block is added after a :class:`~torch.nn.Conv2d`
+            module ``conv`` only if one of the layer's input or output channels is greater than
+            this threshold. Default: ``128``.
         optimizers (Optimizers, optional):  Existing optimizers bound to ``model.parameters()``.
             All optimizers that have already been constructed with
             ``model.parameters()`` must be specified here so they will optimize
@@ -79,7 +83,7 @@ class SqueezeExcite2d(torch.nn.Module):
     Args:
         num_features (int): Number of features or channels in the input
         latent_channels (float, optional): Dimensionality of the hidden layer within the added
-            MLP. If less than 1, interpreted as a fraction of ``num_features``.
+            MLP. If less than 1, interpreted as a fraction of ``num_features``. Default: ``0.125``.
     """
 
     def __init__(self, num_features: int, latent_channels: float = .125):
@@ -122,18 +126,18 @@ class SqueezeExcite(Algorithm):
     Runs on :attr:`~composer.core.event.Event.INIT`. See :class:`SqueezeExcite2d` for more information.
 
     Args:
-        latent_channels: Dimensionality of the hidden layer within the added
+        latent_channels (float, optional): Dimensionality of the hidden layer within the added
             MLP. If less than 1, interpreted as a fraction of the number of
             output channels in the :class:`~torch.nn.Conv2d` immediately
-            preceding each Squeeze-and-Excitation block.
-        min_channels: An SE block is added after a :class:`~torch.nn.Conv2d`
+            preceding each Squeeze-and-Excitation block. Default: ``64``.
+        min_channels (int, optional): An SE block is added after a :class:`~torch.nn.Conv2d`
             module ``conv`` only if
             ``min(conv.in_channels, conv.out_channels) >= min_channels``.
             For models that reduce spatial size and increase channel count
             deeper in the network, this parameter can be used to only
             add SE blocks deeper in the network. This may be desirable
             because SE blocks add less overhead when their inputs have
-            smaller spatial size.
+            smaller spatial size. Default: ``128``.
     """
 
     def __init__(
diff --git a/composer/algorithms/stochastic_depth/stochastic_depth.py b/composer/algorithms/stochastic_depth/stochastic_depth.py
index 7db55f5241..a959ad3d08 100755
--- a/composer/algorithms/stochastic_depth/stochastic_depth.py
+++ b/composer/algorithms/stochastic_depth/stochastic_depth.py
@@ -58,18 +58,18 @@ def apply_stochastic_depth(model: torch.nn.Module,
             Currently, only :class:`torchvision.models.resnet.Bottleneck` is supported.
         stochastic_method (str, optional): The version of stochastic depth to use. ``"block"``
             randomly drops blocks during training. ``"sample"`` randomly drops
-            samples within a block during training.
+            samples within a block during training. Default: ``"block"``.
         drop_rate (float, optional): The base probability of dropping a layer or sample. Must be
-            between 0.0 and 1.0.
+            between 0.0 and 1.0. Default: `0.2``.
         drop_distribution (str, optional): How ``drop_rate`` is distributed across
             layers. Value must be one of ``"uniform"`` or ``"linear"``.
             ``"uniform"`` assigns the same ``drop_rate`` across all layers.
             ``"linear"`` linearly increases the drop rate across layer depth
-            starting with 0 drop rate and ending with ``drop_rate``.
+            starting with 0 drop rate and ending with ``drop_rate``. Default: ``"linear"``.
         use_same_gpu_seed (bool, optional): Set to ``True`` to have the same layers dropped
             across GPUs when using multi-GPU training. Set to ``False`` to
             have each GPU drop a different set of layers. Only used
-            with ``"block"`` stochastic method.
+            with ``"block"`` stochastic method. Default: ``True``.
         optimizers (Optimizers, optional):  Existing optimizers bound to ``model.parameters()``.
             All optimizers that have already been constructed with
             ``model.parameters()`` must be specified here so they will optimize
@@ -138,21 +138,21 @@ class StochasticDepth(Algorithm):
             Currently, only :class:`torchvision.models.resnet.Bottleneck` is supported.
         stochastic_method (str, optional): The version of stochastic depth to use. ``"block"``
             randomly drops blocks during training. ``"sample"`` randomly drops
-            samples within a block during training.
+            samples within a block during training. Default: ``"block"``.
         drop_rate (float, optional): The base probability of dropping a layer or sample. Must be
-            between 0.0 and 1.0.
+            between 0.0 and 1.0. Default: ``0.2``.
         drop_distribution (str, optional): How ``drop_rate`` is distributed across
             layers. Value must be one of ``"uniform"`` or ``"linear"``.
             ``"uniform"`` assigns the same ``drop_rate`` across all layers.
             ``"linear"`` linearly increases the drop rate across layer depth
-            starting with 0 drop rate and ending with ``drop_rate``.
+            starting with 0 drop rate and ending with ``drop_rate``. Default: ``"linear"``.
         drop_warmup (str | Time | float, optional): A :class:`Time` object, time-string, or float
             on [0.0; 1.0] representing the fraction of the training duration to linearly
-            increase the drop probability to `linear_drop_rate`. (default: ``0.0``)
+            increase the drop probability to `linear_drop_rate`. Default: ``0.0``.
         use_same_gpu_seed (bool, optional): Set to ``True`` to have the same layers dropped
             across GPUs when using multi-GPU training. Set to ``False`` to
             have each GPU drop a different set of layers. Only used
-            with ``"block"`` stochastic method.
+            with ``"block"`` stochastic method. Default: ``True``.
     """
 
     def __init__(self,