From 927285b1b03042cdd28c30a619175272e7be2b04 Mon Sep 17 00:00:00 2001 From: Matthew Leavitt Date: Tue, 7 Jun 2022 12:49:39 -0700 Subject: [PATCH 01/16] recipes --- composer/yamls/recipes/resnet50_medium.yaml | 94 ++++++++++++++++++ composer/yamls/recipes/resnet50_mild.yaml | 87 +++++++++++++++++ composer/yamls/recipes/resnet50_spicy.yaml | 103 ++++++++++++++++++++ 3 files changed, 284 insertions(+) create mode 100644 composer/yamls/recipes/resnet50_medium.yaml create mode 100644 composer/yamls/recipes/resnet50_mild.yaml create mode 100644 composer/yamls/recipes/resnet50_spicy.yaml diff --git a/composer/yamls/recipes/resnet50_medium.yaml b/composer/yamls/recipes/resnet50_medium.yaml new file mode 100644 index 0000000000..646fa5b339 --- /dev/null +++ b/composer/yamls/recipes/resnet50_medium.yaml @@ -0,0 +1,94 @@ +algorithms: + blurpool: + blur_first: true + min_channels: 16 + replace_convs: true + replace_maxpools: true + channels_last: {} + ema: + half_life: 100ba + train_with_ema_weights: false + update_interval: 20ba + label_smoothing: + smoothing: 0.1 + mixup: + alpha: 0.2 + interpolate_loss: false + progressive_resizing: + delay_fraction: 0.4 + finetune_fraction: 0.2 + initial_scale: 0.5 + mode: resize + resize_targets: false + size_increment: 4 + sam: + epsilon: 1.0e-12 + interval: 10 + rho: 0.5 +callbacks: + lr_monitor: {} + speed_monitor: + window_size: 100 +dataloader: + num_workers: 8 + persistent_workers: true + pin_memory: true + prefetch_factor: 2 + timeout: 0.0 +device: + gpu: {} +eval_batch_size: 2048 +eval_interval: 1 +loggers: + progress_bar: + console_log_level: EPOCH + stream: stderr +model: + resnet: + initializers: + - KAIMING_NORMAL + - BN_UNIFORM + - LINEAR_LOG_CONSTANT_BIAS + loss_name: binary_cross_entropy_with_logits + model_name: resnet50 +optimizer: + decoupled_sgdw: + dampening: 0.0 + lr: 2.048 + momentum: 0.875 + nesterov: false + weight_decay: 0.0005 +precision: AMP +max_duration: 90ep +scale_schedule_ratio: # Fraction of 90 epochs to train for. 0.75-2.6 for medium recipe +schedulers: + cosine_decay_with_warmup: + alpha_f: 0.0 + t_max: 1dur + t_warmup: 8ep +seed: 42 +train_batch_size: 2048 +train_dataset: + imagenet: + crop_size: 176 + # datadir: not needed because we're using FFCV + drop_last: true + ffcv_dest: # imagenet_train.ffcv + ffcv_dir: # /path/to/ffcv/data/directory + ffcv_write_dataset: false + is_train: true + resize_size: -1 + shuffle: true + use_ffcv: true +val_dataset: + imagenet: + crop_size: 224 + # datadir: not needed because we're using FFCV + drop_last: false + ffcv_dest: # imagenet_val.ffcv + ffcv_dir: # /path/to/ffcv/data/directory + ffcv_write_dataset: false + is_train: false + resize_size: 232 + shuffle: false + use_ffcv: true \ No newline at end of file diff --git a/composer/yamls/recipes/resnet50_mild.yaml b/composer/yamls/recipes/resnet50_mild.yaml new file mode 100644 index 0000000000..9e720c7dc1 --- /dev/null +++ b/composer/yamls/recipes/resnet50_mild.yaml @@ -0,0 +1,87 @@ +algorithms: + blurpool: + blur_first: true + min_channels: 16 + replace_convs: true + replace_maxpools: true + channels_last: {} + ema: + half_life: 100ba + train_with_ema_weights: false + update_interval: 20ba + label_smoothing: + smoothing: 0.08 + progressive_resizing: + delay_fraction: 0.4 + finetune_fraction: 0.2 + initial_scale: 0.5 + mode: resize + resize_targets: false + size_increment: 4 +callbacks: + lr_monitor: {} + speed_monitor: + window_size: 100 +dataloader: + num_workers: 8 + persistent_workers: true + pin_memory: true + prefetch_factor: 2 + timeout: 0.0 +device: + gpu: {} +eval_batch_size: 2048 +eval_interval: 1 +loggers: + progress_bar: + console_log_level: EPOCH + stream: stderr +model: + resnet: + initializers: + - KAIMING_NORMAL + - BN_UNIFORM + - LINEAR_LOG_CONSTANT_BIAS + loss_name: binary_cross_entropy_with_logits + model_name: resnet50 +optimizer: + decoupled_sgdw: + dampening: 0.0 + lr: 2.048 + momentum: 0.875 + nesterov: false + weight_decay: 0.0005 +precision: AMP +max_duration: 90ep +scale_schedule_ratio: # Fraction of 90 epochs to train for. 0-0.75 for mild recipe +schedulers: + cosine_decay_with_warmup: + alpha_f: 0.0 + t_max: 1dur + t_warmup: 8ep +seed: 42 +train_batch_size: 2048 +train_dataset: + imagenet: + crop_size: 176 + # datadir: not needed because we're using FFCV + drop_last: true + ffcv_dest: # imagenet_train.ffcv + ffcv_dir: # /path/to/ffcv/data/directory + ffcv_write_dataset: false + is_train: true + resize_size: -1 + shuffle: true + use_ffcv: true +val_dataset: + imagenet: + crop_size: 224 + # datadir: not needed because we're using FFCV + drop_last: false + ffcv_dest: # imagenet_val.ffcv + ffcv_dir: # /path/to/ffcv/data/directory + ffcv_write_dataset: false + is_train: false + resize_size: 232 + shuffle: false + use_ffcv: true \ No newline at end of file diff --git a/composer/yamls/recipes/resnet50_spicy.yaml b/composer/yamls/recipes/resnet50_spicy.yaml new file mode 100644 index 0000000000..b8f20274e6 --- /dev/null +++ b/composer/yamls/recipes/resnet50_spicy.yaml @@ -0,0 +1,103 @@ +algorithms: + blurpool: + blur_first: true + min_channels: 16 + replace_convs: true + replace_maxpools: true + channels_last: {} + colout: + batch: true + p_col: 0.05 + p_row: 0.05 + resize_target: auto + ema: + half_life: 100ba + train_with_ema_weights: false + update_interval: 20ba + label_smoothing: + smoothing: 0.13 + mixup: + alpha: 0.25 + interpolate_loss: false + progressive_resizing: + delay_fraction: 0.2 + finetune_fraction: 0.2 + initial_scale: 0.6 + mode: resize + resize_targets: false + size_increment: 4 + randaugment: + augmentation_set: all + depth: 1 + severity: 10 + sam: + epsilon: 1.0e-12 + interval: 5 + rho: 0.5 + stochastic_depth: + drop_distribution: linear + drop_rate: 0.1 + drop_warmup: 0.0dur + stochastic_method: sample + target_layer_name: ResNetBottleneck + use_same_gpu_seed: false +callbacks: + lr_monitor: {} + speed_monitor: + window_size: 100 +dataloader: + num_workers: 8 + persistent_workers: true + pin_memory: true + prefetch_factor: 2 + timeout: 0.0 +device: + gpu: {} +loggers: + progress_bar: + console_log_level: EPOCH + stream: stderr +model: + resnet: + groups: 1 + initializers: + - KAIMING_NORMAL + - BN_UNIFORM + - LINEAR_LOG_CONSTANT_BIAS + loss_name: binary_cross_entropy_with_logits + model_name: resnet50 + num_classes: 1000 + pretrained: false + width_per_group: 64 +optimizer: + decoupled_sgdw: + dampening: 0.0 + lr: 2.048 + momentum: 0.875 + nesterov: false + weight_decay: 0.0003 +precision: AMP +max_duration: 90ep +scale_schedule_ratio: # Fraction of 90 epochs to train for. >2.6 for spicy recipe +schedulers: + cosine_decay_with_warmup: + alpha_f: 0.0 + t_max: 1dur + t_warmup: 8ep +seed: 42 +train_batch_size: 2048 +train_dataset: + imagenet: + crop_size: 176 + datadir: # /path/to/imagenet + drop_last: true + is_train: true +train_subset_num_batches: -1 +val_dataset: + imagenet: + crop_size: 224 + datadir: # /path/to/imagenet + drop_last: false + is_train: false + resize_size: 232 + shuffle: false \ No newline at end of file From 93ac0bececb5c2a02d994b2a9d87748239437aa1 Mon Sep 17 00:00:00 2001 From: Matthew Leavitt Date: Tue, 7 Jun 2022 12:52:27 -0700 Subject: [PATCH 02/16] terminal newline --- composer/yamls/recipes/resnet50_medium.yaml | 2 +- composer/yamls/recipes/resnet50_mild.yaml | 2 +- composer/yamls/recipes/resnet50_spicy.yaml | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/composer/yamls/recipes/resnet50_medium.yaml b/composer/yamls/recipes/resnet50_medium.yaml index 646fa5b339..9a6285eae1 100644 --- a/composer/yamls/recipes/resnet50_medium.yaml +++ b/composer/yamls/recipes/resnet50_medium.yaml @@ -91,4 +91,4 @@ val_dataset: is_train: false resize_size: 232 shuffle: false - use_ffcv: true \ No newline at end of file + use_ffcv: true diff --git a/composer/yamls/recipes/resnet50_mild.yaml b/composer/yamls/recipes/resnet50_mild.yaml index 9e720c7dc1..74a0dd48a0 100644 --- a/composer/yamls/recipes/resnet50_mild.yaml +++ b/composer/yamls/recipes/resnet50_mild.yaml @@ -84,4 +84,4 @@ val_dataset: is_train: false resize_size: 232 shuffle: false - use_ffcv: true \ No newline at end of file + use_ffcv: true diff --git a/composer/yamls/recipes/resnet50_spicy.yaml b/composer/yamls/recipes/resnet50_spicy.yaml index b8f20274e6..88d916f9a4 100644 --- a/composer/yamls/recipes/resnet50_spicy.yaml +++ b/composer/yamls/recipes/resnet50_spicy.yaml @@ -100,4 +100,4 @@ val_dataset: drop_last: false is_train: false resize_size: 232 - shuffle: false \ No newline at end of file + shuffle: false From 96d518b8cc590b6eafe43c4a9aad6cb568de288b Mon Sep 17 00:00:00 2001 From: Matthew Leavitt Date: Tue, 7 Jun 2022 13:18:53 -0700 Subject: [PATCH 03/16] precommit hooks --- composer/yamls/recipes/resnet50_mild.yaml | 2 +- composer/yamls/recipes/resnet50_spicy.yaml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/composer/yamls/recipes/resnet50_mild.yaml b/composer/yamls/recipes/resnet50_mild.yaml index 74a0dd48a0..f74fcecc44 100644 --- a/composer/yamls/recipes/resnet50_mild.yaml +++ b/composer/yamls/recipes/resnet50_mild.yaml @@ -53,7 +53,7 @@ optimizer: weight_decay: 0.0005 precision: AMP max_duration: 90ep -scale_schedule_ratio: # Fraction of 90 epochs to train for. 0-0.75 for mild recipe +scale_schedule_ratio: # Fraction of 90 epochs to train for. 0-0.75 for mild recipe schedulers: cosine_decay_with_warmup: alpha_f: 0.0 diff --git a/composer/yamls/recipes/resnet50_spicy.yaml b/composer/yamls/recipes/resnet50_spicy.yaml index 88d916f9a4..725c9617eb 100644 --- a/composer/yamls/recipes/resnet50_spicy.yaml +++ b/composer/yamls/recipes/resnet50_spicy.yaml @@ -78,7 +78,7 @@ optimizer: weight_decay: 0.0003 precision: AMP max_duration: 90ep -scale_schedule_ratio: # Fraction of 90 epochs to train for. >2.6 for spicy recipe +scale_schedule_ratio: # Fraction of 90 epochs to train for. >2.6 for spicy recipe schedulers: cosine_decay_with_warmup: alpha_f: 0.0 From 586a2a7ce8f34fd9c496e5ea258ee655fb2fac28 Mon Sep 17 00:00:00 2001 From: Matthew Leavitt Date: Tue, 7 Jun 2022 13:29:34 -0700 Subject: [PATCH 04/16] yamls include ffcv data file names --- composer/yamls/recipes/resnet50_medium.yaml | 4 ++-- composer/yamls/recipes/resnet50_mild.yaml | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/composer/yamls/recipes/resnet50_medium.yaml b/composer/yamls/recipes/resnet50_medium.yaml index 9a6285eae1..e8cf4354fb 100644 --- a/composer/yamls/recipes/resnet50_medium.yaml +++ b/composer/yamls/recipes/resnet50_medium.yaml @@ -73,7 +73,7 @@ train_dataset: crop_size: 176 # datadir: not needed because we're using FFCV drop_last: true - ffcv_dest: # imagenet_train.ffcv + ffcv_dest: imagenet_train.ffcv ffcv_dir: # /path/to/ffcv/data/directory ffcv_write_dataset: false is_train: true @@ -85,7 +85,7 @@ val_dataset: crop_size: 224 # datadir: not needed because we're using FFCV drop_last: false - ffcv_dest: # imagenet_val.ffcv + ffcv_dest: imagenet_val.ffcv ffcv_dir: # /path/to/ffcv/data/directory ffcv_write_dataset: false is_train: false diff --git a/composer/yamls/recipes/resnet50_mild.yaml b/composer/yamls/recipes/resnet50_mild.yaml index f74fcecc44..ae3b310954 100644 --- a/composer/yamls/recipes/resnet50_mild.yaml +++ b/composer/yamls/recipes/resnet50_mild.yaml @@ -66,7 +66,7 @@ train_dataset: crop_size: 176 # datadir: not needed because we're using FFCV drop_last: true - ffcv_dest: # imagenet_train.ffcv + ffcv_dest: imagenet_train.ffcv ffcv_dir: # /path/to/ffcv/data/directory ffcv_write_dataset: false is_train: true @@ -78,7 +78,7 @@ val_dataset: crop_size: 224 # datadir: not needed because we're using FFCV drop_last: false - ffcv_dest: # imagenet_val.ffcv + ffcv_dest: imagenet_val.ffcv ffcv_dir: # /path/to/ffcv/data/directory ffcv_write_dataset: false is_train: false From f313260ae56f78eee95206fdaaf4d38e09ab184d Mon Sep 17 00:00:00 2001 From: Matthew Leavitt Date: Fri, 10 Jun 2022 16:47:11 -0700 Subject: [PATCH 05/16] dummy --- blorp.txt | 1 + 1 file changed, 1 insertion(+) create mode 100644 blorp.txt diff --git a/blorp.txt b/blorp.txt new file mode 100644 index 0000000000..9daeafb986 --- /dev/null +++ b/blorp.txt @@ -0,0 +1 @@ +test From 27418a3c65dca26d90ac09c6ae67cbd5d0202ccf Mon Sep 17 00:00:00 2001 From: Matthew Leavitt Date: Fri, 10 Jun 2022 16:47:22 -0700 Subject: [PATCH 06/16] dummy --- blorp.txt | 1 - 1 file changed, 1 deletion(-) delete mode 100644 blorp.txt diff --git a/blorp.txt b/blorp.txt deleted file mode 100644 index 9daeafb986..0000000000 --- a/blorp.txt +++ /dev/null @@ -1 +0,0 @@ -test From dd20f8e2f1f91e1b5dbcad82330fb31fb99c5428 Mon Sep 17 00:00:00 2001 From: Matthew Leavitt Date: Thu, 30 Jun 2022 12:00:23 -0700 Subject: [PATCH 07/16] rebase --- composer/yamls/recipes/resnet50_mild.yaml | 2 +- composer/yamls/recipes/resnet50_spicy.yaml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/composer/yamls/recipes/resnet50_mild.yaml b/composer/yamls/recipes/resnet50_mild.yaml index ae3b310954..f87592f28f 100644 --- a/composer/yamls/recipes/resnet50_mild.yaml +++ b/composer/yamls/recipes/resnet50_mild.yaml @@ -53,7 +53,7 @@ optimizer: weight_decay: 0.0005 precision: AMP max_duration: 90ep -scale_schedule_ratio: # Fraction of 90 epochs to train for. 0-0.75 for mild recipe +scale_schedule_ratio: # Fraction of 90 epochs to train for. 0-0.75 for mild recipe schedulers: cosine_decay_with_warmup: alpha_f: 0.0 diff --git a/composer/yamls/recipes/resnet50_spicy.yaml b/composer/yamls/recipes/resnet50_spicy.yaml index 725c9617eb..88d916f9a4 100644 --- a/composer/yamls/recipes/resnet50_spicy.yaml +++ b/composer/yamls/recipes/resnet50_spicy.yaml @@ -78,7 +78,7 @@ optimizer: weight_decay: 0.0003 precision: AMP max_duration: 90ep -scale_schedule_ratio: # Fraction of 90 epochs to train for. >2.6 for spicy recipe +scale_schedule_ratio: # Fraction of 90 epochs to train for. >2.6 for spicy recipe schedulers: cosine_decay_with_warmup: alpha_f: 0.0 From a3d4bda5a5ba2b3562743c7f6ec4284c936bef7a Mon Sep 17 00:00:00 2001 From: Matthew Leavitt Date: Tue, 7 Jun 2022 13:18:53 -0700 Subject: [PATCH 08/16] precommit hooks --- composer/yamls/recipes/resnet50_mild.yaml | 2 +- composer/yamls/recipes/resnet50_spicy.yaml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/composer/yamls/recipes/resnet50_mild.yaml b/composer/yamls/recipes/resnet50_mild.yaml index f87592f28f..ae3b310954 100644 --- a/composer/yamls/recipes/resnet50_mild.yaml +++ b/composer/yamls/recipes/resnet50_mild.yaml @@ -53,7 +53,7 @@ optimizer: weight_decay: 0.0005 precision: AMP max_duration: 90ep -scale_schedule_ratio: # Fraction of 90 epochs to train for. 0-0.75 for mild recipe +scale_schedule_ratio: # Fraction of 90 epochs to train for. 0-0.75 for mild recipe schedulers: cosine_decay_with_warmup: alpha_f: 0.0 diff --git a/composer/yamls/recipes/resnet50_spicy.yaml b/composer/yamls/recipes/resnet50_spicy.yaml index 88d916f9a4..725c9617eb 100644 --- a/composer/yamls/recipes/resnet50_spicy.yaml +++ b/composer/yamls/recipes/resnet50_spicy.yaml @@ -78,7 +78,7 @@ optimizer: weight_decay: 0.0003 precision: AMP max_duration: 90ep -scale_schedule_ratio: # Fraction of 90 epochs to train for. >2.6 for spicy recipe +scale_schedule_ratio: # Fraction of 90 epochs to train for. >2.6 for spicy recipe schedulers: cosine_decay_with_warmup: alpha_f: 0.0 From cf0dabf1d29ad0058c41a7add8742ee9ac4fb313 Mon Sep 17 00:00:00 2001 From: Matthew Leavitt Date: Fri, 10 Jun 2022 16:47:11 -0700 Subject: [PATCH 09/16] dummy --- blorp.txt | 1 + 1 file changed, 1 insertion(+) create mode 100644 blorp.txt diff --git a/blorp.txt b/blorp.txt new file mode 100644 index 0000000000..9daeafb986 --- /dev/null +++ b/blorp.txt @@ -0,0 +1 @@ +test From efcdce45dccaf645b1068c7a5c57c527ee646eb0 Mon Sep 17 00:00:00 2001 From: Matthew Leavitt Date: Fri, 10 Jun 2022 16:47:22 -0700 Subject: [PATCH 10/16] dummy --- blorp.txt | 1 - 1 file changed, 1 deletion(-) delete mode 100644 blorp.txt diff --git a/blorp.txt b/blorp.txt deleted file mode 100644 index 9daeafb986..0000000000 --- a/blorp.txt +++ /dev/null @@ -1 +0,0 @@ -test From 5928f83bcebc08dc8ad6091cb024f23a7a1201bd Mon Sep 17 00:00:00 2001 From: Matthew Leavitt Date: Mon, 31 Oct 2022 14:34:04 -0700 Subject: [PATCH 11/16] fixed algorithms documentation --- docs/source/trainer/algorithms.rst | 215 +++++++++++++++-------------- 1 file changed, 108 insertions(+), 107 deletions(-) diff --git a/docs/source/trainer/algorithms.rst b/docs/source/trainer/algorithms.rst index 4ad2352625..a82cf11f42 100644 --- a/docs/source/trainer/algorithms.rst +++ b/docs/source/trainer/algorithms.rst @@ -157,110 +157,111 @@ Two-way callbacks The way our algorithms insert themselves in our trainer is based on the two-way callbacks system developed by (`Howard et al, 2020 `__). Algorithms interact with the -training loop at various :class:`.Events` and effect their changes by modifing the trainer :class:`.State`. - -.. `Events` denote locations inside the training procedure where algorithms can be run. In pseudocode, -.. Composer’s `events` look as follows: - -.. ```python -.. EVENT.INIT -.. state.model = model() -.. state.train_dataloader = train_dataloader() -.. state.optimizers = optimizers() -.. load_checkpoint() -.. EVENT.AFTER_LOAD -.. EVENT.FIT_START -.. for epoch in epochs: -.. EVENT.EPOCH_START -.. for batch in state.train_dataloader: -.. EVENT.AFTER_DATALOADER -.. EVENT.BATCH_START -.. prepare_batch_for_training() -.. EVENT.BEFORE_TRAIN_BATCH - -.. EVENT.BEFORE_FORWARD -.. forward_pass() -.. EVENT.AFTER_FORWARD - -.. EVENT.BEFORE_LOSS -.. compute_loss() -.. EVENT.AFTER_LOSS - -.. EVENT.BEFORE_BACKWARD -.. backward_pass() -.. EVENT.AFTER_BACKWARD - -.. EVENT.AFTER_TRAIN_BATCH -.. optimizers.step() -.. EVENT.BATCH_END -.. EVENT.EPOCH_END -.. ``` - -.. Complete definitions of these events can be found [here](https://github.com/mosaicml/composer/blob/dev/composer/core/event.py). Some events have a `before` and `after` flavor. These events differ in the order that algorithms are run. For example, on `EVENT.BEFORE_X`, algorithms passed to the trainer in order `[A, B, C]` are also run in order `[A, B,C]`. On `EVENT.AFTER_X`, algorithms passed to the trainer in order `[A, B, C]` are run in order `[C, B, A]` . This allows algorithms to clean undo their effects on state if necessary. - -.. Composer’s `state` tracks relevant quantities for the training procedure. The code for `state` can be found [here](https://github.com/mosaicml/composer/blob/dev/composer/core/state.py). Algorithms can modify state, and therefore modify the training procedure. - -.. To implement a custom algorithm, one needs to create a class that inherits from Composer’s `Algorithm` class, and implements a `match` methods that specifies which event(s) the algorithm should run on, and an `apply` function that specifies how the custom algorithm should modify quantities in `state`. - -.. The `match` method simply takes `state` and the current `event` as an argument, determines whether or not the algorithm should run, and returns true if it should, false otherwise. In code, a simple `match` might look like this: - -.. ```python -.. def match(self, event, state): -.. return event in [Event.AFTER_DATALOADER, Event.AFTER_FORWARD] -.. ``` - -.. This will cause the algorithm to run on the `AFTER_DATALOADER` and `AFTER_FORWARD` events. Note that a given algorithm might run on multiple events. - -.. The `apply` method also takes `state` and the current `event` as arguments. Based on this information, `apply` carries out the appropriate algorithm logic, and modifies `state` with the changes necessary. In code, an `apply` might look like this: - -.. ```python -.. def apply(self, event, state, logger): -.. if event == Event.AFTER_DATALOADER: -.. state.batch = process_inputs(state.batch) -.. if event == Event.AFTER_FORWARD: -.. state.output = process_outputs(state.outputs) -.. ``` - -.. Note that different logic can be used for different events. - -.. Packaging this all together into a class gives the object that Composer can run: - -.. ```python -.. from composer.core import Algoritm, Event - -.. class MyAlgorithm(Algorithm): -.. def __init__(self, hparam1=1): -.. self.hparam1 = hparam1 - -.. def match(self, event, state): -.. return event in [Event.AFTER_DATALOADER, Event.AFTER_FORWARD] - -.. def apply(self, event, state, logger): -.. if event == Event.AFTER_DATALOADER: -.. state.batch = process_inputs(state.batch, self.hparam1) -.. if event == Event.AFTER_FORWARD: -.. state.output = process_outputs(state.outputs) -.. ``` - -.. Using this in training can be done the same way as with Composer’s native algorithms. - -.. ```python -.. from composer import Trainer -.. from composer.algorithms.blurpool import BlurPool -.. from composer.algorithms.channels_last import ChannelsLast - -.. channels_last = ChannelsLast() -.. blurpool = BlurPool(replace_convs=True, -.. replace_maxpools=True, -.. blur_first=True) -.. custom_algorithm = MyAlgorithm(hparam1=1) - -.. trainer = Trainer(model=model, -.. train_dataloader=train_dataloader, -.. eval_dataloader=test_dataloader, -.. max_duration='90ep', -.. device='gpu', -.. algorithms=[channels_last, blurpool, custom_algorithm], -.. eval_interval="0ep", -.. seed=42) -.. ``` +training loop at various :class:`Events <.Event>` and effect their changes by modifing the trainer :class:`.State`. + +`Events` denote locations inside the training procedure where algorithms can be run. In pseudocode, +Composer’s `events` look as follows: + +.. code-block:: python + + EVENT.INIT + state.model = model() + state.train_dataloader = train_dataloader() + state.optimizers = optimizers() + load_checkpoint() + EVENT.AFTER_LOAD + EVENT.FIT_START + for epoch in epochs: + EVENT.EPOCH_START + for batch in state.train_dataloader: + EVENT.AFTER_DATALOADER + EVENT.BATCH_START + prepare_batch_for_training() + EVENT.BEFORE_TRAIN_BATCH + + EVENT.BEFORE_FORWARD + forward_pass() + EVENT.AFTER_FORWARD + + EVENT.BEFORE_LOSS + compute_loss() + EVENT.AFTER_LOSS + + EVENT.BEFORE_BACKWARD + backward_pass() + EVENT.AFTER_BACKWARD + + EVENT.AFTER_TRAIN_BATCH + optimizers.step() + EVENT.BATCH_END + EVENT.EPOCH_END + + +Complete definitions of these events can be found `here `__. Some events have a `before` and `after` flavor. These events differ in the order that algorithms are run. For example, on `EVENT.BEFORE_X`, algorithms passed to the trainer in order `[A, B, C]` are also run in order `[A, B,C]`. On `EVENT.AFTER_X`, algorithms passed to the trainer in order `[A, B, C]` are run in order `[C, B, A]` . This allows algorithms to clean undo their effects on state if necessary. + +Composer’s `state` tracks relevant quantities for the training procedure. The code for `state` can be found `here `__. Algorithms can modify state, and therefore modify the training procedure. + +To implement a custom algorithm, one needs to create a class that inherits from Composer’s `Algorithm` class, and implements a `match` methods that specifies which event(s) the algorithm should run on, and an `apply` function that specifies how the custom algorithm should modify quantities in `state`. + +The `match` method simply takes `state` and the current `event` as an argument, determines whether or not the algorithm should run, and returns true if it should, false otherwise. In code, a simple `match` might look like this: + +.. code-block:: python + + def match(self, event, state): + return event in [Event.AFTER_DATALOADER, Event.AFTER_FORWARD] + +This will cause the algorithm to run on the `AFTER_DATALOADER` and `AFTER_FORWARD` events. Note that a given algorithm might run on multiple events. + +The `apply` method also takes `state` and the current `event` as arguments. Based on this information, `apply` carries out the appropriate algorithm logic, and modifies `state` with the changes necessary. In code, an `apply` might look like this: + +.. code-block:: python + + def apply(self, event, state, logger): + if event == Event.AFTER_DATALOADER: + state.batch = process_inputs(state.batch) + if event == Event.AFTER_FORWARD: + state.output = process_outputs(state.outputs) + + +Note that different logic can be used for different events. + +Packaging this all together into a class gives the object that Composer can run: + +.. code-block:: python + + from composer.core import Algoritm, Event + + class MyAlgorithm(Algorithm): + def __init__(self, hparam1=1): + self.hparam1 = hparam1 + + def match(self, event, state): + return event in [Event.AFTER_DATALOADER, Event.AFTER_FORWARD] + + def apply(self, event, state, logger): + if event == Event.AFTER_DATALOADER: + state.batch = process_inputs(state.batch, self.hparam1) + if event == Event.AFTER_FORWARD: + state.output = process_outputs(state.outputs) + + +Using this in training can be done the same way as with Composer’s native algorithms. + +.. code-block:: python + + from composer import Trainer + from composer.algorithms.blurpool import BlurPool + from composer.algorithms.channels_last import ChannelsLast + + channels_last = ChannelsLast() + blurpool = BlurPool(replace_convs=True, replace_maxpools=True, blur_first=True) + custom_algorithm = MyAlgorithm(hparam1=1) + + trainer = Trainer(model=model, + train_dataloader=train_dataloader, + eval_dataloader=test_dataloader, + max_duration='90ep', + device='gpu', + algorithms=[channels_last, blurpool, custom_algorithm], + eval_interval="0ep", + seed=42) From 17f030a938d97dcd7eac9c76c7f81ae3337d3016 Mon Sep 17 00:00:00 2001 From: Matthew Leavitt Date: Fri, 10 Feb 2023 15:38:27 -0800 Subject: [PATCH 12/16] Save checkpoint as model w&b artifact --- composer/loggers/wandb_logger.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/composer/loggers/wandb_logger.py b/composer/loggers/wandb_logger.py index d5fb641567..6ae9bcb00f 100644 --- a/composer/loggers/wandb_logger.py +++ b/composer/loggers/wandb_logger.py @@ -239,6 +239,9 @@ def upload_file(self, state: State, remote_file_name: str, file_path: pathlib.Pa # the trainer is evaluating or predicting. Assuming evaluation in this case. metadata.update({f'eval_timestamp/{k}': v for (k, v) in state.eval_timestamp.state_dict().items()}) + if extension == '.pt': + extension = 'model' + wandb_artifact = wandb.Artifact( name=new_remote_file_name, type=extension, From 34b1cc7cf13376c5fa4ccc0c8a0bfe44019f6db8 Mon Sep 17 00:00:00 2001 From: Matthew Leavitt Date: Fri, 10 Feb 2023 17:33:41 -0800 Subject: [PATCH 13/16] debug wandb uplaod --- composer/loggers/wandb_logger.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/composer/loggers/wandb_logger.py b/composer/loggers/wandb_logger.py index 6ae9bcb00f..d5932ab606 100644 --- a/composer/loggers/wandb_logger.py +++ b/composer/loggers/wandb_logger.py @@ -239,8 +239,8 @@ def upload_file(self, state: State, remote_file_name: str, file_path: pathlib.Pa # the trainer is evaluating or predicting. Assuming evaluation in this case. metadata.update({f'eval_timestamp/{k}': v for (k, v) in state.eval_timestamp.state_dict().items()}) - if extension == '.pt': - extension = 'model' + # if extension == '.pt': + # extension = 'model' wandb_artifact = wandb.Artifact( name=new_remote_file_name, From c82cdefc535908ad6ed4a91d7bc335c954853e9e Mon Sep 17 00:00:00 2001 From: Matthew Leavitt Date: Fri, 10 Feb 2023 17:46:03 -0800 Subject: [PATCH 14/16] changed back --- composer/loggers/wandb_logger.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/composer/loggers/wandb_logger.py b/composer/loggers/wandb_logger.py index d5932ab606..6ae9bcb00f 100644 --- a/composer/loggers/wandb_logger.py +++ b/composer/loggers/wandb_logger.py @@ -239,8 +239,8 @@ def upload_file(self, state: State, remote_file_name: str, file_path: pathlib.Pa # the trainer is evaluating or predicting. Assuming evaluation in this case. metadata.update({f'eval_timestamp/{k}': v for (k, v) in state.eval_timestamp.state_dict().items()}) - # if extension == '.pt': - # extension = 'model' + if extension == '.pt': + extension = 'model' wandb_artifact = wandb.Artifact( name=new_remote_file_name, From 5c479db785ae79fcecb56309ba681f81607c0670 Mon Sep 17 00:00:00 2001 From: Matthew Leavitt Date: Mon, 13 Feb 2023 10:46:30 -0800 Subject: [PATCH 15/16] fixed model logic --- composer/loggers/wandb_logger.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/composer/loggers/wandb_logger.py b/composer/loggers/wandb_logger.py index 6ae9bcb00f..e08593295c 100644 --- a/composer/loggers/wandb_logger.py +++ b/composer/loggers/wandb_logger.py @@ -239,7 +239,7 @@ def upload_file(self, state: State, remote_file_name: str, file_path: pathlib.Pa # the trainer is evaluating or predicting. Assuming evaluation in this case. metadata.update({f'eval_timestamp/{k}': v for (k, v) in state.eval_timestamp.state_dict().items()}) - if extension == '.pt': + if extension == 'pt': extension = 'model' wandb_artifact = wandb.Artifact( From 572957e24151d5a3200a8425f05aa686ffda645e Mon Sep 17 00:00:00 2001 From: Matthew Leavitt Date: Wed, 15 Feb 2023 15:54:16 -0800 Subject: [PATCH 16/16] added comment --- composer/loggers/wandb_logger.py | 1 + 1 file changed, 1 insertion(+) diff --git a/composer/loggers/wandb_logger.py b/composer/loggers/wandb_logger.py index e08593295c..e317fd2f93 100644 --- a/composer/loggers/wandb_logger.py +++ b/composer/loggers/wandb_logger.py @@ -239,6 +239,7 @@ def upload_file(self, state: State, remote_file_name: str, file_path: pathlib.Pa # the trainer is evaluating or predicting. Assuming evaluation in this case. metadata.update({f'eval_timestamp/{k}': v for (k, v) in state.eval_timestamp.state_dict().items()}) + # Change the extension so the checkpoint is compatible with W&B's model registry if extension == 'pt': extension = 'model'