Skip to content

Commit ee7b97f

Browse files
authored
Update FSDP checkpointing test to use UC Volumes and updated dockerfile for new composer version (#3865)
1 parent 0e932e0 commit ee7b97f

File tree

5 files changed

+36
-25
lines changed

5 files changed

+36
-25
lines changed

docker/README.md

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -15,8 +15,8 @@ all dependencies for both NLP and Vision models. They are built on top of the
1515
<!-- BEGIN_COMPOSER_BUILD_MATRIX -->
1616
| Composer Version | CUDA Support | Docker Tag |
1717
|--------------------|----------------|----------------------------------------------------------------|
18-
| 0.30.0 | Yes | `mosaicml/composer:latest`, `mosaicml/composer:0.30.0` |
19-
| 0.30.0 | No | `mosaicml/composer:latest_cpu`, `mosaicml/composer:0.30.0_cpu` |
18+
| 0.31.0 | Yes | `mosaicml/composer:latest`, `mosaicml/composer:0.31.0` |
19+
| 0.31.0 | No | `mosaicml/composer:latest_cpu`, `mosaicml/composer:0.31.0_cpu` |
2020
<!-- END_COMPOSER_BUILD_MATRIX -->
2121

2222
**Note**: For a lightweight installation, we recommended using a [MosaicML PyTorch Image](#pytorch-images) and manually

docker/build_matrix.yaml

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -183,35 +183,35 @@
183183
TORCHVISION_VERSION: 0.21.0
184184
UBUNTU_VERSION: '20.04'
185185
- BASE_IMAGE: nvidia/cuda:12.6.3-cudnn-devel-ubuntu22.04
186-
COMPOSER_INSTALL_COMMAND: mosaicml[all]==0.30.0
186+
COMPOSER_INSTALL_COMMAND: mosaicml[all]==0.31.0
187187
CUDA_VERSION: 12.6.3
188188
EFA_INSTALLER_VERSION: ''
189-
IMAGE_NAME: composer-0-30-0
189+
IMAGE_NAME: composer-0-31-0
190190
MOFED_VERSION: latest-23.10
191191
NVIDIA_REQUIRE_CUDA_OVERRIDE: ''
192192
PYTHON_VERSION: '3.12'
193193
PYTORCH_NIGHTLY_URL: ''
194194
PYTORCH_NIGHTLY_VERSION: ''
195195
PYTORCH_VERSION: 2.7.0
196196
TAGS:
197-
- mosaicml/composer:0.30.0
197+
- mosaicml/composer:0.31.0
198198
- mosaicml/composer:latest
199199
TARGET: composer_stage
200200
TORCHVISION_VERSION: 0.22.0
201201
UBUNTU_VERSION: '22.04'
202202
- BASE_IMAGE: ubuntu:22.04
203-
COMPOSER_INSTALL_COMMAND: mosaicml[all]==0.30.0
203+
COMPOSER_INSTALL_COMMAND: mosaicml[all]==0.31.0
204204
CUDA_VERSION: ''
205205
EFA_INSTALLER_VERSION: ''
206-
IMAGE_NAME: composer-0-30-0-cpu
206+
IMAGE_NAME: composer-0-31-0-cpu
207207
MOFED_VERSION: latest-23.10
208208
NVIDIA_REQUIRE_CUDA_OVERRIDE: ''
209209
PYTHON_VERSION: '3.12'
210210
PYTORCH_NIGHTLY_URL: ''
211211
PYTORCH_NIGHTLY_VERSION: ''
212212
PYTORCH_VERSION: 2.7.0
213213
TAGS:
214-
- mosaicml/composer:0.30.0_cpu
214+
- mosaicml/composer:0.31.0_cpu
215215
- mosaicml/composer:latest_cpu
216216
TARGET: composer_stage
217217
TORCHVISION_VERSION: 0.22.0

docker/generate_build_matrix.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -294,7 +294,7 @@ def _main():
294294
composer_entries = []
295295

296296
# The `GIT_COMMIT` is a placeholder and Jenkins will substitute it with the actual git commit for the `composer_staging` images
297-
composer_versions = ['0.30.0'] # Only build images for the latest composer version
297+
composer_versions = ['0.31.0'] # Only build images for the latest composer version
298298
composer_python_versions = [PRODUCTION_PYTHON_VERSION] # just build composer against the latest
299299

300300
for product in itertools.product(composer_python_versions, composer_versions, cuda_options):

tests/fixtures/fixtures.py

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -131,6 +131,14 @@ def s3_read_only_prefix():
131131
return 'read_only'
132132

133133

134+
@pytest.fixture
135+
def uc_volume_path(request: pytest.FixtureRequest):
136+
if request.node.get_closest_marker('remote') is None:
137+
return 'my-volume'
138+
else:
139+
return os.environ.get('UC_VOLUME_PATH', 'Volumes/main/regression_testing/composer_artifacts/')
140+
141+
134142
## MODEL HELPERS ##
135143
def causal_lm_model_helper(config): # type: ignore
136144
transformers = pytest.importorskip('transformers')

tests/trainer/test_fsdp_checkpoint.py

Lines changed: 19 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@
2020
from packaging import version
2121
from torch.distributed._shard.sharded_tensor import ShardedTensor
2222
from torch.distributed._tensor import DTensor
23+
from torch.distributed.fsdp import FullyShardedDataParallel as FSDP
2324
from torch.utils.data import DataLoader
2425
from torchmetrics import Metric, MetricCollection
2526
from torchmetrics.classification import MulticlassAccuracy
@@ -33,7 +34,7 @@
3334
from composer.utils import FSDPConfig, TPConfig, dist, parse_uri
3435
from composer.utils.checkpoint import dist_cp_load
3536
from composer.utils.file_helpers import get_file
36-
from composer.utils.object_store import S3ObjectStore
37+
from composer.utils.object_store import UCObjectStore
3738
from composer.utils.reproducibility import get_rng_state
3839
from tests.common import RandomClassificationDataset, deep_compare
3940
from tests.common.markers import world_size
@@ -517,6 +518,7 @@ def test_fsdp_mixed_with_sync(
517518
'0.28.0',
518519
'0.29.0',
519520
'0.30.0',
521+
'0.31.0',
520522
],
521523
)
522524
@pytest.mark.filterwarnings(r'ignore:.*metrics are not saved with sharded state dict.*:UserWarning')
@@ -529,8 +531,7 @@ def test_fsdp_load_old_checkpoint(
529531
precision: str,
530532
sharding_strategy: str,
531533
state_dict_type: str,
532-
s3_bucket: str,
533-
s3_read_only_prefix: str,
534+
uc_volume_path: str,
534535
composer_version: str,
535536
):
536537
if composer_version == '0.18.1' and state_dict_type == 'full' and precision == 'amp_bf16' and sharding_strategy == 'FULL_SHARD':
@@ -540,25 +541,27 @@ def test_fsdp_load_old_checkpoint(
540541
if state_dict_type == 'sharded':
541542
pytest.skip('Loading legacy sharded checkpoints are not supported after v0.25.0.')
542543

543-
load_path_dir = (
544-
f's3://{s3_bucket}/{s3_read_only_prefix}/backwards_compatibility/'
545-
f'{composer_version}/{sharding_strategy.lower()}_{state_dict_type}_'
546-
f'{precision}/'
544+
load_path_dir = os.path.join(
545+
f'dbfs:/{uc_volume_path}',
546+
'backwards_compatibility',
547+
composer_version,
548+
f'{sharding_strategy.lower()}_{state_dict_type}_{precision}',
547549
)
548550
if ((version.parse(composer_version) > version.parse('0.15.0')) and state_dict_type != 'full'):
549-
load_path_dir = (load_path_dir + 'ep0-ba2/')
551+
load_path_dir = os.path.join(load_path_dir, 'ep0-ba2')
550552

551-
load_path = load_path_dir + f'ba2_rank0.pt'
553+
load_path = os.path.join(load_path_dir, f'ba2_rank0.pt')
552554
else:
553-
load_path = (
554-
f's3://{s3_bucket}/{s3_read_only_prefix}/backwards_compatibility/'
555-
f'{composer_version}/{sharding_strategy.lower()}_{state_dict_type}_'
556-
f'{precision}/'
555+
load_path = os.path.join(
556+
f'dbfs:/{uc_volume_path}',
557+
'backwards_compatibility',
558+
composer_version,
559+
f'{sharding_strategy.lower()}_{state_dict_type}_{precision}',
557560
)
558561
if state_dict_type == 'full':
559-
load_path += 'ba2_rank0.pt'
562+
load_path = os.path.join(load_path, 'ba2_rank0.pt')
560563
else:
561-
load_path += 'ep0-ba2/'
564+
load_path = os.path.join(load_path, 'ep0-ba2')
562565

563566
if composer_version == '0.15.1':
564567
num_classes = 8 # This parameter setting is very important. Don't change or the test will fail.
@@ -619,7 +622,7 @@ def test_fsdp_load_old_checkpoint(
619622
'rng': get_rng_state(),
620623
}
621624

622-
object_store = S3ObjectStore(bucket=f'{s3_bucket}')
625+
object_store = UCObjectStore(path=uc_volume_path)
623626
storage_reader = DistCPObjectStoreReader(
624627
source_path=parsed_load_path,
625628
destination_path=destination,

0 commit comments

Comments
 (0)