Skip to content

Commit 3f3ba30

Browse files
authored
Defaulting streaming dataset version to 1 and add a deprecation warning (#1532)
1 parent 8e2639e commit 3f3ba30

File tree

6 files changed

+42
-26
lines changed

6 files changed

+42
-26
lines changed

composer/datasets/ade20k_hparams.py

Lines changed: 6 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -23,7 +23,7 @@
2323
from composer.datasets.synthetic import SyntheticBatchPairDataset
2424
from composer.datasets.synthetic_hparams import SyntheticHparamsMixin
2525
from composer.datasets.utils import NormalizationFn, pil_image_collate
26-
from composer.utils import dist
26+
from composer.utils import dist, warn_streaming_dataset_deprecation
2727
from composer.utils.import_helpers import MissingConditionalImportError
2828

2929
__all__ = ['ADE20kDatasetHparams', 'StreamingADE20kHparams']
@@ -152,9 +152,9 @@ class StreamingADE20kHparams(DatasetHparams):
152152
"""DatasetHparams for creating an instance of StreamingADE20k.
153153
154154
Args:
155-
version (int): Which version of streaming to use. Default: ``2``.
155+
version (int): Which version of streaming to use. Default: ``1``.
156156
remote (str): Remote directory (S3 or local filesystem) where dataset is stored.
157-
Default: ``'s3://mosaicml-internal-dataset-ade20k/mds/2/```
157+
Default: ``'s3://mosaicml-internal-dataset-ade20k/mds/1/```
158158
local (str): Local filesystem directory where dataset is cached during operation.
159159
Default: ``'/tmp/mds-cache/mds-ade20k/```
160160
split (str): The dataset split to use, either 'train' or 'val'. Default: ``'train```.
@@ -166,9 +166,9 @@ class StreamingADE20kHparams(DatasetHparams):
166166
Default: ``true``.
167167
"""
168168

169-
version: int = hp.optional('Version of streaming (1 or 2)', default=2)
169+
version: int = hp.optional('Version of streaming (1 or 2)', default=1)
170170
remote: str = hp.optional('Remote directory (S3 or local filesystem) where dataset is stored',
171-
default='s3://mosaicml-internal-dataset-ade20k/mds/2/')
171+
default='s3://mosaicml-internal-dataset-ade20k/mds/1/')
172172
local: str = hp.optional('Local filesystem directory where dataset is cached during operation',
173173
default='/tmp/mds-cache/mds-ade20k/')
174174
split: str = hp.optional("Which split of the dataset to use. Either ['train', 'val']", default='train')
@@ -180,6 +180,7 @@ class StreamingADE20kHparams(DatasetHparams):
180180

181181
def initialize_object(self, batch_size: int, dataloader_hparams: DataLoaderHparams) -> DataSpec:
182182
if self.version == 1:
183+
warn_streaming_dataset_deprecation(old_version=self.version, new_version=2)
183184
dataset = StreamingADE20k(remote=self.remote,
184185
local=self.local,
185186
split=self.split,

composer/datasets/c4_hparams.py

Lines changed: 6 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@
1212
from composer.core.data_spec import DataSpec
1313
from composer.datasets.c4 import C4Dataset, StreamingC4
1414
from composer.datasets.dataset_hparams import DataLoaderHparams, DatasetHparams
15+
from composer.utils import warn_streaming_dataset_deprecation
1516
from composer.utils.import_helpers import MissingConditionalImportError
1617

1718
log = logging.getLogger(__name__)
@@ -24,9 +25,9 @@ class StreamingC4Hparams(DatasetHparams):
2425
"""Builds a :class:`.DataSpec` for the StreamingC4 (Colossal Cleaned Common Crawl) dataset.
2526
2627
Args:
27-
version (int): Which version of streaming to use. Default: ``2``.
28+
version (int): Which version of streaming to use. Default: ``1``.
2829
remote (str): Remote directory (S3 or local filesystem) where dataset is stored.
29-
Default: ``'s3://mosaicml-internal-dataset-c4/mds/2/'``
30+
Default: ``'s3://mosaicml-internal-dataset-c4/mds/1/'``
3031
local (str): Local filesystem directory where dataset is cached during operation.
3132
Default: ``'/tmp/mds-cache/mds-c4/'``
3233
split (str): What split of the dataset to use. Either ``'train'`` or ``'val'``. Default: ``'train'``.
@@ -40,9 +41,9 @@ class StreamingC4Hparams(DatasetHparams):
4041
timeout (float): How long to wait for shard to download before raising an exception. Default: 120 sec.
4142
"""
4243

43-
version: int = hp.optional('Version of streaming (1 or 2)', default=2)
44+
version: int = hp.optional('Version of streaming (1 or 2)', default=1)
4445
remote: str = hp.optional('Remote directory (S3 or local filesystem) where dataset is stored',
45-
default='s3://mosaicml-internal-dataset-c4/mds/2/')
46+
default='s3://mosaicml-internal-dataset-c4/mds/1/')
4647
local: str = hp.optional('Local filesystem directory where dataset is cached during operation',
4748
default='/tmp/mds-cache/mds-c4/')
4849
split: str = hp.optional('What split of the dataset to use. Either `train` or `val`.', default='train')
@@ -76,6 +77,7 @@ def initialize_object(self, batch_size: int, dataloader_hparams: DataLoaderHpara
7677

7778
# Get StreamingC4 dataset
7879
if self.version == 1:
80+
warn_streaming_dataset_deprecation(old_version=self.version, new_version=2)
7981
dataset = StreamingC4(remote=self.remote,
8082
local=self.local,
8183
split=self.split,

composer/datasets/cifar_hparams.py

Lines changed: 6 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,7 @@
2424
from composer.datasets.ffcv_utils import write_ffcv_dataset
2525
from composer.datasets.synthetic import SyntheticBatchPairDataset
2626
from composer.datasets.synthetic_hparams import SyntheticHparamsMixin
27-
from composer.utils import dist
27+
from composer.utils import dist, warn_streaming_dataset_deprecation
2828
from composer.utils.import_helpers import MissingConditionalImportError
2929

3030
__all__ = ['CIFAR10DatasetHparams', 'StreamingCIFAR10Hparams']
@@ -185,23 +185,24 @@ class StreamingCIFAR10Hparams(DatasetHparams):
185185
"""Streaming CIFAR10 hyperparameters.
186186
187187
Args:
188-
version (int): Which version of streaming to use. Default: ``2``.
188+
version (int): Which version of streaming to use. Default: ``1``.
189189
remote (str): Remote directory (S3 or local filesystem) where dataset is stored.
190-
Default: ``'s3://mosaicml-internal-dataset-cifar10/mds/2/'``
190+
Default: ``'s3://mosaicml-internal-dataset-cifar10/mds/1/'``
191191
local (str): Local filesystem directory where dataset is cached during operation.
192192
Default: ``'/tmp/mds-cache/mds-cifar10/'``
193193
split (str): The dataset split to use, either 'train' or 'val'. Default: ``'train'``.
194194
"""
195195

196-
version: int = hp.optional('Version of streaming (1 or 2)', default=2)
196+
version: int = hp.optional('Version of streaming (1 or 2)', default=1)
197197
remote: str = hp.optional('Remote directory (S3 or local filesystem) where dataset is stored',
198-
default='s3://mosaicml-internal-dataset-cifar10/mds/2/')
198+
default='s3://mosaicml-internal-dataset-cifar10/mds/1/')
199199
local: str = hp.optional('Local filesystem directory where dataset is cached during operation',
200200
default='/tmp/mds-cache/mds-cifar10/')
201201
split: str = hp.optional("Which split of the dataset to use. Either ['train', 'val']", default='train')
202202

203203
def initialize_object(self, batch_size: int, dataloader_hparams: DataLoaderHparams) -> DataLoader:
204204
if self.version == 1:
205+
warn_streaming_dataset_deprecation(old_version=self.version, new_version=2)
205206
dataset = StreamingCIFAR10(remote=self.remote,
206207
local=self.local,
207208
split=self.split,

composer/datasets/coco_hparams.py

Lines changed: 6 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@
1212
from composer.datasets.coco import COCODetection, StreamingCOCO, split_dict_fn
1313
from composer.datasets.dataset_hparams import DataLoaderHparams, DatasetHparams
1414
from composer.models.ssd.utils import SSDTransformer, dboxes300_coco
15-
from composer.utils import dist
15+
from composer.utils import dist, warn_streaming_dataset_deprecation
1616
from composer.utils.import_helpers import MissingConditionalImportError
1717

1818
__all__ = ['COCODatasetHparams', 'StreamingCOCOHparams']
@@ -75,23 +75,24 @@ class StreamingCOCOHparams(DatasetHparams):
7575
"""DatasetHparams for creating an instance of StreamingCOCO.
7676
7777
Args:
78-
version (int): Which version of streaming to use. Default: ``2``.
78+
version (int): Which version of streaming to use. Default: ``1``.
7979
remote (str): Remote directory (S3 or local filesystem) where dataset is stored.
80-
Default: ``'s3://mosaicml-internal-dataset-coco/mds/2/```
80+
Default: ``'s3://mosaicml-internal-dataset-coco/mds/1/```
8181
local (str): Local filesystem directory where dataset is cached during operation.
8282
Default: ``'/tmp/mds-cache/mds-coco/```
8383
split (str): The dataset split to use, either 'train' or 'val'. Default: ``'train```.
8484
"""
8585

86-
version: int = hp.optional('Version of streaming (1 or 2)', default=2)
86+
version: int = hp.optional('Version of streaming (1 or 2)', default=1)
8787
remote: str = hp.optional('Remote directory (S3 or local filesystem) where dataset is stored',
88-
default='s3://mosaicml-internal-dataset-coco/mds/2/')
88+
default='s3://mosaicml-internal-dataset-coco/mds/1/')
8989
local: str = hp.optional('Local filesystem directory where dataset is cached during operation',
9090
default='/tmp/mds-cache/mds-coco/')
9191
split: str = hp.optional("Which split of the dataset to use. Either ['train', 'val']", default='train')
9292

9393
def initialize_object(self, batch_size: int, dataloader_hparams: DataLoaderHparams):
9494
if self.version == 1:
95+
warn_streaming_dataset_deprecation(old_version=self.version, new_version=2)
9596
dataset = StreamingCOCO(remote=self.remote,
9697
local=self.local,
9798
split=self.split,

composer/datasets/imagenet_hparams.py

Lines changed: 6 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -26,7 +26,7 @@
2626
from composer.datasets.synthetic import SyntheticBatchPairDataset
2727
from composer.datasets.synthetic_hparams import SyntheticHparamsMixin
2828
from composer.datasets.utils import NormalizationFn, pil_image_collate
29-
from composer.utils import dist
29+
from composer.utils import dist, warn_streaming_dataset_deprecation
3030
from composer.utils.import_helpers import MissingConditionalImportError
3131

3232
# ImageNet normalization values from torchvision: https://pytorch.org/vision/stable/models.html
@@ -212,19 +212,19 @@ class StreamingImageNet1kHparams(DatasetHparams):
212212
"""DatasetHparams for creating an instance of StreamingImageNet1k.
213213
214214
Args:
215-
version (int): Which version of streaming to use. Default: ``2``.
215+
version (int): Which version of streaming to use. Default: ``1``.
216216
remote (str): Remote directory (S3 or local filesystem) where dataset is stored.
217-
Default: ``'s3://mosaicml-internal-dataset-imagenet1k/mds/2/```
217+
Default: ``'s3://mosaicml-internal-dataset-imagenet1k/mds/1/```
218218
local (str): Local filesystem directory where dataset is cached during operation.
219219
Default: ``'/tmp/mds-cache/mds-imagenet1k/```
220220
split (str): The dataset split to use, either 'train' or 'val'. Default: ``'train```.
221221
resize_size (int, optional): The resize size to use. Use -1 to not resize. Default: ``-1``.
222222
crop size (int): The crop size to use. Default: ``224``.
223223
"""
224224

225-
version: int = hp.optional('Version of streaming (1 or 2)', default=2)
225+
version: int = hp.optional('Version of streaming (1 or 2)', default=1)
226226
remote: str = hp.optional('Remote directory (S3 or local filesystem) where dataset is stored',
227-
default='s3://mosaicml-internal-dataset-imagenet1k/mds/2/')
227+
default='s3://mosaicml-internal-dataset-imagenet1k/mds/1/')
228228
local: str = hp.optional('Local filesystem directory where dataset is cached during operation',
229229
default='/tmp/mds-cache/mds-imagenet1k/')
230230
split: str = hp.optional("Which split of the dataset to use. Either ['train', 'val']", default='train')
@@ -233,6 +233,7 @@ class StreamingImageNet1kHparams(DatasetHparams):
233233

234234
def initialize_object(self, batch_size: int, dataloader_hparams: DataLoaderHparams) -> DataSpec:
235235
if self.version == 1:
236+
warn_streaming_dataset_deprecation(old_version=self.version, new_version=2)
236237
dataset = StreamingImageNet1k(remote=self.remote,
237238
local=self.local,
238239
split=self.split,

composer/utils/__init__.py

Lines changed: 12 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -20,11 +20,21 @@
2020
from composer.utils.string_enum import StringEnum
2121

2222

23-
def warn_yahp_deprecation():
23+
def warn_yahp_deprecation() -> None:
2424
warnings.warn(
2525
'yahp-based workflows are deprecated and will be removed in a future release. Please'
2626
'migrate to using other configuration managers and create the Trainer objects directly.'
27-
'v0.10 will be the last release to support yahp.', DeprecationWarning)
27+
'v0.10 will be the last release to support yahp.',
28+
DeprecationWarning,
29+
stacklevel=2)
30+
31+
32+
def warn_streaming_dataset_deprecation(old_version: int, new_version: int) -> None:
33+
warnings.warn(
34+
f'streaming dataset version {old_version} is deprecated and will be removed in the future. '
35+
f'Please migrate to using streaming dataset version {new_version}',
36+
DeprecationWarning,
37+
stacklevel=2)
2838

2939

3040
__all__ = [

0 commit comments

Comments
 (0)