diff --git a/composer/datasets/ade20k_hparams.py b/composer/datasets/ade20k_hparams.py index 5da422f3ab..9274787212 100644 --- a/composer/datasets/ade20k_hparams.py +++ b/composer/datasets/ade20k_hparams.py @@ -23,7 +23,7 @@ from composer.datasets.synthetic import SyntheticBatchPairDataset from composer.datasets.synthetic_hparams import SyntheticHparamsMixin from composer.datasets.utils import NormalizationFn, pil_image_collate -from composer.utils import dist +from composer.utils import dist, warn_streaming_dataset_deprecation from composer.utils.import_helpers import MissingConditionalImportError __all__ = ['ADE20kDatasetHparams', 'StreamingADE20kHparams'] @@ -152,9 +152,9 @@ class StreamingADE20kHparams(DatasetHparams): """DatasetHparams for creating an instance of StreamingADE20k. Args: - version (int): Which version of streaming to use. Default: ``2``. + version (int): Which version of streaming to use. Default: ``1``. remote (str): Remote directory (S3 or local filesystem) where dataset is stored. - Default: ``'s3://mosaicml-internal-dataset-ade20k/mds/2/``` + Default: ``'s3://mosaicml-internal-dataset-ade20k/mds/1/``` local (str): Local filesystem directory where dataset is cached during operation. Default: ``'/tmp/mds-cache/mds-ade20k/``` split (str): The dataset split to use, either 'train' or 'val'. Default: ``'train```. @@ -166,9 +166,9 @@ class StreamingADE20kHparams(DatasetHparams): Default: ``true``. """ - version: int = hp.optional('Version of streaming (1 or 2)', default=2) + version: int = hp.optional('Version of streaming (1 or 2)', default=1) remote: str = hp.optional('Remote directory (S3 or local filesystem) where dataset is stored', - default='s3://mosaicml-internal-dataset-ade20k/mds/2/') + default='s3://mosaicml-internal-dataset-ade20k/mds/1/') local: str = hp.optional('Local filesystem directory where dataset is cached during operation', default='/tmp/mds-cache/mds-ade20k/') split: str = hp.optional("Which split of the dataset to use. Either ['train', 'val']", default='train') @@ -180,6 +180,7 @@ class StreamingADE20kHparams(DatasetHparams): def initialize_object(self, batch_size: int, dataloader_hparams: DataLoaderHparams) -> DataSpec: if self.version == 1: + warn_streaming_dataset_deprecation(old_version=self.version, new_version=2) dataset = StreamingADE20k(remote=self.remote, local=self.local, split=self.split, diff --git a/composer/datasets/c4_hparams.py b/composer/datasets/c4_hparams.py index 44412338a2..416f8139f1 100644 --- a/composer/datasets/c4_hparams.py +++ b/composer/datasets/c4_hparams.py @@ -12,6 +12,7 @@ from composer.core.data_spec import DataSpec from composer.datasets.c4 import C4Dataset, StreamingC4 from composer.datasets.dataset_hparams import DataLoaderHparams, DatasetHparams +from composer.utils import warn_streaming_dataset_deprecation from composer.utils.import_helpers import MissingConditionalImportError log = logging.getLogger(__name__) @@ -24,9 +25,9 @@ class StreamingC4Hparams(DatasetHparams): """Builds a :class:`.DataSpec` for the StreamingC4 (Colossal Cleaned Common Crawl) dataset. Args: - version (int): Which version of streaming to use. Default: ``2``. + version (int): Which version of streaming to use. Default: ``1``. remote (str): Remote directory (S3 or local filesystem) where dataset is stored. - Default: ``'s3://mosaicml-internal-dataset-c4/mds/2/'`` + Default: ``'s3://mosaicml-internal-dataset-c4/mds/1/'`` local (str): Local filesystem directory where dataset is cached during operation. Default: ``'/tmp/mds-cache/mds-c4/'`` split (str): What split of the dataset to use. Either ``'train'`` or ``'val'``. Default: ``'train'``. @@ -40,9 +41,9 @@ class StreamingC4Hparams(DatasetHparams): timeout (float): How long to wait for shard to download before raising an exception. Default: 120 sec. """ - version: int = hp.optional('Version of streaming (1 or 2)', default=2) + version: int = hp.optional('Version of streaming (1 or 2)', default=1) remote: str = hp.optional('Remote directory (S3 or local filesystem) where dataset is stored', - default='s3://mosaicml-internal-dataset-c4/mds/2/') + default='s3://mosaicml-internal-dataset-c4/mds/1/') local: str = hp.optional('Local filesystem directory where dataset is cached during operation', default='/tmp/mds-cache/mds-c4/') split: str = hp.optional('What split of the dataset to use. Either `train` or `val`.', default='train') @@ -76,6 +77,7 @@ def initialize_object(self, batch_size: int, dataloader_hparams: DataLoaderHpara # Get StreamingC4 dataset if self.version == 1: + warn_streaming_dataset_deprecation(old_version=self.version, new_version=2) dataset = StreamingC4(remote=self.remote, local=self.local, split=self.split, diff --git a/composer/datasets/cifar_hparams.py b/composer/datasets/cifar_hparams.py index bba29f40d9..9be9c2f73b 100644 --- a/composer/datasets/cifar_hparams.py +++ b/composer/datasets/cifar_hparams.py @@ -24,7 +24,7 @@ from composer.datasets.ffcv_utils import write_ffcv_dataset from composer.datasets.synthetic import SyntheticBatchPairDataset from composer.datasets.synthetic_hparams import SyntheticHparamsMixin -from composer.utils import dist +from composer.utils import dist, warn_streaming_dataset_deprecation from composer.utils.import_helpers import MissingConditionalImportError __all__ = ['CIFAR10DatasetHparams', 'StreamingCIFAR10Hparams'] @@ -185,23 +185,24 @@ class StreamingCIFAR10Hparams(DatasetHparams): """Streaming CIFAR10 hyperparameters. Args: - version (int): Which version of streaming to use. Default: ``2``. + version (int): Which version of streaming to use. Default: ``1``. remote (str): Remote directory (S3 or local filesystem) where dataset is stored. - Default: ``'s3://mosaicml-internal-dataset-cifar10/mds/2/'`` + Default: ``'s3://mosaicml-internal-dataset-cifar10/mds/1/'`` local (str): Local filesystem directory where dataset is cached during operation. Default: ``'/tmp/mds-cache/mds-cifar10/'`` split (str): The dataset split to use, either 'train' or 'val'. Default: ``'train'``. """ - version: int = hp.optional('Version of streaming (1 or 2)', default=2) + version: int = hp.optional('Version of streaming (1 or 2)', default=1) remote: str = hp.optional('Remote directory (S3 or local filesystem) where dataset is stored', - default='s3://mosaicml-internal-dataset-cifar10/mds/2/') + default='s3://mosaicml-internal-dataset-cifar10/mds/1/') local: str = hp.optional('Local filesystem directory where dataset is cached during operation', default='/tmp/mds-cache/mds-cifar10/') split: str = hp.optional("Which split of the dataset to use. Either ['train', 'val']", default='train') def initialize_object(self, batch_size: int, dataloader_hparams: DataLoaderHparams) -> DataLoader: if self.version == 1: + warn_streaming_dataset_deprecation(old_version=self.version, new_version=2) dataset = StreamingCIFAR10(remote=self.remote, local=self.local, split=self.split, diff --git a/composer/datasets/coco_hparams.py b/composer/datasets/coco_hparams.py index e1719314f5..5cef0d88f3 100644 --- a/composer/datasets/coco_hparams.py +++ b/composer/datasets/coco_hparams.py @@ -12,7 +12,7 @@ from composer.datasets.coco import COCODetection, StreamingCOCO, split_dict_fn from composer.datasets.dataset_hparams import DataLoaderHparams, DatasetHparams from composer.models.ssd.utils import SSDTransformer, dboxes300_coco -from composer.utils import dist +from composer.utils import dist, warn_streaming_dataset_deprecation from composer.utils.import_helpers import MissingConditionalImportError __all__ = ['COCODatasetHparams', 'StreamingCOCOHparams'] @@ -75,23 +75,24 @@ class StreamingCOCOHparams(DatasetHparams): """DatasetHparams for creating an instance of StreamingCOCO. Args: - version (int): Which version of streaming to use. Default: ``2``. + version (int): Which version of streaming to use. Default: ``1``. remote (str): Remote directory (S3 or local filesystem) where dataset is stored. - Default: ``'s3://mosaicml-internal-dataset-coco/mds/2/``` + Default: ``'s3://mosaicml-internal-dataset-coco/mds/1/``` local (str): Local filesystem directory where dataset is cached during operation. Default: ``'/tmp/mds-cache/mds-coco/``` split (str): The dataset split to use, either 'train' or 'val'. Default: ``'train```. """ - version: int = hp.optional('Version of streaming (1 or 2)', default=2) + version: int = hp.optional('Version of streaming (1 or 2)', default=1) remote: str = hp.optional('Remote directory (S3 or local filesystem) where dataset is stored', - default='s3://mosaicml-internal-dataset-coco/mds/2/') + default='s3://mosaicml-internal-dataset-coco/mds/1/') local: str = hp.optional('Local filesystem directory where dataset is cached during operation', default='/tmp/mds-cache/mds-coco/') split: str = hp.optional("Which split of the dataset to use. Either ['train', 'val']", default='train') def initialize_object(self, batch_size: int, dataloader_hparams: DataLoaderHparams): if self.version == 1: + warn_streaming_dataset_deprecation(old_version=self.version, new_version=2) dataset = StreamingCOCO(remote=self.remote, local=self.local, split=self.split, diff --git a/composer/datasets/imagenet_hparams.py b/composer/datasets/imagenet_hparams.py index baaffc4070..738f54ce9b 100644 --- a/composer/datasets/imagenet_hparams.py +++ b/composer/datasets/imagenet_hparams.py @@ -26,7 +26,7 @@ from composer.datasets.synthetic import SyntheticBatchPairDataset from composer.datasets.synthetic_hparams import SyntheticHparamsMixin from composer.datasets.utils import NormalizationFn, pil_image_collate -from composer.utils import dist +from composer.utils import dist, warn_streaming_dataset_deprecation from composer.utils.import_helpers import MissingConditionalImportError # ImageNet normalization values from torchvision: https://pytorch.org/vision/stable/models.html @@ -212,9 +212,9 @@ class StreamingImageNet1kHparams(DatasetHparams): """DatasetHparams for creating an instance of StreamingImageNet1k. Args: - version (int): Which version of streaming to use. Default: ``2``. + version (int): Which version of streaming to use. Default: ``1``. remote (str): Remote directory (S3 or local filesystem) where dataset is stored. - Default: ``'s3://mosaicml-internal-dataset-imagenet1k/mds/2/``` + Default: ``'s3://mosaicml-internal-dataset-imagenet1k/mds/1/``` local (str): Local filesystem directory where dataset is cached during operation. Default: ``'/tmp/mds-cache/mds-imagenet1k/``` split (str): The dataset split to use, either 'train' or 'val'. Default: ``'train```. @@ -222,9 +222,9 @@ class StreamingImageNet1kHparams(DatasetHparams): crop size (int): The crop size to use. Default: ``224``. """ - version: int = hp.optional('Version of streaming (1 or 2)', default=2) + version: int = hp.optional('Version of streaming (1 or 2)', default=1) remote: str = hp.optional('Remote directory (S3 or local filesystem) where dataset is stored', - default='s3://mosaicml-internal-dataset-imagenet1k/mds/2/') + default='s3://mosaicml-internal-dataset-imagenet1k/mds/1/') local: str = hp.optional('Local filesystem directory where dataset is cached during operation', default='/tmp/mds-cache/mds-imagenet1k/') split: str = hp.optional("Which split of the dataset to use. Either ['train', 'val']", default='train') @@ -233,6 +233,7 @@ class StreamingImageNet1kHparams(DatasetHparams): def initialize_object(self, batch_size: int, dataloader_hparams: DataLoaderHparams) -> DataSpec: if self.version == 1: + warn_streaming_dataset_deprecation(old_version=self.version, new_version=2) dataset = StreamingImageNet1k(remote=self.remote, local=self.local, split=self.split, diff --git a/composer/utils/__init__.py b/composer/utils/__init__.py index aa0c49ae37..4a4e6a9dbf 100644 --- a/composer/utils/__init__.py +++ b/composer/utils/__init__.py @@ -20,11 +20,21 @@ from composer.utils.string_enum import StringEnum -def warn_yahp_deprecation(): +def warn_yahp_deprecation() -> None: warnings.warn( 'yahp-based workflows are deprecated and will be removed in a future release. Please' 'migrate to using other configuration managers and create the Trainer objects directly.' - 'v0.10 will be the last release to support yahp.', DeprecationWarning) + 'v0.10 will be the last release to support yahp.', + DeprecationWarning, + stacklevel=2) + + +def warn_streaming_dataset_deprecation(old_version: int, new_version: int) -> None: + warnings.warn( + f'streaming dataset version {old_version} is deprecated and will be removed in the future. ' + f'Please migrate to using streaming dataset version {new_version}', + DeprecationWarning, + stacklevel=2) __all__ = [