wip

abhi-mosaic · abhi-mosaic · commit 15df497ed55e · 2022-01-11T23:24:05.000Z
diff --git a/composer/datasets/__init__.py b/composer/datasets/__init__.py
@@ -11,6 +11,7 @@
 from composer.datasets.imagenet import ImagenetDatasetHparams as ImagenetDatasetHparams
 from composer.datasets.lm_datasets import LMDatasetHparams as LMDatasetHparams
 from composer.datasets.mnist import MNISTDatasetHparams as MNISTDatasetHparams
+from composer.datasets.streaming_lm_datasets import StreamingLMDatasetHparams as StreamingLMDatasetHparams
 from composer.datasets.synthetic import MemoryFormat as MemoryFormat
 from composer.datasets.synthetic import SyntheticBatchPairDataset as SyntheticBatchPairDataset
 from composer.datasets.synthetic import SyntheticDataLabelType as SyntheticDataLabelType
diff --git a/composer/datasets/streaming_lm_datasets.py b/composer/datasets/streaming_lm_datasets.py
@@ -0,0 +1,249 @@
+# Copyright 2021 MosaicML. All Rights Reserved.
+
+import copy
+import logging
+import tempfile
+from dataclasses import dataclass
+from functools import partial
+from itertools import chain
+from os.path import join
+from typing import List, Optional
+
+import datasets
+import torch
+import yahp as hp
+from transformers.testing_utils import CaptureLogger
+
+from composer.core.types import Batch
+from composer.datasets.dataloader import DataloaderHparams
+from composer.datasets.hparams import DataloaderSpec, DatasetHparams
+from composer.utils import dist
+from composer.utils.data import get_subset_dataset
+
+log = logging.getLogger(__name__)
+
+
+def _split_dict_fn(batch: Batch, n_microbatches: int) -> List[Batch]:
+    if isinstance(batch, dict):
+        chunked = {k: v.chunk(n_microbatches) for k, v in batch.items()}
+        num_chunks = len(list(chunked.values())[0])
+        return [{k: v[idx] for k, v in chunked.items()} for idx in range(num_chunks)]
+    else:
+        raise ValueError(f'Expect batch from dataloader to be of type Dict[str, Tensor], but got {type(batch)}')
+
+
+CACHED_DATASET_SIZES = {"c4": {"en": {"train": (1024, 356317), "validation": (8, 45576)}}}
+
+@dataclass
+class StreamingLMDatasetHparams(DatasetHparams):
+    """
+    Defines a generic dataset class for autoregressive and masked language models.
+    """
+
+    dataset_name: str = hp.optional("Name of the dataset to load.", default=None)
+    dataset_config_name: Optional[str] = hp.optional(
+        "If required, the specific configuration of the dataset that you would like to use.", default=None)
+    split: str = hp.optional("What split of the dataset to use (e.g. 'train' or 'validation' or 'test')", default=None)
+    max_shards: int = hp.optional("Max number of shards, used to deterministically reduce dataset size.", default=-1)
+    max_samples: int = hp.optional("Max number of post-processed samples, note that the subset will depend on seed and world size.", default=-1)
+    tokenizer_name: str = hp.optional("The name of the tokenizer to preprocess text with.", default=None)
+    max_seq_len: int = hp.optional("The max sequence length of each token sample.", default=None)
+    group_method: str = hp.optional("How to group text samples into token samples.", default=None)
+    use_masked_lm: bool = hp.optional("Whether the dataset shoud be encoded with masked language modeling or not.",
+                                      default=None)
+    mlm_probability: float = hp.optional("If using masked language modeling, the probability to mask tokens with.",
+                                         default=0.15)
+    seed: int = hp.optional("Which seed to use to generate train and validation splits.", default=5)
+    shuffle: bool = hp.optional("Whether to shuffle the dataset for each epoch.", default=True)
+    drop_last: bool = hp.optional("Whether to drop the last samples for the last batch.", default=False)
+
+    def validate(self):
+        assert self.group_method in ["truncate", "concat"], f"Unknown group_method: '{self.group_method}'"
+        assert self.drop_last == True, "No support for 'drop_last'=False currently."
+        if self.group_method == "concat":
+            assert self.max_samples > 0, f"Must provide 'max_samples' if 'group_method'='concat'"
+        if self.use_masked_lm:
+            if self.mlm_probability <= 0.0:
+                raise ValueError(
+                    "If using Masked Language Modeling, you must replace tokens with a non-zero probability.")
+
+    def _load_dataset(self):
+        return datasets.load_dataset(path=self.dataset_name,
+                                     name=self.dataset_config_name,
+                                     split=self.split,
+                                     streaming=True)
+
+    def _get_approx_num_samples(self):
+        try:
+            if self.max_samples > 0:
+                return self.max_samples
+            else:
+                n_shards, samples_per_shard = CACHED_DATASET_SIZES[self.dataset_name][self.dataset_config_name][self.split]
+                n_shards = self.max_shards if self.max_shards > 0 else n_shards
+                return n_shards * samples_per_shard
+        except:
+            raise NotImplementedError
+
+    def _get_approx_num_tokens(self):
+        return 1e12
+
+    def _subsample(self, device_offset, text_batch):
+        # Only return the i-th item out of N sequential items
+        for k, v in text_batch.items():
+            text_batch[k] = v[device_offset:device_offset + 1]
+        return text_batch
+
+    def _shard_dataset(self, dataset):
+        # Select a subset of filepaths for sharded DDP training
+        world_size = dist.get_world_size()
+        rank = dist.get_global_rank()
+        filepaths = dataset._ex_iterable.kwargs['filepaths']
+        # If subsampling using 'max_shards', determimistically choose shards
+        if self.max_shards > 0:
+            filepaths = filepaths[:self.max_shards]
+        num_shards = len(filepaths)
+
+        devices_per_shard = 1
+        if world_size > num_shards:
+            log.warning(
+                f"Not enough unique shards ({num_shards}) for world size ({world_size}). Splitting shards among devices."
+            )
+            assert world_size % num_shards == 0, f"Cannot evenly split shards among devices"
+            devices_per_shard = world_size // num_shards
+        shard_offset = rank // devices_per_shard
+        device_offset = rank % devices_per_shard
+
+        device_filepaths = filepaths[shard_offset::world_size]
+        dataset._ex_iterable.kwargs['filepaths'] = device_filepaths
+
+        # Subsample dataset if shard is being shared among devices
+        # NOTE: Mapping is executed in batched mode for better CPU utilization,
+        # but the returned dataset is still an iterable over text samples
+        if devices_per_shard > 1:
+            dataset = dataset.map(
+                partial(self._subsample, device_offset),
+                batched=True,
+                batch_size=devices_per_shard,
+            )
+        return dataset
+
+    def _tokenize(self, text_batch):
+        # Convert a text batch to a token batch
+        if self.group_method == "truncate":
+            truncation = True
+            padding = 'max_length'
+            max_length = self.max_seq_len
+        else:
+            truncation = False
+            padding = False
+            max_length = None
+        return self.tokenizer(text_batch["text"], truncation=truncation, padding=padding, max_length=max_length)
+
+    def _group_tokens(self, token_batch):
+        if self.group_method == "concat":
+            # Concatenate all tokens.
+            concat_tokens = {}
+            num_tokens = None
+            for k, v in token_batch.items():
+                concat_v = list(chain(*v))
+                concat_tokens[k] = concat_v
+                if num_tokens is None:
+                    num_tokens = len(concat_v)
+                else:
+                    assert num_tokens == len(concat_v), "Not all values in concat_tokens dict have same len()"
+
+            # We drop the small remainder of tokens at the end of the batch,
+            # In the future we could support padding.
+            if num_tokens >= self.max_seq_len:
+                num_tokens = (num_tokens // self.max_seq_len) * self.max_seq_len
+
+            # Split into token samples of size max_seq_len.
+            result = {
+                k: [v[i:i + self.max_seq_len] for i in range(0, num_tokens, self.max_seq_len)] for k, v in concat_tokens.items()
+            }
+            result["labels"] = result["input_ids"].copy()
+            return result
+        else:
+            raise ValueError(f"Unknown group_method: '{group_method}'")
+
+    def initialize_object(self, batch_size: int, dataloader_hparams: DataloaderHparams) -> DataloaderSpec:
+        assert dataloader_hparams.num_workers == 1, "LM Streaming Dataloader only supports num_workers=1"
+
+        try:
+            import datasets
+            import transformers
+        except ImportError:
+            raise ImportError('huggingface transformers and datasets are not installed. '
+                              'Please install with `pip install mosaicml-composer[nlp]`')
+        self.tokenizer = transformers.AutoTokenizer.from_pretrained(self.tokenizer_name)  #type: ignore (thirdparty)
+        self.config = transformers.AutoConfig.from_pretrained(self.tokenizer_name)  #type: ignore (thirdparty)
+
+        # Load and shard dataset
+        text_dataset = self._load_dataset()
+        text_dataset = self._shard_dataset(text_dataset)
+
+        # Shuffle
+        if self.shuffle:
+            text_dataset = text_dataset.shuffle(buffer_size=10000, seed=self.seed)
+
+        # Map text samples to token samples
+        # NOTE: Mapping is executed in batched mode for better CPU utilization,
+        # but the returned dataset is still an iterable over tokenized samples
+        text_sample_batch_size = 1000
+        token_dataset = text_dataset.map(
+            self._tokenize,
+            batched=True,
+            batch_size=text_sample_batch_size,
+        )
+
+        if self.group_method != "truncate":
+            # Map variable-length token samples to fixed-length token samples
+            # NOTE: Mapping is executed in batched mode for better CPU utilization,
+            # but the returned dataset is still an iterable over tokenized samples.
+            # NOTE: Depending on the 'group_method', this step may alter the number of
+            # token samples in the dataset, and may mix neighboring token samples together.
+            token_sample_batch_size = 1000
+            token_dataset = token_dataset.map(
+                self._group_tokens,
+                batched=True,
+                batch_size=token_sample_batch_size,
+            )
+
+        # Maybe limit the number of post-processed samples
+        if self.max_samples > 0:
+            token_dataset = token_dataset.take(self.max_samples // dist.get_world_size())
+
+        # Add approx num samples and create a SizedIterableDataset
+        sized_iterable_dataset = SizedIterableDataset(token_dataset, self._get_approx_num_samples())
+
+
+        # Get collate_fn
+        if self.tokenizer_name in ["gpt2"]:
+            # Really annoying but GPT2 tokenizer has no padding token which causes bugs
+            collate_fn = transformers.default_data_collator
+        else:
+            collate_fn = transformers.DataCollatorForLanguageModeling(tokenizer=self.tokenizer,
+                                                                  mlm=self.use_masked_lm,
+                                                                  mlm_probability=self.mlm_probability)
+        # Return DataloaderSpec
+        return DataloaderSpec(dataloader=dataloader_hparams.initialize_object(
+            dataset=sized_iterable_dataset,
+            batch_size=batch_size,
+            sampler=None,
+            drop_last=self.drop_last,
+            collate_fn=collate_fn,
+        ),
+                              split_fn=_split_dict_fn)
+
+
+class SizedIterableDataset(torch.utils.data.IterableDataset):
+
+    def __init__(self, hf_iterable_dataset, num_samples):
+        self.hf_iterable_dataset = hf_iterable_dataset
+        self.num_samples = num_samples
+
+    def __iter__(self):
+        return iter(self.hf_iterable_dataset)
+
+    def __len__(self):
+        return self.num_samples
diff --git a/composer/trainer/trainer_hparams.py b/composer/trainer/trainer_hparams.py
@@ -78,6 +78,7 @@
     "mnist": datasets.MNISTDatasetHparams,
     "lm": datasets.LMDatasetHparams,
     "glue": datasets.GLUEHparams,
+    "streaming_lm": datasets.StreamingLMDatasetHparams,
 }
 
 algorithms_registry = get_algorithm_registry()
diff --git a/composer/yamls/models/gpt2_125m_streaming.yaml b/composer/yamls/models/gpt2_125m_streaming.yaml
@@ -0,0 +1,110 @@
+# GPT2-125m with streaming C4 dataset
+
+train_dataset:
+  streaming_lm:
+    dataset_name: c4
+    dataset_config_name: en
+    split: train
+    max_shards: -1
+    max_samples: 7168000
+    max_seq_len: 1024
+    group_method: concat
+    tokenizer_name: gpt2
+    use_masked_lm: false
+    seed: 17
+    shuffle: true
+    drop_last: true
+val_dataset:
+  streaming_lm:
+    dataset_name: c4
+    dataset_config_name: en
+    split: validation
+    max_shards: -1
+    max_samples: 128000
+    max_seq_len: 1024
+    group_method: concat
+    tokenizer_name: gpt2
+    use_masked_lm: false
+    seed: 17
+    shuffle: false
+    drop_last: true
+
+model:
+  gpt2:
+    use_pretrained: false
+    tokenizer_name: gpt2
+    model_config:
+      activation_function: gelu_new
+      architectures:
+        - GPT2LMHeadModel
+      attn_pdrop: 0.1
+      bos_token_id: 50256
+      embd_pdrop: 0.1
+      eos_token_id: 50256
+      initializer_range: 0.02
+      layer_norm_epsilon: 1.0e-05
+      model_type: gpt2
+      n_ctx: 1024
+      n_embd: 768
+      n_head: 12
+      n_inner: 3072
+      n_layer: 12
+      n_positions: 1024
+      resid_pdrop: 0.1
+      scale_attn_weights: true
+      summary_activation: null
+      summary_first_dropout: 0.1
+      summary_proj_to_labels: true
+      summary_type: cls_index
+      summary_use_proj: true
+      task_specific_params:
+        text-generation:
+          do_sample: true
+          max_length: 50
+      transformers_version: 4.11.0.dev0
+      use_cache: true
+      vocab_size: 50257
+optimizer:
+  adamw:
+    lr: 6.0e-4
+    betas:
+      - 0.9
+      - 0.999
+    eps: 1.0e-08
+    weight_decay: 0.0
+schedulers:
+  - warmup:
+      warmup_method: linear
+      warmup_factor: 0
+      interval: step
+      warmup_iters: 140ba
+  - cosine_decay:
+      interval: step
+      eta_min: 0
+      verbose: false
+      T_max: 13860ba
+loggers:
+  - file:
+      log_level: batch
+      filename: stdout
+      buffer_size: 1
+      flush_every_n_batches: 100
+      every_n_batches: 100
+      every_n_epochs: 1
+max_epochs: 1
+train_batch_size: 512
+eval_batch_size: 8 # use micro_bs_per_gpu = 1 to accomodate 10GB limit
+seed: 17
+device:
+  gpu: {}
+dataloader:
+  pin_memory: true
+  persistent_workers: true
+  num_workers: 1
+  timeout: 0
+  prefetch_factor: 2
+precision: amp
+grad_clip_norm: 1.0
+grad_accum: 22
+validate_every_n_batches: 1000
+validate_every_n_epochs: 1
diff --git a/tests/datasets/test_streaming_lm.py b/tests/datasets/test_streaming_lm.py

Original file line number	Diff line number	Diff line change
`@@ -78,6 +78,7 @@`
`78`	`78`	`"mnist": datasets.MNISTDatasetHparams,`
`79`	`79`	`"lm": datasets.LMDatasetHparams,`
`80`	`80`	`"glue": datasets.GLUEHparams,`
	`81`	`+ "streaming_lm": datasets.StreamingLMDatasetHparams,`
`81`	`82`	`}`
`82`	`83`
`83`	`84`	`algorithms_registry = get_algorithm_registry()`