From c29cfef38f99d00091c79b50e2167dcf96e8cbf8 Mon Sep 17 00:00:00 2001
From: bmosaicml <jeremy@mosaicml.com>
Date: Thu, 9 Feb 2023 15:19:06 -0500
Subject: [PATCH 1/2] permit opt tokenizer

---
 .../in_context_learning_evaluation.py         |  8 +-
 .../test_in_context_learning_datasets.py      | 82 +++++++++++++++++++
 2 files changed, 86 insertions(+), 4 deletions(-)

diff --git a/composer/datasets/in_context_learning_evaluation.py b/composer/datasets/in_context_learning_evaluation.py
index 0f4cd525cc..847a906c3d 100644
--- a/composer/datasets/in_context_learning_evaluation.py
+++ b/composer/datasets/in_context_learning_evaluation.py
@@ -153,10 +153,10 @@ def prep_examples(self, num_fewshot: int, prompt_string: str, example_delimiter:
 
             cont = f'{continuation_delimiter}{cont}'
 
-            encoded_example['context'] = self.tokenizer(ctxt)
-            encoded_example['continuation'] = self.tokenizer(cont)
             encoded_example['preamble'] = self.tokenizer(
                 preamble)  # if the preamble is empty then these will be 0-length lists
+            encoded_example['context'] = self.tokenizer(ctxt, add_special_tokens=len(preamble) == 0)
+            encoded_example['continuation'] = self.tokenizer(cont, add_special_tokens=False)
 
             examples.append(encoded_example)
 
@@ -300,11 +300,11 @@ def prep_examples(self, num_fewshot: int, prompt_string: str, example_delimiter:
                 query = f'{example_delimiter}{query}'
 
             choices = [f'{continuation_delimiter}{choice}' for choice in choices]
-            encoded_example['query'] = self.tokenizer(query)
-            encoded_example['choices'] = [self.tokenizer(choice) for choice in choices]
             encoded_example['preamble'] = self.tokenizer(
                 preamble)  # if the preamble is empty then these will be 0-length lists
             encoded_example['gold_idx'] = gold_idx
+            encoded_example['query'] = self.tokenizer(query, add_special_tokens=len(preamble) == 0)
+            encoded_example['choices'] = [self.tokenizer(choice, add_special_tokens=False) for choice in choices]
 
             examples.append(encoded_example)
 
diff --git a/tests/datasets/test_in_context_learning_datasets.py b/tests/datasets/test_in_context_learning_datasets.py
index 1af596e559..4f2f716cca 100644
--- a/tests/datasets/test_in_context_learning_datasets.py
+++ b/tests/datasets/test_in_context_learning_datasets.py
@@ -5,6 +5,7 @@
 
 import pytest
 from torch.utils.data import DataLoader
+from transformers import AutoTokenizer
 
 from composer.core import Evaluator
 from composer.datasets.in_context_learning_evaluation import (_get_fewshot_sample_idxs, _make_padded_input,
@@ -72,6 +73,87 @@ def test_lm_task_dataloader(dataset_uri, tiny_gpt2_tokenizer):
     assert tokenizer.decode(batch['input_ids'][0][min_idx:max_idx + 1]) == ' glen'
 
 
+@pytest.mark.parametrize('dataset_uri', ['lambada_small.jsonl'])
+def test_lm_task_dataloader_opt_tokenizer(dataset_uri):
+    local_data = os.path.join(os.path.dirname(__file__), 'local_data')
+
+    tokenizer = AutoTokenizer.from_pretrained('facebook/opt-125m', use_fast=False)
+    dataset_uri = f'{local_data}/{dataset_uri}'
+    batch_size = 2
+    seqlen = 2048
+    dl = get_icl_task_dataloader('language_modeling',
+                                 dataset_uri,
+                                 tokenizer,
+                                 batch_size,
+                                 max_seq_len=seqlen,
+                                 pad_tok_id=tokenizer.eos_token_id,
+                                 num_fewshot=1,
+                                 prompt_string='',
+                                 example_delimiter='\n',
+                                 continuation_delimiter='')
+
+    assert isinstance(dl.dataloader, DataLoader)  # pyright
+    batch = next(dl.dataloader._get_iterator())
+
+    assert 'input_ids' in batch
+    assert tuple(batch['input_ids'].shape) == (batch_size, seqlen)
+    assert 'attention_mask' in batch
+    assert tuple(batch['attention_mask'].shape) == (batch_size, seqlen)
+    assert 'continuation_indices' in batch
+    assert isinstance(batch['continuation_indices'], list) and len(batch['continuation_indices']) == batch_size
+    assert 'mode' in batch
+    assert batch['mode'] == 'icl_task'
+    min_idx = min(batch['continuation_indices'][0]).item()
+    max_idx = max(batch['continuation_indices'][0]).item()
+    assert tokenizer.decode(batch['input_ids'][0][min_idx:max_idx + 1]) == ' glen'
+
+
+@pytest.mark.parametrize('dataset_uri', ['piqa_small.jsonl'])
+def test_mc_task_dataloader_opt_tokenizer(dataset_uri):
+    local_data = os.path.join(os.path.dirname(__file__), 'local_data')
+
+    tokenizer = AutoTokenizer.from_pretrained('facebook/opt-125m', use_fast=False)
+
+    dataset_uri = f'{local_data}/{dataset_uri}'
+    batch_size = 2
+    seqlen = 2048
+    dl = get_icl_task_dataloader('multiple_choice',
+                                 dataset_uri,
+                                 tokenizer,
+                                 batch_size,
+                                 max_seq_len=seqlen,
+                                 pad_tok_id=tokenizer.eos_token_id,
+                                 num_fewshot=1,
+                                 prompt_string='',
+                                 example_delimiter='\n',
+                                 continuation_delimiter=': ')
+
+    assert isinstance(dl.dataloader, DataLoader)  # pyright
+    batch = next(dl.dataloader._get_iterator())
+
+    choices_per_question = 2
+    assert 'input_ids' in batch
+    assert tuple(batch['input_ids'].shape) == (batch_size, seqlen)
+    assert 'attention_mask' in batch
+    assert tuple(batch['attention_mask'].shape) == (batch_size, seqlen)
+    assert 'continuation_indices' in batch
+    assert isinstance(batch['continuation_indices'], list) and len(batch['continuation_indices']) == batch_size
+    assert 'mode' in batch
+    assert batch['mode'] == 'icl_task'
+    assert 'gold_indices' in batch
+    assert isinstance(batch['gold_indices'], list) and len(batch['gold_indices']) == batch_size // choices_per_question
+    assert 'choice_groupings' in batch
+    assert isinstance(batch['choice_groupings'], list) and len(
+        batch['choice_groupings']) == batch_size // choices_per_question
+
+    min_idx = min(batch['continuation_indices'][0]).item()
+    max_idx = max(batch['continuation_indices'][0]).item()
+    assert tokenizer.decode(batch['input_ids'][0][min_idx:max_idx + 1]) == ': Pour it onto a plate'
+    assert tokenizer.decode(
+        batch['input_ids'][0][0:min_idx]
+    ) == "</s>how do you open a capri-sun: open the straw attached to the juice, and then stick it in the small hole at the front of the pouch.\nWhen boiling butter, when it's ready, you can"
+
+
 @pytest.mark.parametrize('dataset_uri', ['piqa_small.jsonl'])
 def test_mc_task_dataloader(dataset_uri, tiny_gpt2_tokenizer):
     local_data = os.path.join(os.path.dirname(__file__), 'local_data')

From 6c44713b6a50f2ebedd59bcb3c69cda44f7769fa Mon Sep 17 00:00:00 2001
From: bmosaicml <jeremy@mosaicml.com>
Date: Thu, 9 Feb 2023 16:52:28 -0500
Subject: [PATCH 2/2] fix tokenizer to work w/ 0 shot

---
 .../in_context_learning_evaluation.py         | 11 ++++++-----
 .../test_in_context_learning_datasets.py      | 19 +++++++++++--------
 2 files changed, 17 insertions(+), 13 deletions(-)

diff --git a/composer/datasets/in_context_learning_evaluation.py b/composer/datasets/in_context_learning_evaluation.py
index 847a906c3d..424c4e3f71 100644
--- a/composer/datasets/in_context_learning_evaluation.py
+++ b/composer/datasets/in_context_learning_evaluation.py
@@ -154,8 +154,9 @@ def prep_examples(self, num_fewshot: int, prompt_string: str, example_delimiter:
             cont = f'{continuation_delimiter}{cont}'
 
             encoded_example['preamble'] = self.tokenizer(
-                preamble)  # if the preamble is empty then these will be 0-length lists
-            encoded_example['context'] = self.tokenizer(ctxt, add_special_tokens=len(preamble) == 0)
+                preamble
+            )  # if the preamble is empty then these will be 0-length lists, unless the tokenizer adds special tokens to empty strings (e.g. OPT tokenizer)
+            encoded_example['context'] = self.tokenizer(ctxt, add_special_tokens=False)
             encoded_example['continuation'] = self.tokenizer(cont, add_special_tokens=False)
 
             examples.append(encoded_example)
@@ -298,12 +299,12 @@ def prep_examples(self, num_fewshot: int, prompt_string: str, example_delimiter:
                 'choices'], self.samples[sample_idx]['gold'],
             if len(preamble) > 0:
                 query = f'{example_delimiter}{query}'
-
             choices = [f'{continuation_delimiter}{choice}' for choice in choices]
             encoded_example['preamble'] = self.tokenizer(
-                preamble)  # if the preamble is empty then these will be 0-length lists
+                preamble
+            )  # if the preamble is empty then these will be 0-length lists, unless the tokenizer adds special tokens to empty strings (e.g. OPT tokenizer)
             encoded_example['gold_idx'] = gold_idx
-            encoded_example['query'] = self.tokenizer(query, add_special_tokens=len(preamble) == 0)
+            encoded_example['query'] = self.tokenizer(query, add_special_tokens=False)
             encoded_example['choices'] = [self.tokenizer(choice, add_special_tokens=False) for choice in choices]
 
             examples.append(encoded_example)
diff --git a/tests/datasets/test_in_context_learning_datasets.py b/tests/datasets/test_in_context_learning_datasets.py
index 4f2f716cca..f7df77d01f 100644
--- a/tests/datasets/test_in_context_learning_datasets.py
+++ b/tests/datasets/test_in_context_learning_datasets.py
@@ -52,7 +52,7 @@ def test_lm_task_dataloader(dataset_uri, tiny_gpt2_tokenizer):
                                  batch_size,
                                  max_seq_len=seqlen,
                                  pad_tok_id=tokenizer.eos_token_id,
-                                 num_fewshot=1,
+                                 num_fewshot=0,
                                  prompt_string='',
                                  example_delimiter='\n',
                                  continuation_delimiter='')
@@ -74,7 +74,8 @@ def test_lm_task_dataloader(dataset_uri, tiny_gpt2_tokenizer):
 
 
 @pytest.mark.parametrize('dataset_uri', ['lambada_small.jsonl'])
-def test_lm_task_dataloader_opt_tokenizer(dataset_uri):
+@pytest.mark.parametrize('num_fewshot', [0, 1])
+def test_lm_task_dataloader_opt_tokenizer(dataset_uri, num_fewshot):
     local_data = os.path.join(os.path.dirname(__file__), 'local_data')
 
     tokenizer = AutoTokenizer.from_pretrained('facebook/opt-125m', use_fast=False)
@@ -87,7 +88,7 @@ def test_lm_task_dataloader_opt_tokenizer(dataset_uri):
                                  batch_size,
                                  max_seq_len=seqlen,
                                  pad_tok_id=tokenizer.eos_token_id,
-                                 num_fewshot=1,
+                                 num_fewshot=num_fewshot,
                                  prompt_string='',
                                  example_delimiter='\n',
                                  continuation_delimiter='')
@@ -106,10 +107,13 @@ def test_lm_task_dataloader_opt_tokenizer(dataset_uri):
     min_idx = min(batch['continuation_indices'][0]).item()
     max_idx = max(batch['continuation_indices'][0]).item()
     assert tokenizer.decode(batch['input_ids'][0][min_idx:max_idx + 1]) == ' glen'
+    assert tokenizer.decode(batch['input_ids'][0][0:min_idx]).startswith('</s>')
+    assert tokenizer.decode(batch['input_ids'][0][0:min_idx]).count('</s>') == 1
 
 
 @pytest.mark.parametrize('dataset_uri', ['piqa_small.jsonl'])
-def test_mc_task_dataloader_opt_tokenizer(dataset_uri):
+@pytest.mark.parametrize('num_fewshot', [0, 1])
+def test_mc_task_dataloader_opt_tokenizer(dataset_uri, num_fewshot):
     local_data = os.path.join(os.path.dirname(__file__), 'local_data')
 
     tokenizer = AutoTokenizer.from_pretrained('facebook/opt-125m', use_fast=False)
@@ -123,7 +127,7 @@ def test_mc_task_dataloader_opt_tokenizer(dataset_uri):
                                  batch_size,
                                  max_seq_len=seqlen,
                                  pad_tok_id=tokenizer.eos_token_id,
-                                 num_fewshot=1,
+                                 num_fewshot=num_fewshot,
                                  prompt_string='',
                                  example_delimiter='\n',
                                  continuation_delimiter=': ')
@@ -149,9 +153,8 @@ def test_mc_task_dataloader_opt_tokenizer(dataset_uri):
     min_idx = min(batch['continuation_indices'][0]).item()
     max_idx = max(batch['continuation_indices'][0]).item()
     assert tokenizer.decode(batch['input_ids'][0][min_idx:max_idx + 1]) == ': Pour it onto a plate'
-    assert tokenizer.decode(
-        batch['input_ids'][0][0:min_idx]
-    ) == "</s>how do you open a capri-sun: open the straw attached to the juice, and then stick it in the small hole at the front of the pouch.\nWhen boiling butter, when it's ready, you can"
+    assert tokenizer.decode(batch['input_ids'][0][0:min_idx]).startswith('</s>')
+    assert tokenizer.decode(batch['input_ids'][0][0:min_idx]).count('</s>') == 1
 
 
 @pytest.mark.parametrize('dataset_uri', ['piqa_small.jsonl'])