Skip to content

Commit 20240ce

Browse files
committed
fix tokenizer to work w/ 0 shot
1 parent c29cfef commit 20240ce

File tree

2 files changed

+8
-8
lines changed

2 files changed

+8
-8
lines changed

composer/datasets/in_context_learning_evaluation.py

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -155,7 +155,7 @@ def prep_examples(self, num_fewshot: int, prompt_string: str, example_delimiter:
155155

156156
encoded_example['preamble'] = self.tokenizer(
157157
preamble) # if the preamble is empty then these will be 0-length lists
158-
encoded_example['context'] = self.tokenizer(ctxt, add_special_tokens=len(preamble) == 0)
158+
encoded_example['context'] = self.tokenizer(ctxt, add_special_tokens=False)
159159
encoded_example['continuation'] = self.tokenizer(cont, add_special_tokens=False)
160160

161161
examples.append(encoded_example)
@@ -298,12 +298,11 @@ def prep_examples(self, num_fewshot: int, prompt_string: str, example_delimiter:
298298
'choices'], self.samples[sample_idx]['gold'],
299299
if len(preamble) > 0:
300300
query = f'{example_delimiter}{query}'
301-
302301
choices = [f'{continuation_delimiter}{choice}' for choice in choices]
303302
encoded_example['preamble'] = self.tokenizer(
304303
preamble) # if the preamble is empty then these will be 0-length lists
305304
encoded_example['gold_idx'] = gold_idx
306-
encoded_example['query'] = self.tokenizer(query, add_special_tokens=len(preamble) == 0)
305+
encoded_example['query'] = self.tokenizer(query, add_special_tokens=False)
307306
encoded_example['choices'] = [self.tokenizer(choice, add_special_tokens=False) for choice in choices]
308307

309308
examples.append(encoded_example)

tests/datasets/test_in_context_learning_datasets.py

Lines changed: 6 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -52,7 +52,7 @@ def test_lm_task_dataloader(dataset_uri, tiny_gpt2_tokenizer):
5252
batch_size,
5353
max_seq_len=seqlen,
5454
pad_tok_id=tokenizer.eos_token_id,
55-
num_fewshot=1,
55+
num_fewshot=0,
5656
prompt_string='',
5757
example_delimiter='\n',
5858
continuation_delimiter='')
@@ -106,6 +106,9 @@ def test_lm_task_dataloader_opt_tokenizer(dataset_uri):
106106
min_idx = min(batch['continuation_indices'][0]).item()
107107
max_idx = max(batch['continuation_indices'][0]).item()
108108
assert tokenizer.decode(batch['input_ids'][0][min_idx:max_idx + 1]) == ' glen'
109+
assert tokenizer.decode(
110+
batch['input_ids'][0][0:min_idx]
111+
) == "</s>My eyes fly open, and I feel they're engulfed in the Ult L-E as I recite a poem as if someone else is controlling me,\n“Though the clouds darken the sun,\nand the rain becomes tainted,\nalways know there will be\na love that will not die.\nThough hope seems a distant memory,\nand human machines walk the land,\nknow no one can destroy\na love that will not die.”\n\n“What are you babbling about?” the Rogue asks.\nI surface from my unconscious state, and I sit up, stand, walk to the PPK, pick up the gun, and aim it for the Rogue\nWith Tristran's next step he was standing beside a lake, and the candlelight shone brightly on the water; and then he was walking through the mountains, through lonely crags, where the candlelight was reflected in the eyes of the creatures of the high snows; and then he was walking through the clouds, which, while not entirely substantial, still supported his weight in comfort; and then, holding tightly to his candle, he was underground, and the candlelight glinted back at him from the wet cave walls; now he was in the mountains once more; and then he was on a road through wild forest, and he glimpsed a chariot being pulled by two goats, being driven by a woman in a red dress who looked, for the glimpse he got of her, the way Boadicea was drawn in his history books; and another step and he was in a leafy glen, and he could hear the chuckle of water as it splashed and sang its way into a small brook.\n\nHe took another step, but he was still in the"
109112

110113

111114
@pytest.mark.parametrize('dataset_uri', ['piqa_small.jsonl'])
@@ -123,7 +126,7 @@ def test_mc_task_dataloader_opt_tokenizer(dataset_uri):
123126
batch_size,
124127
max_seq_len=seqlen,
125128
pad_tok_id=tokenizer.eos_token_id,
126-
num_fewshot=1,
129+
num_fewshot=0,
127130
prompt_string='',
128131
example_delimiter='\n',
129132
continuation_delimiter=': ')
@@ -149,9 +152,7 @@ def test_mc_task_dataloader_opt_tokenizer(dataset_uri):
149152
min_idx = min(batch['continuation_indices'][0]).item()
150153
max_idx = max(batch['continuation_indices'][0]).item()
151154
assert tokenizer.decode(batch['input_ids'][0][min_idx:max_idx + 1]) == ': Pour it onto a plate'
152-
assert tokenizer.decode(
153-
batch['input_ids'][0][0:min_idx]
154-
) == "</s>how do you open a capri-sun: open the straw attached to the juice, and then stick it in the small hole at the front of the pouch.\nWhen boiling butter, when it's ready, you can"
155+
assert tokenizer.decode(batch['input_ids'][0][0:min_idx]) == "</s>When boiling butter, when it's ready, you can"
155156

156157

157158
@pytest.mark.parametrize('dataset_uri', ['piqa_small.jsonl'])

0 commit comments

Comments
 (0)