Skip to content

Commit 60e24b0

Browse files
authored
Update transformers to 4.51 (#1790)
1 parent 0e5e5a0 commit 60e24b0

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

52 files changed

+579
-366
lines changed

llmfoundry/callbacks/curriculum_learning_callback.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -237,7 +237,7 @@ def _build_train_loader(
237237
try:
238238
return build_dataloader(
239239
train_loader_config,
240-
self._tokenizer,
240+
self._tokenizer, # type: ignore
241241
self._device_train_batch_size,
242242
)
243243
except BaseContextualError as e:

llmfoundry/callbacks/hf_checkpointer.py

Lines changed: 48 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,7 @@
1313
import warnings
1414
from multiprocessing.context import SpawnProcess
1515
from pathlib import Path
16-
from typing import Any, Optional, Sequence, Union
16+
from typing import TYPE_CHECKING, Any, Optional, Sequence, Union
1717

1818
import numpy as np
1919
import torch
@@ -48,6 +48,9 @@
4848
from llmfoundry.utils.huggingface_hub_utils import \
4949
edit_files_for_hf_compatibility
5050

51+
if TYPE_CHECKING:
52+
from peft import PeftModel
53+
5154
try:
5255
import transformer_engine.pytorch as te
5356
is_te_imported = True
@@ -487,9 +490,9 @@ def _any_register_processes_error(self, device: Device) -> bool:
487490

488491
def transform_model_and_tokenizer(
489492
self,
490-
model: PreTrainedModel,
493+
model: Union[PreTrainedModel, 'PeftModel'],
491494
tokenizer: PreTrainedTokenizerBase,
492-
) -> tuple[PreTrainedModel, PreTrainedTokenizerBase]:
495+
) -> tuple[Union[PreTrainedModel, 'PeftModel'], PreTrainedTokenizerBase]:
493496
"""Transform the model and tokenizer before saving.
494497
495498
This allows a subclass to modify the model and tokenizer before saving. The base class implementation will
@@ -537,8 +540,8 @@ def pre_register_edit(self, local_save_path: str):
537540

538541
def transform_model_pre_registration(
539542
self,
540-
model: PreTrainedModel,
541-
) -> PreTrainedModel:
543+
model: Union[PreTrainedModel, 'PeftModel'],
544+
) -> Union[PreTrainedModel, 'PeftModel']:
542545
"""Transform the model before registering with MLflow.
543546
544547
This allows a subclass to modify the model before registering with MLflow. The base class implementation will
@@ -565,17 +568,23 @@ def _get_hf_model(self, state: State):
565568
log.debug('Gathering state dict')
566569

567570
if state.is_model_ddp:
568-
original_model: PreTrainedModel = state.model.module.model # type: ignore
571+
original_model: Union[
572+
PreTrainedModel,
573+
'PeftModel'] = state.model.module.model # type: ignore
569574
state_dict_model = state.model.module.model # type: ignore
570-
original_tokenizer = state.model.module.tokenizer # type: ignore
575+
original_tokenizer: PreTrainedTokenizerBase = state.model.module.tokenizer # type: ignore
571576
elif isinstance(state.model.model, FSDP):
572-
original_model: PreTrainedModel = state.model.model.module # type: ignore
577+
original_model: Union[
578+
PreTrainedModel,
579+
'PeftModel'] = state.model.model.module # type: ignore
573580
state_dict_model = state.model.model # type: ignore
574-
original_tokenizer = state.model.tokenizer # type: ignore
581+
original_tokenizer: PreTrainedTokenizerBase = state.model.tokenizer # type: ignore
575582
else:
576-
original_model: PreTrainedModel = state.model.model # type: ignore
583+
original_model: Union[
584+
PreTrainedModel,
585+
'PeftModel'] = state.model.model # type: ignore
577586
state_dict_model = state.model.model # type: ignore
578-
original_tokenizer = state.model.tokenizer # type: ignore
587+
original_tokenizer: PreTrainedTokenizerBase = state.model.tokenizer # type: ignore
579588

580589
cpu_offload = True
581590

@@ -631,7 +640,7 @@ def tensor_hook(
631640

632641
# Transform HF config before building 2nd model copy
633642
new_config = self.transform_config(
634-
original_config=original_model.config,
643+
original_config=original_model.config, # type: ignore
635644
)
636645

637646
log.debug(f'Creating new model instance')
@@ -640,25 +649,33 @@ def tensor_hook(
640649
# initialization cost.
641650
with init_empty_weights():
642651
if self.using_peft:
643-
active_adapter = original_model.active_adapter
644-
base_model = original_model.get_base_model()
652+
from peft import PeftModel
653+
assert isinstance(original_model, PeftModel)
654+
active_adapter = original_model.active_adapter # type: ignore
655+
base_model: PreTrainedModel = original_model.get_base_model( # type: ignore
656+
)
645657
new_base_model_instance = type(base_model)(new_config)
646658

647659
new_model_instance = type(original_model)(
648-
new_base_model_instance,
649-
original_model.peft_config[active_adapter],
660+
new_base_model_instance, # type: ignore
661+
original_model.
662+
peft_config[active_adapter], # type: ignore
650663
)
651664
del new_base_model_instance
652665
else:
666+
assert isinstance(original_model, PreTrainedModel)
653667
new_model_instance = type(original_model)(new_config)
654668
if new_model_instance.generation_config is not None:
669+
assert original_model.generation_config is not None
655670
new_model_instance.generation_config.update(
656671
**original_model.generation_config.to_dict(),
657672
)
658673

659674
# Then load the state dict in with "assign" so that the state dict
660675
# is loaded properly even though the model is initially on meta device.
661-
new_model_instance.load_state_dict(state_dict, assign=True)
676+
new_model_instance.load_state_dict( # type: ignore
677+
state_dict, assign=True,
678+
)
662679
del state_dict
663680

664681
# Transform the model and tokenizer before saving
@@ -671,11 +688,14 @@ def tensor_hook(
671688
if self.pretrained_model_name is not None:
672689
new_model_instance.name_or_path = self.pretrained_model_name
673690
if self.using_peft:
691+
from peft import PeftModel
692+
assert isinstance(new_model_instance, PeftModel)
674693
new_model_instance.base_model.name_or_path = self.pretrained_model_name
675-
for k in new_model_instance.peft_config.keys():
676-
new_model_instance.peft_config[
694+
for k in new_model_instance.peft_config.keys( # type: ignore
695+
):
696+
new_model_instance.peft_config[ # type: ignore
677697
k
678-
].base_model_name_or_path = self.pretrained_model_name
698+
].base_model_name_or_path = self.pretrained_model_name # type: ignore
679699

680700
log.debug('Saving Hugging Face checkpoint to disk')
681701

@@ -686,7 +706,7 @@ def _register_hf_model(
686706
temp_save_dir: str,
687707
original_tokenizer: PreTrainedTokenizerBase,
688708
use_temp_dir: bool,
689-
new_model_instance: PreTrainedModel,
709+
new_model_instance: Union[PreTrainedModel, 'PeftModel'],
690710
):
691711
assert new_model_instance is not None
692712
new_model_instance = self.transform_model_pre_registration(
@@ -802,7 +822,7 @@ def _save_checkpoint(
802822
)
803823

804824
# Only need to edit files for MPT because it has custom code
805-
if new_model_instance.config.model_type == 'mpt':
825+
if new_model_instance.config.model_type == 'mpt': # type: ignore
806826
log.debug('Editing MPT files for HuggingFace compatibility')
807827
edit_files_for_hf_compatibility(
808828
temp_save_dir,
@@ -837,6 +857,12 @@ def _save_checkpoint(
837857
None,
838858
)
839859
if model_name is not None:
860+
from peft import PeftModel
861+
assert isinstance(new_model_instance, PeftModel)
862+
assert isinstance(
863+
new_model_instance.model,
864+
PreTrainedModel,
865+
)
840866
new_model_instance.name_or_path = model_name
841867
new_model_instance.model.name_or_path = model_name
842868
new_model_instance.base_model.name_or_path = model_name

llmfoundry/command_utils/data_prep/convert_dataset_hf.py

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -169,7 +169,7 @@ def build_hf_dataset(
169169
bos_text: str = '',
170170
eos_text: str = '',
171171
no_wrap: bool = False,
172-
tokenizer: PreTrainedTokenizerBase = None,
172+
tokenizer: Optional[PreTrainedTokenizerBase] = None,
173173
data_subset: Union[str, None] = None,
174174
) -> IterableDataset:
175175
"""Build an IterableDataset over the HF C4 or pile source data.
@@ -206,9 +206,10 @@ def build_hf_dataset(
206206
raise ValueError(f'max_length must be set.')
207207
if bos_text + eos_text == '':
208208
test_tokens = tokenizer('test')
209-
if test_tokens['input_ids'][
210-
0] != tokenizer.bos_token_id and test_tokens['input_ids'][
211-
-1] != tokenizer.eos_token_id:
209+
if test_tokens['input_ids'][ # type: ignore
210+
0] != tokenizer.bos_token_id and test_tokens[
211+
'input_ids'][ # type: ignore
212+
-1] != tokenizer.eos_token_id:
212213
tok_error_msg = 'This tokenizer does not insert an EOS nor BOS token. '
213214
tok_error_msg += 'Concatenating with this tokenizer will result in sequences being '
214215
tok_error_msg += 'attached without a separating token. Please use another tokenizer, '

llmfoundry/command_utils/data_prep/convert_dataset_json.py

Lines changed: 6 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -29,7 +29,7 @@ def build_hf_dataset(
2929
bos_text: str = '',
3030
eos_text: str = '',
3131
no_wrap: bool = False,
32-
tokenizer: PreTrainedTokenizerBase = None,
32+
tokenizer: Optional[PreTrainedTokenizerBase] = None,
3333
) -> IterableDataset:
3434
"""Build an IterableDataset over the HF C4 or pile source data.
3535
@@ -70,9 +70,10 @@ def build_hf_dataset(
7070
raise ValueError(f'max_length must be set.')
7171
if bos_text + eos_text == '':
7272
test_tokens = tokenizer('test')
73-
if test_tokens['input_ids'][
74-
0] != tokenizer.bos_token_id and test_tokens['input_ids'][
75-
-1] != tokenizer.eos_token_id:
73+
if test_tokens['input_ids'][ # type: ignore
74+
0] != tokenizer.bos_token_id and test_tokens[
75+
'input_ids'][ # type: ignore
76+
-1] != tokenizer.eos_token_id:
7677
tok_error_msg = 'This tokenizer does not insert an EOS nor BOS token. '
7778
tok_error_msg += 'Concatenating with this tokenizer will result in sequences being '
7879
tok_error_msg += 'attached without a separating token. Please use another tokenizer, '
@@ -118,6 +119,7 @@ def convert_dataset_json(
118119
"""
119120
if concat_tokens is not None:
120121
mode = ConcatMode.CONCAT_TOKENS
122+
assert tokenizer is not None
121123
built_tokenizer = AutoTokenizer.from_pretrained(tokenizer)
122124
# we will enforce length, so suppress warnings about sequences too long for the model
123125
built_tokenizer.model_max_length = int(1e30)

llmfoundry/command_utils/data_prep/convert_finetuning_dataset.py

Lines changed: 6 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -167,8 +167,9 @@ def convert_finetuning_dataset(
167167

168168
tokenizer_kwargs = tokenizer_kwargs
169169
tokenizer_kwargs.update({'model_max_length': max_seq_len})
170+
built_tokenizer = None
170171
if tokenizer:
171-
tokenizer = build_tokenizer(tokenizer, tokenizer_kwargs)
172+
built_tokenizer = build_tokenizer(tokenizer, tokenizer_kwargs)
172173

173174
for i, split_name in enumerate(splits):
174175
data_file = None
@@ -184,7 +185,7 @@ def convert_finetuning_dataset(
184185
# Determine the output columns
185186
columns, example_type = get_columns_and_format(
186187
dataset=loaded_dataset,
187-
tokenizing=tokenizer is not None,
188+
tokenizing=built_tokenizer is not None,
188189
preprocessing_fn=preprocessing_fn,
189190
)
190191
# Prepare the iterables
@@ -226,10 +227,10 @@ def convert_finetuning_dataset(
226227
'Encountered an error when checking example for proper formatting. ' +\
227228
f'example={formatted_sample}',
228229
) from e
229-
if tokenizer is not None:
230+
if built_tokenizer is not None:
230231
sample = tokenize_formatted_example(
231232
formatted_sample,
232-
tokenizer=tokenizer,
233+
tokenizer=built_tokenizer,
233234
)
234235
if not is_valid_ift_example(
235236
max_seq_len,
@@ -259,7 +260,7 @@ def convert_finetuning_dataset(
259260
else:
260261
out.write(formatted_sample)
261262

262-
if tokenizer is not None and examples_removed > 0:
263+
if built_tokenizer is not None and examples_removed > 0:
263264
warnings.warn(
264265
f'Dropped {examples_removed} examples where the prompt was longer than {max_seq_len}, '
265266
+

llmfoundry/command_utils/data_prep/convert_text_to_mds.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@
99
from concurrent.futures import ProcessPoolExecutor
1010
from functools import partial
1111
from glob import glob
12-
from typing import Iterable, Optional, cast
12+
from typing import Any, Iterable, Optional, cast
1313

1414
import numpy as np
1515
from composer.utils import (
@@ -81,7 +81,7 @@ def __iter__(self) -> Iterable[dict[str, NDArray]]:
8181
truncation=False,
8282
padding=False,
8383
)
84-
iids = encoded['input_ids']
84+
iids = cast(Any, encoded['input_ids'])
8585

8686
# If this is not the first chunk, remove the BOS token
8787
if not first_chunk:

llmfoundry/command_utils/eval.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -80,13 +80,13 @@ def evaluate_model(
8080
tokenizer_cfg = tokenizer
8181
tokenizer_name = tokenizer_cfg['name']
8282
tokenizer_kwargs = tokenizer_cfg.get('kwargs', {})
83-
tokenizer = build_tokenizer(tokenizer_name, tokenizer_kwargs)
83+
built_tokenizer = build_tokenizer(tokenizer_name, tokenizer_kwargs)
8484

8585
evaluators, logger_keys, eval_gauntlet_callback = build_evaluators(
8686
eval_loader_config,
8787
icl_tasks,
8888
eval_gauntlet_config,
89-
tokenizer=tokenizer,
89+
tokenizer=built_tokenizer,
9090
device_eval_batch_size=device_eval_batch_size,
9191
icl_seq_len=max_seq_len,
9292
icl_subset_num_batches=icl_subset_num_batches,
@@ -124,7 +124,7 @@ def evaluate_model(
124124
name = model.pop('name')
125125
composer_model = build_composer_model(
126126
name=name,
127-
tokenizer=tokenizer,
127+
tokenizer=built_tokenizer,
128128
init_context=init_context,
129129
cfg=model,
130130
)

llmfoundry/data/contrastive_pairs/dataloader.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -175,8 +175,8 @@ def _tokenize(
175175
text_samples_negatives = text_samples['negative']
176176
assert isinstance(text_samples_negatives, list) # pyright type check
177177
text_samples_list.extend(text_samples_negatives)
178-
return self.tokenizer(
179-
text_samples_list,
178+
return self.tokenizer( # type: ignore
179+
text_samples_list, # type: ignore
180180
truncation=True,
181181
padding='max_length',
182182
max_length=self.max_seq_len,

llmfoundry/data/data.py

Lines changed: 12 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -72,24 +72,29 @@ def __init__(
7272
self.eos_text = eos_text
7373
self.should_wrap = not no_wrap
7474

75-
self.bos_tokens = self.tokenizer(
75+
bos_ids = self.tokenizer(
7676
self.bos_text,
7777
truncation=False,
7878
padding=False,
7979
add_special_tokens=False,
8080
)['input_ids']
81+
assert isinstance(bos_ids, list)
82+
83+
self.bos_tokens: list[int] = bos_ids
8184
if len(self.bos_tokens) > 1:
8285
warnings.warn(
8386
f'You specified --concat_tokens with --bos_text, but your BOS text is not tokenizing to one token\
8487
, instead we got {self.bos_tokens}. Quit if this was in error.',
8588
)
8689

87-
self.eos_tokens = self.tokenizer(
90+
eos_ids = self.tokenizer(
8891
self.eos_text,
8992
truncation=False,
9093
padding=False,
9194
add_special_tokens=False,
9295
)['input_ids']
96+
assert isinstance(eos_ids, list)
97+
self.eos_tokens: list[int] = eos_ids
9398
if len(self.eos_tokens) > 1:
9499
warnings.warn(
95100
f'You specified --concat_tokens with --eos_text, but your EOS text is not tokenizing to one token\
@@ -99,8 +104,10 @@ def __init__(
99104
eos_text_provided = self.eos_text != ''
100105
bos_text_provided = self.bos_text != ''
101106
test_text = self.tokenizer('')
107+
test_text_iids = test_text['input_ids']
108+
assert isinstance(test_text_iids, list)
102109
if len(
103-
test_text['input_ids'],
110+
test_text_iids,
104111
) > 0 and (eos_text_provided or bos_text_provided):
105112
message = 'both eos and bos' if eos_text_provided and bos_text_provided else (
106113
'eos_text' if eos_text_provided else 'bos_text'
@@ -155,11 +162,12 @@ def __iter__(self) -> Iterable[dict[str, NDArray]]:
155162
buffer = []
156163
for sample in self.hf_dataset:
157164
encoded = self.tokenizer(
158-
sample['text'],
165+
sample['text'], # type: ignore
159166
truncation=False,
160167
padding=False,
161168
)
162169
iids = encoded['input_ids']
170+
assert isinstance(iids, list)
163171
buffer = buffer + self.bos_tokens + iids + self.eos_tokens
164172
while len(buffer) >= self.max_length:
165173
concat_sample = buffer[:self.max_length]

0 commit comments

Comments
 (0)