Skip to content

Commit 070f1d7

Browse files
authored
Upgrade transformers to 4.43.2 and support llama3.1 (#3157)
1 parent 34fab8e commit 070f1d7

File tree

34 files changed

+471
-415
lines changed

34 files changed

+471
-415
lines changed

csrc/cpu/aten/kernels/MaskedMultiHeadAttentionKrnl.cpp

Lines changed: 2 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -1313,15 +1313,6 @@ first_token_masked_mha(
13131313
auto key_lenght = key.size(1);
13141314
auto kv_head_num = key.size(2);
13151315
auto head_size = key.size(3);
1316-
if (add_casual_mask) {
1317-
auto casual_mask = at::full(
1318-
{query_length, key_lenght},
1319-
origin_type == at::kHalf ? -6e4 : -1e6,
1320-
query.options());
1321-
casual_mask = at::triu(casual_mask, 1);
1322-
casual_mask = casual_mask.unsqueeze(0).unsqueeze(0);
1323-
attention_mask = attention_mask + casual_mask;
1324-
}
13251316
if (key.scalar_type() != at::kBFloat16 && key.scalar_type() != at::kFloat &&
13261317
key.scalar_type() != at::kHalf) {
13271318
TORCH_CHECK(
@@ -1358,7 +1349,7 @@ first_token_masked_mha(
13581349
key,
13591350
value,
13601351
/* dropout */ 0.0,
1361-
/* is_causal*/ false,
1352+
add_casual_mask,
13621353
attention_mask,
13631354
1. / scale_attn));
13641355
} else {
@@ -1572,7 +1563,7 @@ at::Tensor prepare_4d_causal_attention_mask_kernel_impl(
15721563
int64_t src_length = attention_mask.size(-1);
15731564
int64_t past_key_value_length = past_kv_len.item<int64_t>();
15741565
int64_t length = seq_length + past_key_value_length;
1575-
int64_t diagonal = past_key_value_length - sliding_window + 1;
1566+
int64_t diagonal = past_key_value_length - sliding_window;
15761567

15771568
at::Tensor causal_4d_mask = torch::empty(
15781569
{batch_size, 1, seq_length, length}, inputs_embeds.options());

dependency_version.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -38,4 +38,4 @@ torchaudio:
3838
torchvision:
3939
version: 0.19.0+cpu
4040
transformers:
41-
version: 4.38.1
41+
version: 4.43.2

docs/tutorials/features/fast_bert.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@ Currently `ipex.fast_bert` API is only well optimized for training. For inferenc
99

1010
### Prerequisite
1111

12-
- Transformers 4.6.0 ~ 4.38.1
12+
- Transformers 4.6.0 ~ 4.43.2
1313

1414
### Usage Example
1515

examples/cpu/features/fast_bert/README.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@
55
Currently `ipex.fast_bert` API is only well optimized for training. For inference, it ensures functionality, while to get peak perf, please use `ipex.optimize` API + torchscript.
66

77
# Prerequisite:
8-
Transformers 4.6.0 ~ 4.38.1
8+
Transformers 4.6.0 ~ 4.43.2
99

1010
# Usage Example:
1111
Training:

examples/cpu/llm/fine-tuning/requirements.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,6 @@ black[jupyter]
66
datasets
77
fire
88
peft
9-
transformers==4.38.1
9+
transformers==4.43.2
1010
gradio
1111
sentencepiece

examples/cpu/llm/inference/README.md

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -420,8 +420,6 @@ There are some model-specific requirements to be aware of, as follows:
420420

421421
- For Llava models from remote hub, additional setup is required, i.e., `bash ./tools/prepare_llava.sh`.
422422

423-
- For mistralai/Mistral-7B-v0.1 and mistralai/Mixtral-8x7B-Instruct-v0.1, we use a fixed model version because the latest version is not compatible with transformers 4.38.1 and tokenizers 0.15.2.
424-
425423
## 2.3 Instructions for Running LLM with Intel® Xeon® CPU Max Series
426424

427425
Intel® Xeon® CPU Max Series are equipped with high bandwidth memory (HBM), which further accelerates LLM inference. For the common case that HBM and DDR are both installed in a Xeon® CPU Max Series server, the memory mode can be configured to Flat Mode or Cache Mode. Details about memory modes can be found at Section 3.1 in [the Xeon® CPU Max Series Configuration Guide](https://cdrdv2-public.intel.com/769060/354227-intel-xeon-cpu-max-series-configuration-and-tuning-guide.pdf).

examples/cpu/llm/inference/distributed/run_accuracy_with_deepspeed.py

Lines changed: 1 addition & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -91,11 +91,6 @@ def decorator(func):
9191
"auto": (AutoModelForCausalLM, AutoTokenizer),
9292
}
9393

94-
# The latest model is not compatible with the current transformers/tokenizers, so we specify the revision of the model
95-
pin_model_revision = {
96-
"mistralai/Mistral-7B-v0.1": "26bca36bde8333b5d7f72e9ed20ccda6a618af24",
97-
"mistralai/Mixtral-8x7B-Instruct-v0.1": "a60832cb6c88d5cb6e507680d0e9996fbad77050",
98-
}
9994
parser = argparse.ArgumentParser()
10095
parser.add_argument("--model", nargs="?", default="EleutherAI/gpt-j-6b")
10196
parser.add_argument("--output_dir", nargs="?", default="./saved_results")
@@ -279,9 +274,7 @@ def __init__(
279274
model_class = MODEL_CLASSES[model_type]
280275

281276
self.tokenizer = model_class[1].from_pretrained(
282-
model_id,
283-
trust_remote_code=True,
284-
revision=pin_model_revision.get(model_id, None),
277+
model_id, trust_remote_code=True
285278
)
286279
if model_type == "chatglm":
287280
# chatglm modeling is from remote hub and its torch_dtype in config.json need to be overrided
@@ -296,7 +289,6 @@ def __init__(
296289
model_id if config is None else config,
297290
torchscript=with_jit,
298291
trust_remote_code=True,
299-
revision=pin_model_revision.get(model_id, None),
300292
)
301293
if re.search("gptbigcode", self.config.architectures[0], re.IGNORECASE):
302294
model_type = "gptbigcode"
@@ -335,7 +327,6 @@ def __init__(
335327
config=self.config,
336328
torch_dtype=load_dtype,
337329
trust_remote_code=True,
338-
revision=pin_model_revision.get(model_id, None),
339330
)
340331

341332
self.model = self.model.eval()

examples/cpu/llm/inference/distributed/run_generation_tp.py

Lines changed: 2 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -50,11 +50,6 @@
5050
"auto": (AutoModelForCausalLM, AutoTokenizer),
5151
}
5252

53-
# The latest model is not compatible with the current transformers/tokenizers, so we specify the revision of the model
54-
pin_model_revision = {
55-
"mistralai/Mistral-7B-v0.1": "26bca36bde8333b5d7f72e9ed20ccda6a618af24",
56-
"mistralai/Mixtral-8x7B-Instruct-v0.1": "a60832cb6c88d5cb6e507680d0e9996fbad77050",
57-
}
5853
try:
5954
from llava.model.language_model.llava_llama import LlavaLlamaForCausalLM
6055
from llava.model.builder import load_pretrained_model
@@ -232,10 +227,7 @@
232227
model_class = MODEL_CLASSES[model_type]
233228
if args.config_file is None:
234229
config = AutoConfig.from_pretrained(
235-
args.model_id,
236-
torchscript=args.deployment_mode,
237-
trust_remote_code=True,
238-
revision=pin_model_revision.get(args.model_id, None),
230+
args.model_id, torchscript=args.deployment_mode, trust_remote_code=True
239231
)
240232
else:
241233
config = AutoConfig.from_pretrained(
@@ -258,13 +250,8 @@
258250
config=config,
259251
low_cpu_mem_usage=True,
260252
trust_remote_code=True,
261-
revision=pin_model_revision.get(args.model_id, None),
262-
)
263-
tokenizer = model_class[1].from_pretrained(
264-
args.model_id,
265-
trust_remote_code=True,
266-
revision=pin_model_revision.get(args.model_id, None),
267253
)
254+
tokenizer = model_class[1].from_pretrained(args.model_id, trust_remote_code=True)
268255
else:
269256
tokenizer, model, image_processor, context_len = load_pretrained_model(
270257
args.model_id

examples/cpu/llm/inference/distributed/run_generation_with_deepspeed.py

Lines changed: 3 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -61,11 +61,6 @@
6161
"auto": (AutoModelForCausalLM, AutoTokenizer),
6262
}
6363

64-
# The latest model is not compatible with the current transformers/tokenizers, so we specify the revision of the model
65-
pin_model_revision = {
66-
"mistralai/Mistral-7B-v0.1": "26bca36bde8333b5d7f72e9ed20ccda6a618af24",
67-
"mistralai/Mixtral-8x7B-Instruct-v0.1": "a60832cb6c88d5cb6e507680d0e9996fbad77050",
68-
}
6964
try:
7065
from llava.model.language_model.llava_llama import LlavaLlamaForCausalLM
7166
from llava.model.builder import load_pretrained_model
@@ -312,19 +307,12 @@ def get_checkpoint_files(model_name_or_path):
312307
print_rank0(f"*** Loading the model {model_name}")
313308
model_type = next((x for x in MODEL_CLASSES.keys() if x in model_name.lower()), "auto")
314309
model_class = MODEL_CLASSES[model_type]
315-
tokenizer = model_class[1].from_pretrained(
316-
model_name,
317-
trust_remote_code=True,
318-
revision=pin_model_revision.get(model_name, None),
319-
)
310+
tokenizer = model_class[1].from_pretrained(model_name, trust_remote_code=True)
320311

321312
if model_type == "auto":
322313
if args.config_file is None:
323314
config = AutoConfig.from_pretrained(
324-
args.model_id,
325-
torchscript=True,
326-
trust_remote_code=True,
327-
revision=pin_model_revision.get(args.model_id, None),
315+
args.model_id, torchscript=True, trust_remote_code=True
328316
)
329317
else:
330318
config = AutoConfig.from_pretrained(
@@ -353,10 +341,7 @@ def get_checkpoint_files(model_name_or_path):
353341
)
354342
else:
355343
config = AutoConfig.from_pretrained(
356-
args.model_id,
357-
torchscript=True,
358-
trust_remote_code=True,
359-
revision=pin_model_revision.get(args.model_id, None),
344+
args.model_id, torchscript=True, trust_remote_code=True
360345
)
361346
else:
362347
config = AutoConfig.from_pretrained(

examples/cpu/llm/inference/run.py

Lines changed: 0 additions & 109 deletions
Original file line numberDiff line numberDiff line change
@@ -353,115 +353,6 @@ def main(args_in: Optional[List[str]] = None) -> None:
353353
print("LLM RUNTIME ERROR: Running generation task failed. Quit.")
354354
quit()
355355
print("LLM RUNTIME INFO: Finished successfully.")
356-
elif re.search("t5", str(args.model_name_or_path), re.IGNORECASE):
357-
qpath = Path(parent_path, "single_instance/run_quantization.py")
358-
infer_cmd = ["python", qpath]
359-
infer_cmd.extend(["-m", str(args.model_name_or_path)])
360-
infer_cmd.extend(["--input-tokens", str(args.input_tokens)])
361-
infer_cmd.extend(["--max-new-tokens", str(args.max_new_tokens)])
362-
infer_cmd.extend(["--num-iter", str(args.num_iter)])
363-
infer_cmd.extend(["--num-warmup", str(args.num_warmup)])
364-
infer_cmd.extend(["--batch-size", str(args.batch_size)])
365-
infer_cmd.extend(["--output-dir", str(args.output_dir)])
366-
if args.ipex_weight_only_quantization:
367-
infer_cmd.extend(["--ipex-weight-only-quantization"])
368-
infer_cmd.extend(["--weight-dtype", str(args.weight_dtype)])
369-
infer_cmd.extend(["--lowp-mode", str(args.lowp_mode)])
370-
infer_cmd.extend(["--act-quant-mode", str(args.act_quant_mode)])
371-
if args.gptq:
372-
print(
373-
"LLM RUNTIME INFO: Weight dtype set to INT4 since `--gptq` is sepcified"
374-
" and `--weight-dtype` is ignored."
375-
)
376-
if args.low_precision_checkpoint == "":
377-
gptq_cmd = [
378-
"python",
379-
Path(parent_path, "utils/run_gptq.py"),
380-
]
381-
gptq_cmd.extend(["--model", str(args.model_name_or_path)])
382-
gptq_cmd.extend(["--output-dir", str(args.output_dir)])
383-
print(
384-
"LLM RUNTIME INFO: Running GPTQ calibration with group_size {}...".format(
385-
group_size
386-
)
387-
)
388-
result = subprocess.run(gptq_cmd)
389-
if result.returncode != 0:
390-
print(
391-
"LLM RUNTIME ERROR: Running GPTQ calibration failed. Quit."
392-
)
393-
quit()
394-
print("LLM RUNTIME INFO: Running GPTQ calibration finished.")
395-
infer_cmd.extend(
396-
[
397-
"--low-precision-checkpoint",
398-
str(args.output_dir) + "/gptq_checkpoint.pt",
399-
]
400-
)
401-
else:
402-
infer_cmd.extend(
403-
[
404-
"--low-precision-checkpoint",
405-
str(args.low_precision_checkpoint),
406-
]
407-
)
408-
else:
409-
# No need to set group size if args.gptq is true
410-
# Group size is read from the checkpoint
411-
infer_cmd.extend(["--group-size", str(group_size)])
412-
else:
413-
infer_cmd.extend(["--ipex-smooth-quant"])
414-
infer_cmd.extend(["--calib-len", str(args.calib_len)])
415-
infer_cmd.extend(["--calib-iters", str(args.calib_iters)])
416-
if args.calib_shuffle:
417-
infer_cmd.extend(["--calib-shuffle"])
418-
if args.calib_padding:
419-
infer_cmd.extend(["--calib-padding"])
420-
infer_cmd.extend(["--calib-pad-val", str(args.calib_pad_val)])
421-
if args.fallback_add:
422-
infer_cmd.extend(["--fallback-add"])
423-
infer_cmd.extend(["--alpha", str(args.alpha)])
424-
if args.folding:
425-
infer_cmd.extend(["--folding"])
426-
infer_cmd.extend(["--init-alpha", str(args.init_alpha)])
427-
infer_cmd.extend(["--alpha-min", str(args.alpha_min)])
428-
infer_cmd.extend(["--alpha-max", str(args.alpha_max)])
429-
infer_cmd.extend(["--alpha-step", str(args.alpha_step)])
430-
infer_cmd.extend(["--shared-criterion", str(args.shared_criterion)])
431-
if args.enable_blockwise_loss:
432-
infer_cmd.extend(["--enable-blockwise-loss"])
433-
infer_cmd.extend(["--dataset", str(args.dataset)])
434-
if args.quant_with_amp:
435-
infer_cmd.extend(["--quant-with-amp"])
436-
if args.greedy:
437-
infer_cmd.extend(["--greedy"])
438-
if args.streaming:
439-
infer_cmd.extend(["--streaming"])
440-
if args.profile:
441-
infer_cmd.extend(["--profile"])
442-
if args.benchmark:
443-
infer_cmd.extend(["--benchmark"])
444-
if args.token_latency:
445-
infer_cmd.extend(["--token-latency"])
446-
if args.lm_head_generation:
447-
infer_cmd.extend(["--lm-head-generation"])
448-
449-
if args.prompt is not None:
450-
infer_cmd.extend(["--prompt", str(args.prompt)])
451-
452-
if args.cache_weight_for_large_batch:
453-
infer_cmd.extend(["--cache-weight-for-large-batch"])
454-
455-
print("LLM RUNTIME INFO: quantizing model ...")
456-
result = subprocess.run(infer_cmd)
457-
if result.returncode != 0:
458-
print("LLM RUNTIME ERROR: Quantizing model failed. Quit.")
459-
quit()
460-
print(
461-
"LLM RUNTIME INFO: Model quantized successfully, saved to {}.".format(
462-
str(args.output_dir) + "/best_model.pt"
463-
)
464-
)
465356
else:
466357
qpath = Path(parent_path, "single_instance/run_quantization.py")
467358

0 commit comments

Comments
 (0)