intel
diff --git a/‎csrc/cpu/aten/kernels/MaskedMultiHeadAttentionKrnl.cpp‎
Lines changed: 2 additions & 11 deletions b/‎csrc/cpu/aten/kernels/MaskedMultiHeadAttentionKrnl.cpp‎
Lines changed: 2 additions & 11 deletions
diff --git a/‎dependency_version.yml‎
Lines changed: 1 addition & 1 deletion b/‎dependency_version.yml‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎docs/tutorials/features/fast_bert.md‎
Lines changed: 1 addition & 1 deletion b/‎docs/tutorials/features/fast_bert.md‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎examples/cpu/features/fast_bert/README.md‎
Lines changed: 1 addition & 1 deletion b/‎examples/cpu/features/fast_bert/README.md‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎examples/cpu/llm/fine-tuning/requirements.txt‎
Lines changed: 1 addition & 1 deletion b/‎examples/cpu/llm/fine-tuning/requirements.txt‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎examples/cpu/llm/inference/README.md‎
Lines changed: 0 additions & 2 deletions b/‎examples/cpu/llm/inference/README.md‎
Lines changed: 0 additions & 2 deletions
diff --git a/‎examples/cpu/llm/inference/distributed/run_accuracy_with_deepspeed.py‎
Lines changed: 1 addition & 10 deletions b/‎examples/cpu/llm/inference/distributed/run_accuracy_with_deepspeed.py‎
Lines changed: 1 addition & 10 deletions
diff --git a/‎examples/cpu/llm/inference/distributed/run_generation_tp.py‎
Lines changed: 2 additions & 15 deletions b/‎examples/cpu/llm/inference/distributed/run_generation_tp.py‎
Lines changed: 2 additions & 15 deletions
diff --git a/‎examples/cpu/llm/inference/distributed/run_generation_with_deepspeed.py‎
Lines changed: 3 additions & 18 deletions b/‎examples/cpu/llm/inference/distributed/run_generation_with_deepspeed.py‎
Lines changed: 3 additions & 18 deletions
diff --git a/‎examples/cpu/llm/inference/run.py‎
Lines changed: 0 additions & 109 deletions b/‎examples/cpu/llm/inference/run.py‎
Lines changed: 0 additions & 109 deletions
@@ -1313,15 +1313,6 @@ first_token_masked_mha(
   auto key_lenght = key.size(1);
   auto kv_head_num = key.size(2);
   auto head_size = key.size(3);
-  if (add_casual_mask) {
-    auto casual_mask = at::full(
-        {query_length, key_lenght},
-        origin_type == at::kHalf ? -6e4 : -1e6,
-        query.options());
-    casual_mask = at::triu(casual_mask, 1);
-    casual_mask = casual_mask.unsqueeze(0).unsqueeze(0);
-    attention_mask = attention_mask + casual_mask;
-  }
   if (key.scalar_type() != at::kBFloat16 && key.scalar_type() != at::kFloat &&
       key.scalar_type() != at::kHalf) {
     TORCH_CHECK(
@@ -1358,7 +1349,7 @@ first_token_masked_mha(
         key,
         value,
         /* dropout */ 0.0,
-        /* is_causal*/ false,
+        add_casual_mask,
         attention_mask,
         1. / scale_attn));
   } else {
@@ -1572,7 +1563,7 @@ at::Tensor prepare_4d_causal_attention_mask_kernel_impl(
   int64_t src_length = attention_mask.size(-1);
   int64_t past_key_value_length = past_kv_len.item<int64_t>();
   int64_t length = seq_length + past_key_value_length;
-  int64_t diagonal = past_key_value_length - sliding_window + 1;
+  int64_t diagonal = past_key_value_length - sliding_window;
 
   at::Tensor causal_4d_mask = torch::empty(
       {batch_size, 1, seq_length, length}, inputs_embeds.options());
 
@@ -38,4 +38,4 @@ torchaudio:
 torchvision:
   version: 0.19.0+cpu
 transformers:
-  version: 4.38.1
+  version: 4.43.2
@@ -9,7 +9,7 @@ Currently `ipex.fast_bert` API is only well optimized for training. For inferenc
 
 ### Prerequisite
 
-- Transformers 4.6.0 ~ 4.38.1
+- Transformers 4.6.0 ~ 4.43.2
 
 ### Usage Example
 
 
@@ -5,7 +5,7 @@
 Currently `ipex.fast_bert` API is only well optimized for training. For inference, it ensures functionality, while to get peak perf, please use `ipex.optimize` API + torchscript.
 
 # Prerequisite:
-Transformers 4.6.0 ~ 4.38.1
+Transformers 4.6.0 ~ 4.43.2
 
 # Usage Example:
 Training:
 
@@ -6,6 +6,6 @@ black[jupyter]
 datasets
 fire
 peft
-transformers==4.38.1
+transformers==4.43.2
 gradio
 sentencepiece
@@ -420,8 +420,6 @@ There are some model-specific requirements to be aware of, as follows:
 
 - For Llava models from remote hub, additional setup is required, i.e., `bash ./tools/prepare_llava.sh`.
 
-- For mistralai/Mistral-7B-v0.1 and mistralai/Mixtral-8x7B-Instruct-v0.1, we use a fixed model version because the latest version is not compatible with transformers 4.38.1 and tokenizers 0.15.2.
-
 ## 2.3 Instructions for Running LLM with Intel® Xeon® CPU Max Series
 
 Intel® Xeon® CPU Max Series are equipped with high bandwidth memory (HBM), which further accelerates LLM inference. For the common case that HBM and DDR are both installed in a Xeon® CPU Max Series server, the memory mode can be configured to Flat Mode or Cache Mode. Details about memory modes can be found at Section 3.1 in [the Xeon® CPU Max Series Configuration Guide](https://cdrdv2-public.intel.com/769060/354227-intel-xeon-cpu-max-series-configuration-and-tuning-guide.pdf).
 
@@ -91,11 +91,6 @@ def decorator(func):
     "auto": (AutoModelForCausalLM, AutoTokenizer),
 }
 
-# The latest model is not compatible with the current transformers/tokenizers, so we specify the revision of the model
-pin_model_revision = {
-    "mistralai/Mistral-7B-v0.1": "26bca36bde8333b5d7f72e9ed20ccda6a618af24",
-    "mistralai/Mixtral-8x7B-Instruct-v0.1": "a60832cb6c88d5cb6e507680d0e9996fbad77050",
-}
 parser = argparse.ArgumentParser()
 parser.add_argument("--model", nargs="?", default="EleutherAI/gpt-j-6b")
 parser.add_argument("--output_dir", nargs="?", default="./saved_results")
@@ -279,9 +274,7 @@ def __init__(
         model_class = MODEL_CLASSES[model_type]
 
         self.tokenizer = model_class[1].from_pretrained(
-            model_id,
-            trust_remote_code=True,
-            revision=pin_model_revision.get(model_id, None),
+            model_id, trust_remote_code=True
         )
         if model_type == "chatglm":
             # chatglm modeling is from remote hub and its torch_dtype in config.json need to be overrided
@@ -296,7 +289,6 @@ def __init__(
                 model_id if config is None else config,
                 torchscript=with_jit,
                 trust_remote_code=True,
-                revision=pin_model_revision.get(model_id, None),
             )
         if re.search("gptbigcode", self.config.architectures[0], re.IGNORECASE):
             model_type = "gptbigcode"
@@ -335,7 +327,6 @@ def __init__(
                             config=self.config,
                             torch_dtype=load_dtype,
                             trust_remote_code=True,
-                            revision=pin_model_revision.get(model_id, None),
                         )
 
         self.model = self.model.eval()
 
@@ -50,11 +50,6 @@
     "auto": (AutoModelForCausalLM, AutoTokenizer),
 }
 
-# The latest model is not compatible with the current transformers/tokenizers, so we specify the revision of the model
-pin_model_revision = {
-    "mistralai/Mistral-7B-v0.1": "26bca36bde8333b5d7f72e9ed20ccda6a618af24",
-    "mistralai/Mixtral-8x7B-Instruct-v0.1": "a60832cb6c88d5cb6e507680d0e9996fbad77050",
-}
 try:
     from llava.model.language_model.llava_llama import LlavaLlamaForCausalLM
     from llava.model.builder import load_pretrained_model
@@ -232,10 +227,7 @@
 model_class = MODEL_CLASSES[model_type]
 if args.config_file is None:
     config = AutoConfig.from_pretrained(
-        args.model_id,
-        torchscript=args.deployment_mode,
-        trust_remote_code=True,
-        revision=pin_model_revision.get(args.model_id, None),
+        args.model_id, torchscript=args.deployment_mode, trust_remote_code=True
     )
 else:
     config = AutoConfig.from_pretrained(
@@ -258,13 +250,8 @@
         config=config,
         low_cpu_mem_usage=True,
         trust_remote_code=True,
-        revision=pin_model_revision.get(args.model_id, None),
-    )
-    tokenizer = model_class[1].from_pretrained(
-        args.model_id,
-        trust_remote_code=True,
-        revision=pin_model_revision.get(args.model_id, None),
     )
+    tokenizer = model_class[1].from_pretrained(args.model_id, trust_remote_code=True)
 else:
     tokenizer, model, image_processor, context_len = load_pretrained_model(
         args.model_id
 
@@ -61,11 +61,6 @@
     "auto": (AutoModelForCausalLM, AutoTokenizer),
 }
 
-# The latest model is not compatible with the current transformers/tokenizers, so we specify the revision of the model
-pin_model_revision = {
-    "mistralai/Mistral-7B-v0.1": "26bca36bde8333b5d7f72e9ed20ccda6a618af24",
-    "mistralai/Mixtral-8x7B-Instruct-v0.1": "a60832cb6c88d5cb6e507680d0e9996fbad77050",
-}
 try:
     from llava.model.language_model.llava_llama import LlavaLlamaForCausalLM
     from llava.model.builder import load_pretrained_model
@@ -312,19 +307,12 @@ def get_checkpoint_files(model_name_or_path):
 print_rank0(f"*** Loading the model {model_name}")
 model_type = next((x for x in MODEL_CLASSES.keys() if x in model_name.lower()), "auto")
 model_class = MODEL_CLASSES[model_type]
-tokenizer = model_class[1].from_pretrained(
-    model_name,
-    trust_remote_code=True,
-    revision=pin_model_revision.get(model_name, None),
-)
+tokenizer = model_class[1].from_pretrained(model_name, trust_remote_code=True)
 
 if model_type == "auto":
     if args.config_file is None:
         config = AutoConfig.from_pretrained(
-            args.model_id,
-            torchscript=True,
-            trust_remote_code=True,
-            revision=pin_model_revision.get(args.model_id, None),
+            args.model_id, torchscript=True, trust_remote_code=True
         )
     else:
         config = AutoConfig.from_pretrained(
@@ -353,10 +341,7 @@ def get_checkpoint_files(model_name_or_path):
         )
     else:
         config = AutoConfig.from_pretrained(
-            args.model_id,
-            torchscript=True,
-            trust_remote_code=True,
-            revision=pin_model_revision.get(args.model_id, None),
+            args.model_id, torchscript=True, trust_remote_code=True
         )
 else:
     config = AutoConfig.from_pretrained(
 
@@ -353,115 +353,6 @@ def main(args_in: Optional[List[str]] = None) -> None:
                 print("LLM RUNTIME ERROR: Running generation task failed. Quit.")
                 quit()
             print("LLM RUNTIME INFO: Finished successfully.")
-        elif re.search("t5", str(args.model_name_or_path), re.IGNORECASE):
-            qpath = Path(parent_path, "single_instance/run_quantization.py")
-            infer_cmd = ["python", qpath]
-            infer_cmd.extend(["-m", str(args.model_name_or_path)])
-            infer_cmd.extend(["--input-tokens", str(args.input_tokens)])
-            infer_cmd.extend(["--max-new-tokens", str(args.max_new_tokens)])
-            infer_cmd.extend(["--num-iter", str(args.num_iter)])
-            infer_cmd.extend(["--num-warmup", str(args.num_warmup)])
-            infer_cmd.extend(["--batch-size", str(args.batch_size)])
-            infer_cmd.extend(["--output-dir", str(args.output_dir)])
-            if args.ipex_weight_only_quantization:
-                infer_cmd.extend(["--ipex-weight-only-quantization"])
-                infer_cmd.extend(["--weight-dtype", str(args.weight_dtype)])
-                infer_cmd.extend(["--lowp-mode", str(args.lowp_mode)])
-                infer_cmd.extend(["--act-quant-mode", str(args.act_quant_mode)])
-                if args.gptq:
-                    print(
-                        "LLM RUNTIME INFO: Weight dtype set to INT4 since `--gptq` is sepcified"
-                        " and `--weight-dtype` is ignored."
-                    )
-                    if args.low_precision_checkpoint == "":
-                        gptq_cmd = [
-                            "python",
-                            Path(parent_path, "utils/run_gptq.py"),
-                        ]
-                        gptq_cmd.extend(["--model", str(args.model_name_or_path)])
-                        gptq_cmd.extend(["--output-dir", str(args.output_dir)])
-                        print(
-                            "LLM RUNTIME INFO: Running GPTQ calibration with group_size {}...".format(
-                                group_size
-                            )
-                        )
-                        result = subprocess.run(gptq_cmd)
-                        if result.returncode != 0:
-                            print(
-                                "LLM RUNTIME ERROR: Running GPTQ calibration failed. Quit."
-                            )
-                            quit()
-                        print("LLM RUNTIME INFO: Running GPTQ calibration finished.")
-                        infer_cmd.extend(
-                            [
-                                "--low-precision-checkpoint",
-                                str(args.output_dir) + "/gptq_checkpoint.pt",
-                            ]
-                        )
-                    else:
-                        infer_cmd.extend(
-                            [
-                                "--low-precision-checkpoint",
-                                str(args.low_precision_checkpoint),
-                            ]
-                        )
-                else:
-                    # No need to set group size if args.gptq is true
-                    # Group size is read from the checkpoint
-                    infer_cmd.extend(["--group-size", str(group_size)])
-            else:
-                infer_cmd.extend(["--ipex-smooth-quant"])
-                infer_cmd.extend(["--calib-len", str(args.calib_len)])
-                infer_cmd.extend(["--calib-iters", str(args.calib_iters)])
-                if args.calib_shuffle:
-                    infer_cmd.extend(["--calib-shuffle"])
-                if args.calib_padding:
-                    infer_cmd.extend(["--calib-padding"])
-                infer_cmd.extend(["--calib-pad-val", str(args.calib_pad_val)])
-                if args.fallback_add:
-                    infer_cmd.extend(["--fallback-add"])
-                infer_cmd.extend(["--alpha", str(args.alpha)])
-                if args.folding:
-                    infer_cmd.extend(["--folding"])
-                infer_cmd.extend(["--init-alpha", str(args.init_alpha)])
-                infer_cmd.extend(["--alpha-min", str(args.alpha_min)])
-                infer_cmd.extend(["--alpha-max", str(args.alpha_max)])
-                infer_cmd.extend(["--alpha-step", str(args.alpha_step)])
-                infer_cmd.extend(["--shared-criterion", str(args.shared_criterion)])
-                if args.enable_blockwise_loss:
-                    infer_cmd.extend(["--enable-blockwise-loss"])
-                infer_cmd.extend(["--dataset", str(args.dataset)])
-            if args.quant_with_amp:
-                infer_cmd.extend(["--quant-with-amp"])
-            if args.greedy:
-                infer_cmd.extend(["--greedy"])
-            if args.streaming:
-                infer_cmd.extend(["--streaming"])
-            if args.profile:
-                infer_cmd.extend(["--profile"])
-            if args.benchmark:
-                infer_cmd.extend(["--benchmark"])
-            if args.token_latency:
-                infer_cmd.extend(["--token-latency"])
-            if args.lm_head_generation:
-                infer_cmd.extend(["--lm-head-generation"])
-
-            if args.prompt is not None:
-                infer_cmd.extend(["--prompt", str(args.prompt)])
-
-            if args.cache_weight_for_large_batch:
-                infer_cmd.extend(["--cache-weight-for-large-batch"])
-
-            print("LLM RUNTIME INFO: quantizing model ...")
-            result = subprocess.run(infer_cmd)
-            if result.returncode != 0:
-                print("LLM RUNTIME ERROR: Quantizing model failed. Quit.")
-                quit()
-            print(
-                "LLM RUNTIME INFO: Model quantized successfully, saved to {}.".format(
-                    str(args.output_dir) + "/best_model.pt"
-                )
-            )
         else:
             qpath = Path(parent_path, "single_instance/run_quantization.py")