Avoid setting ATTENTION_BACKEND for optimum

sbalandi · sbalandi · commit 7df802294662 · 2025-05-26T20:59:50.000+01:00
diff --git a/tools/llm_bench/benchmark.py b/tools/llm_bench/benchmark.py
@@ -152,7 +152,7 @@ def get_argprser():
         "--use_cb",
         action="store_true",
         help='Deprecated, will be removed soon! Continues batching mode is used by default. '
-        'To switch to SPDA mode, please, create .json file, set up ATTENTION_BACKEND="SDPA" in file and put it to --load_config.'
+        'To switch to SPDA mode, please, set up {"ATTENTION_BACKEND": "SDPA"} in --load_config.'
     )
     parser.add_argument("--cb_config", required=False, default=None, help="Path to file with Continuous Batching Scheduler settings or dict")
     parser.add_argument("--draft_model", required=False, default=None,
diff --git a/tools/llm_bench/llm_bench_utils/model_utils.py b/tools/llm_bench/llm_bench_utils/model_utils.py
@@ -10,8 +10,7 @@
     USE_CASES,
     OV_MODEL_CLASSES_MAPPING,
     PT_MODEL_CLASSES_MAPPING,
-    PA_ATTENTION_BACKEND,
-    SDPA_ATTENTION_BACKEND
+    PA_ATTENTION_BACKEND
 )
 import librosa
 
@@ -181,19 +180,14 @@ def analyze_args(args):
             model_args['config'] = config
     if model_framework == 'ov':
         set_default_param_for_ov_config(model_args['config'])
-        if 'ATTENTION_BACKEND' not in model_args['config'] and use_case in ['text_gen', 'vlm'] and args.device != "NPU":
+        if 'ATTENTION_BACKEND' not in model_args['config'] and use_case in ['text_gen', 'vlm'] and args.device != "NPU" and not optimum:
             model_args['config']['ATTENTION_BACKEND'] = PA_ATTENTION_BACKEND
-        if model_args['config'].get('ATTENTION_BACKEND', '') == PA_ATTENTION_BACKEND and args.device == "NPU":
-            model_args['config']['ATTENTION_BACKEND'] = SDPA_ATTENTION_BACKEND
-            log.warning("Continuous Batching, Speculative decoding and Prompt Lookup decoding is not supported for NPU device")
         log.info(f"OV Config={model_args['config']}")
     elif model_framework == 'pt':
         log.info(f"PT Config={model_args['config']}")
     model_args['model_type'] = get_model_type(model_name, use_case, model_framework)
     model_args['model_name'] = model_name
 
-    if model_args['config'].get('ATTENTION_BACKEND', '') == PA_ATTENTION_BACKEND and optimum:
-        raise RuntimeError("Continuous batching mode supported only via OpenVINO GenAI")
     cb_config = None
     if args.cb_config:
         cb_config = get_config(args.cb_config)

Original file line number	Diff line number	Diff line change
`@@ -152,7 +152,7 @@ def get_argprser():`
`152`	`152`	`"--use_cb",`
`153`	`153`	`action="store_true",`
`154`	`154`	`help='Deprecated, will be removed soon! Continues batching mode is used by default. '`
`155`		`- 'To switch to SPDA mode, please, create .json file, set up ATTENTION_BACKEND="SDPA" in file and put it to --load_config.'`
	`155`	`+ 'To switch to SPDA mode, please, set up {"ATTENTION_BACKEND": "SDPA"} in --load_config.'`
`156`	`156`	`)`
`157`	`157`	`parser.add_argument("--cb_config", required=False, default=None, help="Path to file with Continuous Batching Scheduler settings or dict")`
`158`	`158`	`parser.add_argument("--draft_model", required=False, default=None,`