From fb0ab2b843d913f4c674f51fbd64646b2227372f Mon Sep 17 00:00:00 2001 From: Pavel Esir Date: Fri, 25 Jul 2025 10:42:26 +0200 Subject: [PATCH 01/10] WIP --- src/cpp/include/openvino/genai/tokenizer.hpp | 3 +++ .../src/tokenizer/make_tokenizer_stateful.cpp | 20 ++++++++++++++++--- .../src/tokenizer/make_tokenizer_stateful.hpp | 1 + src/cpp/src/tokenizer/tokenizer.cpp | 6 ++++++ 4 files changed, 27 insertions(+), 3 deletions(-) diff --git a/src/cpp/include/openvino/genai/tokenizer.hpp b/src/cpp/include/openvino/genai/tokenizer.hpp index 47c57352c4..5e5f225ee0 100644 --- a/src/cpp/include/openvino/genai/tokenizer.hpp +++ b/src/cpp/include/openvino/genai/tokenizer.hpp @@ -165,6 +165,7 @@ class OPENVINO_GENAI_EXPORTS Tokenizer { * @param add_special_tokens whether to add special tokens * @param max_length optional maximum length to which output will be truncated and/or padded. If not defined, taken from IR. * @param pad_to_max_length either pad to max_length, or pad to the longest sequence in the batch. Default is false. + * @param padding_side side to pad, either "left" or "right". Default is "right". * @return pair of [input_ids, attention_mask] */ template @@ -178,6 +179,7 @@ class OPENVINO_GENAI_EXPORTS Tokenizer { * @param add_special_tokens whether to add special tokens * @param max_length optional maximum length to which output will be truncated and/or padded. If not defined, taken from IR. * @param pad_to_max_length either pad to max_length, or pad to the longest sequence in the batch. Default is false. + * @param padding_side side to pad, either "left" or "right". Default is "right". * @return pair of [input_ids, attention_mask] */ template @@ -313,6 +315,7 @@ class OPENVINO_GENAI_EXPORTS Tokenizer { static constexpr ov::Property add_special_tokens{"add_special_tokens"}; static constexpr ov::Property skip_special_tokens{"skip_special_tokens"}; static constexpr ov::Property pad_to_max_length{"pad_to_max_length"}; +static constexpr ov::Property padding_side{"padding_side"}; } // namespace genai } // namespace ov diff --git a/src/cpp/src/tokenizer/make_tokenizer_stateful.cpp b/src/cpp/src/tokenizer/make_tokenizer_stateful.cpp index 0b7bbcdd15..c43dea190a 100644 --- a/src/cpp/src/tokenizer/make_tokenizer_stateful.cpp +++ b/src/cpp/src/tokenizer/make_tokenizer_stateful.cpp @@ -14,6 +14,7 @@ #include "openvino/op/assign.hpp" #include "openvino/op/constant.hpp" #include +#include using namespace ov; using namespace ov::op; @@ -246,15 +247,28 @@ bool ov::genai::MakePaddingSatateful::run_on_model(const std::shared_ptradd_sinks({std::make_shared(pad_to_max_length_rv, pad_to_max_length_var)}); model->add_variables({pad_to_max_length_var}); + // Add padding side variable. + auto pad_right_var = std::make_shared(op::util::VariableInfo{ov::Shape{1}, ov::element::boolean, ov::genai::PAD_RIGHT_VAR_ID}); + auto pad_right_const = std::make_shared(ov::element::boolean, ov::Shape{1}, std::vector{true}); + auto pad_right_rv = std::make_shared(pad_right_const, pad_right_var); + model->add_sinks({std::make_shared(pad_right_rv, pad_right_var)}); + model->add_variables({pad_right_var}); + auto select_node = std::make_shared(pad_to_max_length_rv, max_length_rv, zero_constant); for (auto ragged_to_dense_node : ragged_to_dense_nodes) { if (!ragged_to_dense_node) { - return true; // true since at this point we already have modified the graph.s + return true; // true since at this point we already have modified the graph. } + + auto new_inputs = ragged_to_dense_node->input_values(); + new_inputs.emplace_back(pad_right_rv->output(0)); + auto new_ragged_to_dense = ragged_to_dense_node->clone_with_new_inputs(new_inputs); - auto max_op = std::make_shared(ragged_to_dense_node->input_value(3), select_node); - ragged_to_dense_node->input(3).replace_source_output(max_op->output(0)); + auto max_op = std::make_shared(new_ragged_to_dense->input_value(3), select_node); + new_ragged_to_dense->input(3).replace_source_output(max_op->output(0)); + + ov::replace_node(ragged_to_dense_node, new_ragged_to_dense); } return true; diff --git a/src/cpp/src/tokenizer/make_tokenizer_stateful.hpp b/src/cpp/src/tokenizer/make_tokenizer_stateful.hpp index fba9cf07fb..f77f761fef 100644 --- a/src/cpp/src/tokenizer/make_tokenizer_stateful.hpp +++ b/src/cpp/src/tokenizer/make_tokenizer_stateful.hpp @@ -90,6 +90,7 @@ inline const std::string SKIP_SPECIAL_TOKENS_VAR_ID = "skip_special_tokens"; inline const std::string MAX_LENGTH_VAR_ID = "max_length"; inline const std::string IS_MAX_LENGTH_SET = "is_max_length_set"; inline const std::string PAD_TO_MAX_LENGTH_VAR_ID = "pad_to_max_length"; +inline const std::string PAD_RIGHT_VAR_ID = "pad_right"; } // namespace genai } // namespace ov diff --git a/src/cpp/src/tokenizer/tokenizer.cpp b/src/cpp/src/tokenizer/tokenizer.cpp index ec651b4356..92efda640c 100644 --- a/src/cpp/src/tokenizer/tokenizer.cpp +++ b/src/cpp/src/tokenizer/tokenizer.cpp @@ -260,11 +260,15 @@ class Tokenizer::TokenizerImpl { std::optional skip_special_tokens_flag = true; std::optional max_length_val; std::optional pad_to_max_length_val = false; + std::optional padding_side_val = "right"; ov::genai::utils::read_anymap_param(params, add_special_tokens.name(), add_special_tokens_flag); ov::genai::utils::read_anymap_param(params, skip_special_tokens.name(), skip_special_tokens_flag); ov::genai::utils::read_anymap_param(params, pad_to_max_length.name(), pad_to_max_length_val); ov::genai::utils::read_anymap_param(params, max_length.name(), max_length_val); + ov::genai::utils::read_anymap_param(params, padding_side.name(), padding_side_val); + std::optional pad_right = (padding_side_val.has_value() && *padding_side_val == "right") ? true : false; + std::optional is_max_length_set_val = max_length_val.has_value(); ov::AnyMap& state_flags = m_request_to_state_flags[&infer_request_guard.get()]; @@ -282,6 +286,8 @@ class Tokenizer::TokenizerImpl { set_state_value(state, pad_to_max_length_val, state_flags); } else if (name == IS_MAX_LENGTH_SET) { set_state_value(state, is_max_length_set_val, state_flags); + } else if (name == PAD_RIGHT_VAR_ID) { + set_state_value(state, pad_right, state_flags); } } } From 0a9114d4f7d6ffd2d34ee94deaae23ce03e4140a Mon Sep 17 00:00:00 2001 From: Pavel Esir Date: Wed, 30 Jul 2025 14:02:22 +0200 Subject: [PATCH 02/10] add padding side to genai --- .../src/tokenizer/make_tokenizer_stateful.cpp | 4 +-- src/cpp/src/tokenizer/tokenizer.cpp | 25 +++++++++++++++---- src/python/py_tokenizer.cpp | 24 +++++++++++++++--- 3 files changed, 42 insertions(+), 11 deletions(-) diff --git a/src/cpp/src/tokenizer/make_tokenizer_stateful.cpp b/src/cpp/src/tokenizer/make_tokenizer_stateful.cpp index c43dea190a..c03afcfeca 100644 --- a/src/cpp/src/tokenizer/make_tokenizer_stateful.cpp +++ b/src/cpp/src/tokenizer/make_tokenizer_stateful.cpp @@ -248,8 +248,8 @@ bool ov::genai::MakePaddingSatateful::run_on_model(const std::shared_ptradd_variables({pad_to_max_length_var}); // Add padding side variable. - auto pad_right_var = std::make_shared(op::util::VariableInfo{ov::Shape{1}, ov::element::boolean, ov::genai::PAD_RIGHT_VAR_ID}); - auto pad_right_const = std::make_shared(ov::element::boolean, ov::Shape{1}, std::vector{true}); + auto pad_right_var = std::make_shared(op::util::VariableInfo{ov::Shape{}, ov::element::boolean, ov::genai::PAD_RIGHT_VAR_ID}); + auto pad_right_const = std::make_shared(ov::element::boolean, ov::Shape{}, std::vector{true}); auto pad_right_rv = std::make_shared(pad_right_const, pad_right_var); model->add_sinks({std::make_shared(pad_right_rv, pad_right_var)}); model->add_variables({pad_right_var}); diff --git a/src/cpp/src/tokenizer/tokenizer.cpp b/src/cpp/src/tokenizer/tokenizer.cpp index 92efda640c..19a864714e 100644 --- a/src/cpp/src/tokenizer/tokenizer.cpp +++ b/src/cpp/src/tokenizer/tokenizer.cpp @@ -851,27 +851,42 @@ Tokenizer::Tokenizer(const std::string& model_str, ov::Tensor& weights_tensor, c } TokenizedInputs Tokenizer::encode(const std::string& prompt, const ov::AnyMap& tokenization_params) { - check_arguments(tokenization_params, {ov::genai::add_special_tokens.name(), ov::genai::max_length.name(), ov::genai::pad_to_max_length.name()}); + check_arguments(tokenization_params, {ov::genai::add_special_tokens.name(), + ov::genai::max_length.name(), + ov::genai::pad_to_max_length.name(), + ov::genai::padding_side.name()}); return m_pimpl->encode(std::move(prompt), tokenization_params); } TokenizedInputs Tokenizer::encode(const std::vector>& prompts, const ov::AnyMap& tokenization_params) { - check_arguments(tokenization_params, {ov::genai::add_special_tokens.name(), ov::genai::max_length.name(), ov::genai::pad_to_max_length.name()}); + check_arguments(tokenization_params, {ov::genai::add_special_tokens.name(), + ov::genai::max_length.name(), + ov::genai::pad_to_max_length.name(), + ov::genai::padding_side.name()}); return m_pimpl->encode(prompts, tokenization_params); } TokenizedInputs Tokenizer::encode(const std::vector& prompts_1, const std::vector& prompts_2, const ov::AnyMap& tokenization_params) { - check_arguments(tokenization_params, {ov::genai::add_special_tokens.name(), ov::genai::max_length.name(), ov::genai::pad_to_max_length.name()}); + check_arguments(tokenization_params, {ov::genai::add_special_tokens.name(), + ov::genai::max_length.name(), + ov::genai::pad_to_max_length.name(), + ov::genai::padding_side.name()}); return m_pimpl->encode(prompts_1, prompts_2, tokenization_params); } TokenizedInputs Tokenizer::encode(const std::vector& prompts, const ov::AnyMap& tokenization_params) { - check_arguments(tokenization_params, {ov::genai::add_special_tokens.name(), ov::genai::max_length.name(), ov::genai::pad_to_max_length.name()}); + check_arguments(tokenization_params, {ov::genai::add_special_tokens.name(), + ov::genai::max_length.name(), + ov::genai::pad_to_max_length.name(), + ov::genai::padding_side.name()}); return m_pimpl->encode(prompts, tokenization_params); } TokenizedInputs Tokenizer::encode(const std::initializer_list& text, const ov::AnyMap& tokenization_params) { - check_arguments(tokenization_params, {ov::genai::add_special_tokens.name(), ov::genai::max_length.name(), ov::genai::pad_to_max_length.name()}); + check_arguments(tokenization_params, {ov::genai::add_special_tokens.name(), + ov::genai::max_length.name(), + ov::genai::pad_to_max_length.name(), + ov::genai::padding_side.name()}); return encode(std::vector(text.begin(), text.end()), tokenization_params); } diff --git a/src/python/py_tokenizer.cpp b/src/python/py_tokenizer.cpp index d98ead7e20..b5f34cbd75 100644 --- a/src/python/py_tokenizer.cpp +++ b/src/python/py_tokenizer.cpp @@ -75,10 +75,13 @@ void init_tokenizer(py::module_& m) { .def("encode", [](Tokenizer& tok, std::vector& prompts, bool add_special_tokens, bool pad_to_max_length, - std::optional max_length) { + std::optional max_length, + std::optional padding_side) { ov::AnyMap tokenization_params; tokenization_params[ov::genai::add_special_tokens.name()] = add_special_tokens; tokenization_params[ov::genai::pad_to_max_length.name()] = pad_to_max_length; + tokenization_params[ov::genai::padding_side.name()] = padding_side.value_or("right"); + if (max_length.has_value()) { tokenization_params[ov::genai::max_length.name()] = *max_length; } @@ -88,15 +91,19 @@ void init_tokenizer(py::module_& m) { py::arg("add_special_tokens") = true, py::arg("pad_to_max_length") = false, py::arg("max_length") = std::nullopt, + py::arg("padding_side") = "right", R"(Encodes a list of prompts into tokenized inputs.)") .def("encode", [](Tokenizer& tok, const std::string prompt, bool add_special_tokens, bool pad_to_max_length, - std::optional max_length) { + std::optional max_length, + std::optional padding_side + ) { ov::AnyMap tokenization_params; tokenization_params[ov::genai::add_special_tokens.name()] = add_special_tokens; tokenization_params[ov::genai::pad_to_max_length.name()] = pad_to_max_length; + tokenization_params[ov::genai::padding_side.name()] = padding_side.value_or("right"); if (max_length.has_value()) { tokenization_params[ov::genai::max_length.name()] = *max_length; } @@ -106,6 +113,7 @@ void init_tokenizer(py::module_& m) { py::arg("add_special_tokens") = true, py::arg("pad_to_max_length") = false, py::arg("max_length") = std::nullopt, + py::arg("padding_side") = "right", R"(Encodes a single prompt into tokenized input.)") .def("encode", [](Tokenizer& tok, @@ -113,10 +121,13 @@ void init_tokenizer(py::module_& m) { std::vector& prompts_2, bool add_special_tokens, bool pad_to_max_length, - std::optional max_length) { + std::optional max_length, + std::optional padding_side) { ov::AnyMap tokenization_params; tokenization_params[ov::genai::add_special_tokens.name()] = add_special_tokens; tokenization_params[ov::genai::pad_to_max_length.name()] = pad_to_max_length; + tokenization_params[ov::genai::padding_side.name()] = padding_side.value_or("right"); + if (max_length.has_value()) { tokenization_params[ov::genai::max_length.name()] = *max_length; } @@ -127,16 +138,20 @@ void init_tokenizer(py::module_& m) { py::arg("add_special_tokens") = true, py::arg("pad_to_max_length") = false, py::arg("max_length") = std::nullopt, + py::arg("padding_side") = "right", R"(Encodes a list of prompts into tokenized inputs. The number of strings must be the same, or one of the inputs can contain one string. In the latter case, the single-string input will be broadcast into the shape of the other input, which is more efficient than repeating the string in pairs.)") .def("encode", [](Tokenizer& tok, py::list& prompts, bool add_special_tokens, bool pad_to_max_length, - std::optional max_length) { + std::optional max_length, + std::optional padding_side) { ov::AnyMap tokenization_params; tokenization_params[ov::genai::add_special_tokens.name()] = add_special_tokens; tokenization_params[ov::genai::pad_to_max_length.name()] = pad_to_max_length; + tokenization_params[ov::genai::padding_side.name()] = padding_side.value_or("right"); + if (max_length.has_value()) { tokenization_params[ov::genai::max_length.name()] = *max_length; } @@ -156,6 +171,7 @@ void init_tokenizer(py::module_& m) { py::arg("add_special_tokens") = true, py::arg("pad_to_max_length") = false, py::arg("max_length") = std::nullopt, + py::arg("padding_side") = "right", R"(Encodes a list of paired prompts into tokenized inputs. Input format is same as for HF paired input [[prompt_1, prompt_2], ...].)") .def( From 658d83b2d47df9d31aa034ea58119e3c2410c221 Mon Sep 17 00:00:00 2001 From: Pavel Esir Date: Fri, 8 Aug 2025 13:47:15 +0200 Subject: [PATCH 03/10] add tests & make them green --- .../src/tokenizer/make_tokenizer_stateful.cpp | 8 ++++-- src/cpp/src/tokenizer/tokenizer.cpp | 18 ++++++++++--- src/python/py_tokenizer.cpp | 26 ++++++++++++------- tests/python_tests/test_tokenizer.py | 8 ++++++ 4 files changed, 45 insertions(+), 15 deletions(-) diff --git a/src/cpp/src/tokenizer/make_tokenizer_stateful.cpp b/src/cpp/src/tokenizer/make_tokenizer_stateful.cpp index c03afcfeca..32760bd466 100644 --- a/src/cpp/src/tokenizer/make_tokenizer_stateful.cpp +++ b/src/cpp/src/tokenizer/make_tokenizer_stateful.cpp @@ -248,8 +248,12 @@ bool ov::genai::MakePaddingSatateful::run_on_model(const std::shared_ptradd_variables({pad_to_max_length_var}); // Add padding side variable. - auto pad_right_var = std::make_shared(op::util::VariableInfo{ov::Shape{}, ov::element::boolean, ov::genai::PAD_RIGHT_VAR_ID}); - auto pad_right_const = std::make_shared(ov::element::boolean, ov::Shape{}, std::vector{true}); + auto pad_right_var = std::make_shared(op::util::VariableInfo{ov::Shape{}, ov::element::i32, ov::genai::PAD_RIGHT_VAR_ID}); + // If user called encode without explicitly stating padding side, then we should pad it to the default side + // which was defined during model conversion, but we don't know default side value during this transformation, + // therefore we should indicate that padding side should be taken from the operation attribute. + // We decided that to be number 2. + auto pad_right_const = std::make_shared(ov::element::i32, ov::Shape{}, std::vector{2}); auto pad_right_rv = std::make_shared(pad_right_const, pad_right_var); model->add_sinks({std::make_shared(pad_right_rv, pad_right_var)}); model->add_variables({pad_right_var}); diff --git a/src/cpp/src/tokenizer/tokenizer.cpp b/src/cpp/src/tokenizer/tokenizer.cpp index 19a864714e..1764fb806e 100644 --- a/src/cpp/src/tokenizer/tokenizer.cpp +++ b/src/cpp/src/tokenizer/tokenizer.cpp @@ -260,14 +260,24 @@ class Tokenizer::TokenizerImpl { std::optional skip_special_tokens_flag = true; std::optional max_length_val; std::optional pad_to_max_length_val = false; - std::optional padding_side_val = "right"; - + std::optional padding_side_val = std::nullopt; + ov::genai::utils::read_anymap_param(params, add_special_tokens.name(), add_special_tokens_flag); ov::genai::utils::read_anymap_param(params, skip_special_tokens.name(), skip_special_tokens_flag); ov::genai::utils::read_anymap_param(params, pad_to_max_length.name(), pad_to_max_length_val); ov::genai::utils::read_anymap_param(params, max_length.name(), max_length_val); ov::genai::utils::read_anymap_param(params, padding_side.name(), padding_side_val); - std::optional pad_right = (padding_side_val.has_value() && *padding_side_val == "right") ? true : false; + std::optional pad_right; + + // If padding_side is not set, we should leave nullopt this will indicate that default value from RaggetToDense attribute will be used. + if (padding_side_val.has_value()) { + OPENVINO_ASSERT( + padding_side_val == "left" || padding_side_val == "right", + "padding_side should be either 'left' or 'right', but got: ", + *padding_side_val + ); + pad_right = (*padding_side_val == "right") ? true : false; + } std::optional is_max_length_set_val = max_length_val.has_value(); @@ -286,7 +296,7 @@ class Tokenizer::TokenizerImpl { set_state_value(state, pad_to_max_length_val, state_flags); } else if (name == IS_MAX_LENGTH_SET) { set_state_value(state, is_max_length_set_val, state_flags); - } else if (name == PAD_RIGHT_VAR_ID) { + } else if (name == PAD_RIGHT_VAR_ID) { set_state_value(state, pad_right, state_flags); } } diff --git a/src/python/py_tokenizer.cpp b/src/python/py_tokenizer.cpp index b5f34cbd75..4f90e33622 100644 --- a/src/python/py_tokenizer.cpp +++ b/src/python/py_tokenizer.cpp @@ -80,18 +80,20 @@ void init_tokenizer(py::module_& m) { ov::AnyMap tokenization_params; tokenization_params[ov::genai::add_special_tokens.name()] = add_special_tokens; tokenization_params[ov::genai::pad_to_max_length.name()] = pad_to_max_length; - tokenization_params[ov::genai::padding_side.name()] = padding_side.value_or("right"); if (max_length.has_value()) { tokenization_params[ov::genai::max_length.name()] = *max_length; } + if (padding_side.has_value()) { + tokenization_params[ov::genai::padding_side.name()] = *padding_side; + } return tok.encode(prompts, tokenization_params); }, py::arg("prompts"), py::arg("add_special_tokens") = true, py::arg("pad_to_max_length") = false, py::arg("max_length") = std::nullopt, - py::arg("padding_side") = "right", + py::arg("padding_side") = std::nullopt, R"(Encodes a list of prompts into tokenized inputs.)") .def("encode", [](Tokenizer& tok, const std::string prompt, @@ -103,17 +105,19 @@ void init_tokenizer(py::module_& m) { ov::AnyMap tokenization_params; tokenization_params[ov::genai::add_special_tokens.name()] = add_special_tokens; tokenization_params[ov::genai::pad_to_max_length.name()] = pad_to_max_length; - tokenization_params[ov::genai::padding_side.name()] = padding_side.value_or("right"); if (max_length.has_value()) { tokenization_params[ov::genai::max_length.name()] = *max_length; } + if (padding_side.has_value()) { + tokenization_params[ov::genai::padding_side.name()] = *padding_side; + } return tok.encode(prompt, tokenization_params); }, py::arg("prompt"), py::arg("add_special_tokens") = true, py::arg("pad_to_max_length") = false, py::arg("max_length") = std::nullopt, - py::arg("padding_side") = "right", + py::arg("padding_side") = std::nullopt, R"(Encodes a single prompt into tokenized input.)") .def("encode", [](Tokenizer& tok, @@ -126,11 +130,13 @@ void init_tokenizer(py::module_& m) { ov::AnyMap tokenization_params; tokenization_params[ov::genai::add_special_tokens.name()] = add_special_tokens; tokenization_params[ov::genai::pad_to_max_length.name()] = pad_to_max_length; - tokenization_params[ov::genai::padding_side.name()] = padding_side.value_or("right"); if (max_length.has_value()) { tokenization_params[ov::genai::max_length.name()] = *max_length; } + if (padding_side.has_value()) { + tokenization_params[ov::genai::padding_side.name()] = *padding_side; + } return tok.encode(prompts_1, prompts_2, tokenization_params); }, py::arg("prompts_1"), @@ -138,7 +144,7 @@ void init_tokenizer(py::module_& m) { py::arg("add_special_tokens") = true, py::arg("pad_to_max_length") = false, py::arg("max_length") = std::nullopt, - py::arg("padding_side") = "right", + py::arg("padding_side") = std::nullopt, R"(Encodes a list of prompts into tokenized inputs. The number of strings must be the same, or one of the inputs can contain one string. In the latter case, the single-string input will be broadcast into the shape of the other input, which is more efficient than repeating the string in pairs.)") @@ -150,11 +156,13 @@ void init_tokenizer(py::module_& m) { ov::AnyMap tokenization_params; tokenization_params[ov::genai::add_special_tokens.name()] = add_special_tokens; tokenization_params[ov::genai::pad_to_max_length.name()] = pad_to_max_length; - tokenization_params[ov::genai::padding_side.name()] = padding_side.value_or("right"); - + if (max_length.has_value()) { tokenization_params[ov::genai::max_length.name()] = *max_length; } + if (padding_side.has_value()) { + tokenization_params[ov::genai::padding_side.name()] = *padding_side; + } // Convert py::list to std::vector std::vector> prompts_vector; @@ -171,7 +179,7 @@ void init_tokenizer(py::module_& m) { py::arg("add_special_tokens") = true, py::arg("pad_to_max_length") = false, py::arg("max_length") = std::nullopt, - py::arg("padding_side") = "right", + py::arg("padding_side") = std::nullopt, R"(Encodes a list of paired prompts into tokenized inputs. Input format is same as for HF paired input [[prompt_1, prompt_2], ...].)") .def( diff --git a/tests/python_tests/test_tokenizer.py b/tests/python_tests/test_tokenizer.py index f48af9bada..43a7f2090f 100644 --- a/tests/python_tests/test_tokenizer.py +++ b/tests/python_tests/test_tokenizer.py @@ -365,6 +365,8 @@ def hf_ov_genai_models(request, tmp_path_factory): @pytest.mark.parametrize("add_special_tokens", [True, False]) @pytest.mark.parametrize("max_length", [None, 16, 103, 512, 1024]) @pytest.mark.parametrize("pad_to_max_length", [None, True, False]) +# regardless of what side was set during conversion we should be able to set it at runtime +@pytest.mark.parametrize("padding_side", [None, "right", "left"]) @pytest.mark.parametrize("prompt", prompts) @pytest.mark.parametrize( "hf_ov_genai_models", @@ -386,6 +388,7 @@ def test_padding( add_special_tokens, max_length, pad_to_max_length, + padding_side, prompt, ): hf_tokenizer, genai_tokenzier = hf_ov_genai_models @@ -418,8 +421,13 @@ def test_padding( hf_params.pop("max_length") ov_params.pop("max_length") + if padding_side is not None: + hf_params["padding_side"] = padding_side + ov_params["padding_side"] = padding_side + ov_res = genai_tokenzier.encode(prompt, **ov_params) hf_res = hf_tokenizer(prompt, return_tensors="np", **hf_params) + assert np.all(ov_res.input_ids.data == hf_res["input_ids"]) assert np.all(ov_res.attention_mask.data == hf_res["attention_mask"]) From 8b53509f7619eec5521f1fd4b7a4d80c2cade561 Mon Sep 17 00:00:00 2001 From: Pavel Esir Date: Fri, 8 Aug 2025 14:01:31 +0200 Subject: [PATCH 04/10] update pyi --- src/python/openvino_genai/py_openvino_genai.pyi | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/python/openvino_genai/py_openvino_genai.pyi b/src/python/openvino_genai/py_openvino_genai.pyi index b8d448600b..cdb1b380c7 100644 --- a/src/python/openvino_genai/py_openvino_genai.pyi +++ b/src/python/openvino_genai/py_openvino_genai.pyi @@ -2971,23 +2971,23 @@ class Tokenizer: Decode a batch of tokens into a list of string prompt. """ @typing.overload - def encode(self, prompts: collections.abc.Sequence[str], add_special_tokens: bool = True, pad_to_max_length: bool = False, max_length: typing.SupportsInt | None = None) -> TokenizedInputs: + def encode(self, prompts: collections.abc.Sequence[str], add_special_tokens: bool = True, pad_to_max_length: bool = False, max_length: typing.SupportsInt | None = None, padding_side: str | None = None) -> TokenizedInputs: """ Encodes a list of prompts into tokenized inputs. """ @typing.overload - def encode(self, prompt: str, add_special_tokens: bool = True, pad_to_max_length: bool = False, max_length: typing.SupportsInt | None = None) -> TokenizedInputs: + def encode(self, prompt: str, add_special_tokens: bool = True, pad_to_max_length: bool = False, max_length: typing.SupportsInt | None = None, padding_side: str | None = None) -> TokenizedInputs: """ Encodes a single prompt into tokenized input. """ @typing.overload - def encode(self, prompts_1: collections.abc.Sequence[str], prompts_2: collections.abc.Sequence[str], add_special_tokens: bool = True, pad_to_max_length: bool = False, max_length: typing.SupportsInt | None = None) -> TokenizedInputs: + def encode(self, prompts_1: collections.abc.Sequence[str], prompts_2: collections.abc.Sequence[str], add_special_tokens: bool = True, pad_to_max_length: bool = False, max_length: typing.SupportsInt | None = None, padding_side: str | None = None) -> TokenizedInputs: """ Encodes a list of prompts into tokenized inputs. The number of strings must be the same, or one of the inputs can contain one string. In the latter case, the single-string input will be broadcast into the shape of the other input, which is more efficient than repeating the string in pairs. """ @typing.overload - def encode(self, prompts: list, add_special_tokens: bool = True, pad_to_max_length: bool = False, max_length: typing.SupportsInt | None = None) -> TokenizedInputs: + def encode(self, prompts: list, add_special_tokens: bool = True, pad_to_max_length: bool = False, max_length: typing.SupportsInt | None = None, padding_side: str | None = None) -> TokenizedInputs: """ Encodes a list of paired prompts into tokenized inputs. Input format is same as for HF paired input [[prompt_1, prompt_2], ...]. """ From 9117082379598078f20b900ec0db7363e17a3719 Mon Sep 17 00:00:00 2001 From: Pavel Esir Date: Fri, 8 Aug 2025 14:05:11 +0200 Subject: [PATCH 05/10] update docstring --- src/cpp/include/openvino/genai/tokenizer.hpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/cpp/include/openvino/genai/tokenizer.hpp b/src/cpp/include/openvino/genai/tokenizer.hpp index 5e5f225ee0..69e5fb4707 100644 --- a/src/cpp/include/openvino/genai/tokenizer.hpp +++ b/src/cpp/include/openvino/genai/tokenizer.hpp @@ -165,7 +165,7 @@ class OPENVINO_GENAI_EXPORTS Tokenizer { * @param add_special_tokens whether to add special tokens * @param max_length optional maximum length to which output will be truncated and/or padded. If not defined, taken from IR. * @param pad_to_max_length either pad to max_length, or pad to the longest sequence in the batch. Default is false. - * @param padding_side side to pad, either "left" or "right". Default is "right". + * @param padding_side side to pad, either "left" or "right". If not defined value is taken from IR. * @return pair of [input_ids, attention_mask] */ template @@ -179,7 +179,7 @@ class OPENVINO_GENAI_EXPORTS Tokenizer { * @param add_special_tokens whether to add special tokens * @param max_length optional maximum length to which output will be truncated and/or padded. If not defined, taken from IR. * @param pad_to_max_length either pad to max_length, or pad to the longest sequence in the batch. Default is false. - * @param padding_side side to pad, either "left" or "right". Default is "right". + * @param padding_side side to pad, either "left" or "right". If not defined value is taken from IR. * @return pair of [input_ids, attention_mask] */ template From 6bc01682683f7986fe2088760e1f1b13b20a7947 Mon Sep 17 00:00:00 2001 From: Pavel Esir Date: Mon, 11 Aug 2025 13:35:31 +0200 Subject: [PATCH 06/10] update python docstring for 'encode' --- src/python/py_tokenizer.cpp | 58 +++++++++++++++++++++++++++++++------ 1 file changed, 49 insertions(+), 9 deletions(-) diff --git a/src/python/py_tokenizer.cpp b/src/python/py_tokenizer.cpp index 4f90e33622..aacc063873 100644 --- a/src/python/py_tokenizer.cpp +++ b/src/python/py_tokenizer.cpp @@ -32,6 +32,46 @@ constexpr char class_docstring[] = R"( 7. Replace not supported instructions with equivalents. )"; +constexpr char common_encode_docstring[] =R"( + 'add_special_tokens' - whether to add special tokens like BOS, EOS, PAD. Default is True. + 'pad_to_max_length' - whether to pad the sequence to the maximum length. Default is False. + 'max_length' - maximum length of the sequence. If None (default), the value will be taken from the model configuration. + 'padding_side' - side to pad the sequence, can be 'left' or 'right'. If None (default), the value will be taken from the model configuration. +Returns: + TokenizedInputs object containing input_ids and attention_mask tensors. +)"; + +auto encode_list_docstring = ( +R"(Encodes a list of prompts into tokenized inputs. +Args: + 'prompts' - list of prompts to encode)" ++ std::string(common_encode_docstring) +); + +auto encode_single_prompt_docstring = ( +R"(Encodes a single prompt into tokenized input. +Args: + 'prompt' - prompt to encode)" ++ std::string(common_encode_docstring) +); + +auto encode_list_of_pairs_docstring = ( +R"(Encodes a list of prompts into tokenized inputs. The number of strings must be the same, or one of the inputs can contain one string. +In the latter case, the single-string input will be broadcast into the shape of the other input, which is more efficient than repeating the string in pairs.) +Args: + 'prompts_1' - list of prompts to encode + 'prompts_2' - list of prompts to encode)" ++ std::string(common_encode_docstring) +); + +auto encode_list_of_lists_docstring = +( +R"(Encodes a list of paired prompts into tokenized inputs. Input format is same as for HF paired input [[prompt_1, prompt_2], ...]. +Args: + 'prompts' - list of prompts to encode\n)" ++ std::string(common_encode_docstring) +); + } // namespace namespace py = pybind11; @@ -94,7 +134,7 @@ void init_tokenizer(py::module_& m) { py::arg("pad_to_max_length") = false, py::arg("max_length") = std::nullopt, py::arg("padding_side") = std::nullopt, - R"(Encodes a list of prompts into tokenized inputs.)") + encode_list_docstring.c_str()) .def("encode", [](Tokenizer& tok, const std::string prompt, bool add_special_tokens, @@ -118,8 +158,8 @@ void init_tokenizer(py::module_& m) { py::arg("pad_to_max_length") = false, py::arg("max_length") = std::nullopt, py::arg("padding_side") = std::nullopt, - R"(Encodes a single prompt into tokenized input.)") - + encode_single_prompt_docstring.c_str()) + .def("encode", [](Tokenizer& tok, std::vector& prompts_1, std::vector& prompts_2, @@ -145,9 +185,8 @@ void init_tokenizer(py::module_& m) { py::arg("pad_to_max_length") = false, py::arg("max_length") = std::nullopt, py::arg("padding_side") = std::nullopt, - R"(Encodes a list of prompts into tokenized inputs. The number of strings must be the same, or one of the inputs can contain one string. - In the latter case, the single-string input will be broadcast into the shape of the other input, which is more efficient than repeating the string in pairs.)") - + encode_list_of_pairs_docstring.c_str()) + .def("encode", [](Tokenizer& tok, py::list& prompts, bool add_special_tokens, bool pad_to_max_length, @@ -180,9 +219,10 @@ void init_tokenizer(py::module_& m) { py::arg("pad_to_max_length") = false, py::arg("max_length") = std::nullopt, py::arg("padding_side") = std::nullopt, - R"(Encodes a list of paired prompts into tokenized inputs. Input format is same as for HF paired input [[prompt_1, prompt_2], ...].)") - - .def( + encode_list_of_lists_docstring.c_str() + ) + + .def( "decode", [](Tokenizer& tok, std::vector& tokens, bool skip_special_tokens) -> py::str { ov::AnyMap detokenization_params; From bea238d92aeb54d3f9dbe4783f25069e9b70e2a2 Mon Sep 17 00:00:00 2001 From: Pavel Esir Date: Mon, 11 Aug 2025 13:35:41 +0200 Subject: [PATCH 07/10] get default value of pad_right from AttributeVisitor --- .../src/tokenizer/make_tokenizer_stateful.cpp | 62 ++++++++++++++----- src/python/py_tokenizer.cpp | 4 +- 2 files changed, 50 insertions(+), 16 deletions(-) diff --git a/src/cpp/src/tokenizer/make_tokenizer_stateful.cpp b/src/cpp/src/tokenizer/make_tokenizer_stateful.cpp index 32760bd466..e6e89389e3 100644 --- a/src/cpp/src/tokenizer/make_tokenizer_stateful.cpp +++ b/src/cpp/src/tokenizer/make_tokenizer_stateful.cpp @@ -109,6 +109,23 @@ bool ov::genai::MakeVocabDecoderSatateful::run_on_model(const std::shared_ptr& adapter) override { + if (name != "pad_right") { + return; + } + if (auto a = ov::as_type>(&adapter)) { + m_pad_right = a->get(); + } + } + + bool get_pad_right() const { + return m_pad_right; + } +}; bool ov::genai::MakePaddingSatateful::run_on_model(const std::shared_ptr& model) { std::shared_ptr combine_seg_node; @@ -244,22 +261,34 @@ bool ov::genai::MakePaddingSatateful::run_on_model(const std::shared_ptr(op::util::VariableInfo{ov::Shape{1}, ov::element::boolean, ov::genai::PAD_TO_MAX_LENGTH_VAR_ID}); auto default_false_const = std::make_shared(ov::element::boolean, ov::Shape{1}, std::vector{false}); auto pad_to_max_length_rv = std::make_shared(default_false_const, pad_to_max_length_var); - model->add_sinks({std::make_shared(pad_to_max_length_rv, pad_to_max_length_var)}); - model->add_variables({pad_to_max_length_var}); + auto select_node = std::make_shared(pad_to_max_length_rv, max_length_rv, zero_constant); + + // If user called encode without explicitly stating padding side, then we should pad it to the default side. + // Here we get that side from the RaggedToDense nodes attribute. + auto pad_right_attr_visitor = ReadPadRightAttributes(); + bool first_iter = false; + bool default_pad_right = true; + for (auto ragged_to_dense_node : ragged_to_dense_nodes) { + if (!ragged_to_dense_node) { + return true; // true since at this point we already have modified the graph. + } + ragged_to_dense_node->visit_attributes(pad_right_attr_visitor); + if (first_iter) { + default_pad_right = pad_right_attr_visitor.get_pad_right(); + } else if (pad_right_attr_visitor.get_pad_right() != default_pad_right) { + return true; // true since at this point we already have modified the graph. + } + } + // Add padding side variable. - auto pad_right_var = std::make_shared(op::util::VariableInfo{ov::Shape{}, ov::element::i32, ov::genai::PAD_RIGHT_VAR_ID}); - // If user called encode without explicitly stating padding side, then we should pad it to the default side - // which was defined during model conversion, but we don't know default side value during this transformation, - // therefore we should indicate that padding side should be taken from the operation attribute. - // We decided that to be number 2. - auto pad_right_const = std::make_shared(ov::element::i32, ov::Shape{}, std::vector{2}); + auto pad_right_var = std::make_shared(op::util::VariableInfo{ov::Shape{}, ov::element::boolean, ov::genai::PAD_RIGHT_VAR_ID}); + auto pad_right_const = std::make_shared(ov::element::boolean, ov::Shape{}, std::vector{default_pad_right}); auto pad_right_rv = std::make_shared(pad_right_const, pad_right_var); - model->add_sinks({std::make_shared(pad_right_rv, pad_right_var)}); - model->add_variables({pad_right_var}); - - auto select_node = std::make_shared(pad_to_max_length_rv, max_length_rv, zero_constant); - + + // This cycle cannot be united with the cycle above since first we need to ensure that all RaggedToDense nodes have the same padding side + // and only after that start to modify. Therefore we need to iterate over RaggedToDense nodes twice. In 99% of cases there is only one RaggedToDense node + // and in the rest of cases it would be two RaggedToDense nodes with the same padding side if they are created by the openvino_tokenizers. for (auto ragged_to_dense_node : ragged_to_dense_nodes) { if (!ragged_to_dense_node) { return true; // true since at this point we already have modified the graph. @@ -271,9 +300,14 @@ bool ov::genai::MakePaddingSatateful::run_on_model(const std::shared_ptr(new_ragged_to_dense->input_value(3), select_node); new_ragged_to_dense->input(3).replace_source_output(max_op->output(0)); - + ov::replace_node(ragged_to_dense_node, new_ragged_to_dense); } + model->add_sinks({std::make_shared(pad_right_rv, pad_right_var)}); + model->add_variables({pad_right_var}); + model->add_sinks({std::make_shared(pad_to_max_length_rv, pad_to_max_length_var)}); + model->add_variables({pad_to_max_length_var}); + return true; } diff --git a/src/python/py_tokenizer.cpp b/src/python/py_tokenizer.cpp index aacc063873..e542e5239d 100644 --- a/src/python/py_tokenizer.cpp +++ b/src/python/py_tokenizer.cpp @@ -35,8 +35,8 @@ constexpr char class_docstring[] = R"( constexpr char common_encode_docstring[] =R"( 'add_special_tokens' - whether to add special tokens like BOS, EOS, PAD. Default is True. 'pad_to_max_length' - whether to pad the sequence to the maximum length. Default is False. - 'max_length' - maximum length of the sequence. If None (default), the value will be taken from the model configuration. - 'padding_side' - side to pad the sequence, can be 'left' or 'right'. If None (default), the value will be taken from the model configuration. + 'max_length' - maximum length of the sequence. If None (default), the value will be taken from the IR (where default value from original HF/GGUF model is stored). + 'padding_side' - side to pad the sequence, can be 'left' or 'right'. If None (default), the value will be taken from the IR (where default value from original HF/GGUF model is stored). Returns: TokenizedInputs object containing input_ids and attention_mask tensors. )"; From 5a6d1bb5a088aa33166ed7844594145f5ed9cf46 Mon Sep 17 00:00:00 2001 From: Pavel Esir Date: Mon, 11 Aug 2025 13:48:34 +0200 Subject: [PATCH 08/10] update cpp docstring --- src/cpp/include/openvino/genai/tokenizer.hpp | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/cpp/include/openvino/genai/tokenizer.hpp b/src/cpp/include/openvino/genai/tokenizer.hpp index 69e5fb4707..7d436db8d2 100644 --- a/src/cpp/include/openvino/genai/tokenizer.hpp +++ b/src/cpp/include/openvino/genai/tokenizer.hpp @@ -163,9 +163,9 @@ class OPENVINO_GENAI_EXPORTS Tokenizer { * @brief encode a single prompt * @param prompt std::string with input prompt * @param add_special_tokens whether to add special tokens - * @param max_length optional maximum length to which output will be truncated and/or padded. If not defined, taken from IR. + * @param max_length optional maximum length to which output will be truncated and/or padded. If not defined, taken from IR (where default value from original HF/GGUF model is stored). * @param pad_to_max_length either pad to max_length, or pad to the longest sequence in the batch. Default is false. - * @param padding_side side to pad, either "left" or "right". If not defined value is taken from IR. + * @param padding_side side to pad, either "left" or "right". If not defined value is taken from IR (where default value from original HF/GGUF model is stored). * @return pair of [input_ids, attention_mask] */ template @@ -177,9 +177,9 @@ class OPENVINO_GENAI_EXPORTS Tokenizer { * @brief encode batch of prompts. * @param prompts vector storing batch of prompts * @param add_special_tokens whether to add special tokens - * @param max_length optional maximum length to which output will be truncated and/or padded. If not defined, taken from IR. + * @param max_length optional maximum length to which output will be truncated and/or padded. If not defined, taken from IR (where default value from original HF/GGUF model is stored). * @param pad_to_max_length either pad to max_length, or pad to the longest sequence in the batch. Default is false. - * @param padding_side side to pad, either "left" or "right". If not defined value is taken from IR. + * @param padding_side side to pad, either "left" or "right". If not defined value is taken from IR (where default value from original HF/GGUF model is stored). * @return pair of [input_ids, attention_mask] */ template From b65ada50c6c5e27a4a8b89db6c89aa24eab276d7 Mon Sep 17 00:00:00 2001 From: Pavel Esir Date: Mon, 11 Aug 2025 14:08:39 +0200 Subject: [PATCH 09/10] some corrections --- src/cpp/src/tokenizer/make_tokenizer_stateful.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/cpp/src/tokenizer/make_tokenizer_stateful.cpp b/src/cpp/src/tokenizer/make_tokenizer_stateful.cpp index e6e89389e3..030c473eb2 100644 --- a/src/cpp/src/tokenizer/make_tokenizer_stateful.cpp +++ b/src/cpp/src/tokenizer/make_tokenizer_stateful.cpp @@ -263,11 +263,10 @@ bool ov::genai::MakePaddingSatateful::run_on_model(const std::shared_ptr(default_false_const, pad_to_max_length_var); auto select_node = std::make_shared(pad_to_max_length_rv, max_length_rv, zero_constant); - // If user called encode without explicitly stating padding side, then we should pad it to the default side. // Here we get that side from the RaggedToDense nodes attribute. auto pad_right_attr_visitor = ReadPadRightAttributes(); - bool first_iter = false; + bool first_iter = true; bool default_pad_right = true; for (auto ragged_to_dense_node : ragged_to_dense_nodes) { if (!ragged_to_dense_node) { @@ -279,6 +278,7 @@ bool ov::genai::MakePaddingSatateful::run_on_model(const std::shared_ptr Date: Mon, 11 Aug 2025 14:44:32 +0200 Subject: [PATCH 10/10] update pyi --- .../openvino_genai/py_openvino_genai.pyi | 35 ++++++++++++++++++- 1 file changed, 34 insertions(+), 1 deletion(-) diff --git a/src/python/openvino_genai/py_openvino_genai.pyi b/src/python/openvino_genai/py_openvino_genai.pyi index cdb1b380c7..991aa63313 100644 --- a/src/python/openvino_genai/py_openvino_genai.pyi +++ b/src/python/openvino_genai/py_openvino_genai.pyi @@ -2974,22 +2974,55 @@ class Tokenizer: def encode(self, prompts: collections.abc.Sequence[str], add_special_tokens: bool = True, pad_to_max_length: bool = False, max_length: typing.SupportsInt | None = None, padding_side: str | None = None) -> TokenizedInputs: """ Encodes a list of prompts into tokenized inputs. + Args: + 'prompts' - list of prompts to encode + 'add_special_tokens' - whether to add special tokens like BOS, EOS, PAD. Default is True. + 'pad_to_max_length' - whether to pad the sequence to the maximum length. Default is False. + 'max_length' - maximum length of the sequence. If None (default), the value will be taken from the IR (where default value from original HF/GGUF model is stored). + 'padding_side' - side to pad the sequence, can be 'left' or 'right'. If None (default), the value will be taken from the IR (where default value from original HF/GGUF model is stored). + Returns: + TokenizedInputs object containing input_ids and attention_mask tensors. """ @typing.overload def encode(self, prompt: str, add_special_tokens: bool = True, pad_to_max_length: bool = False, max_length: typing.SupportsInt | None = None, padding_side: str | None = None) -> TokenizedInputs: """ Encodes a single prompt into tokenized input. + Args: + 'prompt' - prompt to encode + 'add_special_tokens' - whether to add special tokens like BOS, EOS, PAD. Default is True. + 'pad_to_max_length' - whether to pad the sequence to the maximum length. Default is False. + 'max_length' - maximum length of the sequence. If None (default), the value will be taken from the IR (where default value from original HF/GGUF model is stored). + 'padding_side' - side to pad the sequence, can be 'left' or 'right'. If None (default), the value will be taken from the IR (where default value from original HF/GGUF model is stored). + Returns: + TokenizedInputs object containing input_ids and attention_mask tensors. """ @typing.overload def encode(self, prompts_1: collections.abc.Sequence[str], prompts_2: collections.abc.Sequence[str], add_special_tokens: bool = True, pad_to_max_length: bool = False, max_length: typing.SupportsInt | None = None, padding_side: str | None = None) -> TokenizedInputs: """ Encodes a list of prompts into tokenized inputs. The number of strings must be the same, or one of the inputs can contain one string. - In the latter case, the single-string input will be broadcast into the shape of the other input, which is more efficient than repeating the string in pairs. + In the latter case, the single-string input will be broadcast into the shape of the other input, which is more efficient than repeating the string in pairs.) + Args: + 'prompts_1' - list of prompts to encode + 'prompts_2' - list of prompts to encode + 'add_special_tokens' - whether to add special tokens like BOS, EOS, PAD. Default is True. + 'pad_to_max_length' - whether to pad the sequence to the maximum length. Default is False. + 'max_length' - maximum length of the sequence. If None (default), the value will be taken from the IR (where default value from original HF/GGUF model is stored). + 'padding_side' - side to pad the sequence, can be 'left' or 'right'. If None (default), the value will be taken from the IR (where default value from original HF/GGUF model is stored). + Returns: + TokenizedInputs object containing input_ids and attention_mask tensors. """ @typing.overload def encode(self, prompts: list, add_special_tokens: bool = True, pad_to_max_length: bool = False, max_length: typing.SupportsInt | None = None, padding_side: str | None = None) -> TokenizedInputs: """ Encodes a list of paired prompts into tokenized inputs. Input format is same as for HF paired input [[prompt_1, prompt_2], ...]. + Args: + 'prompts' - list of prompts to encode\\n + 'add_special_tokens' - whether to add special tokens like BOS, EOS, PAD. Default is True. + 'pad_to_max_length' - whether to pad the sequence to the maximum length. Default is False. + 'max_length' - maximum length of the sequence. If None (default), the value will be taken from the IR (where default value from original HF/GGUF model is stored). + 'padding_side' - side to pad the sequence, can be 'left' or 'right'. If None (default), the value will be taken from the IR (where default value from original HF/GGUF model is stored). + Returns: + TokenizedInputs object containing input_ids and attention_mask tensors. """ def get_bos_token(self) -> str: ...