openvinotoolkit · apaniukov · Aug 11, 2025 · Jul 25, 2025 · Jul 30, 2025 · Aug 8, 2025
diff --git a/src/cpp/include/openvino/genai/tokenizer.hpp b/src/cpp/include/openvino/genai/tokenizer.hpp
@@ -163,8 +163,9 @@ class OPENVINO_GENAI_EXPORTS Tokenizer {
     * @brief encode a single prompt
     * @param prompt std::string with input prompt
     * @param add_special_tokens whether to add special tokens
-    * @param max_length optional maximum length to which output will be truncated and/or padded. If not defined, taken from IR.
+    * @param max_length optional maximum length to which output will be truncated and/or padded. If not defined, taken from IR (where default value from original HF/GGUF model is stored).
     * @param pad_to_max_length either pad to max_length, or pad to the longest sequence in the batch. Default is false.
+    * @param padding_side side to pad, either "left" or "right". If not defined value is taken from IR (where default value from original HF/GGUF model is stored).
     * @return pair of [input_ids, attention_mask]
     */
     template <typename... Properties>
@@ -176,8 +177,9 @@ class OPENVINO_GENAI_EXPORTS Tokenizer {
     * @brief encode batch of prompts.
     * @param prompts vector storing batch of prompts
     * @param add_special_tokens whether to add special tokens
-    * @param max_length optional maximum length to which output will be truncated and/or padded. If not defined, taken from IR.
+    * @param max_length optional maximum length to which output will be truncated and/or padded. If not defined, taken from IR (where default value from original HF/GGUF model is stored).
     * @param pad_to_max_length either pad to max_length, or pad to the longest sequence in the batch. Default is false.
+    * @param padding_side side to pad, either "left" or "right". If not defined value is taken from IR (where default value from original HF/GGUF model is stored).
     * @return pair of [input_ids, attention_mask]
     */
     template <typename... Properties>
@@ -313,6 +315,7 @@ class OPENVINO_GENAI_EXPORTS Tokenizer {
 static constexpr ov::Property<bool> add_special_tokens{"add_special_tokens"};
 static constexpr ov::Property<bool> skip_special_tokens{"skip_special_tokens"};
 static constexpr ov::Property<bool> pad_to_max_length{"pad_to_max_length"};
+static constexpr ov::Property<std::string> padding_side{"padding_side"};
 
 }  // namespace genai
 }  // namespace ov
diff --git a/src/cpp/src/tokenizer/make_tokenizer_stateful.cpp b/src/cpp/src/tokenizer/make_tokenizer_stateful.cpp
@@ -14,6 +14,7 @@
 #include "openvino/op/assign.hpp"
 #include "openvino/op/constant.hpp"
 #include <openvino/pass/manager.hpp>
+#include <openvino/core/graph_util.hpp>
 
 using namespace ov;
 using namespace ov::op;
@@ -108,6 +109,23 @@ bool ov::genai::MakeVocabDecoderSatateful::run_on_model(const std::shared_ptr<ov
     return true;
 }
 
+class ReadPadRightAttributes : public ov::AttributeVisitor {
+private:
+    bool m_pad_right = true;
+public:
+    void on_adapter(const std::string& name, ov::ValueAccessor<void>& adapter) override {
+        if (name != "pad_right") {
+            return;
+        }
+        if (auto a = ov::as_type<ov::AttributeAdapter<bool>>(&adapter)) {
+            m_pad_right = a->get();
+        }
+    }
+
+    bool get_pad_right() const {
+        return m_pad_right;
+    }
+};
 
 bool ov::genai::MakePaddingSatateful::run_on_model(const std::shared_ptr<ov::Model>& model) {
     std::shared_ptr<ov::Node> combine_seg_node;
@@ -243,19 +261,53 @@ bool ov::genai::MakePaddingSatateful::run_on_model(const std::shared_ptr<ov::Mod
     auto pad_to_max_length_var = std::make_shared<op::util::Variable>(op::util::VariableInfo{ov::Shape{1}, ov::element::boolean, ov::genai::PAD_TO_MAX_LENGTH_VAR_ID});
     auto default_false_const = std::make_shared<v0::Constant>(ov::element::boolean, ov::Shape{1}, std::vector{false});
     auto pad_to_max_length_rv = std::make_shared<v6::ReadValue>(default_false_const, pad_to_max_length_var);
-    model->add_sinks({std::make_shared<v6::Assign>(pad_to_max_length_rv, pad_to_max_length_var)});
-    model->add_variables({pad_to_max_length_var});
-
     auto select_node = std::make_shared<v1::Select>(pad_to_max_length_rv, max_length_rv, zero_constant);
 
+    // If user called encode without explicitly stating padding side, then we should pad it to the default side.
+    // Here we get that side from the RaggedToDense nodes attribute. 
+    auto pad_right_attr_visitor = ReadPadRightAttributes();
+    bool first_iter = true;
+    bool default_pad_right = true;
+    for (auto ragged_to_dense_node : ragged_to_dense_nodes) {
+        if (!ragged_to_dense_node) {
+            return true;  // true since at this point we already have modified the graph.
+        }
+        ragged_to_dense_node->visit_attributes(pad_right_attr_visitor);
+        if (first_iter) {
+            default_pad_right = pad_right_attr_visitor.get_pad_right();
+        } else if (pad_right_attr_visitor.get_pad_right() != default_pad_right) {
+            return true;  // true since at this point we already have modified the graph.
+        }
+        first_iter = false;
+    }
+
+    // Add padding side variable.
+    auto pad_right_var = std::make_shared<op::util::Variable>(op::util::VariableInfo{ov::Shape{}, ov::element::boolean, ov::genai::PAD_RIGHT_VAR_ID});
+    auto pad_right_const = std::make_shared<v0::Constant>(ov::element::boolean, ov::Shape{}, std::vector{default_pad_right});
+    auto pad_right_rv = std::make_shared<v6::ReadValue>(pad_right_const, pad_right_var);
+
+    // This cycle cannot be united with the cycle above since first we need to ensure that all RaggedToDense nodes have the same padding side
+    // and only after that start to modify. Therefore we need to iterate over RaggedToDense nodes twice. In 99% of cases there is only one RaggedToDense node
+    // and in the rest of cases it would be two RaggedToDense nodes with the same padding side if they are created by the openvino_tokenizers.
     for (auto ragged_to_dense_node : ragged_to_dense_nodes) {
         if (!ragged_to_dense_node) {
-            return true;  // true since at this point we already have modified the graph.s
+            return true;  // true since at this point we already have modified the graph.
         }
+
+        auto new_inputs = ragged_to_dense_node->input_values();
+        new_inputs.emplace_back(pad_right_rv->output(0));
+        auto new_ragged_to_dense = ragged_to_dense_node->clone_with_new_inputs(new_inputs);
+
+        auto max_op = std::make_shared<v1::Maximum>(new_ragged_to_dense->input_value(3), select_node);
+        new_ragged_to_dense->input(3).replace_source_output(max_op->output(0));
 
-        auto max_op = std::make_shared<v1::Maximum>(ragged_to_dense_node->input_value(3), select_node);
-        ragged_to_dense_node->input(3).replace_source_output(max_op->output(0));
+        ov::replace_node(ragged_to_dense_node, new_ragged_to_dense);
     }
 
+    model->add_sinks({std::make_shared<v6::Assign>(pad_right_rv, pad_right_var)});
+    model->add_variables({pad_right_var});
+    model->add_sinks({std::make_shared<v6::Assign>(pad_to_max_length_rv, pad_to_max_length_var)});
+    model->add_variables({pad_to_max_length_var});
+
     return true;
 }
diff --git a/src/cpp/src/tokenizer/make_tokenizer_stateful.hpp b/src/cpp/src/tokenizer/make_tokenizer_stateful.hpp
@@ -90,6 +90,7 @@ inline const std::string SKIP_SPECIAL_TOKENS_VAR_ID = "skip_special_tokens";
 inline const std::string MAX_LENGTH_VAR_ID = "max_length";
 inline const std::string IS_MAX_LENGTH_SET = "is_max_length_set";
 inline const std::string PAD_TO_MAX_LENGTH_VAR_ID = "pad_to_max_length";
+inline const std::string PAD_RIGHT_VAR_ID = "pad_right";
 
 } // namespace genai
 } // namespace ov
diff --git a/src/cpp/src/tokenizer/tokenizer.cpp b/src/cpp/src/tokenizer/tokenizer.cpp
@@ -260,11 +260,25 @@ class Tokenizer::TokenizerImpl {
         std::optional<bool> skip_special_tokens_flag = true;
         std::optional<int32_t> max_length_val;
         std::optional<bool> pad_to_max_length_val = false;
-
+        std::optional<std::string> padding_side_val = std::nullopt;
+
         ov::genai::utils::read_anymap_param(params, add_special_tokens.name(), add_special_tokens_flag);
         ov::genai::utils::read_anymap_param(params, skip_special_tokens.name(), skip_special_tokens_flag);
         ov::genai::utils::read_anymap_param(params, pad_to_max_length.name(), pad_to_max_length_val);
         ov::genai::utils::read_anymap_param(params, max_length.name(), max_length_val);
+        ov::genai::utils::read_anymap_param(params, padding_side.name(), padding_side_val);
+        std::optional<bool> pad_right;
+
+        // If padding_side is not set, we should leave nullopt this will indicate that default value from RaggetToDense attribute will be used.
+        if (padding_side_val.has_value()) {
+            OPENVINO_ASSERT(
+                padding_side_val == "left" || padding_side_val == "right",
+                "padding_side should be either 'left' or 'right', but got: ",
+                *padding_side_val
+            );
+            pad_right = (*padding_side_val == "right") ? true : false;
+        }
+
         std::optional<bool> is_max_length_set_val = max_length_val.has_value();
 
         ov::AnyMap& state_flags = m_request_to_state_flags[&infer_request_guard.get()];
@@ -282,6 +296,8 @@ class Tokenizer::TokenizerImpl {
                 set_state_value(state, pad_to_max_length_val, state_flags);
             } else if (name == IS_MAX_LENGTH_SET) {
                 set_state_value(state, is_max_length_set_val, state_flags);
+            } else if (name == PAD_RIGHT_VAR_ID) {
+                set_state_value(state, pad_right, state_flags);
             }
         }
     }
@@ -845,27 +861,42 @@ Tokenizer::Tokenizer(const std::string& model_str, ov::Tensor& weights_tensor, c
 }
 
 TokenizedInputs Tokenizer::encode(const std::string& prompt, const ov::AnyMap& tokenization_params) {
-    check_arguments(tokenization_params, {ov::genai::add_special_tokens.name(), ov::genai::max_length.name(), ov::genai::pad_to_max_length.name()});
+    check_arguments(tokenization_params, {ov::genai::add_special_tokens.name(),
+                                          ov::genai::max_length.name(),
+                                          ov::genai::pad_to_max_length.name(),
+                                          ov::genai::padding_side.name()});
     return m_pimpl->encode(std::move(prompt), tokenization_params);
 }
 
 TokenizedInputs Tokenizer::encode(const std::vector<std::pair<std::string, std::string>>& prompts, const ov::AnyMap& tokenization_params) {
-    check_arguments(tokenization_params, {ov::genai::add_special_tokens.name(), ov::genai::max_length.name(), ov::genai::pad_to_max_length.name()});
+    check_arguments(tokenization_params, {ov::genai::add_special_tokens.name(),
+                                          ov::genai::max_length.name(),
+                                          ov::genai::pad_to_max_length.name(),
+                                          ov::genai::padding_side.name()});
     return m_pimpl->encode(prompts, tokenization_params);
 }
 
 TokenizedInputs Tokenizer::encode(const std::vector<std::string>& prompts_1, const std::vector<std::string>& prompts_2, const ov::AnyMap& tokenization_params) {
-    check_arguments(tokenization_params, {ov::genai::add_special_tokens.name(), ov::genai::max_length.name(), ov::genai::pad_to_max_length.name()});
+    check_arguments(tokenization_params, {ov::genai::add_special_tokens.name(),
+                                          ov::genai::max_length.name(),
+                                          ov::genai::pad_to_max_length.name(),
+                                          ov::genai::padding_side.name()});
     return m_pimpl->encode(prompts_1, prompts_2, tokenization_params);
 }
 
 TokenizedInputs Tokenizer::encode(const std::vector<std::string>& prompts, const ov::AnyMap& tokenization_params) {
-    check_arguments(tokenization_params, {ov::genai::add_special_tokens.name(), ov::genai::max_length.name(), ov::genai::pad_to_max_length.name()});
+    check_arguments(tokenization_params, {ov::genai::add_special_tokens.name(),
+                                          ov::genai::max_length.name(),
+                                          ov::genai::pad_to_max_length.name(),
+                                          ov::genai::padding_side.name()});
     return m_pimpl->encode(prompts, tokenization_params);
 }
 
 TokenizedInputs Tokenizer::encode(const std::initializer_list<std::string>& text, const ov::AnyMap& tokenization_params) {
-    check_arguments(tokenization_params, {ov::genai::add_special_tokens.name(), ov::genai::max_length.name(), ov::genai::pad_to_max_length.name()});
+    check_arguments(tokenization_params, {ov::genai::add_special_tokens.name(),
+                                          ov::genai::max_length.name(),
+                                          ov::genai::pad_to_max_length.name(),
+                                          ov::genai::padding_side.name()});
     return encode(std::vector<std::string>(text.begin(), text.end()), tokenization_params);
 }
 

diff --git a/src/python/openvino_genai/py_openvino_genai.pyi b/src/python/openvino_genai/py_openvino_genai.pyi
@@ -2971,25 +2971,58 @@ class Tokenizer:
         Decode a batch of tokens into a list of string prompt.
         """
     @typing.overload
-    def encode(self, prompts: collections.abc.Sequence[str], add_special_tokens: bool = True, pad_to_max_length: bool = False, max_length: typing.SupportsInt | None = None) -> TokenizedInputs:
+    def encode(self, prompts: collections.abc.Sequence[str], add_special_tokens: bool = True, pad_to_max_length: bool = False, max_length: typing.SupportsInt | None = None, padding_side: str | None = None) -> TokenizedInputs:
         """
         Encodes a list of prompts into tokenized inputs.
+        Args:
+         'prompts' - list of prompts to encode
+         'add_special_tokens' - whether to add special tokens like BOS, EOS, PAD. Default is True.
+         'pad_to_max_length' - whether to pad the sequence to the maximum length. Default is False.
+         'max_length' - maximum length of the sequence. If None (default), the value will be taken from the IR (where default value from original HF/GGUF model is stored).
+         'padding_side' - side to pad the sequence, can be 'left' or 'right'. If None (default), the value will be taken from the IR (where default value from original HF/GGUF model is stored).
+        Returns:
+         TokenizedInputs object containing input_ids and attention_mask tensors.
         """
     @typing.overload
-    def encode(self, prompt: str, add_special_tokens: bool = True, pad_to_max_length: bool = False, max_length: typing.SupportsInt | None = None) -> TokenizedInputs:
+    def encode(self, prompt: str, add_special_tokens: bool = True, pad_to_max_length: bool = False, max_length: typing.SupportsInt | None = None, padding_side: str | None = None) -> TokenizedInputs:
         """
         Encodes a single prompt into tokenized input.
+        Args:
+         'prompt' - prompt to encode
+         'add_special_tokens' - whether to add special tokens like BOS, EOS, PAD. Default is True.
+         'pad_to_max_length' - whether to pad the sequence to the maximum length. Default is False.
+         'max_length' - maximum length of the sequence. If None (default), the value will be taken from the IR (where default value from original HF/GGUF model is stored).
+         'padding_side' - side to pad the sequence, can be 'left' or 'right'. If None (default), the value will be taken from the IR (where default value from original HF/GGUF model is stored).
+        Returns:
+         TokenizedInputs object containing input_ids and attention_mask tensors.
         """
     @typing.overload
-    def encode(self, prompts_1: collections.abc.Sequence[str], prompts_2: collections.abc.Sequence[str], add_special_tokens: bool = True, pad_to_max_length: bool = False, max_length: typing.SupportsInt | None = None) -> TokenizedInputs:
+    def encode(self, prompts_1: collections.abc.Sequence[str], prompts_2: collections.abc.Sequence[str], add_special_tokens: bool = True, pad_to_max_length: bool = False, max_length: typing.SupportsInt | None = None, padding_side: str | None = None) -> TokenizedInputs:
         """
         Encodes a list of prompts into tokenized inputs. The number of strings must be the same, or one of the inputs can contain one string.
-                    In the latter case, the single-string input will be broadcast into the shape of the other input, which is more efficient than repeating the string in pairs.
+        In the latter case, the single-string input will be broadcast into the shape of the other input, which is more efficient than repeating the string in pairs.)
+        Args:
+         'prompts_1' - list of prompts to encode
+         'prompts_2' - list of prompts to encode
+         'add_special_tokens' - whether to add special tokens like BOS, EOS, PAD. Default is True.
+         'pad_to_max_length' - whether to pad the sequence to the maximum length. Default is False.
+         'max_length' - maximum length of the sequence. If None (default), the value will be taken from the IR (where default value from original HF/GGUF model is stored).
+         'padding_side' - side to pad the sequence, can be 'left' or 'right'. If None (default), the value will be taken from the IR (where default value from original HF/GGUF model is stored).
+        Returns:
+         TokenizedInputs object containing input_ids and attention_mask tensors.
         """
     @typing.overload
-    def encode(self, prompts: list, add_special_tokens: bool = True, pad_to_max_length: bool = False, max_length: typing.SupportsInt | None = None) -> TokenizedInputs:
+    def encode(self, prompts: list, add_special_tokens: bool = True, pad_to_max_length: bool = False, max_length: typing.SupportsInt | None = None, padding_side: str | None = None) -> TokenizedInputs:
         """
         Encodes a list of paired prompts into tokenized inputs. Input format is same as for HF paired input [[prompt_1, prompt_2], ...].
+        Args:
+         'prompts' - list of prompts to encode\\n
+         'add_special_tokens' - whether to add special tokens like BOS, EOS, PAD. Default is True.
+         'pad_to_max_length' - whether to pad the sequence to the maximum length. Default is False.
+         'max_length' - maximum length of the sequence. If None (default), the value will be taken from the IR (where default value from original HF/GGUF model is stored).
+         'padding_side' - side to pad the sequence, can be 'left' or 'right'. If None (default), the value will be taken from the IR (where default value from original HF/GGUF model is stored).
+        Returns:
+         TokenizedInputs object containing input_ids and attention_mask tensors.
         """
     def get_bos_token(self) -> str:
         ...