Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 5 additions & 2 deletions src/cpp/include/openvino/genai/tokenizer.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -163,8 +163,9 @@ class OPENVINO_GENAI_EXPORTS Tokenizer {
* @brief encode a single prompt
* @param prompt std::string with input prompt
* @param add_special_tokens whether to add special tokens
* @param max_length optional maximum length to which output will be truncated and/or padded. If not defined, taken from IR.
* @param max_length optional maximum length to which output will be truncated and/or padded. If not defined, taken from IR (where default value from original HF/GGUF model is stored).
* @param pad_to_max_length either pad to max_length, or pad to the longest sequence in the batch. Default is false.
* @param padding_side side to pad, either "left" or "right". If not defined value is taken from IR (where default value from original HF/GGUF model is stored).
* @return pair of [input_ids, attention_mask]
*/
template <typename... Properties>
Expand All @@ -176,8 +177,9 @@ class OPENVINO_GENAI_EXPORTS Tokenizer {
* @brief encode batch of prompts.
* @param prompts vector storing batch of prompts
* @param add_special_tokens whether to add special tokens
* @param max_length optional maximum length to which output will be truncated and/or padded. If not defined, taken from IR.
* @param max_length optional maximum length to which output will be truncated and/or padded. If not defined, taken from IR (where default value from original HF/GGUF model is stored).
* @param pad_to_max_length either pad to max_length, or pad to the longest sequence in the batch. Default is false.
* @param padding_side side to pad, either "left" or "right". If not defined value is taken from IR (where default value from original HF/GGUF model is stored).
* @return pair of [input_ids, attention_mask]
*/
template <typename... Properties>
Expand Down Expand Up @@ -313,6 +315,7 @@ class OPENVINO_GENAI_EXPORTS Tokenizer {
static constexpr ov::Property<bool> add_special_tokens{"add_special_tokens"};
static constexpr ov::Property<bool> skip_special_tokens{"skip_special_tokens"};
static constexpr ov::Property<bool> pad_to_max_length{"pad_to_max_length"};
static constexpr ov::Property<std::string> padding_side{"padding_side"};

} // namespace genai
} // namespace ov
64 changes: 58 additions & 6 deletions src/cpp/src/tokenizer/make_tokenizer_stateful.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
#include "openvino/op/assign.hpp"
#include "openvino/op/constant.hpp"
#include <openvino/pass/manager.hpp>
#include <openvino/core/graph_util.hpp>

using namespace ov;
using namespace ov::op;
Expand Down Expand Up @@ -108,6 +109,23 @@ bool ov::genai::MakeVocabDecoderSatateful::run_on_model(const std::shared_ptr<ov
return true;
}

class ReadPadRightAttributes : public ov::AttributeVisitor {
private:
bool m_pad_right = true;
public:
void on_adapter(const std::string& name, ov::ValueAccessor<void>& adapter) override {
if (name != "pad_right") {
return;
}
if (auto a = ov::as_type<ov::AttributeAdapter<bool>>(&adapter)) {
m_pad_right = a->get();
}
}

bool get_pad_right() const {
return m_pad_right;
}
};

bool ov::genai::MakePaddingSatateful::run_on_model(const std::shared_ptr<ov::Model>& model) {
std::shared_ptr<ov::Node> combine_seg_node;
Expand Down Expand Up @@ -243,19 +261,53 @@ bool ov::genai::MakePaddingSatateful::run_on_model(const std::shared_ptr<ov::Mod
auto pad_to_max_length_var = std::make_shared<op::util::Variable>(op::util::VariableInfo{ov::Shape{1}, ov::element::boolean, ov::genai::PAD_TO_MAX_LENGTH_VAR_ID});
auto default_false_const = std::make_shared<v0::Constant>(ov::element::boolean, ov::Shape{1}, std::vector{false});
auto pad_to_max_length_rv = std::make_shared<v6::ReadValue>(default_false_const, pad_to_max_length_var);
model->add_sinks({std::make_shared<v6::Assign>(pad_to_max_length_rv, pad_to_max_length_var)});
model->add_variables({pad_to_max_length_var});

auto select_node = std::make_shared<v1::Select>(pad_to_max_length_rv, max_length_rv, zero_constant);

// If user called encode without explicitly stating padding side, then we should pad it to the default side.
// Here we get that side from the RaggedToDense nodes attribute.
auto pad_right_attr_visitor = ReadPadRightAttributes();
bool first_iter = true;
bool default_pad_right = true;
for (auto ragged_to_dense_node : ragged_to_dense_nodes) {
if (!ragged_to_dense_node) {
return true; // true since at this point we already have modified the graph.
}
ragged_to_dense_node->visit_attributes(pad_right_attr_visitor);
if (first_iter) {
default_pad_right = pad_right_attr_visitor.get_pad_right();
} else if (pad_right_attr_visitor.get_pad_right() != default_pad_right) {
return true; // true since at this point we already have modified the graph.
}
first_iter = false;
}

// Add padding side variable.
auto pad_right_var = std::make_shared<op::util::Variable>(op::util::VariableInfo{ov::Shape{}, ov::element::boolean, ov::genai::PAD_RIGHT_VAR_ID});
auto pad_right_const = std::make_shared<v0::Constant>(ov::element::boolean, ov::Shape{}, std::vector{default_pad_right});
auto pad_right_rv = std::make_shared<v6::ReadValue>(pad_right_const, pad_right_var);

// This cycle cannot be united with the cycle above since first we need to ensure that all RaggedToDense nodes have the same padding side
// and only after that start to modify. Therefore we need to iterate over RaggedToDense nodes twice. In 99% of cases there is only one RaggedToDense node
// and in the rest of cases it would be two RaggedToDense nodes with the same padding side if they are created by the openvino_tokenizers.
for (auto ragged_to_dense_node : ragged_to_dense_nodes) {
if (!ragged_to_dense_node) {
return true; // true since at this point we already have modified the graph.s
return true; // true since at this point we already have modified the graph.
}

auto new_inputs = ragged_to_dense_node->input_values();
new_inputs.emplace_back(pad_right_rv->output(0));
auto new_ragged_to_dense = ragged_to_dense_node->clone_with_new_inputs(new_inputs);

auto max_op = std::make_shared<v1::Maximum>(new_ragged_to_dense->input_value(3), select_node);
new_ragged_to_dense->input(3).replace_source_output(max_op->output(0));

auto max_op = std::make_shared<v1::Maximum>(ragged_to_dense_node->input_value(3), select_node);
ragged_to_dense_node->input(3).replace_source_output(max_op->output(0));
ov::replace_node(ragged_to_dense_node, new_ragged_to_dense);
}

model->add_sinks({std::make_shared<v6::Assign>(pad_right_rv, pad_right_var)});
model->add_variables({pad_right_var});
model->add_sinks({std::make_shared<v6::Assign>(pad_to_max_length_rv, pad_to_max_length_var)});
model->add_variables({pad_to_max_length_var});

return true;
}
1 change: 1 addition & 0 deletions src/cpp/src/tokenizer/make_tokenizer_stateful.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -90,6 +90,7 @@ inline const std::string SKIP_SPECIAL_TOKENS_VAR_ID = "skip_special_tokens";
inline const std::string MAX_LENGTH_VAR_ID = "max_length";
inline const std::string IS_MAX_LENGTH_SET = "is_max_length_set";
inline const std::string PAD_TO_MAX_LENGTH_VAR_ID = "pad_to_max_length";
inline const std::string PAD_RIGHT_VAR_ID = "pad_right";

} // namespace genai
} // namespace ov
43 changes: 37 additions & 6 deletions src/cpp/src/tokenizer/tokenizer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -260,11 +260,25 @@ class Tokenizer::TokenizerImpl {
std::optional<bool> skip_special_tokens_flag = true;
std::optional<int32_t> max_length_val;
std::optional<bool> pad_to_max_length_val = false;

std::optional<std::string> padding_side_val = std::nullopt;

ov::genai::utils::read_anymap_param(params, add_special_tokens.name(), add_special_tokens_flag);
ov::genai::utils::read_anymap_param(params, skip_special_tokens.name(), skip_special_tokens_flag);
ov::genai::utils::read_anymap_param(params, pad_to_max_length.name(), pad_to_max_length_val);
ov::genai::utils::read_anymap_param(params, max_length.name(), max_length_val);
ov::genai::utils::read_anymap_param(params, padding_side.name(), padding_side_val);
std::optional<bool> pad_right;

// If padding_side is not set, we should leave nullopt this will indicate that default value from RaggetToDense attribute will be used.
if (padding_side_val.has_value()) {
OPENVINO_ASSERT(
padding_side_val == "left" || padding_side_val == "right",
"padding_side should be either 'left' or 'right', but got: ",
*padding_side_val
);
pad_right = (*padding_side_val == "right") ? true : false;
}

std::optional<bool> is_max_length_set_val = max_length_val.has_value();

ov::AnyMap& state_flags = m_request_to_state_flags[&infer_request_guard.get()];
Expand All @@ -282,6 +296,8 @@ class Tokenizer::TokenizerImpl {
set_state_value(state, pad_to_max_length_val, state_flags);
} else if (name == IS_MAX_LENGTH_SET) {
set_state_value(state, is_max_length_set_val, state_flags);
} else if (name == PAD_RIGHT_VAR_ID) {
set_state_value(state, pad_right, state_flags);
}
}
}
Expand Down Expand Up @@ -845,27 +861,42 @@ Tokenizer::Tokenizer(const std::string& model_str, ov::Tensor& weights_tensor, c
}

TokenizedInputs Tokenizer::encode(const std::string& prompt, const ov::AnyMap& tokenization_params) {
check_arguments(tokenization_params, {ov::genai::add_special_tokens.name(), ov::genai::max_length.name(), ov::genai::pad_to_max_length.name()});
check_arguments(tokenization_params, {ov::genai::add_special_tokens.name(),
ov::genai::max_length.name(),
ov::genai::pad_to_max_length.name(),
ov::genai::padding_side.name()});
return m_pimpl->encode(std::move(prompt), tokenization_params);
}

TokenizedInputs Tokenizer::encode(const std::vector<std::pair<std::string, std::string>>& prompts, const ov::AnyMap& tokenization_params) {
check_arguments(tokenization_params, {ov::genai::add_special_tokens.name(), ov::genai::max_length.name(), ov::genai::pad_to_max_length.name()});
check_arguments(tokenization_params, {ov::genai::add_special_tokens.name(),
ov::genai::max_length.name(),
ov::genai::pad_to_max_length.name(),
ov::genai::padding_side.name()});
return m_pimpl->encode(prompts, tokenization_params);
}

TokenizedInputs Tokenizer::encode(const std::vector<std::string>& prompts_1, const std::vector<std::string>& prompts_2, const ov::AnyMap& tokenization_params) {
check_arguments(tokenization_params, {ov::genai::add_special_tokens.name(), ov::genai::max_length.name(), ov::genai::pad_to_max_length.name()});
check_arguments(tokenization_params, {ov::genai::add_special_tokens.name(),
ov::genai::max_length.name(),
ov::genai::pad_to_max_length.name(),
ov::genai::padding_side.name()});
return m_pimpl->encode(prompts_1, prompts_2, tokenization_params);
}

TokenizedInputs Tokenizer::encode(const std::vector<std::string>& prompts, const ov::AnyMap& tokenization_params) {
check_arguments(tokenization_params, {ov::genai::add_special_tokens.name(), ov::genai::max_length.name(), ov::genai::pad_to_max_length.name()});
check_arguments(tokenization_params, {ov::genai::add_special_tokens.name(),
ov::genai::max_length.name(),
ov::genai::pad_to_max_length.name(),
ov::genai::padding_side.name()});
return m_pimpl->encode(prompts, tokenization_params);
}

TokenizedInputs Tokenizer::encode(const std::initializer_list<std::string>& text, const ov::AnyMap& tokenization_params) {
check_arguments(tokenization_params, {ov::genai::add_special_tokens.name(), ov::genai::max_length.name(), ov::genai::pad_to_max_length.name()});
check_arguments(tokenization_params, {ov::genai::add_special_tokens.name(),
ov::genai::max_length.name(),
ov::genai::pad_to_max_length.name(),
ov::genai::padding_side.name()});
return encode(std::vector<std::string>(text.begin(), text.end()), tokenization_params);
}

Expand Down
43 changes: 38 additions & 5 deletions src/python/openvino_genai/py_openvino_genai.pyi
Original file line number Diff line number Diff line change
Expand Up @@ -2971,25 +2971,58 @@ class Tokenizer:
Decode a batch of tokens into a list of string prompt.
"""
@typing.overload
def encode(self, prompts: collections.abc.Sequence[str], add_special_tokens: bool = True, pad_to_max_length: bool = False, max_length: typing.SupportsInt | None = None) -> TokenizedInputs:
def encode(self, prompts: collections.abc.Sequence[str], add_special_tokens: bool = True, pad_to_max_length: bool = False, max_length: typing.SupportsInt | None = None, padding_side: str | None = None) -> TokenizedInputs:
"""
Encodes a list of prompts into tokenized inputs.
Args:
'prompts' - list of prompts to encode
'add_special_tokens' - whether to add special tokens like BOS, EOS, PAD. Default is True.
'pad_to_max_length' - whether to pad the sequence to the maximum length. Default is False.
'max_length' - maximum length of the sequence. If None (default), the value will be taken from the IR (where default value from original HF/GGUF model is stored).
'padding_side' - side to pad the sequence, can be 'left' or 'right'. If None (default), the value will be taken from the IR (where default value from original HF/GGUF model is stored).
Returns:
TokenizedInputs object containing input_ids and attention_mask tensors.
"""
@typing.overload
def encode(self, prompt: str, add_special_tokens: bool = True, pad_to_max_length: bool = False, max_length: typing.SupportsInt | None = None) -> TokenizedInputs:
def encode(self, prompt: str, add_special_tokens: bool = True, pad_to_max_length: bool = False, max_length: typing.SupportsInt | None = None, padding_side: str | None = None) -> TokenizedInputs:
"""
Encodes a single prompt into tokenized input.
Args:
'prompt' - prompt to encode
'add_special_tokens' - whether to add special tokens like BOS, EOS, PAD. Default is True.
'pad_to_max_length' - whether to pad the sequence to the maximum length. Default is False.
'max_length' - maximum length of the sequence. If None (default), the value will be taken from the IR (where default value from original HF/GGUF model is stored).
'padding_side' - side to pad the sequence, can be 'left' or 'right'. If None (default), the value will be taken from the IR (where default value from original HF/GGUF model is stored).
Returns:
TokenizedInputs object containing input_ids and attention_mask tensors.
"""
@typing.overload
def encode(self, prompts_1: collections.abc.Sequence[str], prompts_2: collections.abc.Sequence[str], add_special_tokens: bool = True, pad_to_max_length: bool = False, max_length: typing.SupportsInt | None = None) -> TokenizedInputs:
def encode(self, prompts_1: collections.abc.Sequence[str], prompts_2: collections.abc.Sequence[str], add_special_tokens: bool = True, pad_to_max_length: bool = False, max_length: typing.SupportsInt | None = None, padding_side: str | None = None) -> TokenizedInputs:
"""
Encodes a list of prompts into tokenized inputs. The number of strings must be the same, or one of the inputs can contain one string.
In the latter case, the single-string input will be broadcast into the shape of the other input, which is more efficient than repeating the string in pairs.
In the latter case, the single-string input will be broadcast into the shape of the other input, which is more efficient than repeating the string in pairs.)
Args:
'prompts_1' - list of prompts to encode
'prompts_2' - list of prompts to encode
'add_special_tokens' - whether to add special tokens like BOS, EOS, PAD. Default is True.
'pad_to_max_length' - whether to pad the sequence to the maximum length. Default is False.
'max_length' - maximum length of the sequence. If None (default), the value will be taken from the IR (where default value from original HF/GGUF model is stored).
'padding_side' - side to pad the sequence, can be 'left' or 'right'. If None (default), the value will be taken from the IR (where default value from original HF/GGUF model is stored).
Returns:
TokenizedInputs object containing input_ids and attention_mask tensors.
"""
@typing.overload
def encode(self, prompts: list, add_special_tokens: bool = True, pad_to_max_length: bool = False, max_length: typing.SupportsInt | None = None) -> TokenizedInputs:
def encode(self, prompts: list, add_special_tokens: bool = True, pad_to_max_length: bool = False, max_length: typing.SupportsInt | None = None, padding_side: str | None = None) -> TokenizedInputs:
"""
Encodes a list of paired prompts into tokenized inputs. Input format is same as for HF paired input [[prompt_1, prompt_2], ...].
Args:
'prompts' - list of prompts to encode\\n
'add_special_tokens' - whether to add special tokens like BOS, EOS, PAD. Default is True.
'pad_to_max_length' - whether to pad the sequence to the maximum length. Default is False.
'max_length' - maximum length of the sequence. If None (default), the value will be taken from the IR (where default value from original HF/GGUF model is stored).
'padding_side' - side to pad the sequence, can be 'left' or 'right'. If None (default), the value will be taken from the IR (where default value from original HF/GGUF model is stored).
Returns:
TokenizedInputs object containing input_ids and attention_mask tensors.
"""
def get_bos_token(self) -> str:
...
Expand Down
Loading
Loading