openvinotoolkit · as-suvorov · Jul 29, 2025 · Jul 10, 2025 · Jul 10, 2025 · Jul 11, 2025
diff --git a/.github/labeler.yml b/.github/labeler.yml
@@ -107,7 +107,7 @@
 'category: RAG samples':
 - 'samples/cpp/rag/**/*'
 - 'samples/python/rag/**/*'
-- 'tests/python_tests/samples/test_text_embedding_pipeline.py'
+- 'tests/python_tests/samples/test_rag.py'
 
 'category: structured output generation':
 - 'src/cpp/src/sampling/structured_output/*'

diff --git a/samples/cpp/rag/CMakeLists.txt b/samples/cpp/rag/CMakeLists.txt
@@ -21,7 +21,7 @@ function(add_sample_executable target_name)
             EXCLUDE_FROM_ALL)
 endfunction()
 
-set(SAMPLE_LIST text_embeddings)
+set(SAMPLE_LIST text_embeddings text_rerank)
 
 foreach(sample ${SAMPLE_LIST})
     add_sample_executable(${sample})

diff --git a/samples/cpp/rag/README.md b/samples/cpp/rag/README.md
@@ -1,6 +1,6 @@
 # Retrieval Augmented Generation Sample
 
-This example showcases inference of Text Embedding Models. The application has limited configuration options to encourage the reader to explore and modify the source code. For example, change the device for inference to GPU. The sample features `ov::genai::TextEmbeddingPipeline` and uses text as an input source.
+This example showcases inference of Text Embedding and Text Rerank Models. The application has limited configuration options to encourage the reader to explore and modify the source code. For example, change the device for inference to GPU. The sample features `ov::genai::TextEmbeddingPipeline` and `ov::genai::TextRerankPipeline` and uses text as an input source.
 
 ## Download and Convert the Model and Tokenizers
 
@@ -10,17 +10,37 @@ Install [../../export-requirements.txt](../../export-requirements.txt) to conver
 
 ```sh
 pip install --upgrade-strategy eager -r ../../export-requirements.txt
+```
+
+Then, run the export with Optimum CLI:
+
+```sh
 optimum-cli export openvino --trust-remote-code --model BAAI/bge-small-en-v1.5 BAAI/bge-small-en-v1.5
 ```
 
+
 ## Run
 
 Follow [Get Started with Samples](https://docs.openvino.ai/2025/get-started/learn-openvino/openvino-samples/get-started-demos.html) to run the sample.
 
-`text_embeddings BAAI/bge-small-en-v1.5 "Document 1" "Document 2"`
-
+### 1. Text Embedding Sample (`text_embeddings.cpp`)
+- **Description:**
+  Demonstrates inference of text embedding models using OpenVINO GenAI. Converts input text into vector embeddings for downstream tasks such as retrieval or semantic search.
+- **Run Command:**
+  ```sh
+  text_embeddings <MODEL_DIR> "Document 1" "Document 2"
+  ```
 Refer to the [Supported Models](https://openvinotoolkit.github.io/openvino.genai/docs/supported-models/#text-embeddings-models) for more details.
 
+### 2. Text Rerank Sample (`text_rerank.cpp`)
+- **Description:**
+  Demonstrates inference of text rerank models using OpenVINO GenAI. Reranks a list of candidate documents based on their relevance to a query using a cross-encoder or reranker model.
+- **Run Command:**
+  ```sh
+  text_rerank <MODEL_DIR> '<QUERY>' '<TEXT 1>' ['<TEXT 2>' ...]
+  ```
+
+
 # Text Embedding Pipeline Usage
 
 ```c++
@@ -29,3 +49,12 @@ Refer to the [Supported Models](https://openvinotoolkit.github.io/openvino.genai
 ov::genai::TextEmbeddingPipeline pipeline(models_path, device, config);
 std::vector<ov::genai::EmbeddingResult> embeddings = pipeline.embed_documents(documents);
 ```
+
+# Text Rerank Pipeline Usage
+
+```c++
+#include "openvino/genai/rag/text_rerank_pipeline.hpp"
+
+ov::genai::TextRerankPipeline pipeline(models_path, device, config);
+std::vector<std::pair<size_t, float>> rerank_result = pipeline.rerank(query, documents);
+```
diff --git a/samples/cpp/rag/text_rerank.cpp b/samples/cpp/rag/text_rerank.cpp
@@ -0,0 +1,45 @@
+// Copyright (C) 2025 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#include "openvino/genai/rag/text_rerank_pipeline.hpp"
+
+int main(int argc, char* argv[]) try {
+    if (argc < 4) {
+        throw std::runtime_error(std::string{"Usage: "} + argv[0] +
+                                 " <MODEL_DIR> '<QUERY>' '<TEXT 1>' ['<TEXT 2>' ...]");
+    }
+
+    auto documents = std::vector<std::string>(argv + 3, argv + argc);
+    std::string models_path = argv[1];
+    std::string query = argv[2];
+
+    std::string device = "CPU";  // GPU can be used as well
+
+    ov::genai::TextRerankPipeline::Config config;
+    config.top_n = 3;
+
+    ov::genai::TextRerankPipeline pipeline(models_path, device, config);
+
+    std::vector<std::pair<size_t, float>> rerank_result = pipeline.rerank(query, documents);
+
+    // print reranked documents
+    std::cout << std::fixed << std::setprecision(4);
+    std::cout << "Reranked documents:\n";
+    for (const auto& [index, score] : rerank_result) {
+        std::cout << "Document " << index << " (score: " << score << "): " << documents[index] << '\n';
+    }
+    std::cout << std::defaultfloat;
+
+} catch (const std::exception& error) {
+    try {
+        std::cerr << error.what() << '\n';
+    } catch (const std::ios_base::failure&) {
+    }
+    return EXIT_FAILURE;
+} catch (...) {
+    try {
+        std::cerr << "Non-exception object thrown\n";
+    } catch (const std::ios_base::failure&) {
+    }
+    return EXIT_FAILURE;
+}
diff --git a/samples/python/rag/README.md b/samples/python/rag/README.md
@@ -1,6 +1,6 @@
 # Retrieval Augmented Generation Sample
 
-This example showcases inference of Text Embedding Models. The application limited configuration configuration options to encourage the reader to explore and modify the source code. For example, change the device for inference to GPU. The sample features `openvino_genai.TextEmbeddingPipeline` and uses text as an input source.
+This example showcases inference of Text Embedding and Text Rerank Models. The application has limited configuration options to encourage the reader to explore and modify the source code. For example, change the device for inference to GPU. The sample features `openvino_genai.TextEmbeddingPipeline` and `openvino_genai.TextRerankPipeline` and uses text as an input source.
 
 ## Download and Convert the Model and Tokenizers
 
@@ -38,10 +38,24 @@ export_tokenizer(tokenizer, output_dir)
 
 Install [deployment-requirements.txt](../../deployment-requirements.txt) via `pip install -r ../../deployment-requirements.txt` and then, run a sample:
 
-`python text_embeddings.py BAAI/bge-small-en-v1.5 "Document 1" "Document 2"`
-
+### 1. Text Embedding Sample (`text_embeddings.py`)
+- **Description:**
+  Demonstrates inference of text embedding models using OpenVINO GenAI. Converts input text into vector embeddings for downstream tasks such as retrieval or semantic search.
+- **Run Command:**
+  ```sh
+  python text_embeddings.py <MODEL_DIR> "Document 1" "Document 2"
+  ```
 Refer to the [Supported Models](https://openvinotoolkit.github.io/openvino.genai/docs/supported-models/#text-embeddings-models) for more details.
 
+### 2. Text Rerank Sample (`text_rerank.py`)
+- **Description:**
+  Demonstrates inference of text rerank models using OpenVINO GenAI. Reranks a list of candidate documents based on their relevance to a query using a cross-encoder or reranker model.
+- **Run Command:**
+  ```sh
+  python text_rerank.py <MODEL_DIR> "<QUERY>" "<TEXT 1>" ["<TEXT 2>" ...]
+  ```
+
+
 # Text Embedding Pipeline Usage
 
 ```python
@@ -51,3 +65,13 @@ pipeline = openvino_genai.TextEmbeddingPipeline(model_dir, "CPU")
 
 embeddings = pipeline.embed_documents(["document1", "document2"])
 ```
+
+# Text Rerank Pipeline Usage
+
+```python
+import openvino_genai
+
+pipeline = openvino_genai.TextRerankPipeline(model_dir, "CPU")
+
+rerank_result = pipeline.rerank(query, documents)
+```
diff --git a/samples/python/rag/text_rerank.py b/samples/python/rag/text_rerank.py
@@ -0,0 +1,31 @@
+#!/usr/bin/env python3
+# Copyright (C) 2025 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+
+import argparse
+import openvino_genai
+
+
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("model_dir")
+    parser.add_argument("query")
+    parser.add_argument("texts", nargs="+")
+    args = parser.parse_args()
+
+    device = "CPU"  # GPU can be used as well
+
+    config = openvino_genai.TextRerankPipeline.Config()
+    config.top_n = 3
+
+    pipeline = openvino_genai.TextRerankPipeline(args.model_dir, device, config)
+
+    rerank_result = pipeline.rerank(args.query, args.texts)
+
+    print("Reranked documents:")
+    for index, score in rerank_result:
+        print(f"Document {index} (score: {score:.4f}): {args.texts[index]}")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/src/cpp/include/openvino/genai/rag/text_rerank_pipeline.hpp b/src/cpp/include/openvino/genai/rag/text_rerank_pipeline.hpp
@@ -0,0 +1,102 @@
+// Copyright (C) 2025 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "openvino/genai/tokenizer.hpp"
+
+namespace ov {
+namespace genai {
+
+class OPENVINO_GENAI_EXPORTS TextRerankPipeline {
+public:
+    struct OPENVINO_GENAI_EXPORTS Config {
+        /**
+         * @brief Number of documents to return sorted by score
+         */
+        size_t top_n = 3;
+
+        /**
+         * @brief Constructs text rerank pipeline configuration
+         */
+        Config() = default;
+
+        /**
+         * @brief Constructs text rerank pipeline configuration
+         *
+         * @param properties configuration options
+         *
+         * const ov::AnyMap properties{{"top_n", 3}};
+         * ov::genai::TextRerankPipeline::Config config(properties);
+         *
+         * ov::genai::TextRerankPipeline::Config config({{"top_n", 3}});
+         */
+        explicit Config(const ov::AnyMap& properties);
+    };
+
+    /**
+     * @brief Constructs a pipeline from xml/bin files, tokenizer and configuration in the same dir.
+     *
+     * @param models_path Path to the directory containing model xml/bin files and tokenizer
+     * @param device Device
+     * @param config Pipeline configuration
+     * @param properties Optional plugin properties to pass to ov::Core::compile_model().
+     */
+    TextRerankPipeline(const std::filesystem::path& models_path,
+                       const std::string& device,
+                       const Config& config,
+                       const ov::AnyMap& properties = {});
+
+    /**
+     * @brief Constructs a pipeline from xml/bin files, tokenizer and configuration in the same dir.
+     *
+     * @param models_path Path to the directory containing model xml/bin files and tokenizer
+     * @param device Device
+     * @param properties Optional plugin and/or config properties
+     */
+    TextRerankPipeline(const std::filesystem::path& models_path,
+                       const std::string& device,
+                       const ov::AnyMap& properties = {});
+
+    /**
+     * @brief Constructs a pipeline from xml/bin files, tokenizer and configuration in the same dir.
+     *
+     * @param models_path Path to the directory containing model xml/bin files and tokenizer
+     * @param device Device
+     * @param properties Plugin and/or config properties
+     */
+    template <typename... Properties,
+              typename std::enable_if<ov::util::StringAny<Properties...>::value, bool>::type = true>
+    TextRerankPipeline(const std::filesystem::path& models_path, const std::string& device, Properties&&... properties)
+        : TextRerankPipeline(models_path, device, ov::AnyMap{std::forward<Properties>(properties)...}) {}
+
+    /**
+     * @brief Reranks a vector of texts based on the query.
+     */
+    std::vector<std::pair<size_t, float>> rerank(const std::string& query, const std::vector<std::string>& texts);
+
+    /**
+     * @brief Asynchronously reranks a vector of texts based on the query. Only one method of async family can be
+     * active.
+     */
+    void start_rerank_async(const std::string& query, const std::vector<std::string>& texts);
+
+    /**
+     * @brief Waits for reranked texts.
+     */
+    std::vector<std::pair<size_t, float>> wait_rerank();
+
+    ~TextRerankPipeline();
+
+private:
+    class TextRerankPipelineImpl;
+    std::unique_ptr<TextRerankPipelineImpl> m_impl;
+};
+
+/**
+ * @brief Number of documents to return after reranking sorted by score
+ */
+static constexpr ov::Property<size_t> top_n{"top_n"};
+
+}  // namespace genai
+}  // namespace ov
diff --git a/src/cpp/include/openvino/genai/tokenizer.hpp b/src/cpp/include/openvino/genai/tokenizer.hpp
@@ -7,6 +7,7 @@
 #include <vector>
 #include <initializer_list>
 #include <filesystem>
+#include <optional>
 
 #include "openvino/runtime/tensor.hpp"
 #include "openvino/genai/visibility.hpp"
@@ -21,6 +22,7 @@ using Vocab = std::unordered_map<std::string, int64_t>;  // similar to huggingfa
 struct TokenizedInputs {
     ov::Tensor input_ids;
     ov::Tensor attention_mask;
+    std::optional<ov::Tensor> token_type_ids;
 };
 
 /**

diff --git a/src/cpp/src/debug_utils.hpp b/src/cpp/src/debug_utils.hpp
@@ -3,34 +3,59 @@
 
 #pragma once
 
-#include <string>
-#include <iostream>
 #include <fstream>
-
+#include <iostream>
 #include <openvino/runtime/tensor.hpp>
+#include <string>
 
 template <typename T>
-void print_array(T * array, size_t size) {
+void print_array(T* array, size_t size) {
     std::cout << " => [ ";
     for (size_t i = 0; i < std::min(size, size_t(10)); ++i) {
         std::cout << array[i] << " ";
     }
     std::cout << " ] " << std::endl;
 }
 
+template <typename T>
+void print_tensor(ov::Tensor tensor) {
+    const auto shape = tensor.get_shape();
+    const size_t rank = shape.size();
+    const auto* data = tensor.data<T>();
+
+    if (rank > 2) {
+        print_array(data, tensor.get_size());
+        return;
+    }
+
+    const size_t batch_size = shape[0];
+    const size_t seq_length = shape[1];
+
+    std::cout << " => [ \n";
+    for (size_t batch = 0; batch < batch_size; ++batch) {
+        std::cout << "  [ ";
+        const size_t batch_offset = batch * seq_length;
+        for (size_t j = 0; j < seq_length; ++j) {
+            std::cout << data[batch_offset + j] << " ";
+        }
+        std::cout << "]\n";
+    }
+    std::cout << " ]" << std::endl;
+}
+
 inline void print_tensor(std::string name, ov::Tensor tensor) {
     std::cout << name;
     std::cout << " " << tensor.get_shape().to_string();
     if (tensor.get_element_type() == ov::element::i32) {
-        print_array(tensor.data<int>(), tensor.get_size());
+        print_tensor<int>(tensor);
     } else if (tensor.get_element_type() == ov::element::i64) {
-        print_array(tensor.data<int64_t>(), tensor.get_size());
+        print_tensor<int64_t>(tensor);
     } else if (tensor.get_element_type() == ov::element::f32) {
-        print_array(tensor.data<float>(), tensor.get_size());
+        print_tensor<float>(tensor);
     } else if (tensor.get_element_type() == ov::element::boolean) {
-        print_array(tensor.data<bool>(), tensor.get_size());
+        print_tensor<bool>(tensor);
     } else if (tensor.get_element_type() == ov::element::f16) {
-        print_array(tensor.data<ov::float16>(), tensor.get_size());
+        print_tensor<ov::float16>(tensor);
     }
 }