sgl-project
diff --git a/‎python/sglang/srt/managers/scheduler.py‎
Lines changed: 17 additions & 1 deletion b/‎python/sglang/srt/managers/scheduler.py‎
Lines changed: 17 additions & 1 deletion
diff --git a/‎python/sglang/srt/mem_cache/cpp_radix_tree/.clang-format‎
Lines changed: 1 addition & 0 deletions b/‎python/sglang/srt/mem_cache/cpp_radix_tree/.clang-format‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎python/sglang/srt/mem_cache/cpp_radix_tree/common.h‎
Lines changed: 29 additions & 0 deletions b/‎python/sglang/srt/mem_cache/cpp_radix_tree/common.h‎
Lines changed: 29 additions & 0 deletions
diff --git a/‎python/sglang/srt/mem_cache/cpp_radix_tree/radix_tree.py‎
Lines changed: 182 additions & 0 deletions b/‎python/sglang/srt/mem_cache/cpp_radix_tree/radix_tree.py‎
Lines changed: 182 additions & 0 deletions
diff --git a/‎python/sglang/srt/mem_cache/cpp_radix_tree/tree_v2.cpp‎
Lines changed: 143 additions & 0 deletions b/‎python/sglang/srt/mem_cache/cpp_radix_tree/tree_v2.cpp‎
Lines changed: 143 additions & 0 deletions
@@ -569,7 +569,23 @@ def init_memory_pool_and_cache(self):
                 page_size=self.page_size,
             )
         else:
-            if self.enable_hierarchical_cache:
+            if os.environ.get("SGLANG_EXPERIMENTAL_CPP_RADIX_TREE") == "1":
+                # lazy import to avoid JIT overhead
+                from sglang.srt.mem_cache.radix_cache_cpp import RadixCacheCpp
+
+                self.tree_cache = RadixCacheCpp(
+                    disable=False,
+                    use_hicache=self.enable_hierarchical_cache,
+                    req_to_token_pool=self.req_to_token_pool,
+                    token_to_kv_pool=self.token_to_kv_pool_allocator,
+                    tp_cache_group=self.tp_cpu_group,
+                    page_size=self.page_size,
+                    hicache_ratio=server_args.hicache_ratio,
+                    hicache_size=server_args.hicache_size,
+                    hicache_write_policy=server_args.hicache_write_policy,
+                    enable_kv_cache_events=self.enable_kv_cache_events,
+                )
+            elif self.enable_hierarchical_cache:
                 self.tree_cache = HiRadixCache(
                     req_to_token_pool=self.req_to_token_pool,
                     token_to_kv_pool_allocator=self.token_to_kv_pool_allocator,
 
@@ -0,0 +1 @@
+../../../../../sgl-kernel/.clang-format
@@ -0,0 +1,29 @@
+#pragma once
+#include <cstddef>
+#include <cstdint>
+#include <source_location>
+#include <span>
+#include <stdexcept>
+#include <string>
+#include <vector>
+
+namespace radix_tree_v2 {
+
+using token_t = std::int32_t;
+using token_vec_t = std::vector<token_t>;
+using token_slice = std::span<const token_t>;
+using NodeHandle = std::size_t;
+using IOTicket = std::uint32_t;
+
+inline void _assert(
+    bool condition,
+    const char* message = "Assertion failed",
+    std::source_location loc = std::source_location::current()) {
+  if (!condition) [[unlikely]] {
+    std::string msg = message;
+    msg = msg + " at " + loc.file_name() + ":" + std::to_string(loc.line()) + " in " + loc.function_name();
+    throw std::runtime_error(msg);
+  }
+}
+
+}  // namespace radix_tree_v2
@@ -0,0 +1,182 @@
+from __future__ import annotations
+
+import os
+from typing import TYPE_CHECKING, List, Optional, Tuple
+
+import torch
+from torch.utils.cpp_extension import load
+
+_abs_path = os.path.dirname(os.path.abspath(__file__))
+radix_tree_cpp = load(
+    name="radix_tree_cpp",
+    sources=[
+        f"{_abs_path}/tree_v2_binding.cpp",
+        f"{_abs_path}/tree_v2_debug.cpp",
+        f"{_abs_path}/tree_v2.cpp",
+    ],
+    extra_cflags=["-O3", "-std=c++20"],
+)
+
+if TYPE_CHECKING:
+
+    class TreeNodeCpp:
+        """
+        A placeholder for the TreeNode class. Cannot be constructed elsewhere.
+        """
+
+    class IOHandle:
+        """
+        A placeholder for the IOHandle class. Cannot be constructed elsewhere.
+        """
+
+    class RadixTreeCpp:
+        def __init__(
+            self,
+            disabled: bool,
+            host_size: Optional[int],
+            page_size: int,
+            write_through_threshold: int,
+        ):
+            """
+            Initializes the RadixTreeCpp instance.
+            Args:
+                disabled (bool): If True, the radix tree is disabled.
+                host_size (Optional[int]): Size of the radix tree on the CPU. None means no CPU tree.
+                page_size (int): Size of the page for the radix tree.
+                write_through_threshold (int): Threshold for writing through from GPU to CPU.
+            """
+            self.tree = radix_tree_cpp.RadixTree(  # type: ignore
+                disabled, host_size, page_size, write_through_threshold
+            )
+
+        def match_prefix(
+            self, prefix: List[int]
+        ) -> Tuple[List[torch.Tensor], int, TreeNodeCpp, TreeNodeCpp]:
+            """
+            Matches a prefix in the radix tree.
+            Args:
+                prefix (List[int]): The prefix to match.
+            Returns:
+                Tuple[List[torch.Tensor], TreeNodeCpp, TreeNodeCpp]:
+                    0. A list of indices that is matched by the prefix on the GPU.
+                    1. Sum length of the indices matched on the CPU.
+                    2. The last node of the prefix matched on the GPU.
+                    3. The last node of the prefix matched on the CPU.
+            """
+            return self.tree.match_prefix(prefix)
+
+        def evict(self, num_tokens: int) -> List[torch.Tensor]:
+            """
+            Evicts a number of tokens from the radix tree.
+            Args:
+                num_tokens (int): The number of tokens to evict.
+            Returns:
+                List[torch.Tensor]: A list of indices that were evicted.
+            """
+            return self.tree.evict(num_tokens)
+
+        def lock_ref(self, handle: TreeNodeCpp, lock: bool) -> None:
+            """
+            Locks or unlocks a reference to a tree node.
+            After locking, the node will not be evicted from the radix tree.
+            Args:
+                handle (TreeNodeCpp): The tree node to lock or unlock.
+                lock (bool): If True, locks the node; if False, unlocks it.
+            """
+            return self.tree.lock_ref(handle, lock)
+
+        def writing_through(
+            self, key: List[int], indices: torch.Tensor
+        ) -> Tuple[List[Tuple[IOHandle, torch.Tensor, torch.Tensor]], int]:
+            """
+            Inserts a key-value pair into the radix tree and perform write-through check.
+            Args:
+                key (List[int]): The key to insert.
+                indices (torch.Tensor): The value associated with the key.
+            Returns:
+                Tuple[List[Tuple[IOHandle, torch.Tensor, torch.Tensor]], int]:
+                    0. A list of (IOHandle, device indices, host indices) tuples.
+                       These IOhandles require write-through to the CPU in python side.
+                    1. The number of indices that are matched on device.
+            """
+            return self.tree.writing_through(key, indices)
+
+        def loading_onboard(
+            self,
+            host_node: TreeNodeCpp,
+            new_device_indices: torch.Tensor,
+        ) -> Tuple[IOHandle, List[torch.Tensor]]:
+            """
+            Updates the device indices of tree nodes within a range on the tree.
+            Args:
+                host_node (TreeNodeCpp): The tree node on the host, must be descendant of device_node.
+                new_device_indices (torch.Tensor): The new device indices to set.
+                    The length of this tensor must be exactly host indices length.
+            Returns:
+                Tuple[IOHandle, List[torch.Tensor]]:
+                    0. An IOHandle that requires loading to the CPU in python side.
+                    1. A list of host indices corresponding to the new device indices.
+            """
+            return self.tree.loading_onboard(host_node, new_device_indices)
+
+        def commit_writing_through(self, handle: IOHandle, success: bool) -> None:
+            """
+            Commits the write-through process for a tree node.
+            Args:
+                handle (IOHandle): The IOHandle to commit.
+                success (bool): If True, commits the write-through; if False, just indicates failure.
+            """
+            return self.tree.commit_writing_through(handle, success)
+
+        def commit_loading_onboard(self, handle: IOHandle, success: bool) -> None:
+            """
+            Commits the load onboard process for tree nodes within a range on the tree.
+            Args:
+                handle (IOHandle): The IOHandle to commit.
+                success (bool): If True, commits the load-onboard; if False, just indicates failure.
+            """
+            return self.tree.commit_loading_onboard(handle, success)
+
+        def evictable_size(self) -> int:
+            """
+            Returns the size of the evictable part of the radix tree.
+            This is the size of the part that can be evicted from the GPU (ref_count = 0).
+            Returns:
+                int: The size of the evictable part.
+            """
+            return self.tree.evictable_size()
+
+        def protected_size(self) -> int:
+            """
+            Returns the size of the protected part of the radix tree.
+            This is the size of the part that cannot be evicted from the GPU (ref_count > 0).
+            Returns:
+                int: The size of the protected part.
+            """
+            return self.tree.protected_size()
+
+        def total_size(self) -> int:
+            """
+            Returns the total size of the radix tree (including CPU nodes).
+            Returns:
+                int: The total size of the radix tree.
+            """
+            return self.tree.total_size()
+
+        def reset(self) -> None:
+            """
+            Resets the radix tree, clearing all nodes and indices.
+            """
+            return self.tree.reset()
+
+        def debug_print(self) -> None:
+            """
+            Prints the internal state of the radix tree for debugging purposes.
+            """
+            return self.tree.debug_print()
+
+else:
+    # Real implementation of the classes for runtime
+    RadixTreeCpp = radix_tree_cpp.RadixTree
+    TreeNodeCpp = object
+    IOHandle = object
@@ -0,0 +1,143 @@
+#include "tree_v2.h"
+
+#include <ATen/core/TensorBody.h>
+#include <ATen/ops/empty.h>
+#include <ATen/ops/tensor.h>
+#include <ATen/ops/zeros.h>
+#include <c10/util/irange.h>
+
+#include <cstddef>
+#include <memory>
+#include <queue>
+#include <stdexcept>
+#include <utility>
+#include <vector>
+
+#include "common.h"
+#include "tree_v2_impl.h"
+#include "tree_v2_node.h"
+
+namespace radix_tree_v2 {
+
+static NodeHandle node2id(TreeNode* node) {
+  return node->node_id;
+}
+
+// compare function for the TreeNode pointers based on their time
+// we use LRU, so we want to evict the least recently used nodes
+// since std::priority_queue is a max-heap, we need to reverse the comparison
+static constexpr auto cmp = [](TreeNode* lhs, TreeNode* rhs) { return lhs->time() > rhs->time(); };
+
+RadixTree::RadixTree(bool disabled, std::optional<std::size_t> host_size, std::size_t page_size, std::size_t threshold)
+    : m_impl(std::make_unique<Impl>(disabled, host_size.has_value(), page_size, host_size.value_or(0), threshold)) {}
+
+RadixTree::~RadixTree() = default;
+
+std::tuple<std::vector<at::Tensor>, std::size_t, NodeHandle, NodeHandle>
+RadixTree::match_prefix(const token_vec_t& _key) {
+  if (m_impl->disabled) return {};
+
+  const auto key = token_slice{_key.data(), m_impl->align(_key.size())};
+  const auto [host_node, _] = m_impl->tree_walk(key);
+
+  // walk up to the first non-evicted node
+  std::size_t host_hit_length = 0;
+  const auto device_node = host_node;
+
+  // collect all the device indices
+  std::vector<at::Tensor> indices{};
+  walk_to_root(device_node, [&](TreeNode* n) { indices.push_back(n->device_indices()); });
+  std::reverse(indices.begin(), indices.end());
+
+  return {std::move(indices), host_hit_length, node2id(device_node), node2id(host_node)};
+}
+
+std::vector<at::Tensor> RadixTree::evict(std::size_t num_tokens) {
+  if (m_impl->disabled || num_tokens == 0) return {};
+  auto heap = std::priority_queue{cmp, m_impl->collect_leaves_device()};
+  std::vector<at::Tensor> evicted_values;
+  // evict nodes until we reach the desired number of tokens
+  std::size_t num_evict = 0;
+  while (num_evict < num_tokens && !heap.empty()) {
+    const auto node = heap.top();
+    heap.pop();
+    // when ref_count == 0, can't be writing through
+    _assert(node->on_gpu() && node->ref_count == 0);
+    if (!node->is_io_free()) continue;  // skip nodes that are undergoing IO (i.e. indices protected)
+    evicted_values.push_back(node->device_indices());
+    num_evict += node->length();
+    const auto parent = node->parent();
+    m_impl->remove_device_node(node);
+    if (parent->is_leaf_device() && parent->ref_count == 0)
+      heap.push(parent);  // push parent to the heap if it is now a free leaf
+  }
+
+  return evicted_values;
+}
+
+std::tuple<std::vector<std::tuple<IOTicket, at::Tensor, at::Tensor>>, std::size_t>
+RadixTree::writing_through(const token_vec_t& _key, at::Tensor value) {
+  if (m_impl->disabled) return {};
+  _assert(_key.size() == std::size_t(value.size(0)), "Key and value must have the same size");
+
+  // just align the key to the page size, clip the unaligned tail
+  const auto key = token_slice{_key.data(), m_impl->align(_key.size())};
+
+  // walk the tree to find the right place to insert
+  const auto [host_node, host_prefix_length] = m_impl->tree_walk(key);
+
+  // insert and create a new node if the remaining part of the key is not empty
+  if (host_prefix_length != key.size()) {
+    m_impl->create_device_node(
+        host_node,
+        {key.begin() + host_prefix_length, key.end()},
+        value.slice(/*dim=*/0, host_prefix_length, key.size()));
+  }
+
+  // add the hit count for the device node
+  walk_to_root(host_node, [&](TreeNode* n) { n->hit_count++; });
+
+  std::vector<std::tuple<IOTicket, at::Tensor, at::Tensor>> result;
+
+  // don't write through if hicache is disabled (no host memory), fast path
+  if (!m_impl->use_hicache) return {std::move(result), host_prefix_length};
+  throw std::runtime_error("Not implemented yet");
+}
+
+std::tuple<IOTicket, std::vector<at::Tensor>> RadixTree::loading_onboard(NodeHandle, at::Tensor) {
+  if (m_impl->disabled) return {};
+  throw std::runtime_error("Not implemented yet");
+}
+
+void RadixTree::commit_writing_through(IOTicket, bool) {
+  if (m_impl->disabled) return;
+  throw std::runtime_error("Not implemented yet");
+}
+
+void RadixTree::commit_loading_onboard(IOTicket, bool) {
+  if (m_impl->disabled) return;
+  throw std::runtime_error("Not implemented yet");
+}
+
+void RadixTree::reset() {
+  m_impl->reset();
+}
+
+void RadixTree::lock_ref(NodeHandle node_id, bool increment) {
+  if (m_impl->disabled) return;
+  m_impl->lock_ref(node_id, increment);
+}
+
+std::size_t RadixTree::evictable_size() const {
+  return m_impl->evictable_size();
+}
+
+std::size_t RadixTree::protected_size() const {
+  return m_impl->protected_size();
+}
+
+std::size_t RadixTree::total_size() const {
+  return m_impl->total_size();
+}
+
+}  // namespace radix_tree_v2
Original file line number	Diff line number	Diff line change
`@@ -0,0 +1 @@`
	`1`	`+../../../../../sgl-kernel/.clang-format`