Revert "feat: replace Decord with video_reader-rs" (sgl-project#8077)

mickqian · shuaills · commit 5082c473c939 · 2025-07-21T07:57:36.000Z
diff --git a/python/pyproject.toml b/python/pyproject.toml
@@ -21,7 +21,6 @@ runtime_common = [
     "build",
     "compressed-tensors",
     "datasets",
-    "video-reader-rs",
     "fastapi",
     "hf_transfer",
     "huggingface_hub",
diff --git a/python/sglang/check_env.py b/python/sglang/check_env.py
@@ -47,7 +47,7 @@ def is_cuda_v2():
     "tiktoken",
     "anthropic",
     "litellm",
-    "video-reader-rs",
+    "decord",
 ]
 
 
diff --git a/python/sglang/srt/multimodal/processors/base_processor.py b/python/sglang/srt/multimodal/processors/base_processor.py
@@ -206,7 +206,7 @@ def get_estimated_frames_list(self, image_data):
         estimate the total frame count from all visual input
         """
         # Lazy import because decord is not available on some arm platforms.
-        from video_reader import PyVideoReader, cpu
+        from decord import VideoReader, cpu
 
         # Before processing inputs
         if not image_data or len(image_data) == 0:
@@ -216,7 +216,7 @@ def get_estimated_frames_list(self, image_data):
             if isinstance(image, str) and image.startswith("video:"):
                 path = image[len("video:") :]
                 # Estimate frames for the video
-                vr = PyVideoReader(path, threads=0)
+                vr = VideoReader(path, ctx=cpu(0))
                 num_frames = len(vr)
             else:
                 # For images, each contributes one frame
diff --git a/python/sglang/srt/multimodal/processors/internvl.py b/python/sglang/srt/multimodal/processors/internvl.py
@@ -150,15 +150,15 @@ def get_index(bound, fps, max_frame, first_idx=0, num_segments=32):
     def load_video(video_path, bound=None, input_size=448, max_num=1, num_segments=32):
         vr = VideoReader(video_path, ctx=cpu(0), num_threads=1)
         max_frame = len(vr) - 1
-        fps = float(vr.get_fps())
+        fps = float(vr.get_avg_fps())
 
         pixel_values_list, num_patches_list = [], []
         transform = InternVLImageProcessor.build_transform(input_size=input_size)
         frame_indices = InternVLImageProcessor.get_index(
             bound, fps, max_frame, first_idx=0, num_segments=num_segments
         )
         for frame_index in frame_indices:
-            img = Image.fromarray(vr[frame_index]).convert("RGB")
+            img = Image.fromarray(vr[frame_index].asnumpy()).convert("RGB")
             img = InternVLImageProcessor.dynamic_preprocess(
                 img, image_size=input_size, use_thumbnail=True, max_num=max_num
             )
diff --git a/python/sglang/srt/multimodal/processors/qwen_vl.py b/python/sglang/srt/multimodal/processors/qwen_vl.py
@@ -156,10 +156,10 @@ async def preprocess_video(
     # vr: VideoReader, image_factor: int = IMAGE_FACTOR
 ) -> torch.Tensor:
     ele = {}
-    total_frames, video_fps = len(vr), vr.get_fps()
+    total_frames, video_fps = len(vr), vr.get_avg_fps()
     nframes = smart_nframes({}, total_frames=total_frames, video_fps=video_fps)
     idx = torch.linspace(0, total_frames - 1, nframes).round().long().tolist()
-    video = vr.get_batch(idx)
+    video = vr.get_batch(idx).asnumpy()
     video = torch.tensor(video).permute(0, 3, 1, 2)  # Convert to TCHW format
     nframes, _, height, width = video.shape
     min_pixels = ele.get("min_pixels", VIDEO_MIN_PIXELS)
diff --git a/python/sglang/srt/utils.py b/python/sglang/srt/utils.py
@@ -84,7 +84,6 @@
 from torch.profiler import ProfilerActivity, profile, record_function
 from torch.utils._contextlib import _DecoratorContextManager
 from triton.runtime.cache import FileCacheManager
-from video_reader import PyVideoReader
 
 logger = logging.getLogger(__name__)
 
@@ -758,17 +757,24 @@ def load_image(
 
 def load_video(video_file: Union[str, bytes], use_gpu: bool = True):
     # We import decord here to avoid a strange Segmentation fault (core dumped) issue.
-    from video_reader import PyVideoReader
+    from decord import VideoReader, cpu, gpu
+
+    try:
+        from decord.bridge import decord_bridge
+
+        ctx = gpu(0)
+        _ = decord_bridge.get_ctx_device(ctx)
+    except Exception:
+        ctx = cpu(0)
 
-    device = "cuda" if use_gpu and torch.cuda.is_available() else None
     tmp_file = None
     vr = None
     try:
         if isinstance(video_file, bytes):
             tmp_file = tempfile.NamedTemporaryFile(delete=False, suffix=".mp4")
             tmp_file.write(video_file)
             tmp_file.close()
-            vr = PyVideoReader(tmp_file.name, device=device, threads=0)
+            vr = VideoReader(tmp_file.name, ctx=ctx)
         elif isinstance(video_file, str):
             if video_file.startswith(("http://", "https://")):
                 timeout = int(os.getenv("REQUEST_TIMEOUT", "10"))
@@ -778,22 +784,22 @@ def load_video(video_file: Union[str, bytes], use_gpu: bool = True):
                 for chunk in response.iter_content(chunk_size=8192):
                     tmp_file.write(chunk)
                 tmp_file.close()
-                vr = PyVideoReader(tmp_file.name, device=device, threads=0)
+                vr = VideoReader(tmp_file.name, ctx=ctx)
             elif video_file.startswith("data:"):
                 _, encoded = video_file.split(",", 1)
                 video_bytes = base64.b64decode(encoded)
                 tmp_file = tempfile.NamedTemporaryFile(delete=False, suffix=".mp4")
                 tmp_file.write(video_bytes)
                 tmp_file.close()
-                vr = PyVideoReader(tmp_file.name, device=device, threads=0)
+                vr = VideoReader(tmp_file.name, ctx=ctx)
             elif os.path.isfile(video_file):
-                vr = PyVideoReader(video_file, device=device, threads=0)
+                vr = VideoReader(video_file, ctx=ctx)
             else:
                 video_bytes = base64.b64decode(video_file)
                 tmp_file = tempfile.NamedTemporaryFile(delete=False, suffix=".mp4")
                 tmp_file.write(video_bytes)
                 tmp_file.close()
-                vr = PyVideoReader(tmp_file.name, device=device, threads=0)
+                vr = VideoReader(tmp_file.name, ctx=ctx)
         else:
             raise ValueError(f"Unsupported video input type: {type(video_file)}")
 

Original file line number	Diff line number	Diff line change
`@@ -47,7 +47,7 @@ def is_cuda_v2():`
`47`	`47`	`"tiktoken",`
`48`	`48`	`"anthropic",`
`49`	`49`	`"litellm",`
`50`		`- "video-reader-rs",`
	`50`	`+ "decord",`
`51`	`51`	`]`
`52`	`52`
`53`	`53`