fix: Support vLLM DP+EP in async engine via Ray-level data parallelism (#1101)

azzhipa · azzhipa · commit 0b1ec1dc57e2 · 2025-11-10T13:24:52.000-05:00
diff --git a/nemo_rl/models/generation/vllm/vllm_generation.py b/nemo_rl/models/generation/vllm/vllm_generation.py
@@ -78,13 +78,6 @@ def __init__(
                 "When EP > 1, EP must be a multiple of TP since vLLM's EP = DP * TP. "
                 "Please update your configuration to set expert_parallel_size to a multiple of tensor_parallel_size."
             )
-            if self.ep_size != self.tp_size:
-                # vLLM's EP = DP * TP, so here we need to use DP inside vLLM.
-                assert not self.cfg["vllm_cfg"]["async_engine"], (
-                    "vLLM async_engine has some issues when using DP inside vLLM. "
-                    "Please update your configuration to set `policy.generation.vllm_cfg.async_engine=false`. "
-                    "See https://github.com/NVIDIA-NeMo/RL/issues/1101 for more details."
-                )
 
         # Validate sampling parameters early to avoid resource allocation with unsupported configs.
         # The vLLM sampler patch only supports temperature scaling and does not handle top_p/top_k correctly.
@@ -176,10 +169,21 @@ def __init__(
                 "[INFO] NCCL_NVLS_ENABLE is set to 0 for non-colocated inference with cross-node model parallelism."
                 "See https://github.com/NVIDIA-NeMo/RL/issues/1352 for more details."
             )
-        # We should use vLLM DP if ep_size > tp_size since EP_SIZE = DP_SIZE * TP_SIZE in vLLM.
-        # See details in https://github.com/vllm-project/vllm/blob/main/examples/offline_inference/data_parallel.py
-        if self.ep_size > self.tp_size:
-            env_vars["VLLM_DP_SIZE"] = str(self.vllm_dp_size)
+        # Use Ray-level DP (multiple independent workers) instead of vLLM internal DP
+        # when async_engine=true with DP>1 and EP>1, to avoid NCCL collective deadlocks.
+        self.use_ray_level_dp = (
+            self.dp_size > 1
+            and self.ep_size > 1
+            and self.cfg["vllm_cfg"]["async_engine"]
+        )
+
+        if self.use_ray_level_dp:
+            print(
+                f"INFO: Using Ray-level DP with {self.dp_size} independent workers (async engine with DP={self.dp_size}, EP={self.ep_size})"
+            )
+            self.vllm_dp_size = 1
+
+        env_vars["VLLM_DP_SIZE"] = str(self.vllm_dp_size)
 
         # Check if we need parallelism-aware worker group creation
         if self.model_parallel_size > 1:
diff --git a/nemo_rl/models/generation/vllm/vllm_worker_async.py b/nemo_rl/models/generation/vllm/vllm_worker_async.py
@@ -14,6 +14,7 @@
 
 import asyncio
 import gc
+import os
 import threading
 import uuid
 from typing import Any, AsyncGenerator, Optional, cast
@@ -125,6 +126,92 @@ def _replace_prefix_tokens(
     runtime_env={**get_nsight_config_if_pattern_matches("vllm_async_generation_worker")}
 )  # pragma: no cover
 class VllmAsyncGenerationWorker(BaseVllmGenerationWorker):
+    def _patch_vllm_device_allocation(self) -> None:
+        """Fix device allocation for DP+EP. vLLM parser fails on single device ID."""
+        try:
+            import vllm.v1.engine.utils as vllm_utils
+
+            original_fn = vllm_utils.get_device_indices
+
+            def patched_get_device_indices(
+                device_control_env_var, local_dp_rank, world_size
+            ):
+                try:
+                    return original_fn(
+                        device_control_env_var, local_dp_rank, world_size
+                    )
+                except Exception:
+                    import os
+
+                    value = os.environ.get(device_control_env_var, "")
+                    # Return string for single device, list for multiple
+                    if value and "," not in value:
+                        return value  # Return as string, not list
+                    return [local_dp_rank * world_size + i for i in range(world_size)]
+
+            vllm_utils.get_device_indices = patched_get_device_indices
+        except (ImportError, AttributeError) as e:
+            print(f"Warning: Could not patch vLLM device allocation: {e}")
+
+    def _patch_vllm_stats_address(self) -> None:
+        """Fix stats_update_address initialization for vLLM internal DP with EP != TP."""
+        vllm_dp_size = int(os.environ.get("VLLM_DP_SIZE", "1"))
+        if vllm_dp_size <= 1:
+            return
+
+        try:
+            import vllm.v1.engine.core_client as core_client_module
+
+            original_ensure = (
+                core_client_module.DPLBAsyncMPClient._ensure_stats_update_task
+            )
+
+            def patched_ensure(self):
+                if (
+                    not hasattr(self, "stats_update_address")
+                    or self.stats_update_address is None
+                ):
+                    import socket
+
+                    sock = socket.socket()
+                    sock.bind(("", 0))
+                    port = sock.getsockname()[1]
+                    sock.close()
+                    self.stats_update_address = f"tcp://127.0.0.1:{port}"
+
+                original_ensure(self)
+
+            core_client_module.DPLBAsyncMPClient._ensure_stats_update_task = (
+                patched_ensure
+            )
+
+            original_init = core_client_module.DPLBAsyncMPClient.__init__
+
+            def patched_init(self, *args, **kwargs):
+                self.client_count = kwargs.get("client_count", 1)
+                self.reqs_in_flight = {}
+
+                super(core_client_module.DPLBAsyncMPClient, self).__init__(
+                    args[0],
+                    args[1],
+                    args[2],
+                    kwargs.get("client_addresses"),
+                    kwargs.get("client_count", 1),
+                    kwargs.get("client_index", 0),
+                )
+
+                if hasattr(self, "core_engines") and len(self.core_engines) > 1:
+                    self.eng_start_index = (
+                        len(self.core_engines) * kwargs.get("client_index", 0)
+                    ) // kwargs.get("client_count", 1)
+                else:
+                    self.eng_start_index = 0
+
+            core_client_module.DPLBAsyncMPClient.__init__ = patched_init
+
+        except (ImportError, AttributeError) as e:
+            print(f"Warning: Could not patch vLLM stats address: {e}")
+
     def _create_engine(self, llm_kwargs: dict[str, Any]) -> None:
         from vllm.config import CompilationConfig
         from vllm.engine.arg_utils import AsyncEngineArgs
@@ -136,6 +223,9 @@ def _create_engine(self, llm_kwargs: dict[str, Any]) -> None:
                 **llm_kwargs["compilation_config"]
             )
 
+        self._patch_vllm_device_allocation()
+        self._patch_vllm_stats_address()
+
         self.llm_async_engine_args = AsyncEngineArgs(**llm_kwargs)
         self.llm = AsyncLLM.from_engine_args(self.llm_async_engine_args)
 
diff --git a/tests/unit/models/generation/test_vllm_async_dp_ep.py b/tests/unit/models/generation/test_vllm_async_dp_ep.py
@@ -0,0 +1,67 @@
+"""Unit tests for vLLM async DP+EP patches."""
+
+import os
+from unittest.mock import MagicMock, patch
+
+
+class TestVllmDeviceAllocationPatch:
+    """Test device allocation patch for DP+EP."""
+
+    def test_single_device(self):
+        """Single device should return string value."""
+        from nemo_rl.models.generation.vllm.vllm_worker_async import (
+            VllmAsyncGenerationWorker,
+        )
+
+        worker = VllmAsyncGenerationWorker.__new__(VllmAsyncGenerationWorker)
+        with patch("vllm.v1.engine.utils") as mock_utils:
+            mock_utils.get_device_indices = MagicMock()
+            worker._patch_vllm_device_allocation()
+            os.environ["CUDA_VISIBLE_DEVICES"] = "1"
+            result = mock_utils.get_device_indices("CUDA_VISIBLE_DEVICES", 0, 1)
+            # Should return string "1" not list [1]
+            assert result == "1"
+
+    def test_no_env(self):
+        """No env var should use sequential allocation."""
+        from nemo_rl.models.generation.vllm.vllm_worker_async import (
+            VllmAsyncGenerationWorker,
+        )
+
+        worker = VllmAsyncGenerationWorker.__new__(VllmAsyncGenerationWorker)
+        with patch("vllm.v1.engine.utils") as mock_utils:
+            mock_utils.get_device_indices = MagicMock()
+            worker._patch_vllm_device_allocation()
+            os.environ.pop("CUDA_VISIBLE_DEVICES", None)
+            result = mock_utils.get_device_indices("CUDA_VISIBLE_DEVICES", 0, 2)
+            assert result == [0, 1]
+
+
+class TestVllmStatsAddressPatch:
+    """Test stats address patch conditional behavior."""
+
+    def test_skips_patch_when_dp_size_is_one(self):
+        """Should skip patch when VLLM_DP_SIZE=1."""
+        from nemo_rl.models.generation.vllm.vllm_worker_async import (
+            VllmAsyncGenerationWorker,
+        )
+
+        worker = VllmAsyncGenerationWorker.__new__(VllmAsyncGenerationWorker)
+        os.environ["VLLM_DP_SIZE"] = "1"
+        with patch("vllm.v1.engine.llm_engine") as mock_engine:
+            worker._patch_vllm_stats_address()
+            # Should not access llm_engine when DP=1
+            mock_engine.LLMEngine.assert_not_called()
+
+    def test_applies_patch_when_dp_size_greater_than_one(self):
+        """Should apply patch when VLLM_DP_SIZE>1."""
+        from nemo_rl.models.generation.vllm.vllm_worker_async import (
+            VllmAsyncGenerationWorker,
+        )
+
+        worker = VllmAsyncGenerationWorker.__new__(VllmAsyncGenerationWorker)
+        os.environ["VLLM_DP_SIZE"] = "2"
+        with patch("vllm.v1.engine.llm_engine.LLMEngine") as mock_engine:
+            worker._patch_vllm_stats_address()
+            # Should patch __init__ when DP>1
+            assert mock_engine.__init__ is not None