[Compat] cuda.ipc_collect (PaddlePaddle#76344)

fxyfxy777 · LittleHeroZZZX · commit 51c70b85b6fc · 2025-11-19T12:54:29.000Z
diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc
@@ -181,6 +181,7 @@ limitations under the License. */
 #endif
 
 #ifdef PADDLE_WITH_XPU
+#include "paddle/phi/core/memory/allocation/xpu_ipc_allocator.h"
 #include "paddle/phi/core/platform/device/xpu/xpu_info.h"
 #include "paddle/phi/core/platform/device/xpu/xpu_op_list.h"
 #endif
@@ -1704,6 +1705,21 @@ PYBIND11_MODULE(libpaddle, m) {
           }
         });
 
+  m.def("_ipc_collect", []() {
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_XPU)
+#if defined(_WIN32)
+    PADDLE_THROW(common::errors::Unavailable(
+        "ipc_collect is not supported on Windows (CUDA/XPU IPC)."));
+#else
+      paddle::memory::allocation::IpcCollect();
+#endif
+#else
+      PADDLE_THROW(common::errors::Unavailable(
+        "Paddle is not compiled with CUDA/XPU, "
+        "so `ipc_collect` cannot be used."));
+#endif
+  });
+
   class NodePostHookRemoveHelper {
    public:
     NodePostHookRemoveHelper(std::shared_ptr<egr::GradNodeBase> node,
diff --git a/paddle/phi/core/memory/allocation/cuda_ipc_allocator.cc b/paddle/phi/core/memory/allocation/cuda_ipc_allocator.cc
@@ -65,6 +65,27 @@ std::shared_ptr<void> GetIpcBasePtr(std::string handle) {
   return sp;
 }
 
+void IpcCollect() {
+  std::lock_guard<std::mutex> lock(ipc_mutex_);
+  size_t before = ipc_handle_to_baseptr_.size();
+  VLOG(6) << "The number of IPC handles before collection:" << before;
+
+  for (auto it = ipc_handle_to_baseptr_.begin();
+       it != ipc_handle_to_baseptr_.end();) {
+    if (it->second.expired()) {
+      it = ipc_handle_to_baseptr_.erase(it);
+    } else {
+      VLOG(6) << " Valid ipc handle is not expired";
+      ++it;
+    }
+  }
+
+  size_t after = ipc_handle_to_baseptr_.size();
+  size_t collected = before - after;
+  VLOG(1) << "IpcCollect: collected " << collected << " expired IPC handles"
+          << "out of " << before << " total handles";
+}
+
 CudaIpcAllocation::~CudaIpcAllocation() {
   shared_ptr_.reset();
   VLOG(6) << "tensor deleted cudaIpcCloseMemHandle for ptr:"
diff --git a/paddle/phi/core/memory/allocation/cuda_ipc_allocator.h b/paddle/phi/core/memory/allocation/cuda_ipc_allocator.h
@@ -32,6 +32,8 @@ namespace allocation {
 
 std::shared_ptr<void> GetIpcBasePtr(std::string handle);
 
+void IpcCollect();
+
 class CudaIpcAllocation : public Allocation {
  public:
   explicit CudaIpcAllocation(void *ptr,
diff --git a/paddle/phi/core/memory/allocation/xpu_ipc_allocator.cc b/paddle/phi/core/memory/allocation/xpu_ipc_allocator.cc
@@ -77,13 +77,33 @@ std::shared_ptr<void> GetIpcBasePtr(std::string handle) {
   return sp;
 }
 
+void IpcCollect() {
+  std::lock_guard<std::mutex> lock(ipc_mutex_);
+  size_t before = ipc_handle_to_baseptr_.size();
+  VLOG(6) << "The number of IPC handles before collection:" << before;
+
+  for (auto it = ipc_handle_to_baseptr_.begin();
+       it != ipc_handle_to_baseptr_.end();) {
+    if (it->second.expired()) {
+      it = ipc_handle_to_baseptr_.erase(it);
+    } else {
+      VLOG(6) << " Valid ipc handle is not expired";
+      ++it;
+    }
+  }
+
+  size_t after = ipc_handle_to_baseptr_.size();
+  size_t collected = before - after;
+  VLOG(1) << "IpcCollect: collected " << collected << " expired IPC handles"
+          << "out of " << before << " total handles";
+}
+
 XpuIpcAllocation::~XpuIpcAllocation() {
   // Release the underlying IPC resource.
   shared_ptr_.reset();
   VLOG(6) << "tensor deleted cudaIpcCloseMemHandle for ptr:"
           << "\t" << this->ptr();
 }
-
 }  // namespace paddle::memory::allocation
 
 #endif  // _WIN32
diff --git a/paddle/phi/core/memory/allocation/xpu_ipc_allocator.h b/paddle/phi/core/memory/allocation/xpu_ipc_allocator.h
@@ -32,7 +32,7 @@ namespace allocation {
 // Returns a shared pointer that holds the IPC base pointer for the given
 // handle.
 std::shared_ptr<void> GetIpcBasePtr(std::string handle);
-
+void IpcCollect();
 class XpuIpcAllocation : public Allocation {
  public:
   explicit XpuIpcAllocation(void *ptr,
diff --git a/python/paddle/cuda/__init__.py b/python/paddle/cuda/__init__.py
@@ -28,6 +28,7 @@
     amp,  # noqa: F401
     current_device,
     device,
+    ipc_collect,
     is_available as _device_is_available,
     is_bf16_supported,
     is_current_stream_capturing as _is_current_stream_capturing,
@@ -878,5 +879,6 @@ def get_stream_from_external(
     "max_memory_allocated",
     "reset_peak_memory_stats",
     "Event",
+    "ipc_collect",
     "StreamContext",
 ]
diff --git a/python/paddle/device/__init__.py b/python/paddle/device/__init__.py
@@ -201,6 +201,8 @@
     'is_bf16_supported',
     'manual_seed',
     'reset_peak_memory_stats',
+    'ipc_collect',
+    'get_stream_from_external',
 ]
 
 _cudnn_version = None
@@ -972,9 +974,9 @@ def get_device_capability(
         .. code-block:: python
 
             >>> # doctest: +REQUIRES(env:CUSTOM_DEVICE)
-            >>> # import paddle
-            >>> # cap = paddle.device.get_device_capability()
-            >>> # print(cap)
+            >>> import paddle
+            >>> cap = paddle.device.get_device_capability()
+            >>> print(cap)
     """
     prop = get_device_properties(device)
     return prop.major, prop.minor
@@ -1817,6 +1819,25 @@ def synchronize(device: PlaceLike | None = None) -> None:
         )
 
 
+def ipc_collect() -> None:
+    """
+    Force collects GPU memory after it has been released by CUDA IPC.
+    This function checks if any sent CUDA tensors could be cleaned from the memory.
+    Force closes shared memory file used for reference counting if there is no active counters.
+    Useful when the producer process stopped actively sending tensors and want to release unused memory.
+    Returns:
+        None
+    Examples:
+        .. code-block:: python
+
+            >>> # doctest: +REQUIRES(env:GPU)
+            >>> import paddle
+            >>> # Force collect expired IPC memory
+            >>> paddle.device.ipc_collect() #this is equivalent to paddle.cuda.ipc_collect()
+    """
+    paddle.base.libpaddle._ipc_collect()
+
+
 def get_stream_from_external(
     data_ptr: int, device: PlaceLike | None = None
 ) -> Stream:
@@ -1847,7 +1868,7 @@ def get_stream_from_external(
 
             >>> import paddle
             >>> # Suppose external_stream_ptr is from another CUDA library
-            >>> s = paddle.device.get_stream_from_external(external_stream_ptr, "gpu:0")
+            >>> # s = paddle.device.get_stream_from_external(external_stream_ptr, "gpu:0")
     '''
     if device is None:
         place = paddle.framework._current_expected_place_()
@@ -1953,6 +1974,7 @@ def range_push(msg: str):
             msg (str): The name of the NVTX range.
         Example:
             .. code-block:: python
+
                 >>> # doctest: +REQUIRES(env:GPU)
                 >>> import paddle
                 >>> # paddle.device.nvtx.range_push("test") is equivalent to paddle.cuda.nvtx.range_push("test")
@@ -1967,6 +1989,7 @@ def range_pop():
         Pop the most recent NVTX range marker.
         Example:
             .. code-block:: python
+
                 >>> # doctest: +REQUIRES(env:GPU)
                 >>> import paddle
                 >>> # paddle.device.nvtx.range_pop("test") is equivalent to paddle.cuda.nvtx.range_pop("test")
@@ -1984,6 +2007,7 @@ def reset_peak_memory_stats(device: PlaceLike | int | None = None) -> None:
 
     Example:
         .. code-block:: python
+
             >>> # doctest: +REQUIRES(env:GPU)
             >>> import paddle
             >>> paddle.device.set_device('gpu')  # or '<custom_device>'
diff --git a/test/legacy_test/test_cuda_unittest.py b/test/legacy_test/test_cuda_unittest.py
@@ -37,6 +37,23 @@
 )
 
 
+class TestCudaIpcCollect(unittest.TestCase):
+    def test_ipc_collect(self):
+        if (
+            paddle.device.is_compiled_with_cuda() or is_custom_device()
+        ) and paddle.device.is_compiled_with_rocm():
+            reason = "Skip for ipc_collect function in dcu is not correct"
+            print(reason)
+            return
+        if platform.system().lower() == "windows":
+            print("Skip: ipc_collect function on Windows is not supported.")
+            return
+        device = paddle.device.get_device()
+        if device.startswith("gpu") or device.startswith("xpu"):
+            paddle.device.ipc_collect()
+            paddle.cuda.ipc_collect()
+
+
 class TestCudaCompat(unittest.TestCase):
     # ---------------------
     # _device_to_paddle test
diff --git a/test/legacy_test/test_paddle_multiprocessing.py b/test/legacy_test/test_paddle_multiprocessing.py
@@ -58,6 +58,7 @@ def check_ipc_tensor(event, ipc_metas):
     shared_ipc_tensor = paddle.to_tensor(
         paddle.base.core.DenseTensor._new_shared_cuda(ipc_metas)
     )
+    paddle.cuda.ipc_collect()
 
     def tensor_equal(t1, t2):
         return (t1 == t2).all().item()

Original file line number	Diff line number	Diff line change
`@@ -58,6 +58,7 @@ def check_ipc_tensor(event, ipc_metas):`
`58`	`58`	`shared_ipc_tensor = paddle.to_tensor(`
`59`	`59`	`paddle.base.core.DenseTensor._new_shared_cuda(ipc_metas)`
`60`	`60`	`)`
	`61`	`+ paddle.cuda.ipc_collect()`
`61`	`62`
`62`	`63`	`def tensor_equal(t1, t2):`
`63`	`64`	`return (t1 == t2).all().item()`