Skip to content

Commit 51c70b8

Browse files
fxyfxy777LittleHeroZZZX
authored andcommitted
[Compat] cuda.ipc_collect (PaddlePaddle#76344)
1 parent 6bbdd5e commit 51c70b8

File tree

9 files changed

+109
-6
lines changed

9 files changed

+109
-6
lines changed

paddle/fluid/pybind/pybind.cc

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -181,6 +181,7 @@ limitations under the License. */
181181
#endif
182182

183183
#ifdef PADDLE_WITH_XPU
184+
#include "paddle/phi/core/memory/allocation/xpu_ipc_allocator.h"
184185
#include "paddle/phi/core/platform/device/xpu/xpu_info.h"
185186
#include "paddle/phi/core/platform/device/xpu/xpu_op_list.h"
186187
#endif
@@ -1704,6 +1705,21 @@ PYBIND11_MODULE(libpaddle, m) {
17041705
}
17051706
});
17061707

1708+
m.def("_ipc_collect", []() {
1709+
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_XPU)
1710+
#if defined(_WIN32)
1711+
PADDLE_THROW(common::errors::Unavailable(
1712+
"ipc_collect is not supported on Windows (CUDA/XPU IPC)."));
1713+
#else
1714+
paddle::memory::allocation::IpcCollect();
1715+
#endif
1716+
#else
1717+
PADDLE_THROW(common::errors::Unavailable(
1718+
"Paddle is not compiled with CUDA/XPU, "
1719+
"so `ipc_collect` cannot be used."));
1720+
#endif
1721+
});
1722+
17071723
class NodePostHookRemoveHelper {
17081724
public:
17091725
NodePostHookRemoveHelper(std::shared_ptr<egr::GradNodeBase> node,

paddle/phi/core/memory/allocation/cuda_ipc_allocator.cc

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -65,6 +65,27 @@ std::shared_ptr<void> GetIpcBasePtr(std::string handle) {
6565
return sp;
6666
}
6767

68+
void IpcCollect() {
69+
std::lock_guard<std::mutex> lock(ipc_mutex_);
70+
size_t before = ipc_handle_to_baseptr_.size();
71+
VLOG(6) << "The number of IPC handles before collection:" << before;
72+
73+
for (auto it = ipc_handle_to_baseptr_.begin();
74+
it != ipc_handle_to_baseptr_.end();) {
75+
if (it->second.expired()) {
76+
it = ipc_handle_to_baseptr_.erase(it);
77+
} else {
78+
VLOG(6) << " Valid ipc handle is not expired";
79+
++it;
80+
}
81+
}
82+
83+
size_t after = ipc_handle_to_baseptr_.size();
84+
size_t collected = before - after;
85+
VLOG(1) << "IpcCollect: collected " << collected << " expired IPC handles"
86+
<< "out of " << before << " total handles";
87+
}
88+
6889
CudaIpcAllocation::~CudaIpcAllocation() {
6990
shared_ptr_.reset();
7091
VLOG(6) << "tensor deleted cudaIpcCloseMemHandle for ptr:"

paddle/phi/core/memory/allocation/cuda_ipc_allocator.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,8 @@ namespace allocation {
3232

3333
std::shared_ptr<void> GetIpcBasePtr(std::string handle);
3434

35+
void IpcCollect();
36+
3537
class CudaIpcAllocation : public Allocation {
3638
public:
3739
explicit CudaIpcAllocation(void *ptr,

paddle/phi/core/memory/allocation/xpu_ipc_allocator.cc

Lines changed: 21 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -77,13 +77,33 @@ std::shared_ptr<void> GetIpcBasePtr(std::string handle) {
7777
return sp;
7878
}
7979

80+
void IpcCollect() {
81+
std::lock_guard<std::mutex> lock(ipc_mutex_);
82+
size_t before = ipc_handle_to_baseptr_.size();
83+
VLOG(6) << "The number of IPC handles before collection:" << before;
84+
85+
for (auto it = ipc_handle_to_baseptr_.begin();
86+
it != ipc_handle_to_baseptr_.end();) {
87+
if (it->second.expired()) {
88+
it = ipc_handle_to_baseptr_.erase(it);
89+
} else {
90+
VLOG(6) << " Valid ipc handle is not expired";
91+
++it;
92+
}
93+
}
94+
95+
size_t after = ipc_handle_to_baseptr_.size();
96+
size_t collected = before - after;
97+
VLOG(1) << "IpcCollect: collected " << collected << " expired IPC handles"
98+
<< "out of " << before << " total handles";
99+
}
100+
80101
XpuIpcAllocation::~XpuIpcAllocation() {
81102
// Release the underlying IPC resource.
82103
shared_ptr_.reset();
83104
VLOG(6) << "tensor deleted cudaIpcCloseMemHandle for ptr:"
84105
<< "\t" << this->ptr();
85106
}
86-
87107
} // namespace paddle::memory::allocation
88108

89109
#endif // _WIN32

paddle/phi/core/memory/allocation/xpu_ipc_allocator.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -32,7 +32,7 @@ namespace allocation {
3232
// Returns a shared pointer that holds the IPC base pointer for the given
3333
// handle.
3434
std::shared_ptr<void> GetIpcBasePtr(std::string handle);
35-
35+
void IpcCollect();
3636
class XpuIpcAllocation : public Allocation {
3737
public:
3838
explicit XpuIpcAllocation(void *ptr,

python/paddle/cuda/__init__.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,7 @@
2828
amp, # noqa: F401
2929
current_device,
3030
device,
31+
ipc_collect,
3132
is_available as _device_is_available,
3233
is_bf16_supported,
3334
is_current_stream_capturing as _is_current_stream_capturing,
@@ -878,5 +879,6 @@ def get_stream_from_external(
878879
"max_memory_allocated",
879880
"reset_peak_memory_stats",
880881
"Event",
882+
"ipc_collect",
881883
"StreamContext",
882884
]

python/paddle/device/__init__.py

Lines changed: 28 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -201,6 +201,8 @@
201201
'is_bf16_supported',
202202
'manual_seed',
203203
'reset_peak_memory_stats',
204+
'ipc_collect',
205+
'get_stream_from_external',
204206
]
205207

206208
_cudnn_version = None
@@ -972,9 +974,9 @@ def get_device_capability(
972974
.. code-block:: python
973975
974976
>>> # doctest: +REQUIRES(env:CUSTOM_DEVICE)
975-
>>> # import paddle
976-
>>> # cap = paddle.device.get_device_capability()
977-
>>> # print(cap)
977+
>>> import paddle
978+
>>> cap = paddle.device.get_device_capability()
979+
>>> print(cap)
978980
"""
979981
prop = get_device_properties(device)
980982
return prop.major, prop.minor
@@ -1817,6 +1819,25 @@ def synchronize(device: PlaceLike | None = None) -> None:
18171819
)
18181820

18191821

1822+
def ipc_collect() -> None:
1823+
"""
1824+
Force collects GPU memory after it has been released by CUDA IPC.
1825+
This function checks if any sent CUDA tensors could be cleaned from the memory.
1826+
Force closes shared memory file used for reference counting if there is no active counters.
1827+
Useful when the producer process stopped actively sending tensors and want to release unused memory.
1828+
Returns:
1829+
None
1830+
Examples:
1831+
.. code-block:: python
1832+
1833+
>>> # doctest: +REQUIRES(env:GPU)
1834+
>>> import paddle
1835+
>>> # Force collect expired IPC memory
1836+
>>> paddle.device.ipc_collect() #this is equivalent to paddle.cuda.ipc_collect()
1837+
"""
1838+
paddle.base.libpaddle._ipc_collect()
1839+
1840+
18201841
def get_stream_from_external(
18211842
data_ptr: int, device: PlaceLike | None = None
18221843
) -> Stream:
@@ -1847,7 +1868,7 @@ def get_stream_from_external(
18471868
18481869
>>> import paddle
18491870
>>> # Suppose external_stream_ptr is from another CUDA library
1850-
>>> s = paddle.device.get_stream_from_external(external_stream_ptr, "gpu:0")
1871+
>>> # s = paddle.device.get_stream_from_external(external_stream_ptr, "gpu:0")
18511872
'''
18521873
if device is None:
18531874
place = paddle.framework._current_expected_place_()
@@ -1953,6 +1974,7 @@ def range_push(msg: str):
19531974
msg (str): The name of the NVTX range.
19541975
Example:
19551976
.. code-block:: python
1977+
19561978
>>> # doctest: +REQUIRES(env:GPU)
19571979
>>> import paddle
19581980
>>> # paddle.device.nvtx.range_push("test") is equivalent to paddle.cuda.nvtx.range_push("test")
@@ -1967,6 +1989,7 @@ def range_pop():
19671989
Pop the most recent NVTX range marker.
19681990
Example:
19691991
.. code-block:: python
1992+
19701993
>>> # doctest: +REQUIRES(env:GPU)
19711994
>>> import paddle
19721995
>>> # paddle.device.nvtx.range_pop("test") is equivalent to paddle.cuda.nvtx.range_pop("test")
@@ -1984,6 +2007,7 @@ def reset_peak_memory_stats(device: PlaceLike | int | None = None) -> None:
19842007
19852008
Example:
19862009
.. code-block:: python
2010+
19872011
>>> # doctest: +REQUIRES(env:GPU)
19882012
>>> import paddle
19892013
>>> paddle.device.set_device('gpu') # or '<custom_device>'

test/legacy_test/test_cuda_unittest.py

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -37,6 +37,23 @@
3737
)
3838

3939

40+
class TestCudaIpcCollect(unittest.TestCase):
41+
def test_ipc_collect(self):
42+
if (
43+
paddle.device.is_compiled_with_cuda() or is_custom_device()
44+
) and paddle.device.is_compiled_with_rocm():
45+
reason = "Skip for ipc_collect function in dcu is not correct"
46+
print(reason)
47+
return
48+
if platform.system().lower() == "windows":
49+
print("Skip: ipc_collect function on Windows is not supported.")
50+
return
51+
device = paddle.device.get_device()
52+
if device.startswith("gpu") or device.startswith("xpu"):
53+
paddle.device.ipc_collect()
54+
paddle.cuda.ipc_collect()
55+
56+
4057
class TestCudaCompat(unittest.TestCase):
4158
# ---------------------
4259
# _device_to_paddle test

test/legacy_test/test_paddle_multiprocessing.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -58,6 +58,7 @@ def check_ipc_tensor(event, ipc_metas):
5858
shared_ipc_tensor = paddle.to_tensor(
5959
paddle.base.core.DenseTensor._new_shared_cuda(ipc_metas)
6060
)
61+
paddle.cuda.ipc_collect()
6162

6263
def tensor_equal(t1, t2):
6364
return (t1 == t2).all().item()

0 commit comments

Comments
 (0)