PaddlePaddle · liuruyan · Nov 17, 2025 · Nov 11, 2025 · Nov 11, 2025 · Nov 11, 2025
diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc
@@ -3612,7 +3612,6 @@ All parameter, weight, gradient are variables in Paddle.
         return platform::GetDeviceProperties(id);
       },
       py::return_value_policy::copy);
-
   py::class_<gpuDeviceProp>(m, "_gpuDeviceProperties", py::module_local())
       .def_property_readonly(
           "name", [](const gpuDeviceProp &prop) { return prop.name; })
@@ -3653,7 +3652,11 @@ All parameter, weight, gradient are variables in Paddle.
   m.def("nvprof_disable_record_event", platform::NvprofDisableRecordEvent);
 #endif
 #endif
-
+#if defined(PADDLE_WITH_CUDA)
+  m.def("vmm_max_free_size", [] {
+    memory::VmmMaxFreeSize(phi::GPUPlace(platform::GetCurrentDeviceId()), 1);
+  });
+#endif
 #ifdef PADDLE_WITH_CUSTOM_DEVICE
   m.def(
       "get_device_properties",

diff --git a/paddle/phi/core/memory/CMakeLists.txt b/paddle/phi/core/memory/CMakeLists.txt
@@ -1,3 +1,10 @@
 add_subdirectory(allocation)
 
-collect_srcs(core_srcs SRCS malloc.cc memcpy.cc stats.cc mem_utils.cc)
+collect_srcs(
+  core_srcs
+  SRCS
+  malloc.cc
+  memcpy.cc
+  stats.cc
+  mem_utils.cc
+  mem_visitor.cc)
diff --git a/paddle/phi/core/memory/allocation/allocator.h b/paddle/phi/core/memory/allocation/allocator.h
@@ -25,6 +25,7 @@
 #include "paddle/phi/core/allocator.h"
 #include "paddle/phi/core/enforce.h"
 #include "paddle/phi/core/memory/allocation/inlined_vector.h"
+#include "paddle/phi/core/memory/mem_visitor.h"
 #include "paddle/phi/core/platform/device/gpu/gpu_types.h"
 
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
@@ -202,6 +203,8 @@ class PADDLE_API Allocator : public phi::Allocator {
   uint64_t Release(const phi::Place& place) { return ReleaseImpl(place); }
   size_t Compact(const phi::Place& place) { return CompactImpl(place); }
 
+  virtual void Accept(AllocatorVisitor* visitor) { visitor->Visit(this); }
+
  protected:
   virtual phi::Allocation* AllocateImpl(size_t size) = 0;
   virtual void FreeImpl(phi::Allocation* allocation);

diff --git a/paddle/phi/core/memory/allocation/allocator_facade.cc b/paddle/phi/core/memory/allocation/allocator_facade.cc
@@ -1663,6 +1663,13 @@ uint64_t AllocatorFacade::Release(const phi::Place& place) {
       ->Release(place);
 }
 
+void AllocatorFacade::Accept(const phi::Place& place,
+                             AllocatorVisitor* visitor) {
+  GetPrivate()
+      ->GetAllocator(place, /* A non-zero num to choose allocator_ */ 1)
+      ->Accept(visitor);
+}
+
 size_t AllocatorFacade::Compact(const phi::Place& place) {
   return GetPrivate()
       ->GetAllocator(place, /* A non-zero num to choose allocator_ */ 1)

diff --git a/paddle/phi/core/memory/allocation/allocator_facade.h b/paddle/phi/core/memory/allocation/allocator_facade.h
@@ -32,6 +32,7 @@
 
 namespace paddle {
 namespace memory {
+class AllocatorVisitor;
 namespace allocation {
 
 // Allocator Facade is the interface exposed to other modules.
@@ -73,6 +74,21 @@ class AllocatorFacade {
   // Compact memory of free blocks held by the VmmAllocator.
   size_t Compact(const phi::Place& place);
 
+  /**
+   * @brief Accepts an AllocatorVisitor and iterates over all nested Allocator
+   * instances associated with a specific memory location (Place), executing the
+   * visitor's corresponding Visit method for each one.
+   *
+   * This method facilitates the traversal of the Allocator hierarchy for the
+   * given memory Place, allowing the visitor to collect statistics or perform
+   * operations on all constituent allocators.
+   *
+   * @param place The memory location
+   * @param visitor A pointer to the AllocatorVisitor whose Visit methods will
+   * be executed against the nested allocators found at the specified Place.
+   */
+  void Accept(const phi::Place& place, AllocatorVisitor* visitor);
+
   std::shared_ptr<Allocation> AllocShared(const phi::Place& place,
                                           size_t size,
                                           const phi::Stream& stream);

diff --git a/paddle/phi/core/memory/allocation/retry_allocator.h b/paddle/phi/core/memory/allocation/retry_allocator.h
@@ -23,6 +23,7 @@
 
 #include "paddle/phi/core/enforce.h"
 #include "paddle/phi/core/memory/allocation/allocator.h"
+#include "paddle/phi/core/memory/mem_visitor.h"
 
 namespace paddle {
 namespace memory {
@@ -49,6 +50,10 @@ class PADDLE_API RetryAllocator : public Allocator {
         common::errors::PreconditionNotMet(
             "Underlying allocator of RetryAllocator is not thread-safe"));
   }
+  std::shared_ptr<Allocator>& GetUnderLyingAllocator() {
+    return underlying_allocator_;
+  }
+  void Accept(AllocatorVisitor* visitor) override { visitor->Visit(this); }
 
   bool IsAllocThreadSafe() const override { return true; }
 

diff --git a/paddle/phi/core/memory/allocation/stat_allocator.h b/paddle/phi/core/memory/allocation/stat_allocator.h
@@ -15,6 +15,7 @@
 #pragma once
 
 #include "paddle/phi/core/memory/allocation/allocator.h"
+#include "paddle/phi/core/memory/mem_visitor.h"
 #include "paddle/phi/core/memory/stats.h"
 #include "paddle/phi/core/platform/profiler/mem_tracing.h"
 
@@ -28,6 +29,10 @@ class StatAllocator : public Allocator {
       : underlying_allocator_(std::move(underlying_allocator)) {}
 
   bool IsAllocThreadSafe() const override { return true; }
+  void Accept(AllocatorVisitor* visitor) override { visitor->Visit(this); }
+  std::shared_ptr<Allocator>& GetUnderLyingAllocator() {
+    return underlying_allocator_;
+  }
 
  protected:
   void FreeImpl(phi::Allocation* allocation) override {

diff --git a/paddle/phi/core/memory/allocation/stream_safe_cuda_allocator.h b/paddle/phi/core/memory/allocation/stream_safe_cuda_allocator.h
@@ -21,6 +21,7 @@
 #include "paddle/phi/common/place.h"
 #include "paddle/phi/core/memory/allocation/allocator.h"
 #include "paddle/phi/core/memory/allocation/spin_lock.h"
+#include "paddle/phi/core/memory/mem_visitor.h"
 
 #ifdef PADDLE_WITH_CUDA
 #include <cuda_runtime.h>
@@ -76,9 +77,16 @@ class StreamSafeCUDAAllocator
                           bool in_cuda_graph_capturing = false);
   ~StreamSafeCUDAAllocator();
 
+  std::shared_ptr<Allocator> &GetUnderLyingAllocator() {
+    return underlying_allocator_;
+  }
+  std::vector<StreamSafeCUDAAllocator *> &GetAllocatorByPlace() {
+    return allocator_map_[place_];
+  }
   bool IsAllocThreadSafe() const override;
   gpuStream_t GetDefaultStream() const;
   void SetDefaultStream(gpuStream_t stream);
+  void Accept(AllocatorVisitor *visitor) override { visitor->Visit(this); }
 
  protected:
   phi::Allocation *AllocateImpl(size_t size) override;

diff --git a/paddle/phi/core/memory/allocation/virtual_memory_auto_growth_best_fit_allocator.cc b/paddle/phi/core/memory/allocation/virtual_memory_auto_growth_best_fit_allocator.cc
@@ -264,6 +264,23 @@ phi::Allocation *VirtualMemoryAutoGrowthBestFitAllocator::AllocFromFreeBlocks(
   return nullptr;
 }
 
+std::pair<size_t, size_t>
+VirtualMemoryAutoGrowthBestFitAllocator::SumLargestFreeBlockSizes(
+    int32_t n) const {
+  if (n <= 0 || free_blocks_.empty()) return std::make_pair(0, 0);
+
+  size_t large_size = free_blocks_.rbegin()->first.first;
+  size_t total_size = 0;
+  int32_t count = 0;
+
+  for (auto it = free_blocks_.rbegin(); it != free_blocks_.rend() && count < n;
+       ++it, ++count) {
+    total_size += it->first.first;
+  }
+
+  return std::make_pair(large_size, total_size);
+}
+
 void VirtualMemoryAutoGrowthBestFitAllocator::DumpInfo(
     std::string phase) const {
   size_t total = 0, free = 0, used = 0;

diff --git a/paddle/phi/core/memory/allocation/virtual_memory_auto_growth_best_fit_allocator.h b/paddle/phi/core/memory/allocation/virtual_memory_auto_growth_best_fit_allocator.h
@@ -22,6 +22,7 @@
 #include "paddle/phi/core/memory/allocation/allocator.h"
 #include "paddle/phi/core/memory/allocation/spin_lock.h"
 #include "paddle/phi/core/memory/mem_utils.h"
+#include "paddle/phi/core/memory/mem_visitor.h"
 
 namespace paddle {
 namespace memory {
@@ -43,6 +44,12 @@ class VirtualMemoryAutoGrowthBestFitAllocator : public Allocator {
       size_t alignment,
       const phi::GPUPlace &place);
 
+  std::shared_ptr<Allocator> &GetUnderLyingAllocator() {
+    return underlying_allocator_;
+  }
+  std::pair<size_t, size_t> SumLargestFreeBlockSizes(int32_t n) const;
+  void Accept(AllocatorVisitor *visitor) override { visitor->Visit(this); }
+
   bool IsAllocThreadSafe() const override { return true; }
 
  protected:

diff --git a/paddle/phi/core/memory/malloc.cc b/paddle/phi/core/memory/malloc.cc
@@ -16,6 +16,7 @@ limitations under the License. */
 
 #include "paddle/phi/common/place.h"
 #include "paddle/phi/core/memory/allocation/allocator_facade.h"
+#include "paddle/phi/core/memory/mem_visitor.h"
 #include "paddle/phi/core/stream.h"
 
 namespace paddle::memory {
@@ -80,6 +81,17 @@ gpuStream_t GetStream(const std::shared_ptr<Allocation>& allocation) {
 
 #endif
 
+#if defined(PADDLE_WITH_CUDA)
+std::pair<size_t, size_t> VmmMaxFreeSize(const phi::GPUPlace& place,
+                                         int32_t n) {
+  FreeMemoryMetricsVisitor free_memory_metrics_visitor(n);
+  allocation::AllocatorFacade::Instance().Accept(place,
+                                                 &free_memory_metrics_visitor);
+  return std::make_pair(free_memory_metrics_visitor.GetLargeSize(),
+                        free_memory_metrics_visitor.GetSumSize());
+}
+#endif
+
 #ifdef PADDLE_WITH_XPU
 bool RecordStream(std::shared_ptr<Allocation> allocation, XPUStream stream) {
   return allocation::AllocatorFacade::Instance().RecordStream(allocation,

diff --git a/paddle/phi/core/memory/malloc.h b/paddle/phi/core/memory/malloc.h
@@ -69,6 +69,12 @@ void EraseStream(std::shared_ptr<Allocation> allocation, gpuStream_t stream);
 PADDLE_API gpuStream_t GetStream(const std::shared_ptr<Allocation>& allocation);
 #endif
 
+#if defined(PADDLE_WITH_CUDA)
+// return a pair of <largest_free_block_size, sum_of_n_largest_free_block_size>
+PADDLE_API extern std::pair<size_t, size_t> VmmMaxFreeSize(
+    const phi::GPUPlace& place, int32_t n);
+#endif
+
 #ifdef PADDLE_WITH_XPU
 bool RecordStream(std::shared_ptr<Allocation> allocation, XPUStream stream);
 #endif

diff --git a/paddle/phi/core/memory/mem_visitor.cc b/paddle/phi/core/memory/mem_visitor.cc
@@ -0,0 +1,62 @@
+// Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/core/memory/mem_visitor.h"
+#include "paddle/phi/core/memory/allocation/allocator.h"
+#include "paddle/phi/core/memory/allocation/retry_allocator.h"
+#include "paddle/phi/core/memory/allocation/spin_lock.h"
+#include "paddle/phi/core/memory/allocation/stat_allocator.h"
+
+#ifdef PADDLE_WITH_CUDA
+#include "paddle/phi/core/memory/allocation/stream_safe_cuda_allocator.h"
+#include "paddle/phi/core/memory/allocation/virtual_memory_auto_growth_best_fit_allocator.h"
+#endif
+
+namespace paddle {
+namespace memory {
+
+void AllocatorVisitor::Visit(RetryAllocator* allocator) {
+  allocator->GetUnderLyingAllocator()->Accept(this);
+}
+
+void AllocatorVisitor::Visit(StatAllocator* allocator) {
+  allocator->GetUnderLyingAllocator()->Accept(this);
+}
+
+#ifdef PADDLE_WITH_CUDA
+void AllocatorVisitor::Visit(StreamSafeCUDAAllocator* allocator) {
+  const std::vector<StreamSafeCUDAAllocator*>& allocators =
+      allocator->GetAllocatorByPlace();
+  for (StreamSafeCUDAAllocator* allocator : allocators) {
+    allocator->GetUnderLyingAllocator()->Accept(this);
+  }
+}
+
+void AllocatorVisitor::Visit(
+    VirtualMemoryAutoGrowthBestFitAllocator* allocator) {
+  allocator->GetUnderLyingAllocator()->Accept(this);
+}
+
+void FreeMemoryMetricsVisitor::Visit(
+    VirtualMemoryAutoGrowthBestFitAllocator* allocator) {
+  auto [large_size, sum_size] =
+      allocator->SumLargestFreeBlockSizes(nums_blocks_);
+  large_size_ = std::max(large_size_, large_size);
+  sum_size_ = std::max(sum_size_, sum_size);
+  VLOG(1) << "Visit VirtualMemoryAutoGrowthBestFitAllocator large_free_size:"
+          << large_size_ << " sum_free_size:" << sum_size_;
+}
+#endif
+}  // namespace memory
+}  // namespace paddle