NVIDIA · aliceb-nv · Nov 18, 2025 · Nov 18, 2025 · Nov 18, 2025 · Nov 25, 2025
@@ -15,7 +15,7 @@ REPODIR=$(cd "$(dirname "$0")"; pwd)
 LIBCUOPT_BUILD_DIR=${LIBCUOPT_BUILD_DIR:=${REPODIR}/cpp/build}
 LIBMPS_PARSER_BUILD_DIR=${LIBMPS_PARSER_BUILD_DIR:=${REPODIR}/cpp/libmps_parser/build}
 
-VALIDARGS="clean libcuopt libmps_parser cuopt_mps_parser cuopt cuopt_server cuopt_sh_client docs deb -a -b -g -fsanitize -v -l= --verbose-pdlp --build-lp-only  --no-fetch-rapids --skip-c-python-adapters --skip-tests-build --skip-routing-build --skip-fatbin-write --host-lineinfo [--cmake-args=\\\"<args>\\\"] [--cache-tool=<tool>] -n --allgpuarch --ci-only-arch --show_depr_warn -h --help"
+VALIDARGS="clean libcuopt libmps_parser cuopt_mps_parser cuopt cuopt_server cuopt_sh_client docs deb -a -b -g -fsanitize -tsan -msan -v -l= --verbose-pdlp --build-lp-only  --no-fetch-rapids --skip-c-python-adapters --skip-tests-build --skip-routing-build --skip-fatbin-write --host-lineinfo [--cmake-args=\\\"<args>\\\"] [--cache-tool=<tool>] -n --allgpuarch --ci-only-arch --show_depr_warn -h --help"
 HELP="$0 [<target> ...] [<flag> ...]
  where <target> is:
    clean            - remove all existing build artifacts and configuration (start over)
@@ -32,7 +32,9 @@ HELP="$0 [<target> ...] [<flag> ...]
    -g               - build for debug
    -a               - Enable assertion (by default in debug mode)
    -b               - Build with benchmark settings
-   -fsanitize       - Build with sanitizer
+   -fsanitize       - Build with AddressSanitizer and UndefinedBehaviorSanitizer
+   -tsan            - Build with ThreadSanitizer (cannot be used with -fsanitize or -msan)
+   -msan            - Build with MemorySanitizer (cannot be used with -fsanitize or -tsan)
    -n               - no install step
    --no-fetch-rapids  - don't fetch rapids dependencies
    -l=              - log level. Options are: TRACE | DEBUG | INFO | WARN | ERROR | CRITICAL | OFF. Default=INFO
@@ -76,6 +78,8 @@ BUILD_ALL_GPU_ARCH=0
 BUILD_CI_ONLY=0
 BUILD_LP_ONLY=0
 BUILD_SANITIZER=0
+BUILD_TSAN=0
+BUILD_MSAN=0
 SKIP_C_PYTHON_ADAPTERS=0
 SKIP_TESTS_BUILD=0
 SKIP_ROUTING_BUILD=0
@@ -230,6 +234,12 @@ fi
 if hasArg -fsanitize; then
     BUILD_SANITIZER=1
 fi
+if hasArg -tsan; then
+    BUILD_TSAN=1
+fi
+if hasArg -msan; then
+    BUILD_MSAN=1
+fi
 if hasArg --skip-c-python-adapters; then
     SKIP_C_PYTHON_ADAPTERS=1
 fi
@@ -298,6 +308,24 @@ if [ ${BUILD_LP_ONLY} -eq 1 ] && [ ${SKIP_C_PYTHON_ADAPTERS} -eq 0 ]; then
     exit 1
 fi
 
+if [ ${BUILD_SANITIZER} -eq 1 ] && [ ${BUILD_TSAN} -eq 1 ]; then
+    echo "ERROR: -fsanitize and -tsan cannot be used together"
+    echo "AddressSanitizer and ThreadSanitizer are mutually exclusive"
+    exit 1
+fi
+
+if [ ${BUILD_SANITIZER} -eq 1 ] && [ ${BUILD_MSAN} -eq 1 ]; then
+    echo "ERROR: -fsanitize and -msan cannot be used together"
+    echo "AddressSanitizer and MemorySanitizer are mutually exclusive"
+    exit 1
+fi
+
+if [ ${BUILD_TSAN} -eq 1 ] && [ ${BUILD_MSAN} -eq 1 ]; then
+    echo "ERROR: -tsan and -msan cannot be used together"
+    echo "ThreadSanitizer and MemorySanitizer are mutually exclusive"
+    exit 1
+fi
+
 if  [ ${BUILD_ALL_GPU_ARCH} -eq 1 ]; then
     CUOPT_CMAKE_CUDA_ARCHITECTURES="RAPIDS"
     echo "Building for *ALL* supported GPU architectures..."
@@ -344,6 +372,8 @@ if buildAll || hasArg libcuopt; then
           -DFETCH_RAPIDS=${FETCH_RAPIDS} \
           -DBUILD_LP_ONLY=${BUILD_LP_ONLY} \
           -DBUILD_SANITIZER=${BUILD_SANITIZER} \
+          -DBUILD_TSAN=${BUILD_TSAN} \
+          -DBUILD_MSAN=${BUILD_MSAN} \
           -DSKIP_C_PYTHON_ADAPTERS=${SKIP_C_PYTHON_ADAPTERS} \
           -DBUILD_TESTS=$((1 - ${SKIP_TESTS_BUILD})) \
           -DSKIP_ROUTING_BUILD=${SKIP_ROUTING_BUILD} \

@@ -80,10 +80,33 @@ endif(CMAKE_COMPILER_IS_GNUCXX)
 # 1. Run the binary with env var set: LD_PRELOAD="$(gcc -print-file-name=libasan.so)" ASAN_OPTIONS='protect_shadow_gap=0:replace_intrin=0'
 # 2. (Optional) To run with a debugger (gdb or cuda-gdb) use the additional ASAN option alloc_dealloc_mismatch=0
 if(BUILD_SANITIZER)
-  list(APPEND CUOPT_CXX_FLAGS -fsanitize=address,undefined -fno-omit-frame-pointer -g -Wno-error=maybe-uninitialized)
+  list(APPEND CUOPT_CXX_FLAGS -fsanitize=address,undefined -fno-omit-frame-pointer -g)
+  if(NOT "${CMAKE_CXX_COMPILER_ID}" STREQUAL "Clang")
+    list(APPEND CUOPT_CXX_FLAGS -Wno-error=maybe-uninitialized)
+  endif()
   add_link_options(-fsanitize=address,undefined)
 endif(BUILD_SANITIZER)
 
+# To use ThreadSanitizer:
+# 1. Build with clang and the -tsan flag
+# 2. Run the binary with env var set: OMP_TOOL_LIBRARIES=/usr/lib/llvm-17/lib/libarcher.so ARCHER_OPTIONS='verbose=1' TSAN_OPTIONS='suppresions=cpp/tsan_suppressions.txt:ignore_noninstrumented_modules=1:halt_on_error=1'
+#     Replace with local llvm install path. libarcher.so must be presetn
+if(BUILD_TSAN)
+  message(STATUS "Building with ThreadSanitizer enabled")
+  list(APPEND CUOPT_CXX_FLAGS -fsanitize=thread -fno-omit-frame-pointer -g)
+  add_link_options(-fsanitize=thread)
+endif(BUILD_TSAN)
+
+# To use MemorySanitizer:
+# 1. Build with clang and the -msan flag (MemorySanitizer requires clang)
+# 2. Run the binary with env var set: MSAN_OPTIONS='halt_on_error=1'
+# Note: MemorySanitizer requires all code (including libraries) to be instrumented for accurate results
+if(BUILD_MSAN)
+  message(STATUS "Building with MemorySanitizer enabled")
+  list(APPEND CUOPT_CXX_FLAGS -fsanitize=memory -fno-omit-frame-pointer -g -fsanitize-memory-track-origins=1)
+  add_link_options(-fsanitize=memory)
+endif(BUILD_MSAN)
+
 if(DEFINE_ASSERT)
   add_definitions(-DASSERT_MODE)
 endif(DEFINE_ASSERT)
@@ -117,7 +140,11 @@ if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.9)
   set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -static-global-template-stub=false")
 endif()
 list(APPEND CUOPT_CUDA_FLAGS -Werror=cross-execution-space-call -Wno-deprecated-declarations -Xcompiler=-Werror --default-stream=per-thread)
-list(APPEND CUOPT_CUDA_FLAGS -Xcompiler=-Wall -Wno-error=non-template-friend)
+if("${CMAKE_CUDA_HOST_COMPILER}" MATCHES "clang" OR "${CMAKE_CXX_COMPILER_ID}" STREQUAL "Clang")
+  list(APPEND CUOPT_CUDA_FLAGS -Xcompiler=-Wall)
+else()
+  list(APPEND CUOPT_CUDA_FLAGS -Xcompiler=-Wall -Wno-error=non-template-friend)
+endif()
 list(APPEND CUOPT_CUDA_FLAGS -Xfatbin=-compress-all)
 if(CMAKE_CUDA_COMPILER_VERSION VERSION_GREATER_EQUAL 12.9 AND CMAKE_CUDA_COMPILER_VERSION VERSION_LESS 13.0)
   list(APPEND CUOPT_CUDA_FLAGS -Xfatbin=--compress-level=3)

@@ -33,8 +33,6 @@ enum class error_type_t {
  */
 
 struct logic_error : public std::logic_error {
-  explicit logic_error() = default;
-
   logic_error(const logic_error& exception) = default;
 
   // Move constructor

@@ -62,6 +62,7 @@ namespace linear_programming {
 
 class base_solution_t {
  public:
+  virtual ~base_solution_t()  = default;
   virtual bool is_mip() const = 0;
 };
 

@@ -44,6 +44,48 @@ namespace cuopt::linear_programming::dual_simplex {
 
 auto constexpr use_gpu = true;
 
+// non-template wrappers to work around clang compiler bug
+[[maybe_unused]] static void pairwise_multiply(
+  float* a, float* b, float* out, int size, rmm::cuda_stream_view stream)
+{
+  cub::DeviceTransform::Transform(
+    cuda::std::make_tuple(a, b), out, size, cuda::std::multiplies<>{}, stream);
+}
+
+[[maybe_unused]] static void pairwise_multiply(
+  double* a, double* b, double* out, int size, rmm::cuda_stream_view stream)
+{
+  cub::DeviceTransform::Transform(
+    cuda::std::make_tuple(a, b), out, size, cuda::std::multiplies<>{}, stream);
+}
+
+[[maybe_unused]] static void axpy(
+  float alpha, float* x, float beta, float* y, float* out, int size, rmm::cuda_stream_view stream)
+{
+  cub::DeviceTransform::Transform(
+    cuda::std::make_tuple(x, y),
+    out,
+    size,
+    [alpha, beta] __host__ __device__(float a, float b) { return alpha * a + beta * b; },
+    stream);
+}
+
+[[maybe_unused]] static void axpy(double alpha,
+                                  double* x,
+                                  double beta,
+                                  double* y,
+                                  double* out,
+                                  int size,
+                                  rmm::cuda_stream_view stream)
+{
+  cub::DeviceTransform::Transform(
+    cuda::std::make_tuple(x, y),
+    out,
+    size,
+    [alpha, beta] __host__ __device__(double a, double b) { return alpha * a + beta * b; },
+    stream);
+}
+
 template <typename i_t, typename f_t>
 class iteration_data_t {
  public:
@@ -1404,12 +1446,7 @@ class iteration_data_t {
 
     // diag.pairwise_product(x1, r1);
     // r1 <- D * x_1
-    thrust::transform(handle_ptr->get_thrust_policy(),
-                      d_x1.data(),
-                      d_x1.data() + n,
-                      d_diag_.data(),
-                      d_r1.data(),
-                      thrust::multiplies<f_t>());
+    pairwise_multiply(d_x1.data(), d_diag_.data(), d_r1.data(), n, stream_view_);
 
     // r1 <- Q x1 + D x1
     if (Q.n > 0) {
@@ -1419,12 +1456,7 @@ class iteration_data_t {
 
     // y1 <- - alpha * r1 + beta * y1
     // y1.axpy(-alpha, r1, beta);
-    thrust::transform(handle_ptr->get_thrust_policy(),
-                      d_r1.data(),
-                      d_r1.data() + n,
-                      d_y1.data(),
-                      d_y1.data(),
-                      axpy_op<f_t>{-alpha, beta});
+    axpy(-alpha, d_r1.data(), beta, d_y1.data(), d_y1.data(), n, stream_view_);
 
     // matrix_transpose_vector_multiply(A, alpha, x2, 1.0, y1);
     cusparse_view_.transpose_spmv(alpha, d_x2, 1.0, d_y1);

@@ -1511,7 +1511,7 @@ void pdlp_solver_t<i_t, f_t>::compute_initial_step_size()
 
     const auto& cusparse_view_ = pdhg_solver_.get_cusparse_view();
 
-    int sing_iters = 0;
+    [[maybe_unused]] int sing_iters = 0;
     for (int i = 0; i < max_iterations; ++i) {
       ++sing_iters;
       // d_q = d_z

@@ -306,8 +306,7 @@ std::pair<std::vector<std::unique_ptr<solver_ret_t>>, double> call_batch_solve(
 
 #pragma omp parallel for num_threads(max_thread)
   for (std::size_t i = 0; i < size; ++i)
-    list[i] =
-      std::move(call_solve(data_models[i], solver_settings, cudaStreamNonBlocking, is_batch_mode));
+    list[i] = call_solve(data_models[i], solver_settings, cudaStreamNonBlocking, is_batch_mode);
 
   auto end      = std::chrono::high_resolution_clock::now();
   auto duration = std::chrono::duration_cast<std::chrono::milliseconds>(end - start_solver);

@@ -62,9 +62,9 @@ struct max_abs_value {
 template <typename i_t>
 i_t conditional_major(uint64_t total_pdlp_iterations)
 {
-  uint64_t step      = 10;
-  uint64_t threshold = 1000;
-  uint64_t iteration = 0;
+  uint64_t step                       = 10;
+  uint64_t threshold                  = 1000;
+  [[maybe_unused]] uint64_t iteration = 0;
 
   [[maybe_unused]] constexpr uint64_t max_u64 = std::numeric_limits<uint64_t>::max();
 

@@ -261,8 +261,8 @@ void rins_t<i_t, f_t>::run_rins()
   branch_and_bound_settings.num_diving_threads = 1;
   branch_and_bound_settings.log.log            = false;
   branch_and_bound_settings.log.log_prefix     = "[RINS] ";
-  branch_and_bound_settings.solution_callback  = [this, &rins_solution_queue](
-                                                  std::vector<f_t>& solution, f_t objective) {
+  branch_and_bound_settings.solution_callback  = [&rins_solution_queue](std::vector<f_t>& solution,
+                                                                       f_t objective) {
     rins_solution_queue.push_back(solution);
   };
   dual_simplex::branch_and_bound_t<i_t, f_t> branch_and_bound(branch_and_bound_problem,

@@ -81,8 +81,7 @@ void local_search_t<i_t, f_t>::start_cpufj_scratch_threads(population_t<i_t, f_t
                                                       /*randomize=*/counter > 0);
 
     cpu_fj.fj_cpu->log_prefix           = "******* scratch " + std::to_string(counter) + ": ";
-    cpu_fj.fj_cpu->improvement_callback = [this, &population, &cpu_fj](
-                                            f_t obj, const std::vector<f_t>& h_vec) {
+    cpu_fj.fj_cpu->improvement_callback = [&population](f_t obj, const std::vector<f_t>& h_vec) {
       population.add_external_solution(h_vec, obj, solution_origin_t::CPUFJ);
       if (obj < local_search_best_obj) {
         CUOPT_LOG_TRACE("******* New local search best obj %g, best overall %g",

@@ -7,13 +7,17 @@
 
 #pragma once
 
+#if !defined(__clang__)
 #pragma GCC diagnostic push
 #pragma GCC diagnostic ignored "-Wstringop-overflow"  // ignore boost error for pip wheel build
+#endif
 #include <papilo/Config.hpp>
 #include <papilo/core/PresolveMethod.hpp>
 #include <papilo/core/Problem.hpp>
 #include <papilo/core/ProblemUpdate.hpp>
+#if !defined(__clang__)
 #pragma GCC diagnostic pop
+#endif
 
 namespace cuopt::linear_programming::detail {
 

@@ -14,11 +14,15 @@
 
 #include <raft/common/nvtx.hpp>
 
+#if !defined(__clang__)
 #pragma GCC diagnostic push
 #pragma GCC diagnostic ignored "-Wstringop-overflow"  // ignore boost error for pip wheel build
+#endif
 #include <papilo/core/Presolve.hpp>
 #include <papilo/core/ProblemBuilder.hpp>
+#if !defined(__clang__)
 #pragma GCC diagnostic pop
+#endif
 
 namespace cuopt::linear_programming::detail {
 

@@ -83,6 +83,7 @@ void cpu_worker_thread_base_t<Derived>::cpu_worker_thread()
       std::lock_guard<std::mutex> lock(cpu_mutex);
       cpu_thread_done = true;
     }
+    cpu_cv.notify_all();
   }
 }
 
@@ -131,9 +132,8 @@ void cpu_worker_thread_base_t<Derived>::start_cpu_solver()
 template <typename Derived>
 bool cpu_worker_thread_base_t<Derived>::wait_for_cpu_solver()
 {
-  while (!cpu_thread_done && !cpu_thread_terminate) {
-    std::this_thread::sleep_for(std::chrono::milliseconds(1));
-  }
+  std::unique_lock<std::mutex> lock(cpu_mutex);
+  cpu_cv.wait(lock, [this] { return cpu_thread_done || cpu_thread_terminate; });
 
   return static_cast<Derived*>(this)->get_result();
 }

@@ -336,7 +336,7 @@ struct OX {
     int i                       = routes_number;
     if (optimal_routes_search) { i = optimal_routes_number; }
     int end_index = offspring.size() - 1;
-    double cost_n, cost_p, total_delta = 0.;
+    [[maybe_unused]] double cost_n, cost_p, total_delta = 0.;
 
     std::vector<std::pair<int, std::vector<NodeInfo<>>>> routes_to_add;
     std::vector<uint32_t> tmp_route;
@@ -530,7 +530,7 @@ struct OX {
                    "Mismatch number of edges");
       for (size_t j = 0; j < h_transpose_graph[i].size(); ++j) {
         auto [ref_edge, ref_weight, ref_veh] = h_transpose_graph[i][j];
-        bool found                           = false;
+        [[maybe_unused]] bool found          = false;
         for (int x = 0; x < tmp_transpose.row_sizes[i]; ++x) {
           auto edge = tmp_transpose.indices[transpose_offset + x];
           auto veh  = tmp_transpose.buckets[transpose_offset + x];

@@ -126,8 +126,8 @@ bool local_search_t<i_t, f_t, REQUEST>::run_cross_search(solution_t<i_t, f_t, RE
 
     cuopt_assert(cost_before - cost_after > EPSILON, "Cost should improve!");
     cuopt_assert(abs((cost_before - cost_after) -
-                       move_candidates.debug_delta.value(sol.sol_handle->get_stream()) <
-                     EPSILON * (1 + abs(cost_before))),
+                     move_candidates.debug_delta.value(sol.sol_handle->get_stream())) <
+                   EPSILON * (1 + abs(cost_before)),
                  "Cost mismatch on cross costs!");
     return true;
   }

@@ -565,8 +565,8 @@ bool local_search_t<i_t, f_t, REQUEST>::perform_sliding_tsp(
                                sol.get_cost(false, move_candidates.weights));
 
   cuopt_assert(abs((cost_before - cost_after) +
-                     move_candidates.debug_delta.value(sol.sol_handle->get_stream()) <
-                   EPSILON * (1 + abs(cost_before))),
+                   move_candidates.debug_delta.value(sol.sol_handle->get_stream())) <
+                 EPSILON * (1 + abs(cost_before)),
                "Cost mismatch on sliding_tsp costs!");
   cuopt_assert(cost_before - cost_after >= EPSILON, "Cost should improve!");
 

@@ -1116,9 +1116,8 @@ bool local_search_t<i_t, f_t, REQUEST>::perform_sliding_window(
 
   cuopt_assert(cost_before - cost_after >= EPSILON, "Cost should improve!");
   cuopt_assert(abs((cost_before - cost_after) -
-                     move_candidates.debug_delta.value(solution.sol_handle->get_stream()) <
-                   EPSILON),
-               "Cost mismatch on cross costs!");
+                   move_candidates.debug_delta.value(solution.sol_handle->get_stream())) < EPSILON,
+               "Cost mismatch on sliding_window costs!");
   return true;
 }
 

@@ -458,9 +458,8 @@ bool local_search_t<i_t, f_t, REQUEST>::perform_two_opt(
                       : sol.get_cost(move_candidates.include_objective, move_candidates.weights));
 
   cuopt_assert(abs((cost_before - cost_after) +
-
-                     move_candidates.debug_delta.value(sol.sol_handle->get_stream()) <
-                   EPSILON * (1 + abs(cost_before))),
+                   move_candidates.debug_delta.value(sol.sol_handle->get_stream())) <
+                 EPSILON * (1 + abs(cost_before)),
                "Cost mismatch on two_opt costs!");
   cuopt_assert(cost_before - cost_after >= EPSILON, "Cost should improve!");
   sol.global_runtime_checks(false, false, "two_opt_end");

@@ -465,8 +465,8 @@ bool execute_vrp_moves(solution_t<i_t, f_t, REQUEST>& sol,
                     sol.get_cost(move_candidates.include_objective, move_candidates.weights));
   cuopt_assert(cost_before - cost_after > EPSILON, "Cost should improve!");
   cuopt_assert(abs((cost_before - cost_after) +
-                     move_candidates.debug_delta.value(sol.sol_handle->get_stream()) <
-                   EPSILON * (1 + abs(cost_before))),
+                   move_candidates.debug_delta.value(sol.sol_handle->get_stream())) <
+                 EPSILON * (1 + abs(cost_before)),
                "Cost mismatch on vrp costs!");
   return true;
 }
-Original file line number
+Diff line change
@@ Expand Up / @@ -62,6 +62,7 @@ namespace linear_programming { @@
     class base_solution_t {
      public:
+      virtual ~base_solution_t()  = default;
       virtual bool is_mip() const = 0;
     };
@@ Expand Down @@