NVIDIA · rgsl888prabhu · Jan 7, 2026 · Dec 16, 2025 · Dec 16, 2025 · Dec 16, 2025
@@ -1,6 +1,6 @@
 /* clang-format off */
 /*
- * SPDX-FileCopyrightText: Copyright (c) 2021-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-FileCopyrightText: Copyright (c) 2021-2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: Apache-2.0
  */
 /* clang-format on */
@@ -16,6 +16,7 @@
 #include <raft/core/handle.hpp>
 
 #include <memory>
+#include <vector>
 
 namespace cuopt {
 namespace cython {
@@ -82,6 +83,10 @@ struct dataset_ret_t {
 std::unique_ptr<vehicle_routing_ret_t> call_solve(routing::data_model_view_t<int, float>*,
                                                   routing::solver_settings_t<int, float>*);
 
+// Wrapper for batch solve to expose the API to cython.
+std::vector<std::unique_ptr<vehicle_routing_ret_t>> call_batch_solve(
+  std::vector<routing::data_model_view_t<int, float>*>, routing::solver_settings_t<int, float>*);
+
 // Wrapper for dataset to expose the API to cython.
 std::unique_ptr<dataset_ret_t> call_generate_dataset(
   raft::handle_t const& handle, routing::generator::dataset_params_t<int, float> const& params);

@@ -1,6 +1,6 @@
 /* clang-format off */
 /*
- * SPDX-FileCopyrightText: Copyright (c) 2022-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-FileCopyrightText: Copyright (c) 2022-2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: Apache-2.0
  */
 /* clang-format on */
@@ -337,7 +337,7 @@ i_t optimization_problem_t<i_t, f_t>::get_n_integers() const
 {
   i_t n_integers = 0;
   if (get_n_variables() != 0) {
-    auto enum_variable_types = cuopt::host_copy(get_variable_types());
+    auto enum_variable_types = cuopt::host_copy(get_variable_types(), handle_ptr_->get_stream());
 
     for (size_t i = 0; i < enum_variable_types.size(); ++i) {
       if (enum_variable_types[i] == var_t::INTEGER) { n_integers++; }
@@ -591,16 +591,17 @@ void optimization_problem_t<i_t, f_t>::write_to_mps(const std::string& mps_file_
   data_model_view.set_maximize(get_sense());
 
   // Copy to host
-  auto constraint_matrix_values  = cuopt::host_copy(get_constraint_matrix_values());
-  auto constraint_matrix_indices = cuopt::host_copy(get_constraint_matrix_indices());
-  auto constraint_matrix_offsets = cuopt::host_copy(get_constraint_matrix_offsets());
-  auto constraint_bounds         = cuopt::host_copy(get_constraint_bounds());
-  auto objective_coefficients    = cuopt::host_copy(get_objective_coefficients());
-  auto variable_lower_bounds     = cuopt::host_copy(get_variable_lower_bounds());
-  auto variable_upper_bounds     = cuopt::host_copy(get_variable_upper_bounds());
-  auto constraint_lower_bounds   = cuopt::host_copy(get_constraint_lower_bounds());
-  auto constraint_upper_bounds   = cuopt::host_copy(get_constraint_upper_bounds());
-  auto row_types                 = cuopt::host_copy(get_row_types());
+  auto stream                    = handle_ptr_->get_stream();
+  auto constraint_matrix_values  = cuopt::host_copy(get_constraint_matrix_values(), stream);
+  auto constraint_matrix_indices = cuopt::host_copy(get_constraint_matrix_indices(), stream);
+  auto constraint_matrix_offsets = cuopt::host_copy(get_constraint_matrix_offsets(), stream);
+  auto constraint_bounds         = cuopt::host_copy(get_constraint_bounds(), stream);
+  auto objective_coefficients    = cuopt::host_copy(get_objective_coefficients(), stream);
+  auto variable_lower_bounds     = cuopt::host_copy(get_variable_lower_bounds(), stream);
+  auto variable_upper_bounds     = cuopt::host_copy(get_variable_upper_bounds(), stream);
+  auto constraint_lower_bounds   = cuopt::host_copy(get_constraint_lower_bounds(), stream);
+  auto constraint_upper_bounds   = cuopt::host_copy(get_constraint_upper_bounds(), stream);
+  auto row_types                 = cuopt::host_copy(get_row_types(), stream);
 
   // Set constraint matrix in CSR format
   if (get_nnz() != 0) {
@@ -652,7 +653,7 @@ void optimization_problem_t<i_t, f_t>::write_to_mps(const std::string& mps_file_
   std::vector<char> variable_types(get_n_variables());
   // Set variable types (convert from enum to char)
   if (get_n_variables() != 0) {
-    auto enum_variable_types = cuopt::host_copy(get_variable_types());
+    auto enum_variable_types = cuopt::host_copy(get_variable_types(), stream);
 
     // Convert enum types to char types
     for (size_t i = 0; i < variable_types.size(); ++i) {
@@ -677,13 +678,17 @@ void optimization_problem_t<i_t, f_t>::write_to_mps(const std::string& mps_file_
 template <typename i_t, typename f_t>
 void optimization_problem_t<i_t, f_t>::print_scaling_information() const
 {
-  std::vector<f_t> constraint_matrix_values = cuopt::host_copy(get_constraint_matrix_values());
-  std::vector<f_t> constraint_rhs           = cuopt::host_copy(get_constraint_bounds());
-  std::vector<f_t> objective_coefficients   = cuopt::host_copy(get_objective_coefficients());
-  std::vector<f_t> variable_lower_bounds    = cuopt::host_copy(get_variable_lower_bounds());
-  std::vector<f_t> variable_upper_bounds    = cuopt::host_copy(get_variable_upper_bounds());
-  std::vector<f_t> constraint_lower_bounds  = cuopt::host_copy(get_constraint_lower_bounds());
-  std::vector<f_t> constraint_upper_bounds  = cuopt::host_copy(get_constraint_upper_bounds());
+  auto stream = handle_ptr_->get_stream();
+  std::vector<f_t> constraint_matrix_values =
+    cuopt::host_copy(get_constraint_matrix_values(), stream);
+  std::vector<f_t> constraint_rhs         = cuopt::host_copy(get_constraint_bounds(), stream);
+  std::vector<f_t> objective_coefficients = cuopt::host_copy(get_objective_coefficients(), stream);
+  std::vector<f_t> variable_lower_bounds  = cuopt::host_copy(get_variable_lower_bounds(), stream);
+  std::vector<f_t> variable_upper_bounds  = cuopt::host_copy(get_variable_upper_bounds(), stream);
+  std::vector<f_t> constraint_lower_bounds =
+    cuopt::host_copy(get_constraint_lower_bounds(), stream);
+  std::vector<f_t> constraint_upper_bounds =
+    cuopt::host_copy(get_constraint_upper_bounds(), stream);
 
   auto findMaxAbs = [](const std::vector<f_t>& vec) -> f_t {
     if (vec.empty()) { return 0.0; }

@@ -1,6 +1,6 @@
 /* clang-format off */
 /*
- * SPDX-FileCopyrightText: Copyright (c) 2022-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-FileCopyrightText: Copyright (c) 2022-2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: Apache-2.0
  */
 /* clang-format on */
@@ -116,22 +116,23 @@ void translate_to_crossover_problem(const detail::problem_t<i_t, f_t>& problem,
 {
   CUOPT_LOG_DEBUG("Starting translation");
 
-  std::vector<f_t> pdlp_objective = cuopt::host_copy(problem.objective_coefficients);
+  auto stream                     = problem.handle_ptr->get_stream();
+  std::vector<f_t> pdlp_objective = cuopt::host_copy(problem.objective_coefficients, stream);
 
   dual_simplex::csr_matrix_t<i_t, f_t> csr_A(
     problem.n_constraints, problem.n_variables, problem.nnz);
-  csr_A.x         = cuopt::host_copy(problem.coefficients);
-  csr_A.j         = cuopt::host_copy(problem.variables);
-  csr_A.row_start = cuopt::host_copy(problem.offsets);
+  csr_A.x         = cuopt::host_copy(problem.coefficients, stream);
+  csr_A.j         = cuopt::host_copy(problem.variables, stream);
+  csr_A.row_start = cuopt::host_copy(problem.offsets, stream);
 
-  problem.handle_ptr->get_stream().synchronize();
+  stream.synchronize();
   CUOPT_LOG_DEBUG("Converting to compressed column");
   csr_A.to_compressed_col(lp.A);
   CUOPT_LOG_DEBUG("Converted to compressed column");
 
   std::vector<f_t> slack(problem.n_constraints);
-  std::vector<f_t> tmp_x = cuopt::host_copy(sol.get_primal_solution());
-  problem.handle_ptr->get_stream().synchronize();
+  std::vector<f_t> tmp_x = cuopt::host_copy(sol.get_primal_solution(), stream);
+  stream.synchronize();
   dual_simplex::matrix_vector_multiply(lp.A, 1.0, tmp_x, 0.0, slack);
   CUOPT_LOG_DEBUG("Multiplied A and x");
 
@@ -161,8 +162,8 @@ void translate_to_crossover_problem(const detail::problem_t<i_t, f_t>& problem,
 
   auto [lower, upper] = extract_host_bounds<f_t>(problem.variable_bounds, problem.handle_ptr);
 
-  std::vector<f_t> constraint_lower = cuopt::host_copy(problem.constraint_lower_bounds);
-  std::vector<f_t> constraint_upper = cuopt::host_copy(problem.constraint_upper_bounds);
+  std::vector<f_t> constraint_lower = cuopt::host_copy(problem.constraint_lower_bounds, stream);
+  std::vector<f_t> constraint_upper = cuopt::host_copy(problem.constraint_upper_bounds, stream);
 
   lp.objective.resize(n, 0.0);
   std::copy(
@@ -187,10 +188,10 @@ void translate_to_crossover_problem(const detail::problem_t<i_t, f_t>& problem,
     if (initial_solution.x[j] > lp.upper[j]) { initial_solution.x[j] = lp.upper[j]; }
   }
   CUOPT_LOG_DEBUG("Finished with x");
-  initial_solution.y = cuopt::host_copy(sol.get_dual_solution());
+  initial_solution.y = cuopt::host_copy(sol.get_dual_solution(), stream);
 
-  std::vector<f_t> tmp_z = cuopt::host_copy(sol.get_reduced_cost());
-  problem.handle_ptr->get_stream().synchronize();
+  std::vector<f_t> tmp_z = cuopt::host_copy(sol.get_reduced_cost(), stream);
+  stream.synchronize();
   std::copy(tmp_z.begin(), tmp_z.begin() + problem.n_variables, initial_solution.z.begin());
   for (i_t j = problem.n_variables; j < n; ++j) {
     initial_solution.z[j] = initial_solution.y[j - problem.n_variables];

@@ -1,5 +1,5 @@
 /*
- * SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights
+ * SPDX-FileCopyrightText: Copyright (c) 2025-2026, NVIDIA CORPORATION & AFFILIATES. All rights
  * reserved. SPDX-License-Identifier: Apache-2.0
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
@@ -221,7 +221,7 @@ void rins_t<i_t, f_t>::run_rins()
     &rins_handle, &fixed_problem, context.settings, context.scaling);
   fj_t<i_t, f_t> fj(fj_context);
   solution_t<i_t, f_t> fj_solution(fixed_problem);
-  fj_solution.copy_new_assignment(cuopt::host_copy(fixed_assignment));
+  fj_solution.copy_new_assignment(cuopt::host_copy(fixed_assignment, rins_handle.get_stream()));
   std::vector<f_t> default_weights(fixed_problem.n_constraints, 1.);
   cpu_fj_thread_t<i_t, f_t> cpu_fj_thread;
   cpu_fj_thread.fj_cpu             = fj.create_cpu_climber(fj_solution,

@@ -1,6 +1,6 @@
 /* clang-format off */
 /*
- * SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-FileCopyrightText: Copyright (c) 2024-2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: Apache-2.0
  */
 /* clang-format on */
@@ -190,7 +190,7 @@ class bound_prop_recombiner_t : public recombiner_t<i_t, f_t> {
                                       probing_values,
                                       n_vars_from_other,
                                       variable_map);
-      probing_config.probing_values         = host_copy(probing_values);
+      probing_config.probing_values = host_copy(probing_values, offspring.handle_ptr->get_stream());
       probing_config.n_of_fixed_from_first  = fixed_from_guiding;
       probing_config.n_of_fixed_from_second = fixed_from_other;
       probing_config.use_balanced_probing   = true;
@@ -214,7 +214,7 @@ class bound_prop_recombiner_t : public recombiner_t<i_t, f_t> {
       timer_t timer(bp_recombiner_config_t::bounds_prop_time_limit);
       get_probing_values_for_infeasible(
         guiding_solution, other_solution, offspring, probing_values, n_vars_from_other);
-      probing_config.probing_values = host_copy(probing_values);
+      probing_config.probing_values = host_copy(probing_values, offspring.handle_ptr->get_stream());
       constraint_prop.apply_round(offspring, lp_run_time_after_feasible, timer, probing_config);
     }
     constraint_prop.max_n_failed_repair_iterations = 1;

@@ -1,6 +1,6 @@
 /* clang-format off */
 /*
- * SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-FileCopyrightText: Copyright (c) 2024-2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: Apache-2.0
  */
 /* clang-format on */
@@ -110,7 +110,8 @@ void local_search_t<i_t, f_t>::start_cpufj_lptopt_scratch_threads(
   std::vector<f_t> default_weights(context.problem_ptr->n_constraints, 1.);
 
   solution_t<i_t, f_t> solution_lp(*context.problem_ptr);
-  solution_lp.copy_new_assignment(host_copy(lp_optimal_solution));
+  solution_lp.copy_new_assignment(
+    host_copy(lp_optimal_solution, context.problem_ptr->handle_ptr->get_stream()));
   solution_lp.round_random_nearest(500);
   scratch_cpu_fj_on_lp_opt.fj_cpu = fj.create_cpu_climber(
     solution_lp, default_weights, default_weights, 0., context.preempt_heuristic_solver_);

@@ -1,6 +1,6 @@
 /* clang-format off */
 /*
- * SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-FileCopyrightText: Copyright (c) 2024-2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: Apache-2.0
  */
 /* clang-format on */
@@ -230,11 +230,12 @@ void conditional_bound_strengthening_t<i_t, f_t>::select_constraint_pairs_host(
 #ifdef DEBUG_COND_BOUNDS_PROP
   auto start_time = std::chrono::high_resolution_clock::now();
 #endif
-  auto variables = cuopt::host_copy(problem.variables);
-  auto offsets   = cuopt::host_copy(problem.offsets);
+  auto stream    = problem.handle_ptr->get_stream();
+  auto variables = cuopt::host_copy(problem.variables, stream);
+  auto offsets   = cuopt::host_copy(problem.offsets, stream);
 
-  auto reverse_constraints = cuopt::host_copy(problem.reverse_constraints);
-  auto reverse_offsets     = cuopt::host_copy(problem.reverse_offsets);
+  auto reverse_constraints = cuopt::host_copy(problem.reverse_constraints, stream);
+  auto reverse_offsets     = cuopt::host_copy(problem.reverse_offsets, stream);
 
   std::vector<int2> constraint_pairs_h(max_pair_per_row * problem.n_constraints, {-1, -1});
   std::unordered_set<int> cnstr_pair;
@@ -295,8 +296,8 @@ void conditional_bound_strengthening_t<i_t, f_t>::select_constraint_pairs_device
                   colsC,
                   valsC);
   std::vector<int2> constraint_pairs_h;
-  offsets_h = cuopt::host_copy(offsetsC);
-  cols_h    = cuopt::host_copy(colsC);
+  offsets_h = cuopt::host_copy(offsetsC, stream);
+  cols_h    = cuopt::host_copy(colsC, stream);
 
   constraint_pairs_h.reserve(max_pair_per_row * problem.n_constraints);
   for (int i = 0; i < problem.n_constraints; ++i) {
@@ -654,8 +655,9 @@ void conditional_bound_strengthening_t<i_t, f_t>::solve(problem_t<i_t, f_t>& pro
     raft::alignTo(5 * sizeof(f_t) + sizeof(i_t) + sizeof(var_t), sizeof(i_t)) * max_row_size;
 
 #ifdef DEBUG_COND_BOUNDS_PROP
-  auto old_lb_h = cuopt::host_copy(problem.constraint_lower_bounds);
-  auto old_ub_h = cuopt::host_copy(problem.constraint_upper_bounds);
+  auto debug_stream = problem.handle_ptr->get_stream();
+  auto old_lb_h     = cuopt::host_copy(problem.constraint_lower_bounds, debug_stream);
+  auto old_ub_h     = cuopt::host_copy(problem.constraint_upper_bounds, debug_stream);
 
   auto start_time = std::chrono::high_resolution_clock::now();
 #endif
@@ -674,8 +676,8 @@ void conditional_bound_strengthening_t<i_t, f_t>::solve(problem_t<i_t, f_t>& pro
   double time_for_presolve =
     std::chrono::duration_cast<std::chrono::milliseconds>(end_time - start_time).count();
 
-  auto new_lb_h = cuopt::host_copy(problem.constraint_lower_bounds);
-  auto new_ub_h = cuopt::host_copy(problem.constraint_upper_bounds);
+  auto new_lb_h = cuopt::host_copy(problem.constraint_lower_bounds, debug_stream);
+  auto new_ub_h = cuopt::host_copy(problem.constraint_upper_bounds, debug_stream);
 
   int num_improvements = 0;
   int num_new_equality = 0;

@@ -1,6 +1,6 @@
 /* clang-format off */
 /*
- * SPDX-FileCopyrightText: Copyright (c) 2022-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-FileCopyrightText: Copyright (c) 2022-2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: Apache-2.0
  */
 /* clang-format on */
@@ -302,7 +302,7 @@ inline std::vector<i_t> compute_prioritized_integer_indices(
                         }
                         return false;
                       });
-  auto h_priority_indices = host_copy(priority_indices);
+  auto h_priority_indices = host_copy(priority_indices, problem.pb->handle_ptr->get_stream());
   return h_priority_indices;
 }
 
@@ -315,9 +315,10 @@ void compute_probing_cache(load_balanced_bounds_presolve_t<i_t, f_t>& bound_pres
   auto priority_indices = compute_prioritized_integer_indices(bound_presolve, problem);
   // std::cout<<"priority_indices\n";
   CUOPT_LOG_DEBUG("Computing probing cache");
-  auto h_integer_indices      = host_copy(problem.pb->integer_indices);
-  auto h_var_upper_bounds     = host_copy(problem.pb->variable_upper_bounds);
-  auto h_var_lower_bounds     = host_copy(problem.pb->variable_lower_bounds);
+  auto stream                 = problem.pb->handle_ptr->get_stream();
+  auto h_integer_indices      = host_copy(problem.pb->integer_indices, stream);
+  auto h_var_upper_bounds     = host_copy(problem.pb->variable_upper_bounds, stream);
+  auto h_var_lower_bounds     = host_copy(problem.pb->variable_lower_bounds, stream);
   size_t n_of_cached_probings = 0;
   // TODO adjust the iteration limit depending on the total time limit and time it takes for single
   // var

@@ -1,6 +1,6 @@
 /* clang-format off */
 /*
- * SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-FileCopyrightText: Copyright (c) 2024-2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: Apache-2.0
  */
 /* clang-format on */
@@ -181,7 +181,7 @@ class log_dist_t {
   log_dist_t() = default;
 
   log_dist_t(rmm::device_uvector<i_t>& vertex_id, rmm::device_uvector<i_t>& bin_offsets)
-    : vertex_id_begin_(vertex_id.data()), bin_offsets_(host_copy(bin_offsets))
+    : vertex_id_begin_(vertex_id.data()), bin_offsets_(host_copy(bin_offsets, bin_offsets.stream()))
   {
     // If bin_offsets_ is smaller than NumberBins<i_t> then resize it
     // so that the last element is repeated

@@ -1,6 +1,6 @@
 /* clang-format off */
 /*
- * SPDX-FileCopyrightText: Copyright (c) 2022-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-FileCopyrightText: Copyright (c) 2022-2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: Apache-2.0
  */
 /* clang-format on */
@@ -346,7 +346,7 @@ inline std::vector<i_t> compute_prioritized_integer_indices(
                         }
                         return false;
                       });
-  auto h_priority_indices = host_copy(priority_indices);
+  auto h_priority_indices = host_copy(priority_indices, problem.handle_ptr->get_stream());
   problem.handle_ptr->sync_stream();
   return h_priority_indices;
 }
@@ -461,8 +461,9 @@ void compute_probing_cache(bound_presolve_t<i_t, f_t>& bound_presolve,
   // we dont want to compute the probing cache for all variables for time and computation resources
   auto priority_indices = compute_prioritized_integer_indices(bound_presolve, problem);
   CUOPT_LOG_DEBUG("Computing probing cache");
-  auto h_integer_indices  = host_copy(problem.integer_indices);
-  const auto h_var_bounds = host_copy(problem.variable_bounds);
+  auto stream             = problem.handle_ptr->get_stream();
+  auto h_integer_indices  = host_copy(problem.integer_indices, stream);
+  const auto h_var_bounds = host_copy(problem.variable_bounds, stream);
   // TODO adjust the iteration limit depending on the total time limit and time it takes for single
   // var
   bound_presolve.settings.iteration_limit = 50;