diff --git a/cpp/src/dual_simplex/bounds_strengthening.cpp b/cpp/src/dual_simplex/bounds_strengthening.cpp
index f1bf52c1e..c56c9db98 100644
--- a/cpp/src/dual_simplex/bounds_strengthening.cpp
+++ b/cpp/src/dual_simplex/bounds_strengthening.cpp
@@ -154,7 +154,7 @@ bool bounds_strengthening_t<i_t, f_t>::bounds_strengthening(
       bool is_infeasible =
         check_infeasibility<i_t, f_t>(min_a, max_a, cnst_lb, cnst_ub, settings.primal_tol);
       if (is_infeasible) {
-        settings.log.printf(
+        settings.log.debug(
           "Iter:: %d, Infeasible constraint %d, cnst_lb %e, cnst_ub %e, min_a %e, max_a %e\n",
           iter,
           i,
@@ -211,7 +211,7 @@ bool bounds_strengthening_t<i_t, f_t>::bounds_strengthening(
       new_ub = std::min(new_ub, upper_bounds[k]);
 
       if (new_lb > new_ub + 1e-6) {
-        settings.log.printf(
+        settings.log.debug(
           "Iter:: %d, Infeasible variable after update %d, %e > %e\n", iter, k, new_lb, new_ub);
         return false;
       }
diff --git a/cpp/src/dual_simplex/branch_and_bound.cpp b/cpp/src/dual_simplex/branch_and_bound.cpp
index 6161f4d3f..ff55d06f7 100644
--- a/cpp/src/dual_simplex/branch_and_bound.cpp
+++ b/cpp/src/dual_simplex/branch_and_bound.cpp
@@ -244,25 +244,15 @@ f_t branch_and_bound_t<i_t, f_t>::get_upper_bound()
 template <typename i_t, typename f_t>
 f_t branch_and_bound_t<i_t, f_t>::get_lower_bound()
 {
-  f_t lower_bound = lower_bound_ceiling_.load();
-  mutex_heap_.lock();
-  if (heap_.size() > 0) { lower_bound = std::min(heap_.top()->lower_bound, lower_bound); }
-  mutex_heap_.unlock();
+  f_t lower_bound      = lower_bound_ceiling_.load();
+  f_t heap_lower_bound = node_queue.get_lower_bound();
+  lower_bound          = std::min(heap_lower_bound, lower_bound);
 
   for (i_t i = 0; i < local_lower_bounds_.size(); ++i) {
     lower_bound = std::min(local_lower_bounds_[i].load(), lower_bound);
   }
 
-  return lower_bound;
-}
-
-template <typename i_t, typename f_t>
-i_t branch_and_bound_t<i_t, f_t>::get_heap_size()
-{
-  mutex_heap_.lock();
-  i_t size = heap_.size();
-  mutex_heap_.unlock();
-  return size;
+  return std::isfinite(lower_bound) ? lower_bound : -inf;
 }
 
 template <typename i_t, typename f_t>
@@ -589,6 +579,7 @@ node_solve_info_t branch_and_bound_t<i_t, f_t>::solve_node(
   bool recompute_bounds_and_basis,
   const std::vector<f_t>& root_lower,
   const std::vector<f_t>& root_upper,
+  bnb_stats_t<i_t, f_t>& stats,
   logger_t& log)
 {
   const f_t abs_fathom_tol = settings_.absolute_mip_gap_tol / 10;
@@ -605,6 +596,13 @@ node_solve_info_t branch_and_bound_t<i_t, f_t>::solve_node(
   lp_settings.time_limit    = settings_.time_limit - toc(exploration_stats_.start_time);
   lp_settings.scale_columns = false;
 
+  if (thread_type != thread_type_t::EXPLORATION) {
+    i_t bnb_lp_iters            = exploration_stats_.total_lp_iters;
+    f_t max_iter                = 0.05 * bnb_lp_iters;
+    lp_settings.iteration_limit = max_iter - stats.total_lp_iters;
+    if (lp_settings.iteration_limit <= 0) { return node_solve_info_t::ITERATION_LIMIT; }
+  }
+
 #ifdef LOG_NODE_SIMPLEX
   lp_settings.set_log(true);
   std::stringstream ss;
@@ -679,10 +677,8 @@ node_solve_info_t branch_and_bound_t<i_t, f_t>::solve_node(
       lp_status = convert_lp_status_to_dual_status(second_status);
     }
 
-    if (thread_type == thread_type_t::EXPLORATION) {
-      exploration_stats_.total_lp_solve_time += toc(lp_start_time);
-      exploration_stats_.total_lp_iters += node_iter;
-    }
+    stats.total_lp_solve_time += toc(lp_start_time);
+    stats.total_lp_iters += node_iter;
   }
 
   if (lp_status == dual::status_t::DUAL_UNBOUNDED) {
@@ -726,8 +722,9 @@ node_solve_info_t branch_and_bound_t<i_t, f_t>::solve_node(
 
     } else if (leaf_objective <= upper_bound + abs_fathom_tol) {
       // Choose fractional variable to branch on
-      const i_t branch_var =
-        pc_.variable_selection(leaf_fractional, leaf_solution.x, lp_settings.log);
+      auto [branch_var, obj_estimate] = pc_.variable_selection_and_obj_estimate(
+        leaf_fractional, leaf_solution.x, node_ptr->lower_bound, log);
+      node_ptr->objective_estimate = obj_estimate;
 
       assert(leaf_vstatus.size() == leaf_problem.num_cols);
       search_tree.branch(
@@ -751,6 +748,9 @@ node_solve_info_t branch_and_bound_t<i_t, f_t>::solve_node(
     search_tree.graphviz_node(log, node_ptr, "timeout", 0.0);
     return node_solve_info_t::TIME_LIMIT;
 
+  } else if (lp_status == dual::status_t::ITERATION_LIMIT) {
+    return node_solve_info_t::ITERATION_LIMIT;
+
   } else {
     if (thread_type == thread_type_t::EXPLORATION) {
       fetch_min(lower_bound_ceiling_, node_ptr->lower_bound);
@@ -854,6 +854,7 @@ void branch_and_bound_t<i_t, f_t>::exploration_ramp_up(mip_node_t<i_t, f_t>* nod
                                         true,
                                         original_lp_.lower,
                                         original_lp_.upper,
+                                        exploration_stats_,
                                         settings_.log);
 
   ++exploration_stats_.nodes_since_last_log;
@@ -877,23 +878,21 @@ void branch_and_bound_t<i_t, f_t>::exploration_ramp_up(mip_node_t<i_t, f_t>* nod
 
     } else {
       // We've generated enough nodes, push further nodes onto the heap
-      mutex_heap_.lock();
-      heap_.push(node->get_down_child());
-      heap_.push(node->get_up_child());
-      mutex_heap_.unlock();
+      node_queue.push(node->get_down_child());
+      node_queue.push(node->get_up_child());
     }
   }
 }
 
 template <typename i_t, typename f_t>
-void branch_and_bound_t<i_t, f_t>::explore_subtree(i_t task_id,
-                                                   mip_node_t<i_t, f_t>* start_node,
-                                                   search_tree_t<i_t, f_t>& search_tree,
-                                                   lp_problem_t<i_t, f_t>& leaf_problem,
-                                                   bounds_strengthening_t<i_t, f_t>& node_presolver,
-                                                   basis_update_mpf_t<i_t, f_t>& basis_factors,
-                                                   std::vector<i_t>& basic_list,
-                                                   std::vector<i_t>& nonbasic_list)
+void branch_and_bound_t<i_t, f_t>::plunge_from(i_t task_id,
+                                               mip_node_t<i_t, f_t>* start_node,
+                                               search_tree_t<i_t, f_t>& search_tree,
+                                               lp_problem_t<i_t, f_t>& leaf_problem,
+                                               bounds_strengthening_t<i_t, f_t>& node_presolver,
+                                               basis_update_mpf_t<i_t, f_t>& basis_factors,
+                                               std::vector<i_t>& basic_list,
+                                               std::vector<i_t>& nonbasic_list)
 {
   bool recompute_bounds_and_basis = true;
   std::deque<mip_node_t<i_t, f_t>*> stack;
@@ -922,6 +921,7 @@ void branch_and_bound_t<i_t, f_t>::explore_subtree(i_t task_id,
     if (lower_bound > upper_bound || rel_gap < settings_.relative_mip_gap_tol) {
       search_tree.graphviz_node(settings_.log, node_ptr, "cutoff", node_ptr->lower_bound);
       search_tree.update(node_ptr, node_status_t::FATHOMED);
+      recompute_bounds_and_basis = true;
       --exploration_stats_.nodes_unexplored;
       continue;
     }
@@ -977,6 +977,7 @@ void branch_and_bound_t<i_t, f_t>::explore_subtree(i_t task_id,
                                           recompute_bounds_and_basis,
                                           original_lp_.lower,
                                           original_lp_.upper,
+                                          exploration_stats_,
                                           settings_.log);
 
     recompute_bounds_and_basis = !has_children(status);
@@ -997,28 +998,7 @@ void branch_and_bound_t<i_t, f_t>::explore_subtree(i_t task_id,
       if (stack.size() > 0) {
         mip_node_t<i_t, f_t>* node = stack.back();
         stack.pop_back();
-
-        // The order here matters. We want to create a copy of the node
-        // before adding to the global heap. Otherwise,
-        // some thread may consume the node (possibly fathoming it)
-        // before we had the chance to add to the diving queue.
-        // This lead to a SIGSEGV. Although, in this case, it
-        // would be better if we discard the node instead.
-        if (get_heap_size() > settings_.num_bfs_threads) {
-          std::vector<f_t> lower = original_lp_.lower;
-          std::vector<f_t> upper = original_lp_.upper;
-          std::fill(
-            node_presolver.bounds_changed.begin(), node_presolver.bounds_changed.end(), false);
-          node->get_variable_bounds(lower, upper, node_presolver.bounds_changed);
-
-          mutex_dive_queue_.lock();
-          diving_queue_.emplace(node->detach_copy(), std::move(lower), std::move(upper));
-          mutex_dive_queue_.unlock();
-        }
-
-        mutex_heap_.lock();
-        heap_.push(node);
-        mutex_heap_.unlock();
+        node_queue.push(node);
       }
 
       exploration_stats_.nodes_unexplored += 2;
@@ -1056,37 +1036,31 @@ void branch_and_bound_t<i_t, f_t>::best_first_thread(i_t task_id,
 
   while (solver_status_ == mip_exploration_status_t::RUNNING &&
          abs_gap > settings_.absolute_mip_gap_tol && rel_gap > settings_.relative_mip_gap_tol &&
-         (active_subtrees_ > 0 || get_heap_size() > 0)) {
-    mip_node_t<i_t, f_t>* start_node = nullptr;
-
+         (active_subtrees_ > 0 || node_queue.best_first_queue_size() > 0)) {
     // If there any node left in the heap, we pop the top node and explore it.
-    mutex_heap_.lock();
-    if (heap_.size() > 0) {
-      start_node = heap_.top();
-      heap_.pop();
-      active_subtrees_++;
-    }
-    mutex_heap_.unlock();
+    std::optional<mip_node_t<i_t, f_t>*> start_node = node_queue.pop_best_first(active_subtrees_);
+
+    if (start_node.has_value()) {
+      mip_node_t<i_t, f_t>* node = start_node.value();
 
-    if (start_node != nullptr) {
-      if (get_upper_bound() < start_node->lower_bound) {
+      if (get_upper_bound() < node->lower_bound) {
         // This node was put on the heap earlier but its lower bound is now greater than the
         // current upper bound
-        search_tree.graphviz_node(settings_.log, start_node, "cutoff", start_node->lower_bound);
-        search_tree.update(start_node, node_status_t::FATHOMED);
+        search_tree.graphviz_node(settings_.log, node, "cutoff", node->lower_bound);
+        search_tree.update(node, node_status_t::FATHOMED);
         active_subtrees_--;
         continue;
       }
 
       // Best-first search with plunging
-      explore_subtree(task_id,
-                      start_node,
-                      search_tree,
-                      leaf_problem,
-                      node_presolver,
-                      basis_factors,
-                      basic_list,
-                      nonbasic_list);
+      plunge_from(task_id,
+                  node,
+                  search_tree,
+                  leaf_problem,
+                  node_presolver,
+                  basis_factors,
+                  basic_list,
+                  nonbasic_list);
 
       active_subtrees_--;
     }
@@ -1123,23 +1097,39 @@ void branch_and_bound_t<i_t, f_t>::diving_thread(const csr_matrix_t<i_t, f_t>& A
   std::vector<i_t> basic_list(m);
   std::vector<i_t> nonbasic_list;
 
+  std::vector<f_t> start_lower;
+  std::vector<f_t> start_upper;
+  bool reset_starting_bounds = true;
+
   while (solver_status_ == mip_exploration_status_t::RUNNING &&
-         (active_subtrees_ > 0 || get_heap_size() > 0)) {
-    std::optional<diving_root_t<i_t, f_t>> start_node;
+         (active_subtrees_ > 0 || node_queue.best_first_queue_size() > 0)) {
+    if (reset_starting_bounds) {
+      start_lower = original_lp_.lower;
+      start_upper = original_lp_.upper;
+      std::fill(node_presolver.bounds_changed.begin(), node_presolver.bounds_changed.end(), false);
+      reset_starting_bounds = false;
+    }
 
-    mutex_dive_queue_.lock();
-    if (diving_queue_.size() > 0) { start_node = diving_queue_.pop(); }
-    mutex_dive_queue_.unlock();
+    std::optional<mip_node_t<i_t, f_t>> start_node =
+      node_queue.pop_diving(start_lower, start_upper, node_presolver.bounds_changed);
 
     if (start_node.has_value()) {
-      if (get_upper_bound() < start_node->node.lower_bound) { continue; }
+      reset_starting_bounds = true;
+
+      bool is_feasible = node_presolver.bounds_strengthening(start_lower, start_upper, settings_);
+      if (get_upper_bound() < start_node->lower_bound || !is_feasible) { continue; }
 
       bool recompute_bounds_and_basis = true;
-      i_t nodes_explored              = 0;
-      search_tree_t<i_t, f_t> subtree(std::move(start_node->node));
+      search_tree_t<i_t, f_t> subtree(std::move(start_node.value()));
       std::deque<mip_node_t<i_t, f_t>*> stack;
       stack.push_front(&subtree.root);
 
+      bnb_stats_t<i_t, f_t> dive_stats;
+      dive_stats.total_lp_iters      = 0;
+      dive_stats.total_lp_solve_time = 0;
+      dive_stats.nodes_explored      = 0;
+      dive_stats.nodes_unexplored    = 0;
+
       while (stack.size() > 0 && solver_status_ == mip_exploration_status_t::RUNNING) {
         mip_node_t<i_t, f_t>* node_ptr = stack.front();
         stack.pop_front();
@@ -1151,9 +1141,8 @@ void branch_and_bound_t<i_t, f_t>::diving_thread(const csr_matrix_t<i_t, f_t>& A
           continue;
         }
 
-        if (toc(exploration_stats_.start_time) > settings_.time_limit) { return; }
-
-        if (nodes_explored >= 1000) { break; }
+        if (toc(exploration_stats_.start_time) > settings_.time_limit) { break; }
+        if (dive_stats.nodes_explored > 500) { break; }
 
         node_solve_info_t status = solve_node(node_ptr,
                                               subtree,
@@ -1164,17 +1153,19 @@ void branch_and_bound_t<i_t, f_t>::diving_thread(const csr_matrix_t<i_t, f_t>& A
                                               node_presolver,
                                               thread_type_t::DIVING,
                                               recompute_bounds_and_basis,
-                                              start_node->lower,
-                                              start_node->upper,
+                                              start_lower,
+                                              start_upper,
+                                              dive_stats,
                                               log);
-
-        nodes_explored++;
-
+        dive_stats.nodes_explored++;
         recompute_bounds_and_basis = !has_children(status);
 
         if (status == node_solve_info_t::TIME_LIMIT) {
           solver_status_ = mip_exploration_status_t::TIME_LIMIT;
-          return;
+          break;
+
+        } else if (status == node_solve_info_t::ITERATION_LIMIT) {
+          break;
 
         } else if (has_children(status)) {
           if (status == node_solve_info_t::UP_CHILD_FIRST) {
@@ -1186,24 +1177,8 @@ void branch_and_bound_t<i_t, f_t>::diving_thread(const csr_matrix_t<i_t, f_t>& A
           }
         }
 
-        if (stack.size() > 1) {
-          // If the diving thread is consuming the nodes faster than the
-          // best first search, then we split the current subtree at the
-          // lowest possible point and move to the queue, so it can
-          // be picked by another thread.
-          if (std::lock_guard<omp_mutex_t> lock(mutex_dive_queue_);
-              diving_queue_.size() < min_diving_queue_size_) {
-            mip_node_t<i_t, f_t>* new_node = stack.back();
-            stack.pop_back();
-
-            std::vector<f_t> lower = start_node->lower;
-            std::vector<f_t> upper = start_node->upper;
-            std::fill(
-              node_presolver.bounds_changed.begin(), node_presolver.bounds_changed.end(), false);
-            new_node->get_variable_bounds(lower, upper, node_presolver.bounds_changed);
-
-            diving_queue_.emplace(new_node->detach_copy(), std::move(lower), std::move(upper));
-          }
+        if (stack.size() > 1 && stack.front()->depth - stack.back()->depth > 5) {
+          stack.pop_back();
         }
       }
     }
@@ -1426,7 +1401,8 @@ mip_status_t branch_and_bound_t<i_t, f_t>::solve(mip_solution_t<i_t, f_t>& solut
   }
 
   // Choose variable to branch on
-  i_t branch_var = pc_.variable_selection(fractional, root_relax_soln_.x, log);
+  auto [branch_var, obj_estimate] =
+    pc_.variable_selection_and_obj_estimate(fractional, root_relax_soln_.x, root_objective_, log);
 
   search_tree_.root      = std::move(mip_node_t<i_t, f_t>(root_objective_, root_vstatus_));
   search_tree_.num_nodes = 0;
@@ -1455,7 +1431,6 @@ mip_status_t branch_and_bound_t<i_t, f_t>::solve(mip_solution_t<i_t, f_t>& solut
   exploration_stats_.nodes_since_last_log = 0;
   exploration_stats_.last_log             = tic();
   active_subtrees_                        = 0;
-  min_diving_queue_size_                  = 4 * settings_.num_diving_threads;
   solver_status_                          = mip_exploration_status_t::RUNNING;
   lower_bound_ceiling_                    = inf;
   should_report_                          = true;
@@ -1491,7 +1466,8 @@ mip_status_t branch_and_bound_t<i_t, f_t>::solve(mip_solution_t<i_t, f_t>& solut
     }
   }
 
-  f_t lower_bound = heap_.size() > 0 ? heap_.top()->lower_bound : search_tree_.root.lower_bound;
+  f_t lower_bound = node_queue.best_first_queue_size() > 0 ? node_queue.get_lower_bound()
+                                                           : search_tree_.root.lower_bound;
   return set_final_solution(solution, lower_bound);
 }
 
diff --git a/cpp/src/dual_simplex/branch_and_bound.hpp b/cpp/src/dual_simplex/branch_and_bound.hpp
index 38438cc9e..032eed11f 100644
--- a/cpp/src/dual_simplex/branch_and_bound.hpp
+++ b/cpp/src/dual_simplex/branch_and_bound.hpp
@@ -7,9 +7,9 @@
 
 #pragma once
 
-#include <dual_simplex/diving_queue.hpp>
 #include <dual_simplex/initial_basis.hpp>
 #include <dual_simplex/mip_node.hpp>
+#include <dual_simplex/node_queue.hpp>
 #include <dual_simplex/phase2.hpp>
 #include <dual_simplex/pseudo_costs.hpp>
 #include <dual_simplex/simplex_solver_settings.hpp>
@@ -20,7 +20,6 @@
 #include <utilities/omp_helpers.hpp>
 
 #include <omp.h>
-#include <queue>
 #include <vector>
 
 namespace cuopt::linear_programming::dual_simplex {
@@ -68,12 +67,22 @@ class bounds_strengthening_t;
 template <typename i_t, typename f_t>
 void upper_bound_callback(f_t upper_bound);
 
+template <typename i_t, typename f_t>
+struct bnb_stats_t {
+  f_t start_time                        = 0.0;
+  omp_atomic_t<f_t> total_lp_solve_time = 0.0;
+  omp_atomic_t<i_t> nodes_explored      = 0;
+  omp_atomic_t<i_t> nodes_unexplored    = 0;
+  omp_atomic_t<f_t> total_lp_iters      = 0;
+
+  // This should only be used by the main thread
+  omp_atomic_t<f_t> last_log             = 0.0;
+  omp_atomic_t<i_t> nodes_since_last_log = 0;
+};
+
 template <typename i_t, typename f_t>
 class branch_and_bound_t {
  public:
-  template <typename T>
-  using mip_node_heap_t = std::priority_queue<T, std::vector<T>, node_compare_t<i_t, f_t>>;
-
   branch_and_bound_t(const user_problem_t<i_t, f_t>& user_problem,
                      const simplex_solver_settings_t<i_t, f_t>& solver_settings);
 
@@ -111,7 +120,6 @@ class branch_and_bound_t {
 
   f_t get_upper_bound();
   f_t get_lower_bound();
-  i_t get_heap_size();
   bool enable_concurrent_lp_root_solve() const { return enable_concurrent_lp_root_solve_; }
   std::atomic<int>* get_root_concurrent_halt() { return &root_concurrent_halt_; }
   void set_root_concurrent_halt(int value) { root_concurrent_halt_ = value; }
@@ -145,17 +153,7 @@ class branch_and_bound_t {
   mip_solution_t<i_t, f_t> incumbent_;
 
   // Structure with the general info of the solver.
-  struct stats_t {
-    f_t start_time                        = 0.0;
-    omp_atomic_t<f_t> total_lp_solve_time = 0.0;
-    omp_atomic_t<i_t> nodes_explored      = 0;
-    omp_atomic_t<i_t> nodes_unexplored    = 0;
-    omp_atomic_t<f_t> total_lp_iters      = 0;
-
-    // This should only be used by the main thread
-    omp_atomic_t<f_t> last_log             = 0.0;
-    omp_atomic_t<i_t> nodes_since_last_log = 0;
-  } exploration_stats_;
+  bnb_stats_t<i_t, f_t> exploration_stats_;
 
   // Mutex for repair
   omp_mutex_t mutex_repair_;
@@ -175,9 +173,8 @@ class branch_and_bound_t {
   // Pseudocosts
   pseudo_costs_t<i_t, f_t> pc_;
 
-  // Heap storing the nodes to be explored.
-  omp_mutex_t mutex_heap_;
-  mip_node_heap_t<mip_node_t<i_t, f_t>*> heap_;
+  // Heap storing the nodes waiting to be explored.
+  node_queue_t<i_t, f_t> node_queue;
 
   // Search tree
   search_tree_t<i_t, f_t> search_tree_;
@@ -185,11 +182,6 @@ class branch_and_bound_t {
   // Count the number of subtrees that are currently being explored.
   omp_atomic_t<i_t> active_subtrees_;
 
-  // Queue for storing the promising node for performing dives.
-  omp_mutex_t mutex_dive_queue_;
-  diving_queue_t<i_t, f_t> diving_queue_;
-  i_t min_diving_queue_size_;
-
   // Global status of the solver.
   omp_atomic_t<mip_exploration_status_t> solver_status_;
 
@@ -219,15 +211,15 @@ class branch_and_bound_t {
                            const csr_matrix_t<i_t, f_t>& Arow,
                            i_t initial_heap_size);
 
-  // Explore the search tree using the best-first search with plunging strategy.
-  void explore_subtree(i_t task_id,
-                       mip_node_t<i_t, f_t>* start_node,
-                       search_tree_t<i_t, f_t>& search_tree,
-                       lp_problem_t<i_t, f_t>& leaf_problem,
-                       bounds_strengthening_t<i_t, f_t>& node_presolver,
-                       basis_update_mpf_t<i_t, f_t>& basis_update,
-                       std::vector<i_t>& basic_list,
-                       std::vector<i_t>& nonbasic_list);
+  // Perform a plunge in the subtree determined by the `start_node`.
+  void plunge_from(i_t task_id,
+                   mip_node_t<i_t, f_t>* start_node,
+                   search_tree_t<i_t, f_t>& search_tree,
+                   lp_problem_t<i_t, f_t>& leaf_problem,
+                   bounds_strengthening_t<i_t, f_t>& node_presolver,
+                   basis_update_mpf_t<i_t, f_t>& basis_update,
+                   std::vector<i_t>& basic_list,
+                   std::vector<i_t>& nonbasic_list);
 
   // Each "main" thread pops a node from the global heap and then performs a plunge
   // (i.e., a shallow dive) into the subtree determined by the node.
@@ -235,6 +227,16 @@ class branch_and_bound_t {
                          search_tree_t<i_t, f_t>& search_tree,
                          const csr_matrix_t<i_t, f_t>& Arow);
 
+  // Perform a deep dive in the subtree determined by the `start_node`.
+  void dive_from(mip_node_t<i_t, f_t>& start_node,
+                 const std::vector<f_t>& start_lower,
+                 const std::vector<f_t>& start_upper,
+                 lp_problem_t<i_t, f_t>& leaf_problem,
+                 bounds_strengthening_t<i_t, f_t>& node_presolver,
+                 basis_update_mpf_t<i_t, f_t>& basis_update,
+                 std::vector<i_t>& basic_list,
+                 std::vector<i_t>& nonbasic_list,
+                 thread_type_t diving_type);
   // Each diving thread pops the first node from the dive queue and then performs
   // a deep dive into the subtree determined by the node.
   void diving_thread(const csr_matrix_t<i_t, f_t>& Arow);
@@ -251,6 +253,7 @@ class branch_and_bound_t {
                                bool recompute_basis_and_bounds,
                                const std::vector<f_t>& root_lower,
                                const std::vector<f_t>& root_upper,
+                               bnb_stats_t<i_t, f_t>& stats,
                                logger_t& log);
 
   // Sort the children based on the Martin's criteria.
diff --git a/cpp/src/dual_simplex/diving_queue.hpp b/cpp/src/dual_simplex/diving_queue.hpp
deleted file mode 100644
index f7035109e..000000000
--- a/cpp/src/dual_simplex/diving_queue.hpp
+++ /dev/null
@@ -1,73 +0,0 @@
-/*
- * SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: Apache-2.0
- */
-
-#pragma once
-
-#include <algorithm>
-#include <climits>
-#include <vector>
-
-#include <dual_simplex/mip_node.hpp>
-
-namespace cuopt::linear_programming::dual_simplex {
-
-template <typename i_t, typename f_t>
-struct diving_root_t {
-  mip_node_t<i_t, f_t> node;
-  std::vector<f_t> lower;
-  std::vector<f_t> upper;
-
-  diving_root_t(mip_node_t<i_t, f_t>&& node, std::vector<f_t>&& lower, std::vector<f_t>&& upper)
-    : node(std::move(node)), lower(std::move(lower)), upper(std::move(upper))
-  {
-  }
-
-  friend bool operator>(const diving_root_t<i_t, f_t>& a, const diving_root_t<i_t, f_t>& b)
-  {
-    return a.node.lower_bound > b.node.lower_bound;
-  }
-};
-
-// A min-heap for storing the starting nodes for the dives.
-// This has a maximum size of 1024, such that the container
-// will discard the least promising node if the queue is full.
-template <typename i_t, typename f_t>
-class diving_queue_t {
- private:
-  std::vector<diving_root_t<i_t, f_t>> buffer;
-  static constexpr i_t max_size_ = 1024;
-
- public:
-  diving_queue_t() { buffer.reserve(max_size_); }
-
-  void push(diving_root_t<i_t, f_t>&& node)
-  {
-    buffer.push_back(std::move(node));
-    std::push_heap(buffer.begin(), buffer.end(), std::greater<>());
-    if (buffer.size() > max_size() - 1) { buffer.pop_back(); }
-  }
-
-  void emplace(mip_node_t<i_t, f_t>&& node, std::vector<f_t>&& lower, std::vector<f_t>&& upper)
-  {
-    buffer.emplace_back(std::move(node), std::move(lower), std::move(upper));
-    std::push_heap(buffer.begin(), buffer.end(), std::greater<>());
-    if (buffer.size() > max_size() - 1) { buffer.pop_back(); }
-  }
-
-  diving_root_t<i_t, f_t> pop()
-  {
-    std::pop_heap(buffer.begin(), buffer.end(), std::greater<>());
-    diving_root_t<i_t, f_t> node = std::move(buffer.back());
-    buffer.pop_back();
-    return node;
-  }
-
-  i_t size() const { return buffer.size(); }
-  constexpr i_t max_size() const { return max_size_; }
-  const diving_root_t<i_t, f_t>& top() const { return buffer.front(); }
-  void clear() { buffer.clear(); }
-};
-
-}  // namespace cuopt::linear_programming::dual_simplex
diff --git a/cpp/src/dual_simplex/mip_node.hpp b/cpp/src/dual_simplex/mip_node.hpp
index 1d66a21f7..a082932ac 100644
--- a/cpp/src/dual_simplex/mip_node.hpp
+++ b/cpp/src/dual_simplex/mip_node.hpp
@@ -45,6 +45,7 @@ class mip_node_t {
       branch_var_lower(-std::numeric_limits<f_t>::infinity()),
       branch_var_upper(std::numeric_limits<f_t>::infinity()),
       fractional_val(std::numeric_limits<f_t>::infinity()),
+      objective_estimate(std::numeric_limits<f_t>::infinity()),
       vstatus(0)
   {
     children[0] = nullptr;
@@ -59,6 +60,7 @@ class mip_node_t {
       node_id(0),
       branch_var(-1),
       branch_dir(rounding_direction_t::NONE),
+      objective_estimate(std::numeric_limits<f_t>::infinity()),
       vstatus(basis)
   {
     children[0] = nullptr;
@@ -80,6 +82,7 @@ class mip_node_t {
       branch_var(branch_variable),
       branch_dir(branch_direction),
       fractional_val(branch_var_value),
+      objective_estimate(parent_node->objective_estimate),
       vstatus(basis)
 
   {
@@ -227,17 +230,19 @@ class mip_node_t {
   mip_node_t<i_t, f_t> detach_copy() const
   {
     mip_node_t<i_t, f_t> copy(lower_bound, vstatus);
-    copy.branch_var       = branch_var;
-    copy.branch_dir       = branch_dir;
-    copy.branch_var_lower = branch_var_lower;
-    copy.branch_var_upper = branch_var_upper;
-    copy.fractional_val   = fractional_val;
-    copy.node_id          = node_id;
+    copy.branch_var         = branch_var;
+    copy.branch_dir         = branch_dir;
+    copy.branch_var_lower   = branch_var_lower;
+    copy.branch_var_upper   = branch_var_upper;
+    copy.fractional_val     = fractional_val;
+    copy.objective_estimate = objective_estimate;
+    copy.node_id            = node_id;
     return copy;
   }
 
   node_status_t status;
   f_t lower_bound;
+  f_t objective_estimate;
   i_t depth;
   i_t node_id;
   i_t branch_var;
@@ -262,22 +267,6 @@ void remove_fathomed_nodes(std::vector<mip_node_t<i_t, f_t>*>& stack)
   }
 }
 
-template <typename i_t, typename f_t>
-class node_compare_t {
- public:
-  bool operator()(const mip_node_t<i_t, f_t>& a, const mip_node_t<i_t, f_t>& b) const
-  {
-    return a.lower_bound >
-           b.lower_bound;  // True if a comes before b, elements that come before are output last
-  }
-
-  bool operator()(const mip_node_t<i_t, f_t>* a, const mip_node_t<i_t, f_t>* b) const
-  {
-    return a->lower_bound >
-           b->lower_bound;  // True if a comes before b, elements that come before are output last
-  }
-};
-
 template <typename i_t, typename f_t>
 class search_tree_t {
  public:
diff --git a/cpp/src/dual_simplex/node_queue.hpp b/cpp/src/dual_simplex/node_queue.hpp
new file mode 100644
index 000000000..0234fa038
--- /dev/null
+++ b/cpp/src/dual_simplex/node_queue.hpp
@@ -0,0 +1,173 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+#pragma once
+
+#include <algorithm>
+#include <vector>
+
+#include <dual_simplex/mip_node.hpp>
+
+namespace cuopt::linear_programming::dual_simplex {
+
+// This is a generic heap implementation based
+// on the STL functions. The main benefit here is
+// that we access the underlying container.
+template <typename T, typename Comp>
+class heap_t {
+ public:
+  heap_t()          = default;
+  virtual ~heap_t() = default;
+
+  void push(const T& node)
+  {
+    buffer.push_back(node);
+    std::push_heap(buffer.begin(), buffer.end(), comp);
+  }
+
+  void push(T&& node)
+  {
+    buffer.push_back(std::move(node));
+    std::push_heap(buffer.begin(), buffer.end(), comp);
+  }
+
+  template <typename... Args>
+  void emplace(Args&&... args)
+  {
+    buffer.emplace_back(std::forward<Args&&>(args)...);
+    std::push_heap(buffer.begin(), buffer.end(), comp);
+  }
+
+  std::optional<T> pop()
+  {
+    if (buffer.empty()) return std::nullopt;
+
+    std::pop_heap(buffer.begin(), buffer.end(), comp);
+    T node = std::move(buffer.back());
+    buffer.pop_back();
+    return node;
+  }
+
+  size_t size() const { return buffer.size(); }
+  T& top() { return buffer.front(); }
+  void clear() { buffer.clear(); }
+  bool empty() const { return buffer.empty(); }
+
+ private:
+  std::vector<T> buffer;
+  Comp comp;
+};
+
+// A queue storing the nodes waiting to be explored/dived from.
+template <typename i_t, typename f_t>
+class node_queue_t {
+ private:
+  struct heap_entry_t {
+    mip_node_t<i_t, f_t>* node = nullptr;
+    f_t lower_bound            = -inf;
+    f_t score                  = inf;
+
+    heap_entry_t(mip_node_t<i_t, f_t>* new_node)
+      : node(new_node), lower_bound(new_node->lower_bound), score(new_node->objective_estimate)
+    {
+    }
+  };
+
+  // Comparision function for ordering the nodes based on their lower bound with
+  // lowest one being explored first.
+  struct lower_bound_comp {
+    bool operator()(const std::shared_ptr<heap_entry_t>& a, const std::shared_ptr<heap_entry_t>& b)
+    {
+      // `a` will be placed after `b`
+      return a->lower_bound > b->lower_bound;
+    }
+  };
+
+  // Comparision function for ordering the nodes based on some score (currently the pseudocost
+  // estimate) with the lowest being explored first.
+  struct score_comp {
+    bool operator()(const std::shared_ptr<heap_entry_t>& a, const std::shared_ptr<heap_entry_t>& b)
+    {
+      // `a` will be placed after `b`
+      return a->score > b->score;
+    }
+  };
+
+  heap_t<std::shared_ptr<heap_entry_t>, lower_bound_comp> best_first_heap;
+  heap_t<std::shared_ptr<heap_entry_t>, score_comp> diving_heap;
+  omp_mutex_t mutex;
+
+ public:
+  void push(mip_node_t<i_t, f_t>* new_node)
+  {
+    std::lock_guard<omp_mutex_t> lock(mutex);
+    auto entry = std::make_shared<heap_entry_t>(new_node);
+    best_first_heap.push(entry);
+    diving_heap.push(entry);
+  }
+
+  // In the current implementation, we are use the active number of subtree to decide
+  // when to stop the execution. We need to increment the counter at the same
+  // time as we pop a node from the queue to avoid some threads exiting
+  // the main loop thinking that the solver has already finished.
+  // This will be not needed in the master-worker model.
+  std::optional<mip_node_t<i_t, f_t>*> pop_best_first(omp_atomic_t<i_t>& active_subtree)
+  {
+    std::lock_guard<omp_mutex_t> lock(mutex);
+    auto entry = best_first_heap.pop();
+
+    if (entry.has_value()) {
+      active_subtree++;
+      return std::exchange(entry.value()->node, nullptr);
+    }
+
+    return std::nullopt;
+  }
+
+  // In the current implementation, multiple threads can pop the nodes
+  // from the queue, so we need to pass the lower and upper bound here
+  // to avoid other thread fathoming the node (i.e., deleting) before we can read
+  // the variable bounds from the tree.
+  // This will be not needed in the master-worker model.
+  std::optional<mip_node_t<i_t, f_t>> pop_diving(std::vector<f_t>& lower,
+                                                 std::vector<f_t>& upper,
+                                                 std::vector<bool>& bounds_changed)
+  {
+    std::lock_guard<omp_mutex_t> lock(mutex);
+
+    while (!diving_heap.empty()) {
+      auto entry = diving_heap.pop();
+
+      if (entry.has_value()) {
+        if (auto node_ptr = entry.value()->node; node_ptr != nullptr) {
+          node_ptr->get_variable_bounds(lower, upper, bounds_changed);
+          return node_ptr->detach_copy();
+        }
+      }
+    }
+
+    return std::nullopt;
+  }
+
+  i_t diving_queue_size()
+  {
+    std::lock_guard<omp_mutex_t> lock(mutex);
+    return diving_heap.size();
+  }
+
+  i_t best_first_queue_size()
+  {
+    std::lock_guard<omp_mutex_t> lock(mutex);
+    return best_first_heap.size();
+  }
+
+  f_t get_lower_bound()
+  {
+    std::lock_guard<omp_mutex_t> lock(mutex);
+    return best_first_heap.empty() ? inf : best_first_heap.top()->lower_bound;
+  }
+};
+
+}  // namespace cuopt::linear_programming::dual_simplex
diff --git a/cpp/src/dual_simplex/pseudo_costs.cpp b/cpp/src/dual_simplex/pseudo_costs.cpp
index 9f84e108d..1c0a33042 100644
--- a/cpp/src/dual_simplex/pseudo_costs.cpp
+++ b/cpp/src/dual_simplex/pseudo_costs.cpp
@@ -199,7 +199,7 @@ template <typename i_t, typename f_t>
 void pseudo_costs_t<i_t, f_t>::update_pseudo_costs(mip_node_t<i_t, f_t>* node_ptr,
                                                    f_t leaf_objective)
 {
-  mutex.lock();
+  std::lock_guard<omp_mutex_t> lock(mutex);
   const f_t change_in_obj = leaf_objective - node_ptr->lower_bound;
   const f_t frac          = node_ptr->branch_dir == rounding_direction_t::DOWN
                               ? node_ptr->fractional_val - std::floor(node_ptr->fractional_val)
@@ -211,7 +211,6 @@ void pseudo_costs_t<i_t, f_t>::update_pseudo_costs(mip_node_t<i_t, f_t>* node_pt
     pseudo_cost_sum_up[node_ptr->branch_var] += change_in_obj / frac;
     pseudo_cost_num_up[node_ptr->branch_var]++;
   }
-  mutex.unlock();
 }
 
 template <typename i_t, typename f_t>
@@ -254,16 +253,19 @@ void pseudo_costs_t<i_t, f_t>::initialized(i_t& num_initialized_down,
 }
 
 template <typename i_t, typename f_t>
-i_t pseudo_costs_t<i_t, f_t>::variable_selection(const std::vector<i_t>& fractional,
-                                                 const std::vector<f_t>& solution,
-                                                 logger_t& log)
+std::pair<i_t, f_t> pseudo_costs_t<i_t, f_t>::variable_selection_and_obj_estimate(
+  const std::vector<i_t>& fractional,
+  const std::vector<f_t>& solution,
+  f_t lower_bound,
+  logger_t& log)
 {
-  mutex.lock();
+  std::lock_guard<omp_mutex_t> lock(mutex);
 
   const i_t num_fractional = fractional.size();
   std::vector<f_t> pseudo_cost_up(num_fractional);
   std::vector<f_t> pseudo_cost_down(num_fractional);
   std::vector<f_t> score(num_fractional);
+  f_t estimate = lower_bound;
 
   i_t num_initialized_down;
   i_t num_initialized_up;
@@ -272,11 +274,11 @@ i_t pseudo_costs_t<i_t, f_t>::variable_selection(const std::vector<i_t>& fractio
 
   initialized(num_initialized_down, num_initialized_up, pseudo_cost_down_avg, pseudo_cost_up_avg);
 
-  log.printf("PC: num initialized down %d up %d avg down %e up %e\n",
-             num_initialized_down,
-             num_initialized_up,
-             pseudo_cost_down_avg,
-             pseudo_cost_up_avg);
+  log.debug("PC: num initialized down %d up %d avg down %e up %e\n",
+            num_initialized_down,
+            num_initialized_up,
+            pseudo_cost_down_avg,
+            pseudo_cost_up_avg);
 
   for (i_t k = 0; k < num_fractional; k++) {
     const i_t j = fractional[k];
@@ -296,6 +298,9 @@ i_t pseudo_costs_t<i_t, f_t>::variable_selection(const std::vector<i_t>& fractio
     const f_t f_up    = std::ceil(solution[j]) - solution[j];
     score[k] =
       std::max(f_down * pseudo_cost_down[k], eps) * std::max(f_up * pseudo_cost_up[k], eps);
+
+    estimate += std::min(std::max(pseudo_cost_down[k] * f_down, eps),
+                         std::max(pseudo_cost_up[k] * f_up, eps));
   }
 
   i_t branch_var = fractional[0];
@@ -309,12 +314,13 @@ i_t pseudo_costs_t<i_t, f_t>::variable_selection(const std::vector<i_t>& fractio
     }
   }
 
-  log.printf(
-    "pc branching on %d. Value %e. Score %e\n", branch_var, solution[branch_var], score[select]);
-
-  mutex.unlock();
+  log.debug("Pseudocost branching on %d. Value %e. Score %e. Obj Estimate %e\n",
+            branch_var,
+            solution[branch_var],
+            score[select],
+            estimate);
 
-  return branch_var;
+  return {branch_var, estimate};
 }
 
 template <typename i_t, typename f_t>
diff --git a/cpp/src/dual_simplex/pseudo_costs.hpp b/cpp/src/dual_simplex/pseudo_costs.hpp
index 799cdc3ff..ab01b2a85 100644
--- a/cpp/src/dual_simplex/pseudo_costs.hpp
+++ b/cpp/src/dual_simplex/pseudo_costs.hpp
@@ -43,9 +43,10 @@ class pseudo_costs_t {
                    f_t& pseudo_cost_down_avg,
                    f_t& pseudo_cost_up_avg) const;
 
-  i_t variable_selection(const std::vector<i_t>& fractional,
-                         const std::vector<f_t>& solution,
-                         logger_t& log);
+  std::pair<i_t, f_t> variable_selection_and_obj_estimate(const std::vector<i_t>& fractional,
+                                                          const std::vector<f_t>& solution,
+                                                          f_t lower_bound,
+                                                          logger_t& log);
 
   void update_pseudo_costs_from_strong_branching(const std::vector<i_t>& fractional,
                                                  const std::vector<f_t>& root_soln);