diff --git a/doc/decimal_float_and_sqrt.md b/doc/decimal_float_and_sqrt.md
index 3916a2061..cf8b58b83 100644
--- a/doc/decimal_float_and_sqrt.md
+++ b/doc/decimal_float_and_sqrt.md
@@ -140,13 +140,16 @@ Interpolation: `r0 = k0[index] - (k1[index] * eps16 >> 16)` gives ~10 bits initi
 - sig_gx = gx × 10⁶, scale6=10⁶, scale7=10⁷
 - r_scaled = approx_recip_sqrt32(sig_gx) ≈ 10⁷/sqrt(gx)
 - sig_z = sig_gx × r_scaled / 10⁷ (initial sqrt(gx)×10⁶)
-- **1 Newton** correction: rem = sig_gx×10⁶ − sig_z², correction = rem/(2×sig_z)
+- Precompute target = sig_gx×10⁶ for Newton and rounding
+- **1 Newton** correction: rem = target − sig_z², correction = rem/(2×sig_z)
 - Final rounding: if rem < 0, --sig_z
 
 ```cpp
 std::uint64_t product = static_cast<std::uint64_t>(sig_gx) * r_scaled;
 std::uint32_t sig_z = static_cast<std::uint32_t>(product / scale7);
-// Newton: rem = sig_gx*scale6 - sig_z²; sig_z += rem/(2*sig_z)
+// Precompute target (avoids recomputing in Newton and rounding)
+const std::uint64_t target = static_cast<std::uint64_t>(sig_gx) * scale6;
+// Newton: rem = target - sig_z²; sig_z += rem/(2*sig_z)
 T z{sig_z, -6};
 ```
 
@@ -154,12 +157,15 @@ T z{sig_z, -6};
 
 - sig_gx = gx × 10¹⁵, scale15=10¹⁵, scale16=10¹⁶
 - r_scaled = approx_recip_sqrt64(sig_gx) ≈ 10¹⁶/sqrt(gx)
+- Precompute target = sig_gx×10¹⁵ for Newton and rounding
 - Uses **int128::uint128_t** / **int128::int128_t** for portability (all platforms including MSVC 32-bit)
 
 ```cpp
 int128::uint128_t product = static_cast<int128::uint128_t>(sig_gx) * r_scaled;
 std::uint64_t sig_z = static_cast<std::uint64_t>(product / scale16);
-// 2 Newton corrections: rem = sig_gx*scale15 - sig_z²; correction = rem/(2*sig_z)
+// Precompute target (avoids recomputing in each Newton iteration and rounding)
+const int128::uint128_t target = static_cast<int128::uint128_t>(sig_gx) * scale15;
+// 2 Newton corrections: rem = target - sig_z²; correction = rem/(2*sig_z)
 // Final: if rem < 0, --sig_z
 T z{sig_z, -15};
 ```
@@ -167,16 +173,19 @@ T z{sig_z, -15};
 ### 6.3 decimal128 (sqrt128_impl.hpp)
 
 - Uses **frexp10** for exact 34-digit significand (no FP loss)
-- sig_gx = gx_sig (from frexp10), scale33 = 10³³ (64×64→128, then construct u256)
+- gx_sig from frexp10, scale33_128 = 10³³ (64×64→128)
 - Initial r from approx_recip_sqrt64(sig_gx_approx) where sig_gx_approx = gx_sig/10¹⁸
-- sig_z = umul256(gx_sig, r_scaled) / 10¹⁶ (128×64→256 multiplication)
+- sig_z = mul128By64(gx_sig, r_scaled) / 10¹⁶ (SoftFloat-style 128×64→256, r_scaled is 64-bit)
+- sig_z² = umul256(uint128(sig_z), uint128(sig_z)) — sig_z fits in 128 bits, avoids full u256×u256
 - **3 Newton** iterations using u256 / i256_sub
 - **Round-to-nearest**: ensure sig_z² ≤ target; if target − sig_z² > sig_z, increment sig_z
 
 ```cpp
-u256 sig_gx{gx_sig};  // from frexp10
-u256 sig_z = umul256(gx_sig, int128::uint128_t{r_scaled}) / scale16;
-// Newton: target = sig_gx*scale33; rem = target - sig_z²; sig_z += rem/(2*sig_z)
+// Precompute target (avoids recomputing in each Newton iteration)
+const u256 target = umul256(gx_sig, scale33_128);
+u256 sig_z = mul128By64(gx_sig, r_scaled) / scale16;
+// Newton: sig_z < sqrt(10)*10^33 fits in 128 bits → sig_z_sq = umul256(uint128(sig_z), uint128(sig_z))
+//         rem = target - sig_z_sq; sig_z += rem/(2*sig_z)
 // Final: while sig_z²>target --sig_z; if target-sig_z²>sig_z ++sig_z
 // Convert: z = T{sig_z_hi,-16} + T{sig_z_lo,-33}
 ```
@@ -185,7 +194,7 @@ u256 sig_z = umul256(gx_sig, int128::uint128_t{r_scaled}) / scale16;
 |------|-------------|--------------|--------|----------------|
 | decimal32 | 7 digits | uint64 | 1 | floor |
 | decimal64 | 16 digits | int128::uint128_t (portable) | 2 | floor |
-| decimal128 | 34 digits | u256, umul256 | 3 | round-to-nearest |
+| decimal128 | 34 digits | u256, mul128By64, umul256 | 3 | round-to-nearest |
 
 ---
 
@@ -194,14 +203,16 @@ u256 sig_z = umul256(gx_sig, int128::uint128_t{r_scaled}) / scale16;
 ### 7.1 Implementation Files
 
 ```
-include/boost/decimal/detail/cmath/
-├── sqrt.hpp                        # Entry point, special cases, normalize, dispatch
-└── impl/
-    ├── sqrt_lookup.hpp             # 90-entry k0/k1 tables
-    ├── approx_recip_sqrt_impl.hpp  # approx_recip_sqrt32, approx_recip_sqrt64
-    ├── sqrt32_impl.hpp             # decimal32: integer rem, 1 Newton
-    ├── sqrt64_impl.hpp             # decimal64: int128::uint128_t (portable), 2 Newton
-    └── sqrt128_impl.hpp            # decimal128: u256, frexp10, round-to-nearest
+include/boost/decimal/detail/
+├── cmath/
+│   ├── sqrt.hpp                    # Entry point, special cases, normalize, dispatch
+│   └── impl/
+│       ├── sqrt_lookup.hpp         # 90-entry k0/k1 tables
+│       ├── approx_recip_sqrt_impl.hpp  # approx_recip_sqrt32, approx_recip_sqrt64
+│       ├── sqrt32_impl.hpp         # decimal32: integer rem, 1 Newton
+│       ├── sqrt64_impl.hpp         # decimal64: int128::uint128_t (portable), 2 Newton
+│       └── sqrt128_impl.hpp        # decimal128: u256, frexp10, round-to-nearest
+└── u256.hpp                        # mul128By64 (128×64→256), umul256 (128×128→256)
 ```
 
 ### 7.2 Table Design (sqrt_lookup.hpp)
@@ -274,12 +285,12 @@ If target − sig_z² > sig_z, then (sig_z+0.5)² < target, so round up.
 
 | Type               | Baseline   | Current   | Speedup      |
 |--------------------|------------|-----------|---------------|
-| decimal32_t        | 1.55M ops/s | 11.68M ops/s | ✓ 7.53x (+653.2%) |
-| decimal64_t        | 0.78M ops/s | 4.80M ops/s | ✓ 6.17x (+517.3%) |
-| decimal128_t       | 0.24M ops/s | 0.80M ops/s | ✓ 3.26x (+226.1%) |
-| decimal_fast32_t   | 1.83M ops/s | 8.99M ops/s | ✓ 4.92x (+391.9%) |
-| decimal_fast64_t   | 0.80M ops/s | 3.89M ops/s | ✓ 4.86x (+385.9%) |
-| decimal_fast128_t  | 0.22M ops/s | 0.71M ops/s | ✓ 3.18x (+217.9%) |
+| decimal32_t        | 1.49M ops/s | 11.36M ops/s | ✓ 7.60x (+660.2%) |
+| decimal64_t        | 0.74M ops/s | 4.64M ops/s | ✓ 6.23x (+523.2%) |
+| decimal128_t       | 0.23M ops/s | 0.93M ops/s | ✓ 3.97x (+296.6%) |
+| decimal_fast32_t   | 1.71M ops/s | 9.34M ops/s | ✓ 5.46x (+446.2%) |
+| decimal_fast64_t   | 0.78M ops/s | 3.69M ops/s | ✓ 4.71x (+370.7%) |
+| decimal_fast128_t  | 0.22M ops/s | 0.81M ops/s | ✓ 3.66x (+265.7%) |
 
 *Benchmark scripts `sqrt_bench.py` and `run_srqt_test.sh` are not in the main branch. To run: `git checkout d54af19 -- run_srqt_test.sh sqrt_bench.py test/benchmark_sqrt.cpp include/boost/decimal/detail/cmath/sqrt_baseline.hpp`.*
 
diff --git a/include/boost/decimal/detail/cmath/impl/sqrt128_impl.hpp b/include/boost/decimal/detail/cmath/impl/sqrt128_impl.hpp
index f310e73c7..d492971cd 100644
--- a/include/boost/decimal/detail/cmath/impl/sqrt128_impl.hpp
+++ b/include/boost/decimal/detail/cmath/impl/sqrt128_impl.hpp
@@ -59,19 +59,15 @@ constexpr auto sqrt128_impl(T x, int exp10val) noexcept -> T
     constexpr std::uint64_t scale17 = 100000000000000000ULL;  // 10^17
     constexpr std::uint64_t scale16 = 10000000000000000ULL;   // 10^16
     // scale33 = 10^33 = 10^17 * 10^16, 64×64→128 suffices
-    const u256 scale33{static_cast<int128::uint128_t>(scale17) * scale16};
+    const int128::uint128_t scale33_128{static_cast<int128::uint128_t>(scale17) * scale16};
     
     // ---------- Get exact significand using frexp10 ----------
     // frexp10 returns the full 34-digit significand directly from decimal128
     int gx_exp{};
     auto gx_sig = frexp10(gx, &gx_exp);  // gx_sig is uint128, in [10^33, 10^34)
     
-    // Convert to u256 for computation
-    u256 sig_gx{gx_sig};
-    
-    // Adjust scale: gx = gx_sig * 10^gx_exp, and gx is in [1, 10)
-    // So gx_sig * 10^gx_exp is in [1, 10), meaning gx_exp = -(digits10-1) = -33
-    // sig_gx is already gx * 10^33, which is what we want
+    // gx = gx_sig * 10^gx_exp, and gx is in [1, 10)
+    // So gx_exp = -(digits10-1) = -33; gx_sig * 10^33 is our scaled significand
     
     // Get high 16 digits for initial approximation
     // sig_gx_approx = gx * 10^15 = gx_sig / 10^18
@@ -81,14 +77,14 @@ constexpr auto sqrt128_impl(T x, int exp10val) noexcept -> T
     // ---------- Get 1/sqrt approximation ----------
     std::uint64_t r_scaled = approx_recip_sqrt64(sig_gx_approx, 0);
     
-    // ---------- Compute initial sig_z = sig_gx * r / 10^17 ----------
+    // ---------- Compute initial sig_z = sig_gx * r / 10^16 ----------
     // sig_z ≈ sqrt(gx) * 10^33
-    // Initial: sig_z = (gx * 10^33) * (10^16 / sqrt(gx)) / 10^16
-    //                = gx * 10^33 / sqrt(gx) = sqrt(gx) * 10^33
-    // But r_scaled is 10^16/sqrt(gx), so:
     // sig_z = sig_gx * r_scaled / 10^16
-    // Using umul256 for optimized 128-bit * 128-bit -> 256-bit multiplication
-    u256 sig_z = umul256(gx_sig, int128::uint128_t{r_scaled}) / scale16;
+    // r_scaled is 64-bit; use mul128By64 (SoftFloat-style) instead of full umul256
+    u256 sig_z = mul128By64(gx_sig, r_scaled) / scale16;
+
+    // Precompute target = sig_gx * 10^33 (avoids recomputing in each Newton iteration)
+    const u256 target = umul256(gx_sig, scale33_128);
     
     // ---------- Newton corrections using u256 ----------
     // Newton: sig_z_new = sig_z + (sig_gx * 10^33 - sig_z²) / (2 * sig_z)
@@ -96,9 +92,9 @@ constexpr auto sqrt128_impl(T x, int exp10val) noexcept -> T
     // So we compare sig_gx * 10^33 vs sig_z²
     
     // First Newton iteration
+    // sig_z < sqrt(10)*10^33 always fits in 128 bits → use umul256 (4 muls) not u256×u256 (64 muls)
     {
-        u256 sig_z_sq = sig_z * sig_z;  // scaled by 10^66
-        u256 target = sig_gx * scale33;  // sig_gx * 10^33, also scaled by 10^66
+        u256 sig_z_sq = umul256(static_cast<int128::uint128_t>(sig_z), static_cast<int128::uint128_t>(sig_z));
         
         u256 rem_abs;
         bool is_neg = i256_sub(target, sig_z_sq, rem_abs);
@@ -121,8 +117,7 @@ constexpr auto sqrt128_impl(T x, int exp10val) noexcept -> T
     
     // Second Newton iteration
     {
-        u256 sig_z_sq = sig_z * sig_z;
-        u256 target = sig_gx * scale33;
+        u256 sig_z_sq = umul256(static_cast<int128::uint128_t>(sig_z), static_cast<int128::uint128_t>(sig_z));
         
         u256 rem_abs;
         bool is_neg = i256_sub(target, sig_z_sq, rem_abs);
@@ -144,8 +139,7 @@ constexpr auto sqrt128_impl(T x, int exp10val) noexcept -> T
     
     // Third Newton iteration for full precision
     {
-        u256 sig_z_sq = sig_z * sig_z;
-        u256 target = sig_gx * scale33;
+        u256 sig_z_sq = umul256(static_cast<int128::uint128_t>(sig_z), static_cast<int128::uint128_t>(sig_z));
         
         u256 rem_abs;
         bool is_neg = i256_sub(target, sig_z_sq, rem_abs);
@@ -169,8 +163,7 @@ constexpr auto sqrt128_impl(T x, int exp10val) noexcept -> T
     // Find sig_z such that sig_z is the closest integer to sqrt(target)
     // First ensure sig_z² ≤ target, then check if sig_z+1 is closer
     {
-        u256 target = sig_gx * scale33;
-        u256 sig_z_sq = sig_z * sig_z;
+        u256 sig_z_sq = umul256(static_cast<int128::uint128_t>(sig_z), static_cast<int128::uint128_t>(sig_z));
         
         // Step 1: If sig_z² > target, decrement until sig_z² ≤ target
         while (sig_z_sq > target)
@@ -179,7 +172,7 @@ constexpr auto sqrt128_impl(T x, int exp10val) noexcept -> T
             u256 new_sig_z;
             i256_sub(sig_z, one, new_sig_z);
             sig_z = new_sig_z;
-            sig_z_sq = sig_z * sig_z;
+            sig_z_sq = umul256(static_cast<int128::uint128_t>(sig_z), static_cast<int128::uint128_t>(sig_z));
         }
         
         // Step 2: Round-to-nearest check
diff --git a/include/boost/decimal/detail/cmath/impl/sqrt32_impl.hpp b/include/boost/decimal/detail/cmath/impl/sqrt32_impl.hpp
index 6ee09b33e..cf6dd0a34 100644
--- a/include/boost/decimal/detail/cmath/impl/sqrt32_impl.hpp
+++ b/include/boost/decimal/detail/cmath/impl/sqrt32_impl.hpp
@@ -63,12 +63,14 @@ constexpr auto sqrt32_impl(T x, int exp10val) noexcept -> T
     //       = gx * 10^6 / sqrt(gx) = sqrt(gx) * 10^6
     std::uint64_t product = static_cast<std::uint64_t>(sig_gx) * r_scaled;
     std::uint32_t sig_z = static_cast<std::uint32_t>(product / scale7);
+
+    // Precompute target = sig_gx * 10^6 (avoids recomputing in Newton and rounding)
+    const std::uint64_t target = static_cast<std::uint64_t>(sig_gx) * scale6;
     
     // ---------- Newton correction with exact integer remainder ----------
-    // rem = sig_gx * 10^6 - sig_z² (exact integer)
+    // rem = target - sig_z² (exact integer)
     std::uint64_t z_squared = static_cast<std::uint64_t>(sig_z) * sig_z;
-    std::int64_t rem = static_cast<std::int64_t>(sig_gx) * static_cast<std::int64_t>(scale6) 
-                     - static_cast<std::int64_t>(z_squared);
+    std::int64_t rem = static_cast<std::int64_t>(target) - static_cast<std::int64_t>(z_squared);
     
     // Newton correction: correction = rem / (2 * sig_z)
     if (rem != 0 && sig_z > 0)
@@ -78,8 +80,7 @@ constexpr auto sqrt32_impl(T x, int exp10val) noexcept -> T
         
         // Recompute remainder
         z_squared = static_cast<std::uint64_t>(sig_z) * sig_z;
-        rem = static_cast<std::int64_t>(sig_gx) * static_cast<std::int64_t>(scale6) 
-            - static_cast<std::int64_t>(z_squared);
+        rem = static_cast<std::int64_t>(target) - static_cast<std::int64_t>(z_squared);
     }
 
     // ---------- Final rounding check ----------
diff --git a/include/boost/decimal/detail/cmath/impl/sqrt64_impl.hpp b/include/boost/decimal/detail/cmath/impl/sqrt64_impl.hpp
index 6e48ed7a5..72953cc36 100644
--- a/include/boost/decimal/detail/cmath/impl/sqrt64_impl.hpp
+++ b/include/boost/decimal/detail/cmath/impl/sqrt64_impl.hpp
@@ -67,13 +67,15 @@ constexpr auto sqrt64_impl(T x, int exp10val) noexcept -> T
     // Using int128::uint128_t for portability (works on all platforms including MSVC 32-bit)
     int128::uint128_t product = static_cast<int128::uint128_t>(sig_gx) * r_scaled;
     std::uint64_t sig_z = static_cast<std::uint64_t>(product / scale16);
+
+    // Precompute target = sig_gx * 10^15 (avoids recomputing in each Newton iteration and rounding)
+    const int128::uint128_t target = static_cast<int128::uint128_t>(sig_gx) * scale15;
     
     // ---------- Newton correction with exact integer remainder ----------
-    // rem = sig_gx * 10^15 - sig_z² (exact 128-bit integer)
+    // rem = target - sig_z² (exact 128-bit integer)
     {
         int128::uint128_t z_squared = static_cast<int128::uint128_t>(sig_z) * sig_z;
-        int128::int128_t rem = static_cast<int128::int128_t>(sig_gx) * static_cast<int128::int128_t>(scale15)
-                             - static_cast<int128::int128_t>(z_squared);
+        int128::int128_t rem = static_cast<int128::int128_t>(target) - static_cast<int128::int128_t>(z_squared);
         
         if (rem != 0 && sig_z > 0)
         {
@@ -85,8 +87,7 @@ constexpr auto sqrt64_impl(T x, int exp10val) noexcept -> T
     // Second Newton correction for full precision
     {
         int128::uint128_t z_squared = static_cast<int128::uint128_t>(sig_z) * sig_z;
-        int128::int128_t rem = static_cast<int128::int128_t>(sig_gx) * static_cast<int128::int128_t>(scale15)
-                             - static_cast<int128::int128_t>(z_squared);
+        int128::int128_t rem = static_cast<int128::int128_t>(target) - static_cast<int128::int128_t>(z_squared);
         
         if (rem != 0 && sig_z > 0)
         {
@@ -98,8 +99,7 @@ constexpr auto sqrt64_impl(T x, int exp10val) noexcept -> T
     // Final rounding check
     {
         int128::uint128_t z_squared = static_cast<int128::uint128_t>(sig_z) * sig_z;
-        int128::int128_t rem = static_cast<int128::int128_t>(sig_gx) * static_cast<int128::int128_t>(scale15)
-                             - static_cast<int128::int128_t>(z_squared);
+        int128::int128_t rem = static_cast<int128::int128_t>(target) - static_cast<int128::int128_t>(z_squared);
         
         if (rem < 0)
         {
diff --git a/include/boost/decimal/detail/cmath/sqrt_baseline.hpp b/include/boost/decimal/detail/cmath/sqrt_baseline.hpp
new file mode 100644
index 000000000..d753ff159
--- /dev/null
+++ b/include/boost/decimal/detail/cmath/sqrt_baseline.hpp
@@ -0,0 +1,179 @@
+// Copyright 2023 - 2024 Matt Borland
+// Copyright 2023 - 2024 Christopher Kormanyos
+// Distributed under the Boost Software License, Version 1.0.
+// https://www.boost.org/LICENSE_1_0.txt
+
+#ifndef BOOST_DECIMAL_DETAIL_CMATH_SQRT_BASELINE_HPP
+#define BOOST_DECIMAL_DETAIL_CMATH_SQRT_BASELINE_HPP
+
+#include <boost/decimal/fwd.hpp>
+#include <boost/decimal/detail/type_traits.hpp>
+#include <boost/decimal/detail/concepts.hpp>
+#include <boost/decimal/detail/config.hpp>
+
+#ifndef BOOST_DECIMAL_BUILD_MODULE
+#include <type_traits>
+#include <cstdint>
+#include <cmath>
+#endif
+
+namespace boost {
+namespace decimal {
+
+namespace detail {
+
+template <typename T>
+constexpr auto sqrt_baseline_impl(const T x) noexcept
+    BOOST_DECIMAL_REQUIRES(detail::is_decimal_floating_point_v, T)
+{
+    const auto fpc = fpclassify(x);
+
+    T result { };
+
+    #ifndef BOOST_DECIMAL_FAST_MATH
+    if ((fpc == FP_NAN) || (fpc == FP_ZERO))
+    {
+        result = x;
+    }
+    else if (signbit(x))
+    {
+        result = std::numeric_limits<T>::quiet_NaN();
+    }
+    else if (fpc == FP_INFINITE)
+    {
+        result = std::numeric_limits<T>::infinity();
+    }
+    #else
+    if (signbit(x))
+    {
+        result = T{0};
+    }
+    #endif
+    else
+    {
+        int exp10val { };
+
+        const auto gn { frexp10(x, &exp10val) };
+
+        const auto
+            zeros_removal
+            {
+                remove_trailing_zeros(gn)
+            };
+
+        const bool is_pure { zeros_removal.trimmed_number == 1U };
+
+        constexpr T one { 1 };
+
+        if(is_pure)
+        {
+            // Here, a pure power-of-10 argument gets a straightforward result.
+            // For argument 10^n where n is even, the result is exact.
+
+            const int p10 { exp10val + static_cast<int>(zeros_removal.number_of_removed_zeros) };
+
+            if (p10 == 0)
+            {
+                result = one;
+            }
+            else
+            {
+                const int p10_mod2 = (p10 % 2);
+
+                result = T { 1, p10 / 2 };
+
+                if (p10_mod2 == 1)
+                {
+                    result *= numbers::sqrt10_v<T>;
+                }
+                else if (p10_mod2 == -1)
+                {
+                    result /= numbers::sqrt10_v<T>;
+                }
+            }
+        }
+        else
+        {
+            // Scale the argument to the interval 1/10 <= x < 1.
+            T gx { gn, -std::numeric_limits<T>::digits10 };
+
+            exp10val += std::numeric_limits<T>::digits10;
+
+            // For this work we perform an order-2 Pade approximation of the square root
+            // at argument x = 1/2. This results in slightly more than 2 decimal digits
+            // of accuracy over the interval 1/10 <= x < 1.
+
+            // See also WolframAlpha at:
+            //   https://www.wolframalpha.com/input?i=FullSimplify%5BPadeApproximant%5BSqrt%5Bx%5D%2C+%7Bx%2C+1%2F2%2C+%7B2%2C+2%7D%7D%5D%5D
+
+            // PadeApproximant[Sqrt[x], {x, 1/2, {2, 2}}]
+            // FullSimplify[%]
+
+            // HornerForm[Numerator[Out[2]]]
+            // Results in:
+            //   1 + x (20 + 20 x)
+
+            // HornerForm[Denominator[Out[2]]]
+            // Results in:
+            //   5 Sqrt[2] + x (20 Sqrt[2] + 4 Sqrt[2] x)
+
+            constexpr T five { 5 };
+
+            result =
+                  (one + gx * ((one + gx) * 20))
+                / (numbers::sqrt2_v<T> * ((gx * 4) * (five + gx) + five));
+
+            // Perform 3, 4 or 5 Newton-Raphson iterations depending on precision.
+            // Note from above, we start with slightly more than 2 decimal digits
+            // of accuracy.
+
+            constexpr int iter_loops
+            {
+                  std::numeric_limits<T>::digits10 < 10 ? 3
+                : std::numeric_limits<T>::digits10 < 20 ? 4 : 5
+            };
+
+            for (int idx = 0; idx < iter_loops; ++idx)
+            {
+                result = (result + gx / result) / 2;
+            }
+
+            if (exp10val != 0)
+            {
+                // Rescale the result if necessary.
+
+                const int exp10val_mod2 = (exp10val % 2);
+
+                result *= T { 1, exp10val / 2 };
+
+                if (exp10val_mod2 == 1)
+                {
+                    result *= numbers::sqrt10_v<T>;
+                }
+                else if (exp10val_mod2 == -1)
+                {
+                    result /= numbers::sqrt10_v<T>;
+                }
+            }
+        }
+    }
+
+    return result;
+}
+
+} //namespace detail
+
+BOOST_DECIMAL_EXPORT template <typename T>
+constexpr auto sqrt_baseline(const T val) noexcept
+    BOOST_DECIMAL_REQUIRES(detail::is_decimal_floating_point_v, T)
+{
+    using evaluation_type = detail::evaluation_type_t<T>;
+
+    return static_cast<T>(detail::sqrt_baseline_impl(static_cast<evaluation_type>(val)));
+}
+
+} //namespace decimal
+} //namespace boost
+
+#endif //BOOST_DECIMAL_DETAIL_CMATH_SQRT_BASELINE_HPP
+
diff --git a/include/boost/decimal/detail/u256.hpp b/include/boost/decimal/detail/u256.hpp
index 0fe1e7dae..ecd90af65 100644
--- a/include/boost/decimal/detail/u256.hpp
+++ b/include/boost/decimal/detail/u256.hpp
@@ -923,6 +923,26 @@ constexpr u256 umul256(const int128::uint128_t& a, const int128::uint128_t& b) n
     return result;
 }
 
+// 128×64→256 multiplication (SoftFloat-style lightweight primitive)
+// Used when rhs is 64-bit (e.g. r_scaled from approx_recip_sqrt64)
+// Explicit uint128_t cast ensures 64×64→128 widening (a.low*b otherwise returns uint64_t on some platforms)
+constexpr u256 mul128By64(const int128::uint128_t& a, const std::uint64_t b) noexcept
+{
+    const int128::uint128_t p0 = int128::uint128_t{a.low} * b;   // 64×64→128
+    const int128::uint128_t p1 = int128::uint128_t{a.high} * b; // 64×64→128
+    const auto mid = p1.low + p0.high;
+    const std::uint64_t carry1 = (mid < p0.high) ? 1U : 0U;
+    const auto hi = p1.high + carry1;
+    const std::uint64_t carry2 = (hi < carry1) ? 1U : 0U;
+
+    u256 result{};
+    result.bytes[0] = p0.low;
+    result.bytes[1] = mid;
+    result.bytes[2] = hi;
+    result.bytes[3] = carry2;
+    return result;
+}
+
 constexpr u256& u256::operator*=(const u256& rhs) noexcept
 {
     *this = *this * rhs;