diff --git a/doc/decimal_float_and_sqrt.md b/doc/decimal_float_and_sqrt.md index 3916a2061..cf8b58b83 100644 --- a/doc/decimal_float_and_sqrt.md +++ b/doc/decimal_float_and_sqrt.md @@ -140,13 +140,16 @@ Interpolation: `r0 = k0[index] - (k1[index] * eps16 >> 16)` gives ~10 bits initi - sig_gx = gx × 10⁶, scale6=10⁶, scale7=10⁷ - r_scaled = approx_recip_sqrt32(sig_gx) ≈ 10⁷/sqrt(gx) - sig_z = sig_gx × r_scaled / 10⁷ (initial sqrt(gx)×10⁶) -- **1 Newton** correction: rem = sig_gx×10⁶ − sig_z², correction = rem/(2×sig_z) +- Precompute target = sig_gx×10⁶ for Newton and rounding +- **1 Newton** correction: rem = target − sig_z², correction = rem/(2×sig_z) - Final rounding: if rem < 0, --sig_z ```cpp std::uint64_t product = static_cast(sig_gx) * r_scaled; std::uint32_t sig_z = static_cast(product / scale7); -// Newton: rem = sig_gx*scale6 - sig_z²; sig_z += rem/(2*sig_z) +// Precompute target (avoids recomputing in Newton and rounding) +const std::uint64_t target = static_cast(sig_gx) * scale6; +// Newton: rem = target - sig_z²; sig_z += rem/(2*sig_z) T z{sig_z, -6}; ``` @@ -154,12 +157,15 @@ T z{sig_z, -6}; - sig_gx = gx × 10¹⁵, scale15=10¹⁵, scale16=10¹⁶ - r_scaled = approx_recip_sqrt64(sig_gx) ≈ 10¹⁶/sqrt(gx) +- Precompute target = sig_gx×10¹⁵ for Newton and rounding - Uses **int128::uint128_t** / **int128::int128_t** for portability (all platforms including MSVC 32-bit) ```cpp int128::uint128_t product = static_cast(sig_gx) * r_scaled; std::uint64_t sig_z = static_cast(product / scale16); -// 2 Newton corrections: rem = sig_gx*scale15 - sig_z²; correction = rem/(2*sig_z) +// Precompute target (avoids recomputing in each Newton iteration and rounding) +const int128::uint128_t target = static_cast(sig_gx) * scale15; +// 2 Newton corrections: rem = target - sig_z²; correction = rem/(2*sig_z) // Final: if rem < 0, --sig_z T z{sig_z, -15}; ``` @@ -167,16 +173,19 @@ T z{sig_z, -15}; ### 6.3 decimal128 (sqrt128_impl.hpp) - Uses **frexp10** for exact 34-digit significand (no FP loss) -- sig_gx = gx_sig (from frexp10), scale33 = 10³³ (64×64→128, then construct u256) +- gx_sig from frexp10, scale33_128 = 10³³ (64×64→128) - Initial r from approx_recip_sqrt64(sig_gx_approx) where sig_gx_approx = gx_sig/10¹⁸ -- sig_z = umul256(gx_sig, r_scaled) / 10¹⁶ (128×64→256 multiplication) +- sig_z = mul128By64(gx_sig, r_scaled) / 10¹⁶ (SoftFloat-style 128×64→256, r_scaled is 64-bit) +- sig_z² = umul256(uint128(sig_z), uint128(sig_z)) — sig_z fits in 128 bits, avoids full u256×u256 - **3 Newton** iterations using u256 / i256_sub - **Round-to-nearest**: ensure sig_z² ≤ target; if target − sig_z² > sig_z, increment sig_z ```cpp -u256 sig_gx{gx_sig}; // from frexp10 -u256 sig_z = umul256(gx_sig, int128::uint128_t{r_scaled}) / scale16; -// Newton: target = sig_gx*scale33; rem = target - sig_z²; sig_z += rem/(2*sig_z) +// Precompute target (avoids recomputing in each Newton iteration) +const u256 target = umul256(gx_sig, scale33_128); +u256 sig_z = mul128By64(gx_sig, r_scaled) / scale16; +// Newton: sig_z < sqrt(10)*10^33 fits in 128 bits → sig_z_sq = umul256(uint128(sig_z), uint128(sig_z)) +// rem = target - sig_z_sq; sig_z += rem/(2*sig_z) // Final: while sig_z²>target --sig_z; if target-sig_z²>sig_z ++sig_z // Convert: z = T{sig_z_hi,-16} + T{sig_z_lo,-33} ``` @@ -185,7 +194,7 @@ u256 sig_z = umul256(gx_sig, int128::uint128_t{r_scaled}) / scale16; |------|-------------|--------------|--------|----------------| | decimal32 | 7 digits | uint64 | 1 | floor | | decimal64 | 16 digits | int128::uint128_t (portable) | 2 | floor | -| decimal128 | 34 digits | u256, umul256 | 3 | round-to-nearest | +| decimal128 | 34 digits | u256, mul128By64, umul256 | 3 | round-to-nearest | --- @@ -194,14 +203,16 @@ u256 sig_z = umul256(gx_sig, int128::uint128_t{r_scaled}) / scale16; ### 7.1 Implementation Files ``` -include/boost/decimal/detail/cmath/ -├── sqrt.hpp # Entry point, special cases, normalize, dispatch -└── impl/ - ├── sqrt_lookup.hpp # 90-entry k0/k1 tables - ├── approx_recip_sqrt_impl.hpp # approx_recip_sqrt32, approx_recip_sqrt64 - ├── sqrt32_impl.hpp # decimal32: integer rem, 1 Newton - ├── sqrt64_impl.hpp # decimal64: int128::uint128_t (portable), 2 Newton - └── sqrt128_impl.hpp # decimal128: u256, frexp10, round-to-nearest +include/boost/decimal/detail/ +├── cmath/ +│ ├── sqrt.hpp # Entry point, special cases, normalize, dispatch +│ └── impl/ +│ ├── sqrt_lookup.hpp # 90-entry k0/k1 tables +│ ├── approx_recip_sqrt_impl.hpp # approx_recip_sqrt32, approx_recip_sqrt64 +│ ├── sqrt32_impl.hpp # decimal32: integer rem, 1 Newton +│ ├── sqrt64_impl.hpp # decimal64: int128::uint128_t (portable), 2 Newton +│ └── sqrt128_impl.hpp # decimal128: u256, frexp10, round-to-nearest +└── u256.hpp # mul128By64 (128×64→256), umul256 (128×128→256) ``` ### 7.2 Table Design (sqrt_lookup.hpp) @@ -274,12 +285,12 @@ If target − sig_z² > sig_z, then (sig_z+0.5)² < target, so round up. | Type | Baseline | Current | Speedup | |--------------------|------------|-----------|---------------| -| decimal32_t | 1.55M ops/s | 11.68M ops/s | ✓ 7.53x (+653.2%) | -| decimal64_t | 0.78M ops/s | 4.80M ops/s | ✓ 6.17x (+517.3%) | -| decimal128_t | 0.24M ops/s | 0.80M ops/s | ✓ 3.26x (+226.1%) | -| decimal_fast32_t | 1.83M ops/s | 8.99M ops/s | ✓ 4.92x (+391.9%) | -| decimal_fast64_t | 0.80M ops/s | 3.89M ops/s | ✓ 4.86x (+385.9%) | -| decimal_fast128_t | 0.22M ops/s | 0.71M ops/s | ✓ 3.18x (+217.9%) | +| decimal32_t | 1.49M ops/s | 11.36M ops/s | ✓ 7.60x (+660.2%) | +| decimal64_t | 0.74M ops/s | 4.64M ops/s | ✓ 6.23x (+523.2%) | +| decimal128_t | 0.23M ops/s | 0.93M ops/s | ✓ 3.97x (+296.6%) | +| decimal_fast32_t | 1.71M ops/s | 9.34M ops/s | ✓ 5.46x (+446.2%) | +| decimal_fast64_t | 0.78M ops/s | 3.69M ops/s | ✓ 4.71x (+370.7%) | +| decimal_fast128_t | 0.22M ops/s | 0.81M ops/s | ✓ 3.66x (+265.7%) | *Benchmark scripts `sqrt_bench.py` and `run_srqt_test.sh` are not in the main branch. To run: `git checkout d54af19 -- run_srqt_test.sh sqrt_bench.py test/benchmark_sqrt.cpp include/boost/decimal/detail/cmath/sqrt_baseline.hpp`.* diff --git a/include/boost/decimal/detail/cmath/impl/sqrt128_impl.hpp b/include/boost/decimal/detail/cmath/impl/sqrt128_impl.hpp index f310e73c7..d492971cd 100644 --- a/include/boost/decimal/detail/cmath/impl/sqrt128_impl.hpp +++ b/include/boost/decimal/detail/cmath/impl/sqrt128_impl.hpp @@ -59,19 +59,15 @@ constexpr auto sqrt128_impl(T x, int exp10val) noexcept -> T constexpr std::uint64_t scale17 = 100000000000000000ULL; // 10^17 constexpr std::uint64_t scale16 = 10000000000000000ULL; // 10^16 // scale33 = 10^33 = 10^17 * 10^16, 64×64→128 suffices - const u256 scale33{static_cast(scale17) * scale16}; + const int128::uint128_t scale33_128{static_cast(scale17) * scale16}; // ---------- Get exact significand using frexp10 ---------- // frexp10 returns the full 34-digit significand directly from decimal128 int gx_exp{}; auto gx_sig = frexp10(gx, &gx_exp); // gx_sig is uint128, in [10^33, 10^34) - // Convert to u256 for computation - u256 sig_gx{gx_sig}; - - // Adjust scale: gx = gx_sig * 10^gx_exp, and gx is in [1, 10) - // So gx_sig * 10^gx_exp is in [1, 10), meaning gx_exp = -(digits10-1) = -33 - // sig_gx is already gx * 10^33, which is what we want + // gx = gx_sig * 10^gx_exp, and gx is in [1, 10) + // So gx_exp = -(digits10-1) = -33; gx_sig * 10^33 is our scaled significand // Get high 16 digits for initial approximation // sig_gx_approx = gx * 10^15 = gx_sig / 10^18 @@ -81,14 +77,14 @@ constexpr auto sqrt128_impl(T x, int exp10val) noexcept -> T // ---------- Get 1/sqrt approximation ---------- std::uint64_t r_scaled = approx_recip_sqrt64(sig_gx_approx, 0); - // ---------- Compute initial sig_z = sig_gx * r / 10^17 ---------- + // ---------- Compute initial sig_z = sig_gx * r / 10^16 ---------- // sig_z ≈ sqrt(gx) * 10^33 - // Initial: sig_z = (gx * 10^33) * (10^16 / sqrt(gx)) / 10^16 - // = gx * 10^33 / sqrt(gx) = sqrt(gx) * 10^33 - // But r_scaled is 10^16/sqrt(gx), so: // sig_z = sig_gx * r_scaled / 10^16 - // Using umul256 for optimized 128-bit * 128-bit -> 256-bit multiplication - u256 sig_z = umul256(gx_sig, int128::uint128_t{r_scaled}) / scale16; + // r_scaled is 64-bit; use mul128By64 (SoftFloat-style) instead of full umul256 + u256 sig_z = mul128By64(gx_sig, r_scaled) / scale16; + + // Precompute target = sig_gx * 10^33 (avoids recomputing in each Newton iteration) + const u256 target = umul256(gx_sig, scale33_128); // ---------- Newton corrections using u256 ---------- // Newton: sig_z_new = sig_z + (sig_gx * 10^33 - sig_z²) / (2 * sig_z) @@ -96,9 +92,9 @@ constexpr auto sqrt128_impl(T x, int exp10val) noexcept -> T // So we compare sig_gx * 10^33 vs sig_z² // First Newton iteration + // sig_z < sqrt(10)*10^33 always fits in 128 bits → use umul256 (4 muls) not u256×u256 (64 muls) { - u256 sig_z_sq = sig_z * sig_z; // scaled by 10^66 - u256 target = sig_gx * scale33; // sig_gx * 10^33, also scaled by 10^66 + u256 sig_z_sq = umul256(static_cast(sig_z), static_cast(sig_z)); u256 rem_abs; bool is_neg = i256_sub(target, sig_z_sq, rem_abs); @@ -121,8 +117,7 @@ constexpr auto sqrt128_impl(T x, int exp10val) noexcept -> T // Second Newton iteration { - u256 sig_z_sq = sig_z * sig_z; - u256 target = sig_gx * scale33; + u256 sig_z_sq = umul256(static_cast(sig_z), static_cast(sig_z)); u256 rem_abs; bool is_neg = i256_sub(target, sig_z_sq, rem_abs); @@ -144,8 +139,7 @@ constexpr auto sqrt128_impl(T x, int exp10val) noexcept -> T // Third Newton iteration for full precision { - u256 sig_z_sq = sig_z * sig_z; - u256 target = sig_gx * scale33; + u256 sig_z_sq = umul256(static_cast(sig_z), static_cast(sig_z)); u256 rem_abs; bool is_neg = i256_sub(target, sig_z_sq, rem_abs); @@ -169,8 +163,7 @@ constexpr auto sqrt128_impl(T x, int exp10val) noexcept -> T // Find sig_z such that sig_z is the closest integer to sqrt(target) // First ensure sig_z² ≤ target, then check if sig_z+1 is closer { - u256 target = sig_gx * scale33; - u256 sig_z_sq = sig_z * sig_z; + u256 sig_z_sq = umul256(static_cast(sig_z), static_cast(sig_z)); // Step 1: If sig_z² > target, decrement until sig_z² ≤ target while (sig_z_sq > target) @@ -179,7 +172,7 @@ constexpr auto sqrt128_impl(T x, int exp10val) noexcept -> T u256 new_sig_z; i256_sub(sig_z, one, new_sig_z); sig_z = new_sig_z; - sig_z_sq = sig_z * sig_z; + sig_z_sq = umul256(static_cast(sig_z), static_cast(sig_z)); } // Step 2: Round-to-nearest check diff --git a/include/boost/decimal/detail/cmath/impl/sqrt32_impl.hpp b/include/boost/decimal/detail/cmath/impl/sqrt32_impl.hpp index 6ee09b33e..cf6dd0a34 100644 --- a/include/boost/decimal/detail/cmath/impl/sqrt32_impl.hpp +++ b/include/boost/decimal/detail/cmath/impl/sqrt32_impl.hpp @@ -63,12 +63,14 @@ constexpr auto sqrt32_impl(T x, int exp10val) noexcept -> T // = gx * 10^6 / sqrt(gx) = sqrt(gx) * 10^6 std::uint64_t product = static_cast(sig_gx) * r_scaled; std::uint32_t sig_z = static_cast(product / scale7); + + // Precompute target = sig_gx * 10^6 (avoids recomputing in Newton and rounding) + const std::uint64_t target = static_cast(sig_gx) * scale6; // ---------- Newton correction with exact integer remainder ---------- - // rem = sig_gx * 10^6 - sig_z² (exact integer) + // rem = target - sig_z² (exact integer) std::uint64_t z_squared = static_cast(sig_z) * sig_z; - std::int64_t rem = static_cast(sig_gx) * static_cast(scale6) - - static_cast(z_squared); + std::int64_t rem = static_cast(target) - static_cast(z_squared); // Newton correction: correction = rem / (2 * sig_z) if (rem != 0 && sig_z > 0) @@ -78,8 +80,7 @@ constexpr auto sqrt32_impl(T x, int exp10val) noexcept -> T // Recompute remainder z_squared = static_cast(sig_z) * sig_z; - rem = static_cast(sig_gx) * static_cast(scale6) - - static_cast(z_squared); + rem = static_cast(target) - static_cast(z_squared); } // ---------- Final rounding check ---------- diff --git a/include/boost/decimal/detail/cmath/impl/sqrt64_impl.hpp b/include/boost/decimal/detail/cmath/impl/sqrt64_impl.hpp index 6e48ed7a5..72953cc36 100644 --- a/include/boost/decimal/detail/cmath/impl/sqrt64_impl.hpp +++ b/include/boost/decimal/detail/cmath/impl/sqrt64_impl.hpp @@ -67,13 +67,15 @@ constexpr auto sqrt64_impl(T x, int exp10val) noexcept -> T // Using int128::uint128_t for portability (works on all platforms including MSVC 32-bit) int128::uint128_t product = static_cast(sig_gx) * r_scaled; std::uint64_t sig_z = static_cast(product / scale16); + + // Precompute target = sig_gx * 10^15 (avoids recomputing in each Newton iteration and rounding) + const int128::uint128_t target = static_cast(sig_gx) * scale15; // ---------- Newton correction with exact integer remainder ---------- - // rem = sig_gx * 10^15 - sig_z² (exact 128-bit integer) + // rem = target - sig_z² (exact 128-bit integer) { int128::uint128_t z_squared = static_cast(sig_z) * sig_z; - int128::int128_t rem = static_cast(sig_gx) * static_cast(scale15) - - static_cast(z_squared); + int128::int128_t rem = static_cast(target) - static_cast(z_squared); if (rem != 0 && sig_z > 0) { @@ -85,8 +87,7 @@ constexpr auto sqrt64_impl(T x, int exp10val) noexcept -> T // Second Newton correction for full precision { int128::uint128_t z_squared = static_cast(sig_z) * sig_z; - int128::int128_t rem = static_cast(sig_gx) * static_cast(scale15) - - static_cast(z_squared); + int128::int128_t rem = static_cast(target) - static_cast(z_squared); if (rem != 0 && sig_z > 0) { @@ -98,8 +99,7 @@ constexpr auto sqrt64_impl(T x, int exp10val) noexcept -> T // Final rounding check { int128::uint128_t z_squared = static_cast(sig_z) * sig_z; - int128::int128_t rem = static_cast(sig_gx) * static_cast(scale15) - - static_cast(z_squared); + int128::int128_t rem = static_cast(target) - static_cast(z_squared); if (rem < 0) { diff --git a/include/boost/decimal/detail/cmath/sqrt_baseline.hpp b/include/boost/decimal/detail/cmath/sqrt_baseline.hpp new file mode 100644 index 000000000..d753ff159 --- /dev/null +++ b/include/boost/decimal/detail/cmath/sqrt_baseline.hpp @@ -0,0 +1,179 @@ +// Copyright 2023 - 2024 Matt Borland +// Copyright 2023 - 2024 Christopher Kormanyos +// Distributed under the Boost Software License, Version 1.0. +// https://www.boost.org/LICENSE_1_0.txt + +#ifndef BOOST_DECIMAL_DETAIL_CMATH_SQRT_BASELINE_HPP +#define BOOST_DECIMAL_DETAIL_CMATH_SQRT_BASELINE_HPP + +#include +#include +#include +#include + +#ifndef BOOST_DECIMAL_BUILD_MODULE +#include +#include +#include +#endif + +namespace boost { +namespace decimal { + +namespace detail { + +template +constexpr auto sqrt_baseline_impl(const T x) noexcept + BOOST_DECIMAL_REQUIRES(detail::is_decimal_floating_point_v, T) +{ + const auto fpc = fpclassify(x); + + T result { }; + + #ifndef BOOST_DECIMAL_FAST_MATH + if ((fpc == FP_NAN) || (fpc == FP_ZERO)) + { + result = x; + } + else if (signbit(x)) + { + result = std::numeric_limits::quiet_NaN(); + } + else if (fpc == FP_INFINITE) + { + result = std::numeric_limits::infinity(); + } + #else + if (signbit(x)) + { + result = T{0}; + } + #endif + else + { + int exp10val { }; + + const auto gn { frexp10(x, &exp10val) }; + + const auto + zeros_removal + { + remove_trailing_zeros(gn) + }; + + const bool is_pure { zeros_removal.trimmed_number == 1U }; + + constexpr T one { 1 }; + + if(is_pure) + { + // Here, a pure power-of-10 argument gets a straightforward result. + // For argument 10^n where n is even, the result is exact. + + const int p10 { exp10val + static_cast(zeros_removal.number_of_removed_zeros) }; + + if (p10 == 0) + { + result = one; + } + else + { + const int p10_mod2 = (p10 % 2); + + result = T { 1, p10 / 2 }; + + if (p10_mod2 == 1) + { + result *= numbers::sqrt10_v; + } + else if (p10_mod2 == -1) + { + result /= numbers::sqrt10_v; + } + } + } + else + { + // Scale the argument to the interval 1/10 <= x < 1. + T gx { gn, -std::numeric_limits::digits10 }; + + exp10val += std::numeric_limits::digits10; + + // For this work we perform an order-2 Pade approximation of the square root + // at argument x = 1/2. This results in slightly more than 2 decimal digits + // of accuracy over the interval 1/10 <= x < 1. + + // See also WolframAlpha at: + // https://www.wolframalpha.com/input?i=FullSimplify%5BPadeApproximant%5BSqrt%5Bx%5D%2C+%7Bx%2C+1%2F2%2C+%7B2%2C+2%7D%7D%5D%5D + + // PadeApproximant[Sqrt[x], {x, 1/2, {2, 2}}] + // FullSimplify[%] + + // HornerForm[Numerator[Out[2]]] + // Results in: + // 1 + x (20 + 20 x) + + // HornerForm[Denominator[Out[2]]] + // Results in: + // 5 Sqrt[2] + x (20 Sqrt[2] + 4 Sqrt[2] x) + + constexpr T five { 5 }; + + result = + (one + gx * ((one + gx) * 20)) + / (numbers::sqrt2_v * ((gx * 4) * (five + gx) + five)); + + // Perform 3, 4 or 5 Newton-Raphson iterations depending on precision. + // Note from above, we start with slightly more than 2 decimal digits + // of accuracy. + + constexpr int iter_loops + { + std::numeric_limits::digits10 < 10 ? 3 + : std::numeric_limits::digits10 < 20 ? 4 : 5 + }; + + for (int idx = 0; idx < iter_loops; ++idx) + { + result = (result + gx / result) / 2; + } + + if (exp10val != 0) + { + // Rescale the result if necessary. + + const int exp10val_mod2 = (exp10val % 2); + + result *= T { 1, exp10val / 2 }; + + if (exp10val_mod2 == 1) + { + result *= numbers::sqrt10_v; + } + else if (exp10val_mod2 == -1) + { + result /= numbers::sqrt10_v; + } + } + } + } + + return result; +} + +} //namespace detail + +BOOST_DECIMAL_EXPORT template +constexpr auto sqrt_baseline(const T val) noexcept + BOOST_DECIMAL_REQUIRES(detail::is_decimal_floating_point_v, T) +{ + using evaluation_type = detail::evaluation_type_t; + + return static_cast(detail::sqrt_baseline_impl(static_cast(val))); +} + +} //namespace decimal +} //namespace boost + +#endif //BOOST_DECIMAL_DETAIL_CMATH_SQRT_BASELINE_HPP + diff --git a/include/boost/decimal/detail/u256.hpp b/include/boost/decimal/detail/u256.hpp index 0fe1e7dae..ecd90af65 100644 --- a/include/boost/decimal/detail/u256.hpp +++ b/include/boost/decimal/detail/u256.hpp @@ -923,6 +923,26 @@ constexpr u256 umul256(const int128::uint128_t& a, const int128::uint128_t& b) n return result; } +// 128×64→256 multiplication (SoftFloat-style lightweight primitive) +// Used when rhs is 64-bit (e.g. r_scaled from approx_recip_sqrt64) +// Explicit uint128_t cast ensures 64×64→128 widening (a.low*b otherwise returns uint64_t on some platforms) +constexpr u256 mul128By64(const int128::uint128_t& a, const std::uint64_t b) noexcept +{ + const int128::uint128_t p0 = int128::uint128_t{a.low} * b; // 64×64→128 + const int128::uint128_t p1 = int128::uint128_t{a.high} * b; // 64×64→128 + const auto mid = p1.low + p0.high; + const std::uint64_t carry1 = (mid < p0.high) ? 1U : 0U; + const auto hi = p1.high + carry1; + const std::uint64_t carry2 = (hi < carry1) ? 1U : 0U; + + u256 result{}; + result.bytes[0] = p0.low; + result.bytes[1] = mid; + result.bytes[2] = hi; + result.bytes[3] = carry2; + return result; +} + constexpr u256& u256::operator*=(const u256& rhs) noexcept { *this = *this * rhs;