diff --git a/src/Math/Interpolate.h b/src/Math/Interpolate.h new file mode 100644 index 00000000..ef71a717 --- /dev/null +++ b/src/Math/Interpolate.h @@ -0,0 +1,148 @@ +/* Copyright Jukka Jylänki + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. */ + +/** @file Interpolate.h + @author Jukka Jylänki + @brief */ +#pragma once + +#include "assume.h" +#include "MathFunc.h" +#include "MathConstants.h" +#include "MathNamespace.h" + +MATH_BEGIN_NAMESPACE + +// Interpolates [0,1]->[0,1] in a way that starts smoothly, but stops sharply. +// Sometimes also referred to as EaseIn or EaseInQuad. +inline float SmoothStart(float t) +{ + assume1(t >= 0.f && t <= 1.f, t); // Input should be pre-clamped for performance (combine with Clamp01() from MathFunc.h) + return t*t; +} + +// Like SmoothStart, but even smoother start (and sharper stop) +inline float SmoothStart3(float t) +{ + assume1(t >= 0.f && t <= 1.f, t); // Input should be pre-clamped for performance (combine with Clamp01() from MathFunc.h) + return t*t*t; +} + +inline float SmoothStart4(float t) +{ + assume1(t >= 0.f && t <= 1.f, t); // Input should be pre-clamped for performance (combine with Clamp01() from MathFunc.h) + float tt = t*t; + return tt*tt; +} + +inline float SmoothStart5(float t) +{ + assume1(t >= 0.f && t <= 1.f, t); // Input should be pre-clamped for performance (combine with Clamp01() from MathFunc.h) + float tt = t*t; + return tt*tt*t; +} + +// Starts sharply at (0,0), but stops smoothly to (1,1). I.e. reverse of SmoothStart2. +inline float SmoothStop(float t) +{ + assume1(t >= 0.f && t <= 1.f, t); // Input should be pre-clamped for performance (combine with Clamp01() from MathFunc.h) + float oneT = 1.f - t; + return 1.f - oneT*oneT; +} + +inline float SmoothStop3(float t) +{ + assume1(t >= 0.f && t <= 1.f, t); // Input should be pre-clamped for performance (combine with Clamp01() from MathFunc.h) + float oneT = 1.f - t; + return 1.f - oneT*oneT*oneT; +} + +inline float SmoothStop4(float t) +{ + assume1(t >= 0.f && t <= 1.f, t); // Input should be pre-clamped for performance (combine with Clamp01() from MathFunc.h) + float oneT = 1.f - t; + oneT *= oneT; + return 1.f - oneT*oneT; +} + +inline float SmoothStop5(float t) +{ + assume1(t >= 0.f && t <= 1.f, t); // Input should be pre-clamped for performance (combine with Clamp01() from MathFunc.h) + float oneT = 1.f - t; + float oneT2 = oneT * oneT; + return 1.f - oneT2*oneT2*oneT; +} + +// Starts out as SmoothStop, and linearly blends to SmoothStart +inline float SharpStep(float t) +{ + assume1(t >= 0.f && t <= 1.f, t); // Input should be pre-clamped for performance (combine with Clamp01() from MathFunc.h) + // t * t^2 + (1-t)*(1 - (1-t)^2) + // = 2t^3 - 3t^2 + 2t + // = t(2t^2 + 2 - 3t) + float tt = t*t; + return t*(2.f * tt + 2.f - 3.f * t); +} + +// Starts out as SmoothStart, and linearly blends to SmoothStop. +// Also called "cubic Hermite interpolation" +inline float SmoothStep(float t) +{ + assume1(t >= 0.f && t <= 1.f, t); // Input should be pre-clamped for performance (combine with Clamp01() from MathFunc.h) + // (1-t) * SmoothStart(t) + t * SmoothStop(t) + // = (1-t) * t^2 + t*(1 - (1-t)^2) + // = 3t^2 - 2t^3 + + float tt = t*t; + return 3.f*tt - 2.f*tt*t; +} + +// N.b. it is possible to define higher order linear +// blends, like + +// (1-t) * SmoothStart4(t) + t * SmoothStop4(t) +// = -2t^5+5t^4-6t^3+4t^2 + +// (1-t) * SmoothStart5(t) + t * SmoothStop5(t) +// = -4t^5+10t^4-10t^3+5t^2 + +// Nth order blend: (1-t) * t^n + t * (1-(1-t)^n) + +// and so on. As the exponent grows, the function becomes +// "sharper" in the beginning and the end. + +inline float SmoothStep5(float t) +{ + assume1(t >= 0.f && t <= 1.f, t); // Input should be pre-clamped for performance (combine with Clamp01() from MathFunc.h) + // 6t^5 -15t^4 + 10t^3 + float tt = t*t; + return tt*t*(6.f*tt - 15.f*t + 10.f); +} + +inline float SmoothStep7(float t) +{ + assume1(t >= 0.f && t <= 1.f, t); // Input should be pre-clamped for performance (combine with Clamp01() from MathFunc.h) + // -20t^7 + 70t^6 - 84t^5 + 35t^4 + float tt = t*t; + float tttt = tt*tt; + return tttt*(-20.f*tt*t + 70.f*tt - 84.f*t + 35.f); +} + +inline float CosineStep01(float t) +{ + assume1(t >= 0.f && t <= 1.f, t); // Input should be pre-clamped for performance (combine with Clamp01() from MathFunc.h) + return 0.5f - Cos(t*pi) * 0.5f; +} + +MATH_END_NAMESPACE diff --git a/src/Math/MathFunc.cpp b/src/Math/MathFunc.cpp index 352bf11c..ab3418a5 100644 --- a/src/Math/MathFunc.cpp +++ b/src/Math/MathFunc.cpp @@ -499,7 +499,7 @@ float Step(float y, float x) return (x >= y) ? 1.f : 0.f; } -float SmoothStep(float min, float max, float x) +float Ramp(float min, float max, float x) { return x <= min ? 0.f : (x >= max ? 1.f : (x - min) / (max - min)); } diff --git a/src/Math/MathFunc.h b/src/Math/MathFunc.h index d6bb9d86..f86f12da 100644 --- a/src/Math/MathFunc.h +++ b/src/Math/MathFunc.h @@ -189,7 +189,7 @@ float InvLerp(float a, float b, float x); float Step(float y, float x); /// See http://msdn.microsoft.com/en-us/library/bb509658(v=vs.85).aspx /** @see Lerp(), LerpMod(), InvLerp(), Step(), PingPongMod(), Mod(), ModPos(), Frac(). */ -float SmoothStep(float min, float max, float x); +float Ramp(float min, float max, float x); /// Limits x to the range [0, mod], but instead of wrapping around from mod to 0, the result will move back /// from mod to 0 as x goes from mod to 2*mod. /** @see Lerp(), LerpMod(), InvLerp(), Step(), SmoothStep(), Mod(), ModPos(), Frac(). */ diff --git a/src/Math/Quat.cpp b/src/Math/Quat.cpp index 535a5079..ef91243f 100644 --- a/src/Math/Quat.cpp +++ b/src/Math/Quat.cpp @@ -303,7 +303,7 @@ Quat MUST_USE_RESULT Quat::Slerp(const Quat &q2, float t) const #if defined(MATH_AUTOMATIC_SSE) && defined(MATH_SSE) simd4f angle = dot4_ps(q, q2.q); // simd4f neg = cmplt_ps(angle, zero_ps()); // angle < 0? - neg = and_ps(neg, set1_ps_hex(0x80000000)); // Convert 0/0xFFFFFFFF mask to a 0x/0x80000000 mask. + neg = and_ps(neg, set1_ps(-0.0f)); // Convert 0/0xFFFFFFFF mask to a 0x/0x80000000 mask. // neg = s4i_to_s4f(_mm_slli_epi32(s4f_to_s4i(neg), 31)); // A SSE2-esque way to achieve the above would be this, but this seems to clock slower (12.04 clocks vs 11.97 clocks) angle = xor_ps(angle, neg); // if angle was negative, make it positive. simd4f one = set1_ps(1.f); diff --git a/src/Math/quat_simd.h b/src/Math/quat_simd.h index b636fee0..ddbcfb12 100644 --- a/src/Math/quat_simd.h +++ b/src/Math/quat_simd.h @@ -15,7 +15,7 @@ MATH_BEGIN_NAMESPACE inline void quat_to_mat3x4(simd4f q, simd4f t, simd4f *m) { simd4f one = set_ps(0, 0, 0, 1); - const simd4f sseX1 = set_ps_hex((int)0x80000000UL, (int)0x80000000UL, 0, (int)0x80000000UL); // [-, -, + -] + const simd4f sseX1 = set_ps(-0.0f, -0.0f, 0, -0.0f); // [-, -, + -] simd4f q2 = add_ps(q, q); // [2w 2z 2y 2x] simd4f t2 = _mm_add_ss(xor_ps(mul_ps(zwww_ps(q), zzyx_ps(q2)), sseX1), one); // [-2xw -2yw 2zw 1-2zz] const simd4f sseX0 = yzwx_ps(sseX1); // [-, -, -, +] @@ -119,7 +119,7 @@ FORCE_INLINE simd4f quat_mul_quat(simd4f q1, simd4f q2) x*r.y - y*r.x + z*r.w + w*r.z, -x*r.x - y*r.y - z*r.z + w*r.w); */ #if defined(MATH_SSE) - const simd4f signy = set_ps_hex(0x80000000u, 0x80000000u, 0, 0); // [- - + +] + const simd4f signy = set_ps(-0.0f, -0.0f, 0, 0); // [- - + +] const simd4f signz = wxxw_ps(signy); // [- + + -] simd4f X = xxxx_ps(q1); @@ -148,9 +148,9 @@ FORCE_INLINE simd4f quat_mul_quat(simd4f q1, simd4f q2) quat_mul_quat_asm(&q1, &q2, &ret); return ret; #elif defined(MATH_NEON) - static const float32x4_t signx = set_ps_hex_const(0x80000000u, 0, 0x80000000u, 0); - static const float32x4_t signy = set_ps_hex_const(0x80000000u, 0x80000000u, 0, 0); - static const float32x4_t signz = set_ps_hex_const(0x80000000u, 0, 0, 0x80000000u); + static const float32x4_t signx = set_ps(-0.0f, 0, -0.0f, 0); + static const float32x4_t signy = set_ps(-0.0f, -0.0f, 0, 0); + static const float32x4_t signz = set_ps(-0.0f, 0, 0, -0.0f); const float32_t *q1f = (const float32_t *)&q1; float32x4_t X = xor_ps(signx, vdupq_n_f32(q1f[0])); @@ -178,7 +178,7 @@ FORCE_INLINE simd4f quat_div_quat(simd4f q1, simd4f q2) -x*r.y + y*r.x + z*r.w - w*r.z, x*r.x + y*r.y + z*r.z + w*r.w); */ - const simd4f signx = set_ps_hex(0x80000000u, 0, 0x80000000u, 0); // [- + - +] + const simd4f signx = set_ps(-0.0f, 0, -0.0f, 0); // [- + - +] const simd4f signy = xxww_ps(signx); // [- - + +] const simd4f signz = wxxw_ps(signx); // [- + + -] diff --git a/tests/NEONTests.cpp b/tests/NEONTests.cpp index ae63f560..410cc917 100644 --- a/tests/NEONTests.cpp +++ b/tests/NEONTests.cpp @@ -7,6 +7,7 @@ #include "TestRunner.h" #include "TestData.h" #include "../src/Math/SSEMath.h" +#include "../src/Math/simd.h" #include "../src/Math/float4x4_sse.h" #include "../src/Math/float4_neon.h" @@ -18,28 +19,18 @@ BENCHMARK(float4_op_add, "float4 + float4") } BENCHMARK_END; -#ifdef MATH_NEON - -BENCHMARK(rsqrtq, "neon") -{ - float32x4_t r = v[i]; - float32x4_t rcp = vrsqrteq_f32(r); - float32x4_t ret = vmulq_f32(vrsqrtsq_f32(vmulq_f32(rcp, rcp), r), rcp); - v3[i] = ret; -} -BENCHMARK_END - -BENCHMARK(rsqrt, "neon") +UNIQUE_TEST(set_ps_neg_zero) { - float32x4_t r = v[i]; - float32x2_t rcp = vrsqrte_f32(vget_low_f32(r)); - float32x2_t hi = vget_high_f32(r); - float32x2_t ret = vmul_f32(vrsqrts_f32(vmul_f32(rcp, rcp), vget_low_f32(r)), rcp); - v3[i] = vcombine_f32(ret, hi); + simd4f constant = set1_ps(-0.f); + u32 arr[4]; + memcpy(arr, &constant, sizeof(arr)); + asserteq(arr[0], 0x80000000u); + asserteq(arr[1], 0x80000000u); + asserteq(arr[2], 0x80000000u); + asserteq(arr[3], 0x80000000u); } -BENCHMARK_END -UNIQUE_TEST(set_ps_const) +UNIQUE_TEST(set_ps_const_vec) { simd4f constant = set_ps_const(4.f, 3.f, 2.f, 1.f); float arr[4]; @@ -61,6 +52,27 @@ UNIQUE_TEST(set_ps_const_hex) asserteq(arr[3], -0.0f); } +#ifdef MATH_NEON + +BENCHMARK(rsqrtq, "neon") +{ + float32x4_t r = v[i]; + float32x4_t rcp = vrsqrteq_f32(r); + float32x4_t ret = vmulq_f32(vrsqrtsq_f32(vmulq_f32(rcp, rcp), r), rcp); + v3[i] = ret; +} +BENCHMARK_END + +BENCHMARK(rsqrt, "neon") +{ + float32x4_t r = v[i]; + float32x2_t rcp = vrsqrte_f32(vget_low_f32(r)); + float32x2_t hi = vget_high_f32(r); + float32x2_t ret = vmul_f32(vrsqrts_f32(vmul_f32(rcp, rcp), vget_low_f32(r)), rcp); + v3[i] = vcombine_f32(ret, hi); +} +BENCHMARK_END + #ifdef ANDROID FORCE_INLINE void inline_asm_add(void *v1, void *v2, void *out)