From d38a86ce3c7ca92b7833fa091d2a0388fa8b4221 Mon Sep 17 00:00:00 2001 From: Dagamies Date: Wed, 15 Nov 2023 06:02:59 +0100 Subject: [PATCH 01/10] Initial compile for Power10 --- CMakeLists.txt | 9 +++++++-- src/cpu/cpu_info.cc | 16 ++++++++++++++++ src/cpu/cpu_info.h | 2 ++ src/cpu/cpu_isa.cc | 10 ++++++++++ src/cpu/cpu_isa.h | 10 +++++++++- src/cpu/kernels.cc | 3 +++ 6 files changed, 47 insertions(+), 3 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index ce8b3d31f..b2aab087e 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -101,7 +101,7 @@ set(SOURCES src/cpu/backend.cc src/cpu/cpu_info.cc src/cpu/cpu_isa.cc - src/cpu/kernels.cc + #src/cpu/kernels.cc src/cpu/parallel.cc src/cpu/primitives.cc src/decoding.cc @@ -214,7 +214,10 @@ if(CMAKE_SYSTEM_PROCESSOR MATCHES "(arm64)|(aarch64)" elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "(x86_64)|(amd64)|(AMD64)") add_definitions(-DCT2_X86_BUILD) set(CT2_BUILD_ARCH "x86_64") - +elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "(ppc64le)|(PPC64LE)") + add_definitions(-DCT2_PPC64LE_BUILD) + set(CT2_BUILD_ARCH "ppc64le") + if(BUILD_SHARED_LIBS) set(CMAKE_POSITION_INDEPENDENT_CODE ON) endif() @@ -241,6 +244,8 @@ if(ENABLE_CPU_DISPATCH) endif() elseif(CT2_BUILD_ARCH STREQUAL "arm64") ct2_compile_kernels_for_isa(neon "-DUSE_NEON") + elseif(CT2_BUILD_ARCH STREQUAL "ppc64le") + ct2_compile_kernels_for_isa(ppc64le "-mcpu=power10 -O3 -flto") endif() endif() diff --git a/src/cpu/cpu_info.cc b/src/cpu/cpu_info.cc index 9030ac7a4..c320dae71 100644 --- a/src/cpu/cpu_info.cc +++ b/src/cpu/cpu_info.cc @@ -58,4 +58,20 @@ namespace ctranslate2 { } } +#elif defined(CT2_PPC64LE_BUILD) + +namespace ctranslate2 { + namespace cpu { + + const char* cpu_vendor() { + return "POWER"; + } + + bool cpu_supports_power10() { + return true; + } + + } +} + #endif diff --git a/src/cpu/cpu_info.h b/src/cpu/cpu_info.h index c2951bcc0..0c696805a 100644 --- a/src/cpu/cpu_info.h +++ b/src/cpu/cpu_info.h @@ -14,6 +14,8 @@ namespace ctranslate2 { bool cpu_supports_avx512(); #elif defined(CT2_ARM64_BUILD) bool cpu_supports_neon(); +#elif defined(CT2_PPC64LE_BUILD) + bool cpu_supports_power10(); #endif } diff --git a/src/cpu/cpu_isa.cc b/src/cpu/cpu_isa.cc index c16aeda22..e616b3c34 100644 --- a/src/cpu/cpu_isa.cc +++ b/src/cpu/cpu_isa.cc @@ -35,7 +35,11 @@ namespace ctranslate2 { #elif defined(CT2_ARM64_BUILD) case CpuIsa::NEON: return "NEON"; +#elif defined(CT2_PPC64_BUILD) + case CpuIsa::POWER10: + return "POWER10"; #endif + default: return "GENERIC"; } @@ -54,6 +58,9 @@ namespace ctranslate2 { #elif defined(CT2_ARM64_BUILD) if (env_isa == "NEON") return try_isa(env_isa, CpuIsa::NEON, cpu_supports_neon()); +#elif defined(CT2_PPC64_BUILD) + if (env_isa == "POWER10") + return try_isa(env_isa, CpuIsa::POWER10, cpu_supports_power10()); #endif if (env_isa == "GENERIC") return CpuIsa::GENERIC; @@ -71,6 +78,9 @@ namespace ctranslate2 { # elif defined(CT2_ARM64_BUILD) if (cpu_supports_neon()) return CpuIsa::NEON; +# elif defined(CT2_PPC64_BUILD) + if (cpu_supports_power10()) + return CpuIsa::POWER10; # endif #endif diff --git a/src/cpu/cpu_isa.h b/src/cpu/cpu_isa.h index 4f42bdf26..45c0f5815 100644 --- a/src/cpu/cpu_isa.h +++ b/src/cpu/cpu_isa.h @@ -13,6 +13,8 @@ namespace ctranslate2 { AVX512, #elif defined(CT2_ARM64_BUILD) NEON, +#elif defined(CT2_PPC64LE_BUILD) + POWER10, #endif }; @@ -54,7 +56,12 @@ namespace ctranslate2 { CPU_ISA_CASE(cpu::CpuIsa::NEON, SINGLE_ARG(STMTS)) \ CPU_ISA_DEFAULT(cpu::CpuIsa::GENERIC, SINGLE_ARG(STMTS)) \ } -#endif +#elif defined(CT2_PPC64_BUILD) +# define CPU_ISA_DISPATCH(STMTS) \ + switch (cpu::get_cpu_isa()) { \ + CPU_ISA_CASE(cpu::CpuIsa::POWER10, SINGLE_ARG(STMTS)) \ + #CPU_ISA_DEFAULT(cpu::CpuIsa::GENERIC, SINGLE_ARG(STMTS)) \ + } #elif defined(__AVX512F__) # define CPU_ISA_DISPATCH(STMTS) \ switch (cpu::get_cpu_isa()) { \ @@ -81,3 +88,4 @@ namespace ctranslate2 { CPU_ISA_DEFAULT(cpu::CpuIsa::GENERIC, SINGLE_ARG(STMTS)) \ } #endif +#endif diff --git a/src/cpu/kernels.cc b/src/cpu/kernels.cc index 3edc8e19b..e82cd34aa 100644 --- a/src/cpu/kernels.cc +++ b/src/cpu/kernels.cc @@ -14,6 +14,9 @@ #elif (defined(__ARM_NEON) && !defined(CT2_WITH_CPU_DISPATCH)) || defined(USE_NEON) # define TARGET_ISA CpuIsa::NEON # include "cpu/vec_neon.h" +#elif defined(CT2_PPC64_BUILD) +# define TARGET_ISA CpuIsa::POWER10 +# include "cpu/vec_power10.h" #else # define TARGET_ISA CpuIsa::GENERIC # include "cpu/vec.h" From 57ce2402814c98a8a598d4e05be0ee189ea65179 Mon Sep 17 00:00:00 2001 From: Dagamies Date: Wed, 15 Nov 2023 06:04:04 +0100 Subject: [PATCH 02/10] Initial compile for Power10 --- src/cpu/vec_power10.h | 231 ++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 231 insertions(+) create mode 100644 src/cpu/vec_power10.h diff --git a/src/cpu/vec_power10.h b/src/cpu/vec_power10.h new file mode 100644 index 000000000..1a4bb859a --- /dev/null +++ b/src/cpu/vec_power10.h @@ -0,0 +1,231 @@ +#pragma once + +#include +#include +#include + +#include "ctranslate2/types.h" + +namespace ctranslate2 { + namespace cpu { + + // Interface for vectorized types. + template + struct Vec { + + using value_type = T; + using mask_type = bool; + static constexpr dim_t width = 1; + + static inline value_type load(T value) { + return value; + } + + static inline value_type load(const T* ptr) { + return *ptr; + } + + static inline value_type load(const T* ptr, dim_t count, T default_value = T(0)) { + (void)count; + (void)default_value; + return *ptr; + } + + static inline value_type load_and_convert(const int32_t* ptr) { + return *ptr; + } + + static inline value_type load_and_convert(const int32_t* ptr, + dim_t count, + int32_t default_value = 0) { + (void)count; + (void)default_value; + return *ptr; + } + + static inline void store(value_type value, T* ptr) { + *ptr = value; + } + + static inline void store(value_type value, T* ptr, dim_t count) { + (void)count; + *ptr = value; + } + + static inline value_type bit_and(value_type a, value_type b) { + return a & b; + } + + static inline value_type bit_xor(value_type a, value_type b) { + return a ^ b; + } + + static inline mask_type lt(value_type a, value_type b) { + return a < b; + } + + static inline value_type select(mask_type mask, value_type a, value_type b) { + return mask ? a : b; + } + + static inline value_type abs(value_type a) { + return static_cast(std::abs(a)); + } + + static inline value_type neg(value_type a) { + return -a; + } + + static inline value_type rcp(value_type a) { + return static_cast(1) / a; + } + + static inline value_type exp(value_type a) { + return std::exp(a); + } + + static inline value_type log(value_type a) { + return std::log(a); + } + + static inline value_type sin(value_type a) { + return std::sin(a); + } + + static inline value_type cos(value_type a) { + return std::cos(a); + } + + static inline value_type tanh(value_type a) { + return std::tanh(a); + } + + static inline value_type erf(value_type a) { + return std::erf(a); + } + + static inline value_type max(value_type a, value_type b) { + return std::max(a, b); + } + + static inline value_type min(value_type a, value_type b) { + return std::min(a, b); + } + + static inline value_type add(value_type a, value_type b) { + return a + b; + } + + static inline value_type sub(value_type a, value_type b) { + return a - b; + } + + static inline value_type mul(value_type a, value_type b) { + return a * b; + } + + static inline value_type div(value_type a, value_type b) { + return a / b; + } + + static inline value_type mul_add(value_type a, value_type b, value_type c) { + return a * b + c; + } + + static inline T reduce_add(value_type a) { + return a; + } + + static inline T reduce_max(value_type a) { + return a; + } + + }; + + template + using vec_type = typename Vec::value_type; + + template + vec_type vec_tanh(vec_type a) { + using VecType = Vec; + + // Implementation ported from Eigen: + // https://gitlab.com/libeigen/eigen/-/blob/3.4.0/Eigen/src/Core/MathFunctionsImpl.h#L18-L76 + + const auto plus_clamp = VecType::load(7.90531110763549805f); + const auto minus_clamp = VecType::load(-7.90531110763549805f); + const auto tiny = VecType::load(0.0004f); + const auto x = VecType::max(VecType::min(a, plus_clamp), minus_clamp); + const auto tiny_mask = VecType::lt(VecType::abs(a), tiny); + + const auto alpha_1 = VecType::load(4.89352455891786e-03f); + const auto alpha_3 = VecType::load(6.37261928875436e-04f); + const auto alpha_5 = VecType::load(1.48572235717979e-05f); + const auto alpha_7 = VecType::load(5.12229709037114e-08f); + const auto alpha_9 = VecType::load(-8.60467152213735e-11f); + const auto alpha_11 = VecType::load(2.00018790482477e-13f); + const auto alpha_13 = VecType::load(-2.76076847742355e-16f); + + const auto beta_0 = VecType::load(4.89352518554385e-03f); + const auto beta_2 = VecType::load(2.26843463243900e-03f); + const auto beta_4 = VecType::load(1.18534705686654e-04f); + const auto beta_6 = VecType::load(1.19825839466702e-06f); + + const auto x2 = VecType::mul(x, x); + + auto p = VecType::mul_add(x2, alpha_13, alpha_11); + p = VecType::mul_add(x2, p, alpha_9); + p = VecType::mul_add(x2, p, alpha_7); + p = VecType::mul_add(x2, p, alpha_5); + p = VecType::mul_add(x2, p, alpha_3); + p = VecType::mul_add(x2, p, alpha_1); + p = VecType::mul(x, p); + + auto q = VecType::mul_add(x2, beta_6, beta_4); + q = VecType::mul_add(x2, q, beta_2); + q = VecType::mul_add(x2, q, beta_0); + + return VecType::select(tiny_mask, x, VecType::div(p, q)); + } + + template + vec_type vec_erf(vec_type a) { + using VecType = Vec; + + // Implementation ported from PyTorch: + // https://github.com/pytorch/pytorch/blob/e9bc82f54b9867cc82b0e94dcdc90f9d156277bd/aten/src/ATen/cpu/vec/vec256/vec256_float.h#L158-L189 + + // constants + const auto neg_zero_vec = VecType::load(-0.f); + const auto one_vec = VecType::load(1.0f); + const auto p = VecType::load(0.3275911f); + const auto p1 = VecType::load(0.254829592f); + const auto p2 = VecType::load(-0.284496736f); + const auto p3 = VecType::load(1.421413741f); + const auto p4 = VecType::load(-1.453152027f); + const auto p5 = VecType::load(1.061405429f); + // sign(x) + auto sign_mask = VecType::bit_and(neg_zero_vec, a); + auto abs_vec = VecType::bit_xor(sign_mask, a); + // t = 1 / (p * abs(x) + 1) + auto tmp0 = VecType::mul_add(p, abs_vec, one_vec); + auto t = VecType::div(one_vec, tmp0); + // r = p5 * t ^ 4 + p4 * t ^ 3 + p3 * t ^ 2 + p2 * t + p1 + auto tmp1 = VecType::mul_add(p5, t, p4); + auto tmp2 = VecType::mul_add(tmp1, t, p3); + auto tmp3 = VecType::mul_add(tmp2, t, p2); + auto r = VecType::mul_add(tmp3, t, p1); + // - exp(- x * x) + auto pow_2 = VecType::mul(a, a); + auto neg_pow_2 = VecType::bit_xor(neg_zero_vec, pow_2); + // auto tmp4 = exp(neg_pow_2); + auto tmp4 = VecType::exp(neg_pow_2); + auto tmp5 = VecType::bit_xor(neg_zero_vec, tmp4); + // erf(x) = sign(x) * (1 - r * t * exp(- x * x)) + auto tmp6 = VecType::mul(tmp5, t); + auto tmp7 = VecType::mul_add(tmp6, r, one_vec); + return VecType::bit_xor(sign_mask, tmp7); + } + + } +} From 05950a73be6cae8d13219d3cce28b4c8117168e1 Mon Sep 17 00:00:00 2001 From: Dagamies Date: Sun, 19 Nov 2023 08:10:55 +0100 Subject: [PATCH 03/10] sync interim work --- src/cpu/cpu_isa.cc | 6 +- src/cpu/cpu_isa.h | 14 +- src/cpu/kernels.cc | 11 +- src/cpu/vec_power10.h | 352 ++++++++++++++++++++++++++---------------- src/utils.cc | 4 + 5 files changed, 244 insertions(+), 143 deletions(-) diff --git a/src/cpu/cpu_isa.cc b/src/cpu/cpu_isa.cc index e616b3c34..c84c2a669 100644 --- a/src/cpu/cpu_isa.cc +++ b/src/cpu/cpu_isa.cc @@ -35,7 +35,7 @@ namespace ctranslate2 { #elif defined(CT2_ARM64_BUILD) case CpuIsa::NEON: return "NEON"; -#elif defined(CT2_PPC64_BUILD) +#elif defined(CT2_PPC64LE_BUILD) case CpuIsa::POWER10: return "POWER10"; #endif @@ -58,7 +58,7 @@ namespace ctranslate2 { #elif defined(CT2_ARM64_BUILD) if (env_isa == "NEON") return try_isa(env_isa, CpuIsa::NEON, cpu_supports_neon()); -#elif defined(CT2_PPC64_BUILD) +#elif defined(CT2_PPC64LE_BUILD) if (env_isa == "POWER10") return try_isa(env_isa, CpuIsa::POWER10, cpu_supports_power10()); #endif @@ -78,7 +78,7 @@ namespace ctranslate2 { # elif defined(CT2_ARM64_BUILD) if (cpu_supports_neon()) return CpuIsa::NEON; -# elif defined(CT2_PPC64_BUILD) +# elif defined(CT2_PPC64LE_BUILD) if (cpu_supports_power10()) return CpuIsa::POWER10; # endif diff --git a/src/cpu/cpu_isa.h b/src/cpu/cpu_isa.h index 45c0f5815..eedfa3bd4 100644 --- a/src/cpu/cpu_isa.h +++ b/src/cpu/cpu_isa.h @@ -6,15 +6,15 @@ namespace ctranslate2 { namespace cpu { enum class CpuIsa { - GENERIC, + GENERIC,POWER10, #if defined(CT2_X86_BUILD) AVX, AVX2, AVX512, #elif defined(CT2_ARM64_BUILD) NEON, -#elif defined(CT2_PPC64LE_BUILD) - POWER10, + /*#elif defined(CT2_PPC64LE_BUILD) + POWER10,*/ #endif }; @@ -56,12 +56,13 @@ namespace ctranslate2 { CPU_ISA_CASE(cpu::CpuIsa::NEON, SINGLE_ARG(STMTS)) \ CPU_ISA_DEFAULT(cpu::CpuIsa::GENERIC, SINGLE_ARG(STMTS)) \ } -#elif defined(CT2_PPC64_BUILD) +#elif defined(CT2_PPC64LE_BUILD) # define CPU_ISA_DISPATCH(STMTS) \ switch (cpu::get_cpu_isa()) { \ - CPU_ISA_CASE(cpu::CpuIsa::POWER10, SINGLE_ARG(STMTS)) \ - #CPU_ISA_DEFAULT(cpu::CpuIsa::GENERIC, SINGLE_ARG(STMTS)) \ + CPU_ISA_CASE(cpu::CpuIsa::GENERIC, SINGLE_ARG(STMTS)) \ + CPU_ISA_DEFAULT(cpu::CpuIsa::GENERIC, SINGLE_ARG(STMTS)) \ } +#endif #elif defined(__AVX512F__) # define CPU_ISA_DISPATCH(STMTS) \ switch (cpu::get_cpu_isa()) { \ @@ -88,4 +89,3 @@ namespace ctranslate2 { CPU_ISA_DEFAULT(cpu::CpuIsa::GENERIC, SINGLE_ARG(STMTS)) \ } #endif -#endif diff --git a/src/cpu/kernels.cc b/src/cpu/kernels.cc index e82cd34aa..239394b6a 100644 --- a/src/cpu/kernels.cc +++ b/src/cpu/kernels.cc @@ -1,10 +1,13 @@ #include "cpu/kernels.h" - +//#include "cpu/cpu_isa.h" #include #if defined(__AVX512F__) # define TARGET_ISA CpuIsa::AVX512 # include "cpu/vec_avx512.h" +#elif defined(CT2_PPC64LE_BUILD) +# define TARGET_ISA CpuIsa::POWER10 +# include "cpu/vec_power10.h" #elif defined(__AVX2__) # define TARGET_ISA CpuIsa::AVX2 # include "cpu/vec_avx.h" @@ -14,9 +17,9 @@ #elif (defined(__ARM_NEON) && !defined(CT2_WITH_CPU_DISPATCH)) || defined(USE_NEON) # define TARGET_ISA CpuIsa::NEON # include "cpu/vec_neon.h" -#elif defined(CT2_PPC64_BUILD) -# define TARGET_ISA CpuIsa::POWER10 -# include "cpu/vec_power10.h" +//#elif defined(CT2_PPC64LE_BUILD) +//# define TARGET_ISA CpuIsa::GENERIC +//# include "cpu/vec_power10.h" #else # define TARGET_ISA CpuIsa::GENERIC # include "cpu/vec.h" diff --git a/src/cpu/vec_power10.h b/src/cpu/vec_power10.h index 1a4bb859a..ea285f58f 100644 --- a/src/cpu/vec_power10.h +++ b/src/cpu/vec_power10.h @@ -1,38 +1,52 @@ #pragma once +//#include +//#include #include #include #include -#include "ctranslate2/types.h" +#include + +#include "vec.h" + +#if defined(__GNUC__) || defined(__clang__) +# define __ct2_align16__ __attribute__((aligned(16))) +#else +# define __ct2_align16__ +#endif namespace ctranslate2 { namespace cpu { - // Interface for vectorized types. - template - struct Vec { + template<> + struct Vec { - using value_type = T; - using mask_type = bool; - static constexpr dim_t width = 1; + using value_type = __vector float; + using mask_type = __vector bool int; + static constexpr dim_t width = 4; + //using value_type = float; + //using mask_type = uint; + //static constexpr dim_t width = 4; - static inline value_type load(T value) { - return value; + static inline value_type load(float value) { + return vec_lde(0,&value); } - static inline value_type load(const T* ptr) { - return *ptr; + static inline value_type load(const float* ptr) { + return vec_ld(0,ptr); } - static inline value_type load(const T* ptr, dim_t count, T default_value = T(0)) { + static inline value_type load(const float* ptr, dim_t count, float default_value = float(0)) { (void)count; (void)default_value; - return *ptr; + //KESKEN + return vec_ld(0,ptr); } static inline value_type load_and_convert(const int32_t* ptr) { - return *ptr; + + return vec_ctf(vec_lde(0,ptr),0); } static inline value_type load_and_convert(const int32_t* ptr, @@ -40,192 +54,272 @@ namespace ctranslate2 { int32_t default_value = 0) { (void)count; (void)default_value; - return *ptr; + //KESKEN + return vec_ctf(vec_lde(0,ptr),0); } - static inline void store(value_type value, T* ptr) { - *ptr = value; + static inline void store(value_type value, float* ptr) { + //*ptr = value; + vec_ste(value,0,ptr); } - static inline void store(value_type value, T* ptr, dim_t count) { + static inline void store(value_type value, float* ptr, dim_t count) { (void)count; - *ptr = value; + //*ptr = value; + //KESKEN + vec_ste(value,0,ptr); } - static inline value_type bit_and(value_type a, value_type b) { - return a & b; + static inline value_type bit_and(value_type a, value_type b) { + //return vreinterpretq_f32_u32(vandq_u32(vreinterpretq_u32_f32(a), vreinterpretq_u32_f32(b))); + //return a & b; + //__vector value_type va = *(__vector value_type *)(&a); + //__vector value_type vb = *(__vector value_type *)(&b); + //__vector value_type va=a; + //__vector value_type vb=b; + return vec_and(a,b); } static inline value_type bit_xor(value_type a, value_type b) { - return a ^ b; + return vec_xor(a,b); } static inline mask_type lt(value_type a, value_type b) { - return a < b; + return vec_cmplt(a,b); } static inline value_type select(mask_type mask, value_type a, value_type b) { + //KESKEN!! return mask ? a : b; } static inline value_type abs(value_type a) { - return static_cast(std::abs(a)); + return vec_abs(a); } static inline value_type neg(value_type a) { - return -a; + return vec_neg(a); } static inline value_type rcp(value_type a) { - return static_cast(1) / a; + return vec_re(a); } static inline value_type exp(value_type a) { - return std::exp(a); + //KESKEN + return exp(a); } static inline value_type log(value_type a) { - return std::log(a); + return vec_loge(a); } - static inline value_type sin(value_type a) { - return std::sin(a); + value_type out; + for (int i=0;i<4;i+=1) { + out=vec_insert(std::sin(vec_extract(a,i)),a,i); + } + return out; } static inline value_type cos(value_type a) { - return std::cos(a); + return cos(a); } static inline value_type tanh(value_type a) { - return std::tanh(a); + return tanh(a); } static inline value_type erf(value_type a) { - return std::erf(a); + return erf(a); } static inline value_type max(value_type a, value_type b) { - return std::max(a, b); + return vec_max(a, b); } static inline value_type min(value_type a, value_type b) { - return std::min(a, b); + return vec_min(a, b); } static inline value_type add(value_type a, value_type b) { - return a + b; + return vec_add(a,b); } static inline value_type sub(value_type a, value_type b) { - return a - b; + return vec_sub(a,b); } static inline value_type mul(value_type a, value_type b) { - return a * b; + return vec_mul(a,b); } static inline value_type div(value_type a, value_type b) { - return a / b; + return vec_div(a,b); } static inline value_type mul_add(value_type a, value_type b, value_type c) { - return a * b + c; + //return a * b + c; + return vec_madd(a,b,c); } - static inline T reduce_add(value_type a) { - return a; + static inline float reduce_add(value_type a) { + float f=0; + for (int i=0;i<4;i+=1) { + f+=a[i]; + } + return f; } - static inline T reduce_max(value_type a) { - return a; + static inline float reduce_max(value_type a) { + float max=0; + for (int i=0;i<4;i+=1) { + if (a[i]>max) { + max=a[i]; + } + } + return max; + } + + +/* static inline value_type load(float value) { + return vdupq_n_f32(value); } - }; + static inline value_type load(const float* ptr) { + return vld1q_f32(ptr); + } + + static inline value_type load(const float* ptr, dim_t count, float default_value = 0) { + if (count == width) { + return vld1q_f32(ptr); + } else { + __ct2_align16__ float tmp_values[width]; + std::fill(tmp_values, tmp_values + width, default_value); + std::copy(ptr, ptr + count, tmp_values); + return vld1q_f32(tmp_values); + } + } + + static inline value_type load_and_convert(const int32_t* ptr) { + return vcvtq_f32_s32(vld1q_s32(ptr)); + } + + static inline value_type load_and_convert(const int32_t* ptr, + dim_t count, + int32_t default_value = 0) { + if (count == width) { + return load_and_convert(ptr); + } else { + __ct2_align16__ int32_t tmp_values[width]; + std::fill(tmp_values, tmp_values + width, default_value); + std::copy(ptr, ptr + count, tmp_values); + return load_and_convert(tmp_values); + } + } - template - using vec_type = typename Vec::value_type; - - template - vec_type vec_tanh(vec_type a) { - using VecType = Vec; - - // Implementation ported from Eigen: - // https://gitlab.com/libeigen/eigen/-/blob/3.4.0/Eigen/src/Core/MathFunctionsImpl.h#L18-L76 - - const auto plus_clamp = VecType::load(7.90531110763549805f); - const auto minus_clamp = VecType::load(-7.90531110763549805f); - const auto tiny = VecType::load(0.0004f); - const auto x = VecType::max(VecType::min(a, plus_clamp), minus_clamp); - const auto tiny_mask = VecType::lt(VecType::abs(a), tiny); - - const auto alpha_1 = VecType::load(4.89352455891786e-03f); - const auto alpha_3 = VecType::load(6.37261928875436e-04f); - const auto alpha_5 = VecType::load(1.48572235717979e-05f); - const auto alpha_7 = VecType::load(5.12229709037114e-08f); - const auto alpha_9 = VecType::load(-8.60467152213735e-11f); - const auto alpha_11 = VecType::load(2.00018790482477e-13f); - const auto alpha_13 = VecType::load(-2.76076847742355e-16f); - - const auto beta_0 = VecType::load(4.89352518554385e-03f); - const auto beta_2 = VecType::load(2.26843463243900e-03f); - const auto beta_4 = VecType::load(1.18534705686654e-04f); - const auto beta_6 = VecType::load(1.19825839466702e-06f); - - const auto x2 = VecType::mul(x, x); - - auto p = VecType::mul_add(x2, alpha_13, alpha_11); - p = VecType::mul_add(x2, p, alpha_9); - p = VecType::mul_add(x2, p, alpha_7); - p = VecType::mul_add(x2, p, alpha_5); - p = VecType::mul_add(x2, p, alpha_3); - p = VecType::mul_add(x2, p, alpha_1); - p = VecType::mul(x, p); - - auto q = VecType::mul_add(x2, beta_6, beta_4); - q = VecType::mul_add(x2, q, beta_2); - q = VecType::mul_add(x2, q, beta_0); - - return VecType::select(tiny_mask, x, VecType::div(p, q)); - } - - template - vec_type vec_erf(vec_type a) { - using VecType = Vec; - - // Implementation ported from PyTorch: - // https://github.com/pytorch/pytorch/blob/e9bc82f54b9867cc82b0e94dcdc90f9d156277bd/aten/src/ATen/cpu/vec/vec256/vec256_float.h#L158-L189 - - // constants - const auto neg_zero_vec = VecType::load(-0.f); - const auto one_vec = VecType::load(1.0f); - const auto p = VecType::load(0.3275911f); - const auto p1 = VecType::load(0.254829592f); - const auto p2 = VecType::load(-0.284496736f); - const auto p3 = VecType::load(1.421413741f); - const auto p4 = VecType::load(-1.453152027f); - const auto p5 = VecType::load(1.061405429f); - // sign(x) - auto sign_mask = VecType::bit_and(neg_zero_vec, a); - auto abs_vec = VecType::bit_xor(sign_mask, a); - // t = 1 / (p * abs(x) + 1) - auto tmp0 = VecType::mul_add(p, abs_vec, one_vec); - auto t = VecType::div(one_vec, tmp0); - // r = p5 * t ^ 4 + p4 * t ^ 3 + p3 * t ^ 2 + p2 * t + p1 - auto tmp1 = VecType::mul_add(p5, t, p4); - auto tmp2 = VecType::mul_add(tmp1, t, p3); - auto tmp3 = VecType::mul_add(tmp2, t, p2); - auto r = VecType::mul_add(tmp3, t, p1); - // - exp(- x * x) - auto pow_2 = VecType::mul(a, a); - auto neg_pow_2 = VecType::bit_xor(neg_zero_vec, pow_2); - // auto tmp4 = exp(neg_pow_2); - auto tmp4 = VecType::exp(neg_pow_2); - auto tmp5 = VecType::bit_xor(neg_zero_vec, tmp4); - // erf(x) = sign(x) * (1 - r * t * exp(- x * x)) - auto tmp6 = VecType::mul(tmp5, t); - auto tmp7 = VecType::mul_add(tmp6, r, one_vec); - return VecType::bit_xor(sign_mask, tmp7); - } + static inline void store(value_type value, float* ptr) { + vst1q_f32(ptr, value); + } + + static inline void store(value_type value, float* ptr, dim_t count) { + if (count == width) { + vst1q_f32(ptr, value); + } else { + __ct2_align16__ float tmp_values[width]; + vst1q_f32(tmp_values, value); + std::copy(tmp_values, tmp_values + count, ptr); + } + } + + static inline value_type bit_and(value_type a, value_type b) { + return vreinterpretq_f32_u32(vandq_u32(vreinterpretq_u32_f32(a), vreinterpretq_u32_f32(b))); + } + + static inline value_type bit_xor(value_type a, value_type b) { + return vreinterpretq_f32_u32(veorq_u32(vreinterpretq_u32_f32(a), vreinterpretq_u32_f32(b))); + } + + static inline mask_type lt(value_type a, value_type b) { + return vcltq_f32(a, b); + } + + static inline value_type select(mask_type mask, value_type a, value_type b) { + return vbslq_f32(mask, a, b); + } + + static inline value_type abs(value_type a) { + return vabsq_f32(a); + } + + static inline value_type neg(value_type a) { + return vnegq_f32(a); + } + + static inline value_type rcp(value_type a) { + return vrecpeq_f32(a); + } + + static inline value_type exp(value_type a) { + return exp_ps(a); + } + + static inline value_type log(value_type a) { + return log_ps(a); + } + + static inline value_type sin(value_type a) { + return sin_ps(a); + } + + static inline value_type cos(value_type a) { + return cos_ps(a); + } + + static inline value_type tanh(value_type a) { + return vec_tanh(a); + } + + static inline value_type erf(value_type a) { + return vec_erf(a); + } + + static inline value_type max(value_type a, value_type b) { + return vmaxq_f32(a, b); + } + + static inline value_type min(value_type a, value_type b) { + return vminq_f32(a, b); + } + + static inline value_type add(value_type a, value_type b) { + return vaddq_f32(a, b); + } + + static inline value_type sub(value_type a, value_type b) { + return vsubq_f32(a, b); + } + + static inline value_type mul(value_type a, value_type b) { + return vmulq_f32(a, b); + } + + static inline value_type div(value_type a, value_type b) { + return vdivq_f32(a, b); + } + + static inline value_type mul_add(value_type a, value_type b, value_type c) { + return vfmaq_f32(c, a, b); + } + + static inline float reduce_add(value_type a) { + return vaddvq_f32(a); + } + + static inline float reduce_max(value_type a) { + return vmaxvq_f32(a); + } +*/ + }; } } diff --git a/src/utils.cc b/src/utils.cc index f0eb29509..de031d963 100644 --- a/src/utils.cc +++ b/src/utils.cc @@ -42,6 +42,10 @@ namespace ctranslate2 { spdlog::info("CPU: {} (NEON={})", cpu::cpu_vendor(), cpu::cpu_supports_neon()); +#elif defined(CT2_PPC64LE_BUILD) + spdlog::info("CPU: {} (NEON={})", + cpu::cpu_vendor(), + cpu::cpu_supports_power10()); #endif spdlog::info(" - Selected ISA: {}", cpu::isa_to_str(cpu::get_cpu_isa())); spdlog::info(" - Use Intel MKL: {}", cpu::mayiuse_mkl()); From 70b050924bac9bf00ccf013cbf7100a8620defbe Mon Sep 17 00:00:00 2001 From: Dagamies Date: Wed, 29 Nov 2023 11:00:33 +0100 Subject: [PATCH 04/10] Power10 initial build --- src/cpu/cpu_isa.h | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/src/cpu/cpu_isa.h b/src/cpu/cpu_isa.h index eedfa3bd4..b32379c7b 100644 --- a/src/cpu/cpu_isa.h +++ b/src/cpu/cpu_isa.h @@ -59,8 +59,7 @@ namespace ctranslate2 { #elif defined(CT2_PPC64LE_BUILD) # define CPU_ISA_DISPATCH(STMTS) \ switch (cpu::get_cpu_isa()) { \ - CPU_ISA_CASE(cpu::CpuIsa::GENERIC, SINGLE_ARG(STMTS)) \ - CPU_ISA_DEFAULT(cpu::CpuIsa::GENERIC, SINGLE_ARG(STMTS)) \ + CPU_ISA_DEFAULT(cpu::CpuIsa::POWER10, SINGLE_ARG(STMTS)) \ } #endif #elif defined(__AVX512F__) From f368229072475433dc8478c20be935648edeeafa Mon Sep 17 00:00:00 2001 From: Dagamies Date: Wed, 29 Nov 2023 11:01:07 +0100 Subject: [PATCH 05/10] Initial Power10 build --- src/cpu/vec_power10.h | 361 ++++++++++++++++++++---------------------- 1 file changed, 168 insertions(+), 193 deletions(-) diff --git a/src/cpu/vec_power10.h b/src/cpu/vec_power10.h index ea285f58f..a32a6451d 100644 --- a/src/cpu/vec_power10.h +++ b/src/cpu/vec_power10.h @@ -1,13 +1,17 @@ #pragma once -//#include -//#include + #include #include #include - +#include #include +#include + + +#include + #include "vec.h" #if defined(__GNUC__) || defined(__clang__) @@ -19,64 +23,88 @@ namespace ctranslate2 { namespace cpu { + #define ALIGNMENT_VALUE 16u + template<> struct Vec { - using value_type = __vector float; - using mask_type = __vector bool int; + using value_type = __ct2_align16__ __vector float; + using mask_type = __ct2_align16__ __vector bool int; static constexpr dim_t width = 4; - //using value_type = float; - //using mask_type = uint; - //static constexpr dim_t width = 4; + + static inline value_type unaligned_load(const float* ptr){ + return vec_perm(vec_ld(0, ptr), vec_ld(16, ptr), vec_lvsl(0, ptr)); + } + static inline value_type load(float value) { - return vec_lde(0,&value); + return vec_splats(value); } static inline value_type load(const float* ptr) { - return vec_ld(0,ptr); + return vec_perm(vec_ld(0, ptr), vec_ld(16, ptr), vec_lvsl(0, ptr)); + } static inline value_type load(const float* ptr, dim_t count, float default_value = float(0)) { - (void)count; - (void)default_value; - //KESKEN - return vec_ld(0,ptr); + if (count == width) { + return load(ptr); + } else { + __ct2_align16__ float tmp_values[width]; + std::fill(tmp_values, tmp_values + width, default_value); + std::copy(ptr, ptr + count, tmp_values); + return load(tmp_values); + } } static inline value_type load_and_convert(const int32_t* ptr) { - return vec_ctf(vec_lde(0,ptr),0); + return vec_ctf(vec_perm(vec_ld(0, ptr), vec_ld(16, ptr), vec_lvsl(0, ptr)),0); } static inline value_type load_and_convert(const int32_t* ptr, dim_t count, int32_t default_value = 0) { - (void)count; - (void)default_value; - //KESKEN - return vec_ctf(vec_lde(0,ptr),0); + if (count == width) { + return load_and_convert(ptr); + } else { + __ct2_align16__ int32_t tmp_values[width]; + std::fill(tmp_values, tmp_values + width, default_value); + for (int i=0;imax) { max=a[i]; } } - return max; - } - - -/* static inline value_type load(float value) { - return vdupq_n_f32(value); - } - - static inline value_type load(const float* ptr) { - return vld1q_f32(ptr); - } - - static inline value_type load(const float* ptr, dim_t count, float default_value = 0) { - if (count == width) { - return vld1q_f32(ptr); - } else { - __ct2_align16__ float tmp_values[width]; - std::fill(tmp_values, tmp_values + width, default_value); - std::copy(ptr, ptr + count, tmp_values); - return vld1q_f32(tmp_values); - } - } - - static inline value_type load_and_convert(const int32_t* ptr) { - return vcvtq_f32_s32(vld1q_s32(ptr)); - } - - static inline value_type load_and_convert(const int32_t* ptr, - dim_t count, - int32_t default_value = 0) { - if (count == width) { - return load_and_convert(ptr); - } else { - __ct2_align16__ int32_t tmp_values[width]; - std::fill(tmp_values, tmp_values + width, default_value); - std::copy(ptr, ptr + count, tmp_values); - return load_and_convert(tmp_values); - } - } - - static inline void store(value_type value, float* ptr) { - vst1q_f32(ptr, value); - } - - static inline void store(value_type value, float* ptr, dim_t count) { - if (count == width) { - vst1q_f32(ptr, value); - } else { - __ct2_align16__ float tmp_values[width]; - vst1q_f32(tmp_values, value); - std::copy(tmp_values, tmp_values + count, ptr); - } - } - - static inline value_type bit_and(value_type a, value_type b) { - return vreinterpretq_f32_u32(vandq_u32(vreinterpretq_u32_f32(a), vreinterpretq_u32_f32(b))); - } - - static inline value_type bit_xor(value_type a, value_type b) { - return vreinterpretq_f32_u32(veorq_u32(vreinterpretq_u32_f32(a), vreinterpretq_u32_f32(b))); - } - - static inline mask_type lt(value_type a, value_type b) { - return vcltq_f32(a, b); - } - - static inline value_type select(mask_type mask, value_type a, value_type b) { - return vbslq_f32(mask, a, b); - } - - static inline value_type abs(value_type a) { - return vabsq_f32(a); - } - - static inline value_type neg(value_type a) { - return vnegq_f32(a); - } - - static inline value_type rcp(value_type a) { - return vrecpeq_f32(a); - } - - static inline value_type exp(value_type a) { - return exp_ps(a); - } - - static inline value_type log(value_type a) { - return log_ps(a); - } - - static inline value_type sin(value_type a) { - return sin_ps(a); - } - - static inline value_type cos(value_type a) { - return cos_ps(a); - } - - static inline value_type tanh(value_type a) { - return vec_tanh(a); - } - - static inline value_type erf(value_type a) { - return vec_erf(a); - } - - static inline value_type max(value_type a, value_type b) { - return vmaxq_f32(a, b); - } - - static inline value_type min(value_type a, value_type b) { - return vminq_f32(a, b); - } - - static inline value_type add(value_type a, value_type b) { - return vaddq_f32(a, b); - } - - static inline value_type sub(value_type a, value_type b) { - return vsubq_f32(a, b); - } - - static inline value_type mul(value_type a, value_type b) { - return vmulq_f32(a, b); - } - - static inline value_type div(value_type a, value_type b) { - return vdivq_f32(a, b); - } - - static inline value_type mul_add(value_type a, value_type b, value_type c) { - return vfmaq_f32(c, a, b); - } - - static inline float reduce_add(value_type a) { - return vaddvq_f32(a); - } - - static inline float reduce_max(value_type a) { - return vmaxvq_f32(a); - } -*/ + //std::cout << "reduce_max "; + return max;*/ + float t0 = a[0] > a[1] ? a[0] : a[1]; + float t1 = a[2] > a[3] ? a[2] : a[3]; + return t0 > t1 ? t0 : t1; + } + static inline void output_vec(value_type v) + { + for(int a=0;a<4;a+=1) + std::cout<< " "< + static inline value_type vec_tanh(value_type a) { + using VecType = Vec; + + // Implementation ported from Eigen: + // https://gitlab.com/libeigen/eigen/-/blob/3.4.0/Eigen/src/Core/MathFunctionsImpl.h#L18-L76 + //std::cout << "Starting Power10::vec_tanh\n"; + const auto plus_clamp = VecType::load(7.90531110763549805f); + //std::cout << " plus_clamp:"; + //VecType::output_vec(plus_clamp); + const auto minus_clamp = VecType::load(-7.90531110763549805f); + //std::cout << " minus_clamp:"; + //VecType::output_vec(minus_clamp); + const auto tiny = VecType::load(0.0004f); + //std::cout << "tiny:"; + //VecType::output_vec(tiny); + const auto x = VecType::max(VecType::min(a, plus_clamp), minus_clamp); + //std::cout << "x:"; + //VecType::output_vec(x); + const auto tiny_mask = VecType::lt(VecType::abs(a), tiny); + //std::cout << "tiny_mask:"; + //VecType::output_vec_mask(tiny_mask); + + + const auto alpha_1 = VecType::load(4.89352455891786e-03f); + const auto alpha_3 = VecType::load(6.37261928875436e-04f); + const auto alpha_5 = VecType::load(1.48572235717979e-05f); + const auto alpha_7 = VecType::load(5.12229709037114e-08f); + const auto alpha_9 = VecType::load(-8.60467152213735e-11f); + const auto alpha_11 = VecType::load(2.00018790482477e-13f); + const auto alpha_13 = VecType::load(-2.76076847742355e-16f); + + const auto beta_0 = VecType::load(4.89352518554385e-03f); + const auto beta_2 = VecType::load(2.26843463243900e-03f); + const auto beta_4 = VecType::load(1.18534705686654e-04f); + const auto beta_6 = VecType::load(1.19825839466702e-06f); + + const auto x2 = VecType::mul(x, x); + //std::cout << "x2:"; + //output_vec(x2); + + + auto p = VecType::mul_add(x2, alpha_13, alpha_11); + //std::cout << "p:"; + //output_vec(p); + + p = VecType::mul_add(x2, p, alpha_9); + p = VecType::mul_add(x2, p, alpha_7); + p = VecType::mul_add(x2, p, alpha_5); + p = VecType::mul_add(x2, p, alpha_3); + p = VecType::mul_add(x2, p, alpha_1); + p = VecType::mul(x, p); + + auto q = VecType::mul_add(x2, beta_6, beta_4); + q = VecType::mul_add(x2, q, beta_2); + q = VecType::mul_add(x2, q, beta_0); + + return VecType::select(tiny_mask, x, VecType::div(p, q)); + } }; - } } From 7e1f287e5b09cee025c8c19521b3a00696ffb452 Mon Sep 17 00:00:00 2001 From: Dagamies Date: Fri, 1 Dec 2023 05:17:36 +0100 Subject: [PATCH 06/10] Initial Power10 build --- src/cpu/vec_power10.h | 135 ++++++++++++------------------------------ 1 file changed, 38 insertions(+), 97 deletions(-) diff --git a/src/cpu/vec_power10.h b/src/cpu/vec_power10.h index a32a6451d..98264126f 100644 --- a/src/cpu/vec_power10.h +++ b/src/cpu/vec_power10.h @@ -7,10 +7,7 @@ #include #include -#include - - -#include +#include #include "vec.h" @@ -133,31 +130,31 @@ namespace ctranslate2 { } static inline value_type exp(value_type a) { - return Sleef_expf4_u10vsx(a); + return Sleef_expf4_u10vsx3(a); } static inline value_type log(value_type a) { - return Sleef_logf4_u35vsx(a); + return Sleef_logf4_u35vsx3(a); } static inline value_type sin(value_type a) { - return Sleef_sinf4_u35vsx(a); + return Sleef_sinf4_u35vsx3(a); } static inline value_type cos(value_type a) { - return Sleef_cosf4_u35vsx(a); + return Sleef_cosf4_u35vsx3(a); } static inline value_type tanh(value_type a) { - return Sleef_tanhf4_u35vsx(a); + return Sleef_tanhf4_u35vsx3(a); } static inline value_type erf(value_type a) { - return Sleef_erff4_u10vsx(a); + return Sleef_erff4_u10vsx3(a); } static inline value_type max(value_type a, value_type b) { @@ -190,24 +187,42 @@ namespace ctranslate2 { } static inline float reduce_add(value_type a) { - /*float f=0; - for (int i=0;i> 2) & 0x03; + unsigned long __element_selector_54 = (1 >> 4) & 0x03; + unsigned long __element_selector_76 = (1 >> 6) & 0x03; + static const unsigned int __permute_selectors[4] = + { +#ifdef __LITTLE_ENDIAN__ + 0x03020100, 0x07060504, 0x0B0A0908, 0x0F0E0D0C +#else + 0x00010203, 0x04050607, 0x08090A0B, 0x0C0D0E0F +#endif + }; + __vector unsigned int __t; + __t[0] = __permute_selectors[__element_selector_10]; + __t[1] = __permute_selectors[__element_selector_32]; + __t[2] = __permute_selectors[__element_selector_54] + 0x10101010; + __t[3] = __permute_selectors[__element_selector_76] + 0x10101010; + + __vector unsigned long long v1 = vec_mergel((__vector unsigned long long)a,(__vector unsigned long long)a); + value_type v2 = (value_type)a + (value_type)v1; + value_type v3 = vec_perm (v2, v2,(__vector unsigned char) __t); + return v2[0]+v3[0]; + + /*__m128 t1 = _mm_movehl_ps(a, a); + for (int b=0; b<4;b+=1) + std::cout << "t1["< a[3] ? a[2] : a[3]; return t0 > t1 ? t0 : t1; } - static inline void output_vec(value_type v) - { - for(int a=0;a<4;a+=1) - std::cout<< " "< - static inline value_type vec_tanh(value_type a) { - using VecType = Vec; - - // Implementation ported from Eigen: - // https://gitlab.com/libeigen/eigen/-/blob/3.4.0/Eigen/src/Core/MathFunctionsImpl.h#L18-L76 - //std::cout << "Starting Power10::vec_tanh\n"; - const auto plus_clamp = VecType::load(7.90531110763549805f); - //std::cout << " plus_clamp:"; - //VecType::output_vec(plus_clamp); - const auto minus_clamp = VecType::load(-7.90531110763549805f); - //std::cout << " minus_clamp:"; - //VecType::output_vec(minus_clamp); - const auto tiny = VecType::load(0.0004f); - //std::cout << "tiny:"; - //VecType::output_vec(tiny); - const auto x = VecType::max(VecType::min(a, plus_clamp), minus_clamp); - //std::cout << "x:"; - //VecType::output_vec(x); - const auto tiny_mask = VecType::lt(VecType::abs(a), tiny); - //std::cout << "tiny_mask:"; - //VecType::output_vec_mask(tiny_mask); - - - const auto alpha_1 = VecType::load(4.89352455891786e-03f); - const auto alpha_3 = VecType::load(6.37261928875436e-04f); - const auto alpha_5 = VecType::load(1.48572235717979e-05f); - const auto alpha_7 = VecType::load(5.12229709037114e-08f); - const auto alpha_9 = VecType::load(-8.60467152213735e-11f); - const auto alpha_11 = VecType::load(2.00018790482477e-13f); - const auto alpha_13 = VecType::load(-2.76076847742355e-16f); - - const auto beta_0 = VecType::load(4.89352518554385e-03f); - const auto beta_2 = VecType::load(2.26843463243900e-03f); - const auto beta_4 = VecType::load(1.18534705686654e-04f); - const auto beta_6 = VecType::load(1.19825839466702e-06f); - - const auto x2 = VecType::mul(x, x); - //std::cout << "x2:"; - //output_vec(x2); - - - auto p = VecType::mul_add(x2, alpha_13, alpha_11); - //std::cout << "p:"; - //output_vec(p); - - p = VecType::mul_add(x2, p, alpha_9); - p = VecType::mul_add(x2, p, alpha_7); - p = VecType::mul_add(x2, p, alpha_5); - p = VecType::mul_add(x2, p, alpha_3); - p = VecType::mul_add(x2, p, alpha_1); - p = VecType::mul(x, p); - - auto q = VecType::mul_add(x2, beta_6, beta_4); - q = VecType::mul_add(x2, q, beta_2); - q = VecType::mul_add(x2, q, beta_0); - - return VecType::select(tiny_mask, x, VecType::div(p, q)); - } }; } } From 0b5a0f6fc9ea77ff772ee343043c8c07d1a3bac4 Mon Sep 17 00:00:00 2001 From: Dagamies Date: Fri, 1 Dec 2023 06:33:30 +0100 Subject: [PATCH 07/10] Code cleaning for Power10 port --- src/cpu/vec_power10.h | 40 +++++----------------------------------- 1 file changed, 5 insertions(+), 35 deletions(-) diff --git a/src/cpu/vec_power10.h b/src/cpu/vec_power10.h index 98264126f..c87242184 100644 --- a/src/cpu/vec_power10.h +++ b/src/cpu/vec_power10.h @@ -30,17 +30,16 @@ namespace ctranslate2 { static constexpr dim_t width = 4; static inline value_type unaligned_load(const float* ptr){ - return vec_perm(vec_ld(0, ptr), vec_ld(16, ptr), vec_lvsl(0, ptr)); + return (value_type){*ptr,*(ptr+1),*(ptr+2),*(ptr+3)}; } static inline value_type load(float value) { - return vec_splats(value); + return (value_type){value,value,value,value}; } static inline value_type load(const float* ptr) { - return vec_perm(vec_ld(0, ptr), vec_ld(16, ptr), vec_lvsl(0, ptr)); - + return (value_type){*ptr,*(ptr+1),*(ptr+2),*(ptr+3)}; } static inline value_type load(const float* ptr, dim_t count, float default_value = float(0)) { @@ -55,8 +54,7 @@ namespace ctranslate2 { } static inline value_type load_and_convert(const int32_t* ptr) { - - return vec_ctf(vec_perm(vec_ld(0, ptr), vec_ld(16, ptr), vec_lvsl(0, ptr)),0); + return vec_ctf((vector signed int){*ptr,*(ptr+1),*(ptr+2),*(ptr+3)},0); } static inline value_type load_and_convert(const int32_t* ptr, @@ -67,15 +65,7 @@ namespace ctranslate2 { } else { __ct2_align16__ int32_t tmp_values[width]; std::fill(tmp_values, tmp_values + width, default_value); - for (int i=0;imax) { - max=a[i]; - } - } - //std::cout << "reduce_max "; - return max;*/ float t0 = a[0] > a[1] ? a[0] : a[1]; float t1 = a[2] > a[3] ? a[2] : a[3]; return t0 > t1 ? t0 : t1; From 95d18e6269d362a31b66e36840a527e6d0c78dd8 Mon Sep 17 00:00:00 2001 From: Dagamies Date: Thu, 25 Jul 2024 10:38:19 +0200 Subject: [PATCH 08/10] Docker/Podman support for ppc64le --- docker/Dockerfile.ppc64le | 91 +++++++++++++++++++++++++++ python/setup-ppc64le.py | 126 ++++++++++++++++++++++++++++++++++++++ 2 files changed, 217 insertions(+) create mode 100644 docker/Dockerfile.ppc64le create mode 100644 python/setup-ppc64le.py diff --git a/docker/Dockerfile.ppc64le b/docker/Dockerfile.ppc64le new file mode 100644 index 000000000..e608d359a --- /dev/null +++ b/docker/Dockerfile.ppc64le @@ -0,0 +1,91 @@ +FROM ppc64le/ubuntu:22.04 as builder + +RUN apt-get update && \ + apt-get install -y --no-install-recommends \ + python3-dev \ + python3-pip \ + wget \ + git \ + build-essential \ + && \ + apt-get clean && \ + rm -rf /var/lib/apt/lists/* + + +WORKDIR /root + +RUN python3 -m pip --no-cache-dir install cmake==3.22.* + +RUN wget -qO- https://public.dhe.ibm.com/software/server/POWER/Linux/toolchain/at/ubuntu/dists/jammy/615d762f.gpg.key | tee /etc/apt/trusted.gpg.d/615d762f.asc && \ + echo "deb [signed-by=/etc/apt/trusted.gpg.d/615d762f.asc] https://public.dhe.ibm.com/software/server/POWER/Linux/toolchain/at/ubuntu jammy at17.0" >> /etc/apt/sources.list && \ + cat /etc/apt/sources.list && \ + cat /etc/apt/trusted.gpg.d/615d762f.asc && \ + apt update && \ + cat /etc/apt/sources.list && \ + apt install -y advance-toolchain-at17.0-runtime advance-toolchain-at17.0-devel advance-toolchain-at17.0-perf advance-toolchain-at17.0-mcore-libs + +ENV SLEEF_VERSION=3.6.1 +RUN wget -q https://github.com/shibatch/sleef/archive/refs/tags/${SLEEF_VERSION}.tar.gz && \ + tar xf *.tar.gz && \ + rm *.tar.gz && \ + cd sleef* && \ + mkdir build && \ + cd build && \ + cmake -DSLEEF_BUILD_INLINE_HEADERS=TRUE -DCMAKE_CXX_FLAGS='-mcpu=power10 -mtune=power10 -O3 -std=gnu++11 -maltivec -mabi=altivec -mstrict-align ' -DCMAKE_C_COMPILER=/opt/at17.0/bin/gcc -DCMAKE_CXX_COMPILER=/opt/at17.0/bin/g++ -DAT_PATH=/opt/at17.0/ -DBUILD_SHARED_LIBS=FALSE -DBUILD_TESTS=FALSE -DENFORCE_VSX3=TRUE -DSLEEF_SHOW_CONFIG=1 -DCMAKE_BUILD_TYPE=Release .. && \ + cd .. && \ + cmake --build build -j --clean-first && \ + cmake --install build --prefix=/usr/ + + +ENV ONEDNN_VERSION=3.1.1 +RUN wget -q https://github.com/oneapi-src/oneDNN/archive/refs/tags/v${ONEDNN_VERSION}.tar.gz && \ + tar xf *.tar.gz && \ + rm *.tar.gz && \ + cd oneDNN-* && \ + cmake -DCMAKE_BUILD_TYPE=Release -DONEDNN_LIBRARY_TYPE=STATIC -DONEDNN_BUILD_EXAMPLES=OFF -DONEDNN_BUILD_TESTS=OFF -DONEDNN_ENABLE_WORKLOAD=INFERENCE -DONEDNN_ENABLE_PRIMITIVE="CONVOLUTION;REORDER" -DONEDNN_BUILD_GRAPH=OFF -DCMAKE_CXX_FLAGS='-mcpu=power10 -mtune=power10 -O3 -maltivec' -DOPENMP_RUNTIME=COMP . && \ + make -j$(nproc) install && \ + cd .. && \ + rm -r oneDNN-* + +COPY third_party third_party +COPY cli cli +COPY include include +COPY src src +COPY cmake cmake +COPY python python +COPY CMakeLists.txt . + +ARG CXX_FLAGS +ENV CXX_FLAGS=${CXX_FLAGS:-"-mcpu=power10 -mtune=power10 -O3 -ffp-contract=off"} + +ENV CTRANSLATE2_ROOT=/opt/ctranslate2 + +RUN mkdir build && \ + cd build && \ + cmake -DCMAKE_INSTALL_PREFIX=${CTRANSLATE2_ROOT} \ + -DWITH_CUDA=OFF -DWITH_MKL=OFF -DWITH_OPENBLAS=OFF \ + -DWITH_DNNL=ON -DOPENMP_RUNTIME=COMP \ + -DCMAKE_CXX_FLAGS="${CXX_FLAGS}" \ + -DCMAKE_BUILD_TYPE=Release \ + .. && \ + VERBOSE=1 make -j$(nproc) install + +ENV LANG=en_US.UTF-8 +COPY README.md . + +RUN cd python && \ + python3 -m pip --no-cache-dir install -r install_requirements.txt && \ + python3 setup-ppc64le.py bdist_wheel --dist-dir $CTRANSLATE2_ROOT + + +ENV CTRANSLATE2_ROOT=/opt/ctranslate2 +ENV LD_LIBRARY_PATH=$LD_LIBRARY_PATH:$CTRANSLATE2_ROOT/lib + +#COPY --from=builder $CTRANSLATE2_ROOT $CTRANSLATE2_ROOT +RUN pip3 install --force-reinstall ninja + + +RUN python3 -m pip --no-cache-dir install $CTRANSLATE2_ROOT/*.whl && \ + rm $CTRANSLATE2_ROOT/*.whl + +ENTRYPOINT ["/opt/ctranslate2/bin/ct2-translator"] diff --git a/python/setup-ppc64le.py b/python/setup-ppc64le.py new file mode 100644 index 000000000..51a21fd43 --- /dev/null +++ b/python/setup-ppc64le.py @@ -0,0 +1,126 @@ +import glob +import os +import sys + +import pybind11 + +from pybind11.setup_helpers import ParallelCompile +from setuptools import Extension, find_packages, setup + +base_dir = os.path.dirname(os.path.abspath(__file__)) +include_dirs = [pybind11.get_include()] +library_dirs = [] + + +def _get_long_description(): + readme_path = os.path.join(base_dir, "README.md") + if not os.path.exists(readme_path): + return "" + with open(readme_path, encoding="utf-8") as readme_file: + return readme_file.read() + + +def _get_project_version(): + version_path = os.path.join(base_dir, "ctranslate2", "version.py") + version = {} + with open(version_path, encoding="utf-8") as fp: + exec(fp.read(), version) + return version["__version__"] + + +def _maybe_add_library_root(lib_name): + if "%s_ROOT" % lib_name in os.environ: + root = os.environ["%s_ROOT" % lib_name] + include_dirs.append("%s/include" % root) + for lib_dir in ("lib", "lib64"): + path = "%s/%s" % (root, lib_dir) + if os.path.exists(path): + library_dirs.append(path) + break + + +_maybe_add_library_root("CTRANSLATE2") + +cflags = ["-std=c++17", "-fvisibility=hidden"] +ldflags = [] +package_data = {} +if sys.platform == "darwin": + # std::visit requires macOS 10.14 + cflags.append("-mmacosx-version-min=10.14") + ldflags.append("-Wl,-rpath,/usr/local/lib") +elif sys.platform == "win32": + cflags = ["/std:c++17", "/d2FH4-"] + package_data["ctranslate2"] = ["*.dll"] + +ctranslate2_module = Extension( + "ctranslate2._ext", + sources=glob.glob(os.path.join("cpp", "*.cc")), + extra_compile_args=cflags, + extra_link_args=ldflags, + include_dirs=include_dirs, + library_dirs=library_dirs, + libraries=["ctranslate2"], +) + +ParallelCompile("CMAKE_BUILD_PARALLEL_LEVEL").install() + +setup( + name="ctranslate2", + version=_get_project_version(), + license="MIT", + description="Fast inference engine for Transformer models", + long_description=_get_long_description(), + long_description_content_type="text/markdown", + author="OpenNMT", + url="https://opennmt.net", + classifiers=[ + "Development Status :: 5 - Production/Stable", + "Environment :: GPU :: NVIDIA CUDA :: 11.0", + "Environment :: GPU :: NVIDIA CUDA :: 11.1", + "Environment :: GPU :: NVIDIA CUDA :: 11.2", + "Environment :: GPU :: NVIDIA CUDA :: 11.3", + "Environment :: GPU :: NVIDIA CUDA :: 11.4", + "Environment :: GPU :: NVIDIA CUDA :: 11.5", + "Environment :: GPU :: NVIDIA CUDA :: 11.6", + "Environment :: GPU :: NVIDIA CUDA :: 11.7", + "Environment :: GPU :: NVIDIA CUDA :: 11.8", + "Intended Audience :: Developers", + "Intended Audience :: Science/Research", + "License :: OSI Approved :: MIT License", + "Programming Language :: Python :: 3", + "Programming Language :: Python :: 3 :: Only", + "Programming Language :: Python :: 3.8", + "Programming Language :: Python :: 3.9", + "Programming Language :: Python :: 3.10", + "Programming Language :: Python :: 3.11", + "Programming Language :: Python :: 3.12", + "Topic :: Scientific/Engineering :: Artificial Intelligence", + ], + project_urls={ + "Documentation": "https://opennmt.net/CTranslate2", + "Forum": "https://forum.opennmt.net", + "Gitter": "https://gitter.im/OpenNMT/CTranslate2", + "Source": "https://github.com/OpenNMT/CTranslate2", + }, + keywords="opennmt nmt neural machine translation cuda mkl inference quantization", + packages=find_packages(exclude=["bin"]), + package_data=package_data, + ext_modules=[ctranslate2_module], + python_requires=">=3.8", + install_requires=[ + "setuptools", + "numpy==1.25.2", + "pyyaml>=5.3,<7", + ], + entry_points={ + "console_scripts": [ + "ct2-fairseq-converter=ctranslate2.converters.fairseq:main", + "ct2-marian-converter=ctranslate2.converters.marian:main", + "ct2-openai-gpt2-converter=ctranslate2.converters.openai_gpt2:main", + "ct2-opennmt-py-converter=ctranslate2.converters.opennmt_py:main", + "ct2-opennmt-tf-converter=ctranslate2.converters.opennmt_tf:main", + "ct2-opus-mt-converter=ctranslate2.converters.opus_mt:main", + "ct2-transformers-converter=ctranslate2.converters.transformers:main", + ], + }, +) From acc9c85ccd27ba17ccbd3441d22c1fdcc51920c1 Mon Sep 17 00:00:00 2001 From: Dagamies Date: Thu, 25 Jul 2024 16:42:15 +0200 Subject: [PATCH 09/10] Build instructions & fixes --- docs/ppc64le.md | 56 ++++++++++++++++++++++++++++++++++++++++++ src/cpu/vec_power10.h | 57 +++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 113 insertions(+) create mode 100644 docs/ppc64le.md diff --git a/docs/ppc64le.md b/docs/ppc64le.md new file mode 100644 index 000000000..4828aad2b --- /dev/null +++ b/docs/ppc64le.md @@ -0,0 +1,56 @@ +# IBM Power10 -ppc64le + +CTranslate2 fully supports IBM Power10 MMA and VSX extensions. Each Power10 core has 4 Matrix Math Accelerator units. For optimum performance use at least SMT4, in some cases SMT8 seems to perform better, but it is advicable to try out both. A simple way to test this is to use --intra_threads parameter to control the number of threads CTranslate2 is executing. At maximum this should be 8*number of physical cores (SMT-8). + +Based on preliminary testing Power10 core offer 27-42% higher tokens/s compared to Intel Gold Core. + +It should be possible to build for Power9, but missing MMA units will have significant impact on performance. + +OneDNN is used for int8 matrix math that is fully utilizing MMA units, it should be possible to build with OpenBLAS for 16bit MMA usage. + +## Build docker / podman container + +This is the easy way: +```git clone --recursive https://github.com/OpenNMT/CTranslate2/ +cd CTranslate2/docker +podman build -t elinar.ai/ct2-ppc64le -f Dockerfile.ppc64le .. + +``` + +Then run CTranslate2 container (substitue mount point, MODEL_LOCATION and SRC_FILE): +```podman run --security-opt=label=disable --ipc=host --ulimit=host -it --rm -v /tmp:/tmp elinar.ai/ct2-ppc64le --model MODEL_LOCATION --src SRC_FILE --intra_threads 16``` + +## Install from sources +This build has been tested on RHEL 9 / ppc64le and requires IBM Advance Toolchain 17.0 ( https://www.ibm.com/support/pages/advance-toolchain-linux-power ) +``` +#sleef: +git clone -b 3.6.1 https://github.com/shibatch/sleef + +cd sleef +mkdir build && cd build +cmake -DSLEEF_BUILD_INLINE_HEADERS=TRUE -DCMAKE_CXX_FLAGS='-mcpu=power10 -mtune=power10 -O3 -std=gnu++11 -maltivec -mabi=altivec -mstrict-align ' -DCMAKE_C_COMPILER=/opt/at17.0/bin/gcc -DCMAKE_CXX_COMPILER=/opt/at17.0/bin/g++ -DAT_PATH=/opt/at17.0/ -DBUILD_SHARED_LIBS=FALSE -DBUILD_TESTS=FALSE -DENFORCE_VSX3=TRUE -DSLEEF_SHOW_CONFIG=1 -DCMAKE_BUILD_TYPE=Release .. + +cmake --build build -j --clean-first +sudo cmake --install build --prefix=/usr/ + + +#OneDNN; +git clone -b v3.2 --recursive https://github.com/oneapi-src/oneDNN +cd oneDNN +mkdir build && cd build +cmake -DCMAKE_CXX_FLAGS='-mcpu=power10 -mtune=power10 -O3 -maltivec' -DOPENMP_RUNTIME=COMP .. +make -j16 +sudo make install + + +git clone --recursive https://github.com/Dagamies/CTranslate2 +cd CTranslate2 +mkdir build +cd build +cmake -DWITH_CUDA=OFF -DWITH_MKL=OFF -DWITH_OPENBLAS=OFF -DWITH_DNNL=ON -DCMAKE_CXX_FLAGS='-mcpu=power10 -mtune=power10 -O3 -ffp-contract=off' -DOPENMP_RUNTIME=COMP .. +make -j16 +sudo make install +sudo ldconfig -v +export LD_LIBRARY_PATH=/usr/local/lib64/ + +``` \ No newline at end of file diff --git a/src/cpu/vec_power10.h b/src/cpu/vec_power10.h index c87242184..de8ec6955 100644 --- a/src/cpu/vec_power10.h +++ b/src/cpu/vec_power10.h @@ -206,6 +206,63 @@ namespace ctranslate2 { float t1 = a[2] > a[3] ? a[2] : a[3]; return t0 > t1 ? t0 : t1; } + + static inline value_type round(value_type a) { + return vec_round(a); + } + + template + static inline void convert_and_store(value_type v, U* a, dim_t count) { + *a = v; + } + + static inline void convert_and_store(value_type v, int8_t *a, dim_t count) { + auto i32 = vec_cts(v,0); + + int8_t tmp[4]; + tmp[0]=i32[0]; + tmp[1]=i32[1]; + tmp[2]=i32[2]; + tmp[3]=i32[3]; + std::copy(tmp, tmp + count, a); + + } + + static inline void convert_and_store(value_type v, uint8_t *a, dim_t count) { + auto u32 = vec_ctu(v,0); + uint8_t tmp[4]; + tmp[0]=u32[0]; + tmp[1]=u32[1]; + tmp[2]=u32[2]; + tmp[3]=u32[3]; + std::copy(tmp, tmp + count, a); + + + } + + /* static inline void convert_and_store(value_type v, int8_t *a, dim_t count) { + //convert float32x4_t to int32x4_t + auto i32x4 = vcvtq_s32_f32(v); + //then convert to int16x4_t + auto i16x4 = vqmovn_s32(i32x4); + //finally convert to int8x4_t + auto i8x8 = vqmovn_s16(vcombine_s16(i16x4, vdup_n_s16(0))); + int8_t tmp[8]; + vst1_s8(tmp, i8x8); + std::copy(tmp, tmp + count, a); + } + + static inline void convert_and_store(value_type v, uint8_t *a, dim_t count) { + //convert float32x4_t to uint32x4_t + auto u32x4 = vcvtq_u32_f32(v); + //then convert to uint16x4_t + auto u16x4 = vqmovn_u32(u32x4); + //finally convert to uint8x8_t + auto u8x8 = vqmovn_u16(vcombine_u16(u16x4, vdup_n_u16(0))); + uint8_t tmp[8]; + vst1_u8(tmp, u8x8); + std::copy(tmp, tmp + count, a); + }*/ }; } } From 0651b8c167c7276ebdc72a1daeb95edf5a55fa0a Mon Sep 17 00:00:00 2001 From: Dagamies Date: Thu, 25 Jul 2024 16:57:30 +0200 Subject: [PATCH 10/10] Remove unnecessary code --- src/cpu/vec_power10.h | 34 +--------------------------------- 1 file changed, 1 insertion(+), 33 deletions(-) diff --git a/src/cpu/vec_power10.h b/src/cpu/vec_power10.h index de8ec6955..c9c71abe3 100644 --- a/src/cpu/vec_power10.h +++ b/src/cpu/vec_power10.h @@ -211,11 +211,6 @@ namespace ctranslate2 { return vec_round(a); } - template - static inline void convert_and_store(value_type v, U* a, dim_t count) { - *a = v; - } - static inline void convert_and_store(value_type v, int8_t *a, dim_t count) { auto i32 = vec_cts(v,0); @@ -225,7 +220,6 @@ namespace ctranslate2 { tmp[2]=i32[2]; tmp[3]=i32[3]; std::copy(tmp, tmp + count, a); - } static inline void convert_and_store(value_type v, uint8_t *a, dim_t count) { @@ -235,34 +229,8 @@ namespace ctranslate2 { tmp[1]=u32[1]; tmp[2]=u32[2]; tmp[3]=u32[3]; - std::copy(tmp, tmp + count, a); - - + std::copy(tmp, tmp + count, a); } - - /* static inline void convert_and_store(value_type v, int8_t *a, dim_t count) { - //convert float32x4_t to int32x4_t - auto i32x4 = vcvtq_s32_f32(v); - //then convert to int16x4_t - auto i16x4 = vqmovn_s32(i32x4); - //finally convert to int8x4_t - auto i8x8 = vqmovn_s16(vcombine_s16(i16x4, vdup_n_s16(0))); - int8_t tmp[8]; - vst1_s8(tmp, i8x8); - std::copy(tmp, tmp + count, a); - } - - static inline void convert_and_store(value_type v, uint8_t *a, dim_t count) { - //convert float32x4_t to uint32x4_t - auto u32x4 = vcvtq_u32_f32(v); - //then convert to uint16x4_t - auto u16x4 = vqmovn_u32(u32x4); - //finally convert to uint8x8_t - auto u8x8 = vqmovn_u16(vcombine_u16(u16x4, vdup_n_u16(0))); - uint8_t tmp[8]; - vst1_u8(tmp, u8x8); - std::copy(tmp, tmp + count, a); - }*/ }; } }