From d38a86ce3c7ca92b7833fa091d2a0388fa8b4221 Mon Sep 17 00:00:00 2001
From: Dagamies <ari@juntunen.org>
Date: Wed, 15 Nov 2023 06:02:59 +0100
Subject: [PATCH 01/10] Initial compile for Power10

---
 CMakeLists.txt      |  9 +++++++--
 src/cpu/cpu_info.cc | 16 ++++++++++++++++
 src/cpu/cpu_info.h  |  2 ++
 src/cpu/cpu_isa.cc  | 10 ++++++++++
 src/cpu/cpu_isa.h   | 10 +++++++++-
 src/cpu/kernels.cc  |  3 +++
 6 files changed, 47 insertions(+), 3 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index ce8b3d31f..b2aab087e 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -101,7 +101,7 @@ set(SOURCES
   src/cpu/backend.cc
   src/cpu/cpu_info.cc
   src/cpu/cpu_isa.cc
-  src/cpu/kernels.cc
+  #src/cpu/kernels.cc
   src/cpu/parallel.cc
   src/cpu/primitives.cc
   src/decoding.cc
@@ -214,7 +214,10 @@ if(CMAKE_SYSTEM_PROCESSOR MATCHES "(arm64)|(aarch64)"
 elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "(x86_64)|(amd64)|(AMD64)")
   add_definitions(-DCT2_X86_BUILD)
   set(CT2_BUILD_ARCH "x86_64")
-
+elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "(ppc64le)|(PPC64LE)")
+  add_definitions(-DCT2_PPC64LE_BUILD)
+  set(CT2_BUILD_ARCH "ppc64le")
+  
   if(BUILD_SHARED_LIBS)
     set(CMAKE_POSITION_INDEPENDENT_CODE ON)
   endif()
@@ -241,6 +244,8 @@ if(ENABLE_CPU_DISPATCH)
     endif()
   elseif(CT2_BUILD_ARCH STREQUAL "arm64")
     ct2_compile_kernels_for_isa(neon "-DUSE_NEON")
+   elseif(CT2_BUILD_ARCH STREQUAL "ppc64le")
+    ct2_compile_kernels_for_isa(ppc64le "-mcpu=power10 -O3 -flto")
   endif()
 endif()
 
diff --git a/src/cpu/cpu_info.cc b/src/cpu/cpu_info.cc
index 9030ac7a4..c320dae71 100644
--- a/src/cpu/cpu_info.cc
+++ b/src/cpu/cpu_info.cc
@@ -58,4 +58,20 @@ namespace ctranslate2 {
   }
 }
 
+#elif defined(CT2_PPC64LE_BUILD)
+
+namespace ctranslate2 {
+  namespace cpu {
+
+    const char* cpu_vendor() {
+      return "POWER";
+    }
+
+    bool cpu_supports_power10() {
+      return true;
+    }
+
+  }
+}
+
 #endif
diff --git a/src/cpu/cpu_info.h b/src/cpu/cpu_info.h
index c2951bcc0..0c696805a 100644
--- a/src/cpu/cpu_info.h
+++ b/src/cpu/cpu_info.h
@@ -14,6 +14,8 @@ namespace ctranslate2 {
     bool cpu_supports_avx512();
 #elif defined(CT2_ARM64_BUILD)
     bool cpu_supports_neon();
+#elif defined(CT2_PPC64LE_BUILD)
+    bool cpu_supports_power10();
 #endif
 
   }
diff --git a/src/cpu/cpu_isa.cc b/src/cpu/cpu_isa.cc
index c16aeda22..e616b3c34 100644
--- a/src/cpu/cpu_isa.cc
+++ b/src/cpu/cpu_isa.cc
@@ -35,7 +35,11 @@ namespace ctranslate2 {
 #elif defined(CT2_ARM64_BUILD)
       case CpuIsa::NEON:
         return "NEON";
+#elif defined(CT2_PPC64_BUILD)
+      case CpuIsa::POWER10:
+        return "POWER10";
 #endif
+
       default:
         return "GENERIC";
       }
@@ -54,6 +58,9 @@ namespace ctranslate2 {
 #elif defined(CT2_ARM64_BUILD)
         if (env_isa == "NEON")
           return try_isa(env_isa, CpuIsa::NEON, cpu_supports_neon());
+#elif defined(CT2_PPC64_BUILD)
+        if (env_isa == "POWER10")
+          return try_isa(env_isa, CpuIsa::POWER10, cpu_supports_power10());
 #endif
         if (env_isa == "GENERIC")
           return CpuIsa::GENERIC;
@@ -71,6 +78,9 @@ namespace ctranslate2 {
 #  elif defined(CT2_ARM64_BUILD)
       if (cpu_supports_neon())
         return CpuIsa::NEON;
+#  elif defined(CT2_PPC64_BUILD)
+      if (cpu_supports_power10())
+        return CpuIsa::POWER10;
 #  endif
 #endif
 
diff --git a/src/cpu/cpu_isa.h b/src/cpu/cpu_isa.h
index 4f42bdf26..45c0f5815 100644
--- a/src/cpu/cpu_isa.h
+++ b/src/cpu/cpu_isa.h
@@ -13,6 +13,8 @@ namespace ctranslate2 {
       AVX512,
 #elif defined(CT2_ARM64_BUILD)
       NEON,
+#elif defined(CT2_PPC64LE_BUILD)
+      POWER10,
 #endif
     };
 
@@ -54,7 +56,12 @@ namespace ctranslate2 {
     CPU_ISA_CASE(cpu::CpuIsa::NEON, SINGLE_ARG(STMTS))        \
     CPU_ISA_DEFAULT(cpu::CpuIsa::GENERIC, SINGLE_ARG(STMTS))  \
   }
-#endif
+#elif defined(CT2_PPC64_BUILD)
+#  define CPU_ISA_DISPATCH(STMTS)                             \
+  switch (cpu::get_cpu_isa()) {                               \
+    CPU_ISA_CASE(cpu::CpuIsa::POWER10, SINGLE_ARG(STMTS))     \
+    #CPU_ISA_DEFAULT(cpu::CpuIsa::GENERIC, SINGLE_ARG(STMTS))  \
+  }
 #elif defined(__AVX512F__)
 #  define CPU_ISA_DISPATCH(STMTS)                             \
   switch (cpu::get_cpu_isa()) {                               \
@@ -81,3 +88,4 @@ namespace ctranslate2 {
     CPU_ISA_DEFAULT(cpu::CpuIsa::GENERIC, SINGLE_ARG(STMTS))  \
   }
 #endif
+#endif
diff --git a/src/cpu/kernels.cc b/src/cpu/kernels.cc
index 3edc8e19b..e82cd34aa 100644
--- a/src/cpu/kernels.cc
+++ b/src/cpu/kernels.cc
@@ -14,6 +14,9 @@
 #elif (defined(__ARM_NEON) && !defined(CT2_WITH_CPU_DISPATCH)) || defined(USE_NEON)
 #  define TARGET_ISA CpuIsa::NEON
 #  include "cpu/vec_neon.h"
+#elif defined(CT2_PPC64_BUILD)
+#  define TARGET_ISA CpuIsa::POWER10
+#  include "cpu/vec_power10.h"
 #else
 #  define TARGET_ISA CpuIsa::GENERIC
 #  include "cpu/vec.h"

From 57ce2402814c98a8a598d4e05be0ee189ea65179 Mon Sep 17 00:00:00 2001
From: Dagamies <ari@juntunen.org>
Date: Wed, 15 Nov 2023 06:04:04 +0100
Subject: [PATCH 02/10] Initial compile for Power10

---
 src/cpu/vec_power10.h | 231 ++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 231 insertions(+)
 create mode 100644 src/cpu/vec_power10.h

diff --git a/src/cpu/vec_power10.h b/src/cpu/vec_power10.h
new file mode 100644
index 000000000..1a4bb859a
--- /dev/null
+++ b/src/cpu/vec_power10.h
@@ -0,0 +1,231 @@
+#pragma once
+
+#include <algorithm>
+#include <cmath>
+#include <cstdlib>
+
+#include "ctranslate2/types.h"
+
+namespace ctranslate2 {
+  namespace cpu {
+
+    // Interface for vectorized types.
+    template <typename T, CpuIsa ISA = CpuIsa::POWER10>
+    struct Vec {
+
+      using value_type = T;
+      using mask_type = bool;
+      static constexpr dim_t width = 1;
+
+      static inline value_type load(T value) {
+        return value;
+      }
+
+      static inline value_type load(const T* ptr) {
+        return *ptr;
+      }
+
+      static inline value_type load(const T* ptr, dim_t count, T default_value = T(0)) {
+        (void)count;
+        (void)default_value;
+        return *ptr;
+      }
+
+      static inline value_type load_and_convert(const int32_t* ptr) {
+        return *ptr;
+      }
+
+      static inline value_type load_and_convert(const int32_t* ptr,
+                                                dim_t count,
+                                                int32_t default_value = 0) {
+        (void)count;
+        (void)default_value;
+        return *ptr;
+      }
+
+      static inline void store(value_type value, T* ptr) {
+        *ptr = value;
+      }
+
+      static inline void store(value_type value, T* ptr, dim_t count) {
+        (void)count;
+        *ptr = value;
+      }
+
+      static inline value_type bit_and(value_type a, value_type b) {
+        return a & b;
+      }
+
+      static inline value_type bit_xor(value_type a, value_type b) {
+        return a ^ b;
+      }
+
+      static inline mask_type lt(value_type a, value_type b) {
+        return a < b;
+      }
+
+      static inline value_type select(mask_type mask, value_type a, value_type b) {
+        return mask ? a : b;
+      }
+
+      static inline value_type abs(value_type a) {
+        return static_cast<value_type>(std::abs(a));
+      }
+
+      static inline value_type neg(value_type a) {
+        return -a;
+      }
+
+      static inline value_type rcp(value_type a) {
+        return static_cast<T>(1) / a;
+      }
+
+      static inline value_type exp(value_type a) {
+        return std::exp(a);
+      }
+
+      static inline value_type log(value_type a) {
+        return std::log(a);
+      }
+
+      static inline value_type sin(value_type a) {
+        return std::sin(a);
+      }
+
+      static inline value_type cos(value_type a) {
+        return std::cos(a);
+      }
+
+      static inline value_type tanh(value_type a) {
+        return std::tanh(a);
+      }
+
+      static inline value_type erf(value_type a) {
+        return std::erf(a);
+      }
+
+      static inline value_type max(value_type a, value_type b) {
+        return std::max(a, b);
+      }
+
+      static inline value_type min(value_type a, value_type b) {
+        return std::min(a, b);
+      }
+
+      static inline value_type add(value_type a, value_type b) {
+        return a + b;
+      }
+
+      static inline value_type sub(value_type a, value_type b) {
+        return a - b;
+      }
+
+      static inline value_type mul(value_type a, value_type b) {
+        return a * b;
+      }
+
+      static inline value_type div(value_type a, value_type b) {
+        return a / b;
+      }
+
+      static inline value_type mul_add(value_type a, value_type b, value_type c) {
+        return a * b + c;
+      }
+
+      static inline T reduce_add(value_type a) {
+        return a;
+      }
+
+      static inline T reduce_max(value_type a) {
+        return a;
+      }
+
+    };
+
+    template <typename T, CpuIsa ISA = CpuIsa::POWER10>
+    using vec_type = typename Vec<T, ISA>::value_type;
+
+    template <CpuIsa ISA>
+    vec_type<float, ISA> vec_tanh(vec_type<float, ISA> a) {
+      using VecType = Vec<float, ISA>;
+
+      // Implementation ported from Eigen:
+      // https://gitlab.com/libeigen/eigen/-/blob/3.4.0/Eigen/src/Core/MathFunctionsImpl.h#L18-L76
+
+      const auto plus_clamp = VecType::load(7.90531110763549805f);
+      const auto minus_clamp = VecType::load(-7.90531110763549805f);
+      const auto tiny = VecType::load(0.0004f);
+      const auto x = VecType::max(VecType::min(a, plus_clamp), minus_clamp);
+      const auto tiny_mask = VecType::lt(VecType::abs(a), tiny);
+
+      const auto alpha_1 = VecType::load(4.89352455891786e-03f);
+      const auto alpha_3 = VecType::load(6.37261928875436e-04f);
+      const auto alpha_5 = VecType::load(1.48572235717979e-05f);
+      const auto alpha_7 = VecType::load(5.12229709037114e-08f);
+      const auto alpha_9 = VecType::load(-8.60467152213735e-11f);
+      const auto alpha_11 = VecType::load(2.00018790482477e-13f);
+      const auto alpha_13 = VecType::load(-2.76076847742355e-16f);
+
+      const auto beta_0 = VecType::load(4.89352518554385e-03f);
+      const auto beta_2 = VecType::load(2.26843463243900e-03f);
+      const auto beta_4 = VecType::load(1.18534705686654e-04f);
+      const auto beta_6 = VecType::load(1.19825839466702e-06f);
+
+      const auto x2 = VecType::mul(x, x);
+
+      auto p = VecType::mul_add(x2, alpha_13, alpha_11);
+      p = VecType::mul_add(x2, p, alpha_9);
+      p = VecType::mul_add(x2, p, alpha_7);
+      p = VecType::mul_add(x2, p, alpha_5);
+      p = VecType::mul_add(x2, p, alpha_3);
+      p = VecType::mul_add(x2, p, alpha_1);
+      p = VecType::mul(x, p);
+
+      auto q = VecType::mul_add(x2, beta_6, beta_4);
+      q = VecType::mul_add(x2, q, beta_2);
+      q = VecType::mul_add(x2, q, beta_0);
+
+      return VecType::select(tiny_mask, x, VecType::div(p, q));
+    }
+
+    template <CpuIsa ISA>
+    vec_type<float, ISA> vec_erf(vec_type<float, ISA> a) {
+      using VecType = Vec<float, ISA>;
+
+      // Implementation ported from PyTorch:
+      // https://github.com/pytorch/pytorch/blob/e9bc82f54b9867cc82b0e94dcdc90f9d156277bd/aten/src/ATen/cpu/vec/vec256/vec256_float.h#L158-L189
+
+      // constants
+      const auto neg_zero_vec = VecType::load(-0.f);
+      const auto one_vec = VecType::load(1.0f);
+      const auto p = VecType::load(0.3275911f);
+      const auto p1 = VecType::load(0.254829592f);
+      const auto p2 = VecType::load(-0.284496736f);
+      const auto p3 = VecType::load(1.421413741f);
+      const auto p4 = VecType::load(-1.453152027f);
+      const auto p5 = VecType::load(1.061405429f);
+      // sign(x)
+      auto sign_mask = VecType::bit_and(neg_zero_vec, a);
+      auto abs_vec = VecType::bit_xor(sign_mask, a);
+      // t = 1 / (p * abs(x) + 1)
+      auto tmp0 = VecType::mul_add(p, abs_vec, one_vec);
+      auto t = VecType::div(one_vec, tmp0);
+      // r = p5 * t ^ 4 + p4 * t ^ 3 + p3 * t ^ 2 + p2 * t + p1
+      auto tmp1 = VecType::mul_add(p5, t, p4);
+      auto tmp2 = VecType::mul_add(tmp1, t, p3);
+      auto tmp3 = VecType::mul_add(tmp2, t, p2);
+      auto r = VecType::mul_add(tmp3, t, p1);
+      // - exp(- x * x)
+      auto pow_2 = VecType::mul(a, a);
+      auto neg_pow_2 = VecType::bit_xor(neg_zero_vec, pow_2);
+      // auto tmp4 = exp(neg_pow_2);
+      auto tmp4 = VecType::exp(neg_pow_2);
+      auto tmp5 = VecType::bit_xor(neg_zero_vec, tmp4);
+      // erf(x) = sign(x) * (1 - r * t * exp(- x * x))
+      auto tmp6 = VecType::mul(tmp5, t);
+      auto tmp7 = VecType::mul_add(tmp6, r, one_vec);
+      return VecType::bit_xor(sign_mask, tmp7);
+    }
+
+  }
+}

From 05950a73be6cae8d13219d3cce28b4c8117168e1 Mon Sep 17 00:00:00 2001
From: Dagamies <ari@juntunen.org>
Date: Sun, 19 Nov 2023 08:10:55 +0100
Subject: [PATCH 03/10] sync interim work

---
 src/cpu/cpu_isa.cc    |   6 +-
 src/cpu/cpu_isa.h     |  14 +-
 src/cpu/kernels.cc    |  11 +-
 src/cpu/vec_power10.h | 352 ++++++++++++++++++++++++++----------------
 src/utils.cc          |   4 +
 5 files changed, 244 insertions(+), 143 deletions(-)

diff --git a/src/cpu/cpu_isa.cc b/src/cpu/cpu_isa.cc
index e616b3c34..c84c2a669 100644
--- a/src/cpu/cpu_isa.cc
+++ b/src/cpu/cpu_isa.cc
@@ -35,7 +35,7 @@ namespace ctranslate2 {
 #elif defined(CT2_ARM64_BUILD)
       case CpuIsa::NEON:
         return "NEON";
-#elif defined(CT2_PPC64_BUILD)
+#elif defined(CT2_PPC64LE_BUILD)
       case CpuIsa::POWER10:
         return "POWER10";
 #endif
@@ -58,7 +58,7 @@ namespace ctranslate2 {
 #elif defined(CT2_ARM64_BUILD)
         if (env_isa == "NEON")
           return try_isa(env_isa, CpuIsa::NEON, cpu_supports_neon());
-#elif defined(CT2_PPC64_BUILD)
+#elif defined(CT2_PPC64LE_BUILD)
         if (env_isa == "POWER10")
           return try_isa(env_isa, CpuIsa::POWER10, cpu_supports_power10());
 #endif
@@ -78,7 +78,7 @@ namespace ctranslate2 {
 #  elif defined(CT2_ARM64_BUILD)
       if (cpu_supports_neon())
         return CpuIsa::NEON;
-#  elif defined(CT2_PPC64_BUILD)
+#  elif defined(CT2_PPC64LE_BUILD)
       if (cpu_supports_power10())
         return CpuIsa::POWER10;
 #  endif
diff --git a/src/cpu/cpu_isa.h b/src/cpu/cpu_isa.h
index 45c0f5815..eedfa3bd4 100644
--- a/src/cpu/cpu_isa.h
+++ b/src/cpu/cpu_isa.h
@@ -6,15 +6,15 @@ namespace ctranslate2 {
   namespace cpu {
 
     enum class CpuIsa {
-      GENERIC,
+      GENERIC,POWER10,
 #if defined(CT2_X86_BUILD)
       AVX,
       AVX2,
       AVX512,
 #elif defined(CT2_ARM64_BUILD)
       NEON,
-#elif defined(CT2_PPC64LE_BUILD)
-      POWER10,
+      /*#elif defined(CT2_PPC64LE_BUILD)
+	POWER10,*/
 #endif
     };
 
@@ -56,12 +56,13 @@ namespace ctranslate2 {
     CPU_ISA_CASE(cpu::CpuIsa::NEON, SINGLE_ARG(STMTS))        \
     CPU_ISA_DEFAULT(cpu::CpuIsa::GENERIC, SINGLE_ARG(STMTS))  \
   }
-#elif defined(CT2_PPC64_BUILD)
+#elif defined(CT2_PPC64LE_BUILD)
 #  define CPU_ISA_DISPATCH(STMTS)                             \
   switch (cpu::get_cpu_isa()) {                               \
-    CPU_ISA_CASE(cpu::CpuIsa::POWER10, SINGLE_ARG(STMTS))     \
-    #CPU_ISA_DEFAULT(cpu::CpuIsa::GENERIC, SINGLE_ARG(STMTS))  \
+    CPU_ISA_CASE(cpu::CpuIsa::GENERIC, SINGLE_ARG(STMTS))     \
+    CPU_ISA_DEFAULT(cpu::CpuIsa::GENERIC, SINGLE_ARG(STMTS))  \
   }
+#endif
 #elif defined(__AVX512F__)
 #  define CPU_ISA_DISPATCH(STMTS)                             \
   switch (cpu::get_cpu_isa()) {                               \
@@ -88,4 +89,3 @@ namespace ctranslate2 {
     CPU_ISA_DEFAULT(cpu::CpuIsa::GENERIC, SINGLE_ARG(STMTS))  \
   }
 #endif
-#endif
diff --git a/src/cpu/kernels.cc b/src/cpu/kernels.cc
index e82cd34aa..239394b6a 100644
--- a/src/cpu/kernels.cc
+++ b/src/cpu/kernels.cc
@@ -1,10 +1,13 @@
 #include "cpu/kernels.h"
-
+//#include "cpu/cpu_isa.h"
 #include <limits>
 
 #if defined(__AVX512F__)
 #  define TARGET_ISA CpuIsa::AVX512
 #  include "cpu/vec_avx512.h"
+#elif defined(CT2_PPC64LE_BUILD)
+#  define TARGET_ISA CpuIsa::POWER10
+#  include "cpu/vec_power10.h"
 #elif defined(__AVX2__)
 #  define TARGET_ISA CpuIsa::AVX2
 #  include "cpu/vec_avx.h"
@@ -14,9 +17,9 @@
 #elif (defined(__ARM_NEON) && !defined(CT2_WITH_CPU_DISPATCH)) || defined(USE_NEON)
 #  define TARGET_ISA CpuIsa::NEON
 #  include "cpu/vec_neon.h"
-#elif defined(CT2_PPC64_BUILD)
-#  define TARGET_ISA CpuIsa::POWER10
-#  include "cpu/vec_power10.h"
+//#elif defined(CT2_PPC64LE_BUILD)
+//#  define TARGET_ISA CpuIsa::GENERIC
+//#  include "cpu/vec_power10.h"
 #else
 #  define TARGET_ISA CpuIsa::GENERIC
 #  include "cpu/vec.h"
diff --git a/src/cpu/vec_power10.h b/src/cpu/vec_power10.h
index 1a4bb859a..ea285f58f 100644
--- a/src/cpu/vec_power10.h
+++ b/src/cpu/vec_power10.h
@@ -1,38 +1,52 @@
 #pragma once
 
+//#include <arm_neon.h>
+//#include <neon_mathfun.h>
 #include <algorithm>
 #include <cmath>
 #include <cstdlib>
 
-#include "ctranslate2/types.h"
+#include <altivec.h>
+
+#include "vec.h"
+
+#if defined(__GNUC__) || defined(__clang__)
+#  define __ct2_align16__ __attribute__((aligned(16)))
+#else
+#  define __ct2_align16__
+#endif
 
 namespace ctranslate2 {
   namespace cpu {
 
-    // Interface for vectorized types.
-    template <typename T, CpuIsa ISA = CpuIsa::POWER10>
-    struct Vec {
+    template<>
+    struct Vec<float, TARGET_ISA> {
 
-      using value_type = T;
-      using mask_type = bool;
-      static constexpr dim_t width = 1;
+      using value_type =  __vector float;
+      using mask_type = __vector bool int;
+      static constexpr dim_t width = 4;
+      //using value_type = float;
+      //using mask_type = uint;
+      //static constexpr dim_t width = 4;
 
-      static inline value_type load(T value) {
-        return value;
+      static inline value_type load(float value) {
+        return vec_lde(0,&value);
       }
 
-      static inline value_type load(const T* ptr) {
-        return *ptr;
+      static inline value_type load(const float* ptr) {
+        return  vec_ld(0,ptr);
       }
 
-      static inline value_type load(const T* ptr, dim_t count, T default_value = T(0)) {
+      static inline value_type load(const float* ptr, dim_t count, float default_value = float(0)) {
         (void)count;
         (void)default_value;
-        return *ptr;
+	//KESKEN
+        return vec_ld(0,ptr);
       }
 
       static inline value_type load_and_convert(const int32_t* ptr) {
-        return *ptr;
+	
+        return vec_ctf(vec_lde(0,ptr),0);
       }
 
       static inline value_type load_and_convert(const int32_t* ptr,
@@ -40,192 +54,272 @@ namespace ctranslate2 {
                                                 int32_t default_value = 0) {
         (void)count;
         (void)default_value;
-        return *ptr;
+	//KESKEN
+        return vec_ctf(vec_lde(0,ptr),0);
       }
 
-      static inline void store(value_type value, T* ptr) {
-        *ptr = value;
+      static inline void store(value_type value, float* ptr) {
+        //*ptr = value;
+	vec_ste(value,0,ptr);
       }
 
-      static inline void store(value_type value, T* ptr, dim_t count) {
+      static inline void store(value_type value, float* ptr, dim_t count) {
         (void)count;
-        *ptr = value;
+        //*ptr = value;
+	//KESKEN
+	vec_ste(value,0,ptr);
       }
 
-      static inline value_type bit_and(value_type a, value_type b) {
-        return a & b;
+      static inline value_type bit_and(value_type  a, value_type b) {
+        //return vreinterpretq_f32_u32(vandq_u32(vreinterpretq_u32_f32(a), vreinterpretq_u32_f32(b)));
+        //return a & b;
+	//__vector value_type va = *(__vector value_type *)(&a);
+        //__vector value_type vb = *(__vector value_type *)(&b);
+	//__vector value_type va=a;
+	//__vector value_type vb=b;
+	return vec_and(a,b);
       }
 
       static inline value_type bit_xor(value_type a, value_type b) {
-        return a ^ b;
+        return vec_xor(a,b);
       }
 
       static inline mask_type lt(value_type a, value_type b) {
-        return a < b;
+        return vec_cmplt(a,b);
       }
 
       static inline value_type select(mask_type mask, value_type a, value_type b) {
+	//KESKEN!!
         return mask ? a : b;
       }
 
       static inline value_type abs(value_type a) {
-        return static_cast<value_type>(std::abs(a));
+        return vec_abs(a);
       }
 
       static inline value_type neg(value_type a) {
-        return -a;
+        return vec_neg(a);
       }
 
       static inline value_type rcp(value_type a) {
-        return static_cast<T>(1) / a;
+        return vec_re(a);
       }
 
       static inline value_type exp(value_type a) {
-        return std::exp(a);
+	//KESKEN
+        return exp(a);
       }
 
       static inline value_type log(value_type a) {
-        return std::log(a);
+        return vec_loge(a);
       }
-
       static inline value_type sin(value_type a) {
-        return std::sin(a);
+        value_type out;
+        for (int i=0;i<4;i+=1) {
+	  out=vec_insert(std::sin(vec_extract(a,i)),a,i);
+        }
+        return out;
       }
 
       static inline value_type cos(value_type a) {
-        return std::cos(a);
+        return cos(a);
       }
 
       static inline value_type tanh(value_type a) {
-        return std::tanh(a);
+        return tanh(a);
       }
 
       static inline value_type erf(value_type a) {
-        return std::erf(a);
+        return erf(a);
       }
 
       static inline value_type max(value_type a, value_type b) {
-        return std::max(a, b);
+        return vec_max(a, b);
       }
 
       static inline value_type min(value_type a, value_type b) {
-        return std::min(a, b);
+        return vec_min(a, b);
       }
 
       static inline value_type add(value_type a, value_type b) {
-        return a + b;
+        return vec_add(a,b);
       }
 
       static inline value_type sub(value_type a, value_type b) {
-        return a - b;
+        return vec_sub(a,b);
       }
 
       static inline value_type mul(value_type a, value_type b) {
-        return a * b;
+        return vec_mul(a,b);
       }
 
       static inline value_type div(value_type a, value_type b) {
-        return a / b;
+        return vec_div(a,b);
       }
 
       static inline value_type mul_add(value_type a, value_type b, value_type c) {
-        return a * b + c;
+        //return a * b + c;
+	return vec_madd(a,b,c);
       }
 
-      static inline T reduce_add(value_type a) {
-        return a;
+      static inline float reduce_add(value_type a) {
+	float f=0;
+	for (int i=0;i<4;i+=1) {
+	  f+=a[i];
+	}
+        return f;
       }
 
-      static inline T reduce_max(value_type a) {
-        return a;
+      static inline float reduce_max(value_type a) {
+	float max=0;
+	for (int i=0;i<4;i+=1) {
+          if (a[i]>max) {
+	    max=a[i];
+	  }
+        }
+        return max;
+      }
+      
+      
+/*    static inline value_type load(float value) {
+        return vdupq_n_f32(value);
       }
 
-    };
+      static inline value_type load(const float* ptr) {
+        return vld1q_f32(ptr);
+      }
+
+      static inline value_type load(const float* ptr, dim_t count, float default_value = 0) {
+        if (count == width) {
+          return vld1q_f32(ptr);
+        } else {
+          __ct2_align16__ float tmp_values[width];
+          std::fill(tmp_values, tmp_values + width, default_value);
+          std::copy(ptr, ptr + count, tmp_values);
+          return vld1q_f32(tmp_values);
+        }
+      }
+
+      static inline value_type load_and_convert(const int32_t* ptr) {
+        return vcvtq_f32_s32(vld1q_s32(ptr));
+      }
+
+      static inline value_type load_and_convert(const int32_t* ptr,
+                                                dim_t count,
+                                                int32_t default_value = 0) {
+        if (count == width) {
+          return load_and_convert(ptr);
+        } else {
+          __ct2_align16__ int32_t tmp_values[width];
+          std::fill(tmp_values, tmp_values + width, default_value);
+          std::copy(ptr, ptr + count, tmp_values);
+          return load_and_convert(tmp_values);
+        }
+      }
 
-    template <typename T, CpuIsa ISA = CpuIsa::POWER10>
-    using vec_type = typename Vec<T, ISA>::value_type;
-
-    template <CpuIsa ISA>
-    vec_type<float, ISA> vec_tanh(vec_type<float, ISA> a) {
-      using VecType = Vec<float, ISA>;
-
-      // Implementation ported from Eigen:
-      // https://gitlab.com/libeigen/eigen/-/blob/3.4.0/Eigen/src/Core/MathFunctionsImpl.h#L18-L76
-
-      const auto plus_clamp = VecType::load(7.90531110763549805f);
-      const auto minus_clamp = VecType::load(-7.90531110763549805f);
-      const auto tiny = VecType::load(0.0004f);
-      const auto x = VecType::max(VecType::min(a, plus_clamp), minus_clamp);
-      const auto tiny_mask = VecType::lt(VecType::abs(a), tiny);
-
-      const auto alpha_1 = VecType::load(4.89352455891786e-03f);
-      const auto alpha_3 = VecType::load(6.37261928875436e-04f);
-      const auto alpha_5 = VecType::load(1.48572235717979e-05f);
-      const auto alpha_7 = VecType::load(5.12229709037114e-08f);
-      const auto alpha_9 = VecType::load(-8.60467152213735e-11f);
-      const auto alpha_11 = VecType::load(2.00018790482477e-13f);
-      const auto alpha_13 = VecType::load(-2.76076847742355e-16f);
-
-      const auto beta_0 = VecType::load(4.89352518554385e-03f);
-      const auto beta_2 = VecType::load(2.26843463243900e-03f);
-      const auto beta_4 = VecType::load(1.18534705686654e-04f);
-      const auto beta_6 = VecType::load(1.19825839466702e-06f);
-
-      const auto x2 = VecType::mul(x, x);
-
-      auto p = VecType::mul_add(x2, alpha_13, alpha_11);
-      p = VecType::mul_add(x2, p, alpha_9);
-      p = VecType::mul_add(x2, p, alpha_7);
-      p = VecType::mul_add(x2, p, alpha_5);
-      p = VecType::mul_add(x2, p, alpha_3);
-      p = VecType::mul_add(x2, p, alpha_1);
-      p = VecType::mul(x, p);
-
-      auto q = VecType::mul_add(x2, beta_6, beta_4);
-      q = VecType::mul_add(x2, q, beta_2);
-      q = VecType::mul_add(x2, q, beta_0);
-
-      return VecType::select(tiny_mask, x, VecType::div(p, q));
-    }
-
-    template <CpuIsa ISA>
-    vec_type<float, ISA> vec_erf(vec_type<float, ISA> a) {
-      using VecType = Vec<float, ISA>;
-
-      // Implementation ported from PyTorch:
-      // https://github.com/pytorch/pytorch/blob/e9bc82f54b9867cc82b0e94dcdc90f9d156277bd/aten/src/ATen/cpu/vec/vec256/vec256_float.h#L158-L189
-
-      // constants
-      const auto neg_zero_vec = VecType::load(-0.f);
-      const auto one_vec = VecType::load(1.0f);
-      const auto p = VecType::load(0.3275911f);
-      const auto p1 = VecType::load(0.254829592f);
-      const auto p2 = VecType::load(-0.284496736f);
-      const auto p3 = VecType::load(1.421413741f);
-      const auto p4 = VecType::load(-1.453152027f);
-      const auto p5 = VecType::load(1.061405429f);
-      // sign(x)
-      auto sign_mask = VecType::bit_and(neg_zero_vec, a);
-      auto abs_vec = VecType::bit_xor(sign_mask, a);
-      // t = 1 / (p * abs(x) + 1)
-      auto tmp0 = VecType::mul_add(p, abs_vec, one_vec);
-      auto t = VecType::div(one_vec, tmp0);
-      // r = p5 * t ^ 4 + p4 * t ^ 3 + p3 * t ^ 2 + p2 * t + p1
-      auto tmp1 = VecType::mul_add(p5, t, p4);
-      auto tmp2 = VecType::mul_add(tmp1, t, p3);
-      auto tmp3 = VecType::mul_add(tmp2, t, p2);
-      auto r = VecType::mul_add(tmp3, t, p1);
-      // - exp(- x * x)
-      auto pow_2 = VecType::mul(a, a);
-      auto neg_pow_2 = VecType::bit_xor(neg_zero_vec, pow_2);
-      // auto tmp4 = exp(neg_pow_2);
-      auto tmp4 = VecType::exp(neg_pow_2);
-      auto tmp5 = VecType::bit_xor(neg_zero_vec, tmp4);
-      // erf(x) = sign(x) * (1 - r * t * exp(- x * x))
-      auto tmp6 = VecType::mul(tmp5, t);
-      auto tmp7 = VecType::mul_add(tmp6, r, one_vec);
-      return VecType::bit_xor(sign_mask, tmp7);
-    }
+      static inline void store(value_type value, float* ptr) {
+        vst1q_f32(ptr, value);
+      }
+
+      static inline void store(value_type value, float* ptr, dim_t count) {
+        if (count == width) {
+          vst1q_f32(ptr, value);
+        } else {
+          __ct2_align16__ float tmp_values[width];
+          vst1q_f32(tmp_values, value);
+          std::copy(tmp_values, tmp_values + count, ptr);
+        }
+      }
+
+      static inline value_type bit_and(value_type a, value_type b) {
+        return vreinterpretq_f32_u32(vandq_u32(vreinterpretq_u32_f32(a), vreinterpretq_u32_f32(b)));
+      }
+
+      static inline value_type bit_xor(value_type a, value_type b) {
+        return vreinterpretq_f32_u32(veorq_u32(vreinterpretq_u32_f32(a), vreinterpretq_u32_f32(b)));
+      }
+
+      static inline mask_type lt(value_type a, value_type b) {
+        return vcltq_f32(a, b);
+      }
+
+      static inline value_type select(mask_type mask, value_type a, value_type b) {
+        return vbslq_f32(mask, a, b);
+      }
+
+      static inline value_type abs(value_type a) {
+        return vabsq_f32(a);
+      }
+
+      static inline value_type neg(value_type a) {
+        return vnegq_f32(a);
+      }
+
+      static inline value_type rcp(value_type a) {
+        return vrecpeq_f32(a);
+      }
+
+      static inline value_type exp(value_type a) {
+        return exp_ps(a);
+      }
+
+      static inline value_type log(value_type a) {
+        return log_ps(a);
+      }
+
+      static inline value_type sin(value_type a) {
+        return sin_ps(a);
+      }
+
+      static inline value_type cos(value_type a) {
+        return cos_ps(a);
+      }
+
+      static inline value_type tanh(value_type a) {
+        return vec_tanh<TARGET_ISA>(a);
+      }
+
+      static inline value_type erf(value_type a) {
+        return vec_erf<TARGET_ISA>(a);
+      }
+
+      static inline value_type max(value_type a, value_type b) {
+        return vmaxq_f32(a, b);
+      }
+
+      static inline value_type min(value_type a, value_type b) {
+        return vminq_f32(a, b);
+      }
+
+      static inline value_type add(value_type a, value_type b) {
+        return vaddq_f32(a, b);
+      }
+
+      static inline value_type sub(value_type a, value_type b) {
+        return vsubq_f32(a, b);
+      }
+
+      static inline value_type mul(value_type a, value_type b) {
+        return vmulq_f32(a, b);
+      }
+
+      static inline value_type div(value_type a, value_type b) {
+        return vdivq_f32(a, b);
+      }
+
+      static inline value_type mul_add(value_type a, value_type b, value_type c) {
+        return vfmaq_f32(c, a, b);
+      }
+
+      static inline float reduce_add(value_type a) {
+        return vaddvq_f32(a);
+      }
+
+      static inline float reduce_max(value_type a) {
+        return vmaxvq_f32(a);
+      }
+*/
+    };
 
   }
 }
diff --git a/src/utils.cc b/src/utils.cc
index f0eb29509..de031d963 100644
--- a/src/utils.cc
+++ b/src/utils.cc
@@ -42,6 +42,10 @@ namespace ctranslate2 {
     spdlog::info("CPU: {} (NEON={})",
                  cpu::cpu_vendor(),
                  cpu::cpu_supports_neon());
+#elif defined(CT2_PPC64LE_BUILD)
+        spdlog::info("CPU: {} (NEON={})",
+                 cpu::cpu_vendor(),
+                 cpu::cpu_supports_power10());
 #endif
     spdlog::info(" - Selected ISA: {}", cpu::isa_to_str(cpu::get_cpu_isa()));
     spdlog::info(" - Use Intel MKL: {}", cpu::mayiuse_mkl());

From 70b050924bac9bf00ccf013cbf7100a8620defbe Mon Sep 17 00:00:00 2001
From: Dagamies <ari@juntunen.org>
Date: Wed, 29 Nov 2023 11:00:33 +0100
Subject: [PATCH 04/10] Power10 initial build

---
 src/cpu/cpu_isa.h | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/src/cpu/cpu_isa.h b/src/cpu/cpu_isa.h
index eedfa3bd4..b32379c7b 100644
--- a/src/cpu/cpu_isa.h
+++ b/src/cpu/cpu_isa.h
@@ -59,8 +59,7 @@ namespace ctranslate2 {
 #elif defined(CT2_PPC64LE_BUILD)
 #  define CPU_ISA_DISPATCH(STMTS)                             \
   switch (cpu::get_cpu_isa()) {                               \
-    CPU_ISA_CASE(cpu::CpuIsa::GENERIC, SINGLE_ARG(STMTS))     \
-    CPU_ISA_DEFAULT(cpu::CpuIsa::GENERIC, SINGLE_ARG(STMTS))  \
+    CPU_ISA_DEFAULT(cpu::CpuIsa::POWER10, SINGLE_ARG(STMTS))  \
   }
 #endif
 #elif defined(__AVX512F__)

From f368229072475433dc8478c20be935648edeeafa Mon Sep 17 00:00:00 2001
From: Dagamies <ari@juntunen.org>
Date: Wed, 29 Nov 2023 11:01:07 +0100
Subject: [PATCH 05/10] Initial Power10 build

---
 src/cpu/vec_power10.h | 361 ++++++++++++++++++++----------------------
 1 file changed, 168 insertions(+), 193 deletions(-)

diff --git a/src/cpu/vec_power10.h b/src/cpu/vec_power10.h
index ea285f58f..a32a6451d 100644
--- a/src/cpu/vec_power10.h
+++ b/src/cpu/vec_power10.h
@@ -1,13 +1,17 @@
 #pragma once
 
-//#include <arm_neon.h>
-//#include <neon_mathfun.h>
+
 #include <algorithm>
 #include <cmath>
 #include <cstdlib>
-
+#include <iostream>
 #include <altivec.h>
 
+#include <sleefinline_vsx.h>
+
+
+#include<xmmintrin.h>
+
 #include "vec.h"
 
 #if defined(__GNUC__) || defined(__clang__)
@@ -19,64 +23,88 @@
 namespace ctranslate2 {
   namespace cpu {
 
+    #define ALIGNMENT_VALUE     16u
+    
     template<>
     struct Vec<float, TARGET_ISA> {
 
-      using value_type =  __vector float;
-      using mask_type = __vector bool int;
+      using value_type = __ct2_align16__  __vector float;
+      using mask_type = __ct2_align16__ __vector bool int;
       static constexpr dim_t width = 4;
-      //using value_type = float;
-      //using mask_type = uint;
-      //static constexpr dim_t width = 4;
+
+      static inline value_type unaligned_load(const float* ptr){
+        return vec_perm(vec_ld(0, ptr), vec_ld(16, ptr), vec_lvsl(0, ptr));
+      }
+      
 
       static inline value_type load(float value) {
-        return vec_lde(0,&value);
+	return vec_splats(value);
       }
 
       static inline value_type load(const float* ptr) {
-        return  vec_ld(0,ptr);
+	return vec_perm(vec_ld(0, ptr), vec_ld(16, ptr), vec_lvsl(0, ptr));
+	
       }
 
       static inline value_type load(const float* ptr, dim_t count, float default_value = float(0)) {
-        (void)count;
-        (void)default_value;
-	//KESKEN
-        return vec_ld(0,ptr);
+	if (count == width) {
+          return load(ptr);
+        } else {
+          __ct2_align16__ float tmp_values[width];
+          std::fill(tmp_values, tmp_values + width, default_value);
+          std::copy(ptr, ptr + count, tmp_values);
+          return load(tmp_values);
+        }
       }
 
       static inline value_type load_and_convert(const int32_t* ptr) {
 	
-        return vec_ctf(vec_lde(0,ptr),0);
+        return vec_ctf(vec_perm(vec_ld(0, ptr), vec_ld(16, ptr), vec_lvsl(0, ptr)),0);
       }
 
       static inline value_type load_and_convert(const int32_t* ptr,
                                                 dim_t count,
                                                 int32_t default_value = 0) {
-        (void)count;
-        (void)default_value;
-	//KESKEN
-        return vec_ctf(vec_lde(0,ptr),0);
+        if (count == width) {
+          return load_and_convert(ptr);
+        } else {
+          __ct2_align16__ int32_t tmp_values[width];
+          std::fill(tmp_values, tmp_values + width, default_value);
+	  for (int i=0;i<width;i+=1){
+            std::cout << "load_and_convert, before filltmp_values["<<i<<"]:"<<tmp_values[i]<<"\n";
+	    std::cout << "load_and_convert, before ptr["<<i<<"]:"<<*(ptr+i)<<"\n";
+	  }
+          std::copy(ptr, ptr + count, tmp_values);
+	  for (int i=0;i<width;i+=1){
+	    std::cout << "load_and_convert,tmp_values["<<i<<"]:"<<tmp_values[i]<<"\n";
+	    std::cout << "load_and_convert,ptr["<<i<<"]:"<<ptr[i]<<"\n";
+	  }
+          return load_and_convert(tmp_values);
+        }
+      }
+      static inline void unaligned_store(value_type value, float* ptr) {
+	vec_xst(value,0,ptr);
       }
 
       static inline void store(value_type value, float* ptr) {
-        //*ptr = value;
-	vec_ste(value,0,ptr);
+	if (((uintptr_t)ptr % ALIGNMENT_VALUE) != 0)
+        {
+	  unaligned_store(value,ptr);
+        } else
+	  vec_st(value,0,ptr);
       }
 
       static inline void store(value_type value, float* ptr, dim_t count) {
-        (void)count;
-        //*ptr = value;
-	//KESKEN
-	vec_ste(value,0,ptr);
+        if (count == width) {
+	  store(value,ptr);
+        } else {
+          __ct2_align16__ float tmp_values[width];
+          store(value,tmp_values);
+          std::copy(tmp_values, tmp_values + count, ptr);
+        }
       }
 
       static inline value_type bit_and(value_type  a, value_type b) {
-        //return vreinterpretq_f32_u32(vandq_u32(vreinterpretq_u32_f32(a), vreinterpretq_u32_f32(b)));
-        //return a & b;
-	//__vector value_type va = *(__vector value_type *)(&a);
-        //__vector value_type vb = *(__vector value_type *)(&b);
-	//__vector value_type va=a;
-	//__vector value_type vb=b;
 	return vec_and(a,b);
       }
 
@@ -89,8 +117,7 @@ namespace ctranslate2 {
       }
 
       static inline value_type select(mask_type mask, value_type a, value_type b) {
-	//KESKEN!!
-        return mask ? a : b;
+	return vec_sel(a,b,mask);
       }
 
       static inline value_type abs(value_type a) {
@@ -106,31 +133,31 @@ namespace ctranslate2 {
       }
 
       static inline value_type exp(value_type a) {
-	//KESKEN
-        return exp(a);
+	return Sleef_expf4_u10vsx(a);
+
+	 
       }
 
       static inline value_type log(value_type a) {
-        return vec_loge(a);
+         return Sleef_logf4_u35vsx(a);
+
       }
       static inline value_type sin(value_type a) {
-        value_type out;
-        for (int i=0;i<4;i+=1) {
-	  out=vec_insert(std::sin(vec_extract(a,i)),a,i);
-        }
-        return out;
+        return Sleef_sinf4_u35vsx(a);
       }
 
       static inline value_type cos(value_type a) {
-        return cos(a);
+        return Sleef_cosf4_u35vsx(a);
+
       }
 
       static inline value_type tanh(value_type a) {
-        return tanh(a);
+	return Sleef_tanhf4_u35vsx(a);
+
       }
 
       static inline value_type erf(value_type a) {
-        return erf(a);
+        return Sleef_erff4_u10vsx(a);
       }
 
       static inline value_type max(value_type a, value_type b) {
@@ -158,168 +185,116 @@ namespace ctranslate2 {
       }
 
       static inline value_type mul_add(value_type a, value_type b, value_type c) {
-        //return a * b + c;
+        
 	return vec_madd(a,b,c);
       }
 
       static inline float reduce_add(value_type a) {
-	float f=0;
-	for (int i=0;i<4;i+=1) {
+	/*float f=0;
+	for (int i=0;i<width;i+=1) {
 	  f+=a[i];
 	}
-        return f;
+	std::cout << "reduce_add ";
+        return f;*/
+        __m128 t1 = _mm_movehl_ps(a, a);
+	__m128 t2 = _mm_add_ps(a, t1);
+	__m128 t3 = _mm_shuffle_ps(t2, t2, 1);
+	__m128 t4 = _mm_add_ss(t2, t3);
+	return _mm_cvtss_f32(t4);
+	
       }
 
+      static inline float reduce_max4(value_type[4] a){
+
+	return 0;
+      }
       static inline float reduce_max(value_type a) {
-	float max=0;
-	for (int i=0;i<4;i+=1) {
+	/*float max=0;
+	for (int i=0;i<width;i+=1) {
           if (a[i]>max) {
 	    max=a[i];
 	  }
         }
-        return max;
-      }
-      
-      
-/*    static inline value_type load(float value) {
-        return vdupq_n_f32(value);
-      }
-
-      static inline value_type load(const float* ptr) {
-        return vld1q_f32(ptr);
-      }
-
-      static inline value_type load(const float* ptr, dim_t count, float default_value = 0) {
-        if (count == width) {
-          return vld1q_f32(ptr);
-        } else {
-          __ct2_align16__ float tmp_values[width];
-          std::fill(tmp_values, tmp_values + width, default_value);
-          std::copy(ptr, ptr + count, tmp_values);
-          return vld1q_f32(tmp_values);
-        }
-      }
-
-      static inline value_type load_and_convert(const int32_t* ptr) {
-        return vcvtq_f32_s32(vld1q_s32(ptr));
-      }
-
-      static inline value_type load_and_convert(const int32_t* ptr,
-                                                dim_t count,
-                                                int32_t default_value = 0) {
-        if (count == width) {
-          return load_and_convert(ptr);
-        } else {
-          __ct2_align16__ int32_t tmp_values[width];
-          std::fill(tmp_values, tmp_values + width, default_value);
-          std::copy(ptr, ptr + count, tmp_values);
-          return load_and_convert(tmp_values);
-        }
-      }
-
-      static inline void store(value_type value, float* ptr) {
-        vst1q_f32(ptr, value);
-      }
-
-      static inline void store(value_type value, float* ptr, dim_t count) {
-        if (count == width) {
-          vst1q_f32(ptr, value);
-        } else {
-          __ct2_align16__ float tmp_values[width];
-          vst1q_f32(tmp_values, value);
-          std::copy(tmp_values, tmp_values + count, ptr);
-        }
-      }
-
-      static inline value_type bit_and(value_type a, value_type b) {
-        return vreinterpretq_f32_u32(vandq_u32(vreinterpretq_u32_f32(a), vreinterpretq_u32_f32(b)));
-      }
-
-      static inline value_type bit_xor(value_type a, value_type b) {
-        return vreinterpretq_f32_u32(veorq_u32(vreinterpretq_u32_f32(a), vreinterpretq_u32_f32(b)));
-      }
-
-      static inline mask_type lt(value_type a, value_type b) {
-        return vcltq_f32(a, b);
-      }
-
-      static inline value_type select(mask_type mask, value_type a, value_type b) {
-        return vbslq_f32(mask, a, b);
-      }
-
-      static inline value_type abs(value_type a) {
-        return vabsq_f32(a);
-      }
-
-      static inline value_type neg(value_type a) {
-        return vnegq_f32(a);
-      }
-
-      static inline value_type rcp(value_type a) {
-        return vrecpeq_f32(a);
-      }
-
-      static inline value_type exp(value_type a) {
-        return exp_ps(a);
-      }
-
-      static inline value_type log(value_type a) {
-        return log_ps(a);
-      }
-
-      static inline value_type sin(value_type a) {
-        return sin_ps(a);
-      }
-
-      static inline value_type cos(value_type a) {
-        return cos_ps(a);
-      }
-
-      static inline value_type tanh(value_type a) {
-        return vec_tanh<TARGET_ISA>(a);
-      }
-
-      static inline value_type erf(value_type a) {
-        return vec_erf<TARGET_ISA>(a);
-      }
-
-      static inline value_type max(value_type a, value_type b) {
-        return vmaxq_f32(a, b);
-      }
-
-      static inline value_type min(value_type a, value_type b) {
-        return vminq_f32(a, b);
-      }
-
-      static inline value_type add(value_type a, value_type b) {
-        return vaddq_f32(a, b);
-      }
-
-      static inline value_type sub(value_type a, value_type b) {
-        return vsubq_f32(a, b);
-      }
-
-      static inline value_type mul(value_type a, value_type b) {
-        return vmulq_f32(a, b);
-      }
-
-      static inline value_type div(value_type a, value_type b) {
-        return vdivq_f32(a, b);
-      }
-
-      static inline value_type mul_add(value_type a, value_type b, value_type c) {
-        return vfmaq_f32(c, a, b);
-      }
-
-      static inline float reduce_add(value_type a) {
-        return vaddvq_f32(a);
-      }
-
-      static inline float reduce_max(value_type a) {
-        return vmaxvq_f32(a);
-      }
-*/
+	//std::cout << "reduce_max ";
+        return max;*/
+	float t0 = a[0] > a[1] ? a[0] : a[1];
+        float t1 = a[2] > a[3] ? a[2] : a[3];
+	return t0 > t1 ? t0 : t1;
+      }
+      static inline void output_vec(value_type v)
+      {
+	for(int a=0;a<4;a+=1)
+	  std::cout<< " "<<a<<":"<<v[a];
+	std::cout <<"\n";
+	return;
+      }
+      static inline void output_vec_mask(mask_type v)
+      {
+        for(int a=0;a<4;a+=1)
+          std::cout<< " "<<a<<":"<<(bool)v[a];
+        std::cout <<"\n";
+        return;
+      }
+
+    //template <CpuIsa ISA>
+    static inline value_type vec_tanh(value_type a) {
+      using VecType = Vec<float, TARGET_ISA>;
+
+      // Implementation ported from Eigen:
+      // https://gitlab.com/libeigen/eigen/-/blob/3.4.0/Eigen/src/Core/MathFunctionsImpl.h#L18-L76
+      //std::cout << "Starting Power10::vec_tanh\n";
+      const auto plus_clamp = VecType::load(7.90531110763549805f);
+      //std::cout << " plus_clamp:";
+      //VecType::output_vec(plus_clamp);
+      const auto minus_clamp = VecType::load(-7.90531110763549805f);
+      //std::cout << " minus_clamp:";
+      //VecType::output_vec(minus_clamp);
+      const auto tiny = VecType::load(0.0004f);
+      //std::cout << "tiny:";
+      //VecType::output_vec(tiny);
+      const auto x = VecType::max(VecType::min(a, plus_clamp), minus_clamp);
+      //std::cout << "x:";
+      //VecType::output_vec(x);
+      const auto tiny_mask = VecType::lt(VecType::abs(a), tiny);
+      //std::cout << "tiny_mask:";
+      //VecType::output_vec_mask(tiny_mask);
+
+
+      const auto alpha_1 = VecType::load(4.89352455891786e-03f);
+      const auto alpha_3 = VecType::load(6.37261928875436e-04f);
+      const auto alpha_5 = VecType::load(1.48572235717979e-05f);
+      const auto alpha_7 = VecType::load(5.12229709037114e-08f);
+      const auto alpha_9 = VecType::load(-8.60467152213735e-11f);
+      const auto alpha_11 = VecType::load(2.00018790482477e-13f);
+      const auto alpha_13 = VecType::load(-2.76076847742355e-16f);
+
+      const auto beta_0 = VecType::load(4.89352518554385e-03f);
+      const auto beta_2 = VecType::load(2.26843463243900e-03f);
+      const auto beta_4 = VecType::load(1.18534705686654e-04f);
+      const auto beta_6 = VecType::load(1.19825839466702e-06f);
+
+      const auto x2 = VecType::mul(x, x);
+      //std::cout << "x2:";
+      //output_vec(x2);
+ 
+
+      auto p = VecType::mul_add(x2, alpha_13, alpha_11);
+      //std::cout << "p:";
+      //output_vec(p);
+
+      p = VecType::mul_add(x2, p, alpha_9);
+      p = VecType::mul_add(x2, p, alpha_7);
+      p = VecType::mul_add(x2, p, alpha_5);
+      p = VecType::mul_add(x2, p, alpha_3);
+      p = VecType::mul_add(x2, p, alpha_1);
+      p = VecType::mul(x, p);
+
+      auto q = VecType::mul_add(x2, beta_6, beta_4);
+      q = VecType::mul_add(x2, q, beta_2);
+      q = VecType::mul_add(x2, q, beta_0);
+
+      return VecType::select(tiny_mask, x, VecType::div(p, q));
+    }
     };
-
   }
 }

From 7e1f287e5b09cee025c8c19521b3a00696ffb452 Mon Sep 17 00:00:00 2001
From: Dagamies <ari@juntunen.org>
Date: Fri, 1 Dec 2023 05:17:36 +0100
Subject: [PATCH 06/10] Initial Power10 build

---
 src/cpu/vec_power10.h | 135 ++++++++++++------------------------------
 1 file changed, 38 insertions(+), 97 deletions(-)

diff --git a/src/cpu/vec_power10.h b/src/cpu/vec_power10.h
index a32a6451d..98264126f 100644
--- a/src/cpu/vec_power10.h
+++ b/src/cpu/vec_power10.h
@@ -7,10 +7,7 @@
 #include <iostream>
 #include <altivec.h>
 
-#include <sleefinline_vsx.h>
-
-
-#include<xmmintrin.h>
+#include <sleefinline_vsx3.h>
 
 #include "vec.h"
 
@@ -133,31 +130,31 @@ namespace ctranslate2 {
       }
 
       static inline value_type exp(value_type a) {
-	return Sleef_expf4_u10vsx(a);
+	return Sleef_expf4_u10vsx3(a);
 
 	 
       }
 
       static inline value_type log(value_type a) {
-         return Sleef_logf4_u35vsx(a);
+         return Sleef_logf4_u35vsx3(a);
 
       }
       static inline value_type sin(value_type a) {
-        return Sleef_sinf4_u35vsx(a);
+        return Sleef_sinf4_u35vsx3(a);
       }
 
       static inline value_type cos(value_type a) {
-        return Sleef_cosf4_u35vsx(a);
+        return Sleef_cosf4_u35vsx3(a);
 
       }
 
       static inline value_type tanh(value_type a) {
-	return Sleef_tanhf4_u35vsx(a);
+	return Sleef_tanhf4_u35vsx3(a);
 
       }
 
       static inline value_type erf(value_type a) {
-        return Sleef_erff4_u10vsx(a);
+        return Sleef_erff4_u10vsx3(a);
       }
 
       static inline value_type max(value_type a, value_type b) {
@@ -190,24 +187,42 @@ namespace ctranslate2 {
       }
 
       static inline float reduce_add(value_type a) {
-	/*float f=0;
-	for (int i=0;i<width;i+=1) {
-	  f+=a[i];
-	}
-	std::cout << "reduce_add ";
-        return f;*/
-        __m128 t1 = _mm_movehl_ps(a, a);
+
+
+        unsigned long __element_selector_10 = 1 & 0x03;
+        unsigned long __element_selector_32 = (1 >> 2) & 0x03;
+        unsigned long __element_selector_54 = (1 >> 4) & 0x03;
+        unsigned long __element_selector_76 = (1 >> 6) & 0x03;
+        static const unsigned int __permute_selectors[4] =
+          {
+#ifdef __LITTLE_ENDIAN__
+            0x03020100, 0x07060504, 0x0B0A0908, 0x0F0E0D0C
+#else
+            0x00010203, 0x04050607, 0x08090A0B, 0x0C0D0E0F
+#endif
+          };
+        __vector unsigned int __t;
+        __t[0] = __permute_selectors[__element_selector_10];
+        __t[1] = __permute_selectors[__element_selector_32];
+        __t[2] = __permute_selectors[__element_selector_54] + 0x10101010;
+        __t[3] = __permute_selectors[__element_selector_76] + 0x10101010;
+
+        __vector unsigned long long v1 = vec_mergel((__vector unsigned long long)a,(__vector unsigned long long)a);
+	value_type v2 = (value_type)a + (value_type)v1;
+        value_type v3 = vec_perm (v2, v2,(__vector unsigned char) __t);
+	return  v2[0]+v3[0];
+
+        /*__m128 t1 = _mm_movehl_ps(a, a);
+        for (int b=0; b<4;b+=1)
+          std::cout << "t1["<<b<<"]:"<<((__vector float)t1)[b]<<"\n";
+
 	__m128 t2 = _mm_add_ps(a, t1);
 	__m128 t3 = _mm_shuffle_ps(t2, t2, 1);
 	__m128 t4 = _mm_add_ss(t2, t3);
 	return _mm_cvtss_f32(t4);
-	
-      }
-
-      static inline float reduce_max4(value_type[4] a){
-
-	return 0;
+	*/
       }
+      
       static inline float reduce_max(value_type a) {
 	/*float max=0;
 	for (int i=0;i<width;i+=1) {
@@ -221,80 +236,6 @@ namespace ctranslate2 {
         float t1 = a[2] > a[3] ? a[2] : a[3];
 	return t0 > t1 ? t0 : t1;
       }
-      static inline void output_vec(value_type v)
-      {
-	for(int a=0;a<4;a+=1)
-	  std::cout<< " "<<a<<":"<<v[a];
-	std::cout <<"\n";
-	return;
-      }
-      static inline void output_vec_mask(mask_type v)
-      {
-        for(int a=0;a<4;a+=1)
-          std::cout<< " "<<a<<":"<<(bool)v[a];
-        std::cout <<"\n";
-        return;
-      }
-
-    //template <CpuIsa ISA>
-    static inline value_type vec_tanh(value_type a) {
-      using VecType = Vec<float, TARGET_ISA>;
-
-      // Implementation ported from Eigen:
-      // https://gitlab.com/libeigen/eigen/-/blob/3.4.0/Eigen/src/Core/MathFunctionsImpl.h#L18-L76
-      //std::cout << "Starting Power10::vec_tanh\n";
-      const auto plus_clamp = VecType::load(7.90531110763549805f);
-      //std::cout << " plus_clamp:";
-      //VecType::output_vec(plus_clamp);
-      const auto minus_clamp = VecType::load(-7.90531110763549805f);
-      //std::cout << " minus_clamp:";
-      //VecType::output_vec(minus_clamp);
-      const auto tiny = VecType::load(0.0004f);
-      //std::cout << "tiny:";
-      //VecType::output_vec(tiny);
-      const auto x = VecType::max(VecType::min(a, plus_clamp), minus_clamp);
-      //std::cout << "x:";
-      //VecType::output_vec(x);
-      const auto tiny_mask = VecType::lt(VecType::abs(a), tiny);
-      //std::cout << "tiny_mask:";
-      //VecType::output_vec_mask(tiny_mask);
-
-
-      const auto alpha_1 = VecType::load(4.89352455891786e-03f);
-      const auto alpha_3 = VecType::load(6.37261928875436e-04f);
-      const auto alpha_5 = VecType::load(1.48572235717979e-05f);
-      const auto alpha_7 = VecType::load(5.12229709037114e-08f);
-      const auto alpha_9 = VecType::load(-8.60467152213735e-11f);
-      const auto alpha_11 = VecType::load(2.00018790482477e-13f);
-      const auto alpha_13 = VecType::load(-2.76076847742355e-16f);
-
-      const auto beta_0 = VecType::load(4.89352518554385e-03f);
-      const auto beta_2 = VecType::load(2.26843463243900e-03f);
-      const auto beta_4 = VecType::load(1.18534705686654e-04f);
-      const auto beta_6 = VecType::load(1.19825839466702e-06f);
-
-      const auto x2 = VecType::mul(x, x);
-      //std::cout << "x2:";
-      //output_vec(x2);
- 
-
-      auto p = VecType::mul_add(x2, alpha_13, alpha_11);
-      //std::cout << "p:";
-      //output_vec(p);
-
-      p = VecType::mul_add(x2, p, alpha_9);
-      p = VecType::mul_add(x2, p, alpha_7);
-      p = VecType::mul_add(x2, p, alpha_5);
-      p = VecType::mul_add(x2, p, alpha_3);
-      p = VecType::mul_add(x2, p, alpha_1);
-      p = VecType::mul(x, p);
-
-      auto q = VecType::mul_add(x2, beta_6, beta_4);
-      q = VecType::mul_add(x2, q, beta_2);
-      q = VecType::mul_add(x2, q, beta_0);
-
-      return VecType::select(tiny_mask, x, VecType::div(p, q));
-    }
     };
   }
 }

From 0b5a0f6fc9ea77ff772ee343043c8c07d1a3bac4 Mon Sep 17 00:00:00 2001
From: Dagamies <ari@juntunen.org>
Date: Fri, 1 Dec 2023 06:33:30 +0100
Subject: [PATCH 07/10] Code cleaning for Power10 port

---
 src/cpu/vec_power10.h | 40 +++++-----------------------------------
 1 file changed, 5 insertions(+), 35 deletions(-)

diff --git a/src/cpu/vec_power10.h b/src/cpu/vec_power10.h
index 98264126f..c87242184 100644
--- a/src/cpu/vec_power10.h
+++ b/src/cpu/vec_power10.h
@@ -30,17 +30,16 @@ namespace ctranslate2 {
       static constexpr dim_t width = 4;
 
       static inline value_type unaligned_load(const float* ptr){
-        return vec_perm(vec_ld(0, ptr), vec_ld(16, ptr), vec_lvsl(0, ptr));
+	return (value_type){*ptr,*(ptr+1),*(ptr+2),*(ptr+3)};
       }
       
 
       static inline value_type load(float value) {
-	return vec_splats(value);
+	return (value_type){value,value,value,value};
       }
 
       static inline value_type load(const float* ptr) {
-	return vec_perm(vec_ld(0, ptr), vec_ld(16, ptr), vec_lvsl(0, ptr));
-	
+	return (value_type){*ptr,*(ptr+1),*(ptr+2),*(ptr+3)};
       }
 
       static inline value_type load(const float* ptr, dim_t count, float default_value = float(0)) {
@@ -55,8 +54,7 @@ namespace ctranslate2 {
       }
 
       static inline value_type load_and_convert(const int32_t* ptr) {
-	
-        return vec_ctf(vec_perm(vec_ld(0, ptr), vec_ld(16, ptr), vec_lvsl(0, ptr)),0);
+	return vec_ctf((vector signed int){*ptr,*(ptr+1),*(ptr+2),*(ptr+3)},0);
       }
 
       static inline value_type load_and_convert(const int32_t* ptr,
@@ -67,15 +65,7 @@ namespace ctranslate2 {
         } else {
           __ct2_align16__ int32_t tmp_values[width];
           std::fill(tmp_values, tmp_values + width, default_value);
-	  for (int i=0;i<width;i+=1){
-            std::cout << "load_and_convert, before filltmp_values["<<i<<"]:"<<tmp_values[i]<<"\n";
-	    std::cout << "load_and_convert, before ptr["<<i<<"]:"<<*(ptr+i)<<"\n";
-	  }
           std::copy(ptr, ptr + count, tmp_values);
-	  for (int i=0;i<width;i+=1){
-	    std::cout << "load_and_convert,tmp_values["<<i<<"]:"<<tmp_values[i]<<"\n";
-	    std::cout << "load_and_convert,ptr["<<i<<"]:"<<ptr[i]<<"\n";
-	  }
           return load_and_convert(tmp_values);
         }
       }
@@ -130,9 +120,7 @@ namespace ctranslate2 {
       }
 
       static inline value_type exp(value_type a) {
-	return Sleef_expf4_u10vsx3(a);
-
-	 
+	return Sleef_expf4_u10vsx3(a);	 
       }
 
       static inline value_type log(value_type a) {
@@ -211,27 +199,9 @@ namespace ctranslate2 {
 	value_type v2 = (value_type)a + (value_type)v1;
         value_type v3 = vec_perm (v2, v2,(__vector unsigned char) __t);
 	return  v2[0]+v3[0];
-
-        /*__m128 t1 = _mm_movehl_ps(a, a);
-        for (int b=0; b<4;b+=1)
-          std::cout << "t1["<<b<<"]:"<<((__vector float)t1)[b]<<"\n";
-
-	__m128 t2 = _mm_add_ps(a, t1);
-	__m128 t3 = _mm_shuffle_ps(t2, t2, 1);
-	__m128 t4 = _mm_add_ss(t2, t3);
-	return _mm_cvtss_f32(t4);
-	*/
       }
       
       static inline float reduce_max(value_type a) {
-	/*float max=0;
-	for (int i=0;i<width;i+=1) {
-          if (a[i]>max) {
-	    max=a[i];
-	  }
-        }
-	//std::cout << "reduce_max ";
-        return max;*/
 	float t0 = a[0] > a[1] ? a[0] : a[1];
         float t1 = a[2] > a[3] ? a[2] : a[3];
 	return t0 > t1 ? t0 : t1;

From 95d18e6269d362a31b66e36840a527e6d0c78dd8 Mon Sep 17 00:00:00 2001
From: Dagamies <ari@juntunen.org>
Date: Thu, 25 Jul 2024 10:38:19 +0200
Subject: [PATCH 08/10] Docker/Podman support for ppc64le

---
 docker/Dockerfile.ppc64le |  91 +++++++++++++++++++++++++++
 python/setup-ppc64le.py   | 126 ++++++++++++++++++++++++++++++++++++++
 2 files changed, 217 insertions(+)
 create mode 100644 docker/Dockerfile.ppc64le
 create mode 100644 python/setup-ppc64le.py

diff --git a/docker/Dockerfile.ppc64le b/docker/Dockerfile.ppc64le
new file mode 100644
index 000000000..e608d359a
--- /dev/null
+++ b/docker/Dockerfile.ppc64le
@@ -0,0 +1,91 @@
+FROM ppc64le/ubuntu:22.04 as builder
+
+RUN apt-get update && \
+    apt-get install -y --no-install-recommends \
+        python3-dev \
+        python3-pip \
+        wget \
+	git \
+	build-essential \
+        && \
+    apt-get clean && \
+    rm -rf /var/lib/apt/lists/* 
+
+
+WORKDIR /root
+
+RUN python3 -m pip --no-cache-dir install cmake==3.22.*
+
+RUN wget -qO- https://public.dhe.ibm.com/software/server/POWER/Linux/toolchain/at/ubuntu/dists/jammy/615d762f.gpg.key |  tee /etc/apt/trusted.gpg.d/615d762f.asc && \
+    echo "deb [signed-by=/etc/apt/trusted.gpg.d/615d762f.asc] https://public.dhe.ibm.com/software/server/POWER/Linux/toolchain/at/ubuntu jammy at17.0" >> /etc/apt/sources.list && \ 
+    cat /etc/apt/sources.list && \
+    cat /etc/apt/trusted.gpg.d/615d762f.asc &&  \
+    apt update && \
+    cat /etc/apt/sources.list && \
+    apt install -y advance-toolchain-at17.0-runtime advance-toolchain-at17.0-devel advance-toolchain-at17.0-perf advance-toolchain-at17.0-mcore-libs
+
+ENV SLEEF_VERSION=3.6.1    
+RUN wget -q https://github.com/shibatch/sleef/archive/refs/tags/${SLEEF_VERSION}.tar.gz  && \
+    tar xf *.tar.gz && \
+    rm *.tar.gz && \
+    cd sleef* && \
+    mkdir build && \
+    cd build && \
+    cmake -DSLEEF_BUILD_INLINE_HEADERS=TRUE  -DCMAKE_CXX_FLAGS='-mcpu=power10 -mtune=power10 -O3 -std=gnu++11 -maltivec -mabi=altivec -mstrict-align ' -DCMAKE_C_COMPILER=/opt/at17.0/bin/gcc -DCMAKE_CXX_COMPILER=/opt/at17.0/bin/g++  -DAT_PATH=/opt/at17.0/ -DBUILD_SHARED_LIBS=FALSE -DBUILD_TESTS=FALSE -DENFORCE_VSX3=TRUE -DSLEEF_SHOW_CONFIG=1 -DCMAKE_BUILD_TYPE=Release   .. && \
+    cd .. && \
+    cmake --build build -j --clean-first && \
+    cmake --install build --prefix=/usr/
+
+
+ENV ONEDNN_VERSION=3.1.1
+RUN wget -q https://github.com/oneapi-src/oneDNN/archive/refs/tags/v${ONEDNN_VERSION}.tar.gz && \
+    tar xf *.tar.gz && \
+    rm *.tar.gz && \
+    cd oneDNN-* && \
+    cmake -DCMAKE_BUILD_TYPE=Release -DONEDNN_LIBRARY_TYPE=STATIC -DONEDNN_BUILD_EXAMPLES=OFF -DONEDNN_BUILD_TESTS=OFF -DONEDNN_ENABLE_WORKLOAD=INFERENCE -DONEDNN_ENABLE_PRIMITIVE="CONVOLUTION;REORDER" -DONEDNN_BUILD_GRAPH=OFF  -DCMAKE_CXX_FLAGS='-mcpu=power10 -mtune=power10 -O3 -maltivec' -DOPENMP_RUNTIME=COMP . && \
+    make -j$(nproc) install && \
+    cd .. && \
+    rm -r oneDNN-*
+
+COPY third_party third_party
+COPY cli cli
+COPY include include
+COPY src src
+COPY cmake cmake
+COPY python python
+COPY CMakeLists.txt .
+
+ARG CXX_FLAGS
+ENV CXX_FLAGS=${CXX_FLAGS:-"-mcpu=power10 -mtune=power10 -O3 -ffp-contract=off"}
+
+ENV CTRANSLATE2_ROOT=/opt/ctranslate2
+
+RUN mkdir build && \
+    cd build && \
+    cmake -DCMAKE_INSTALL_PREFIX=${CTRANSLATE2_ROOT} \
+          -DWITH_CUDA=OFF -DWITH_MKL=OFF -DWITH_OPENBLAS=OFF \
+          -DWITH_DNNL=ON -DOPENMP_RUNTIME=COMP \
+	  -DCMAKE_CXX_FLAGS="${CXX_FLAGS}" \
+	  -DCMAKE_BUILD_TYPE=Release \
+	  .. && \
+    VERBOSE=1 make -j$(nproc) install
+
+ENV LANG=en_US.UTF-8
+COPY README.md .
+
+RUN cd python && \
+    python3 -m pip --no-cache-dir install -r install_requirements.txt && \
+    python3 setup-ppc64le.py bdist_wheel --dist-dir $CTRANSLATE2_ROOT
+
+
+ENV CTRANSLATE2_ROOT=/opt/ctranslate2
+ENV LD_LIBRARY_PATH=$LD_LIBRARY_PATH:$CTRANSLATE2_ROOT/lib
+
+#COPY --from=builder $CTRANSLATE2_ROOT $CTRANSLATE2_ROOT
+RUN pip3 install --force-reinstall ninja
+
+
+RUN python3 -m pip --no-cache-dir install $CTRANSLATE2_ROOT/*.whl && \
+    rm $CTRANSLATE2_ROOT/*.whl
+
+ENTRYPOINT ["/opt/ctranslate2/bin/ct2-translator"]
diff --git a/python/setup-ppc64le.py b/python/setup-ppc64le.py
new file mode 100644
index 000000000..51a21fd43
--- /dev/null
+++ b/python/setup-ppc64le.py
@@ -0,0 +1,126 @@
+import glob
+import os
+import sys
+
+import pybind11
+
+from pybind11.setup_helpers import ParallelCompile
+from setuptools import Extension, find_packages, setup
+
+base_dir = os.path.dirname(os.path.abspath(__file__))
+include_dirs = [pybind11.get_include()]
+library_dirs = []
+
+
+def _get_long_description():
+    readme_path = os.path.join(base_dir, "README.md")
+    if not os.path.exists(readme_path):
+        return ""
+    with open(readme_path, encoding="utf-8") as readme_file:
+        return readme_file.read()
+
+
+def _get_project_version():
+    version_path = os.path.join(base_dir, "ctranslate2", "version.py")
+    version = {}
+    with open(version_path, encoding="utf-8") as fp:
+        exec(fp.read(), version)
+    return version["__version__"]
+
+
+def _maybe_add_library_root(lib_name):
+    if "%s_ROOT" % lib_name in os.environ:
+        root = os.environ["%s_ROOT" % lib_name]
+        include_dirs.append("%s/include" % root)
+        for lib_dir in ("lib", "lib64"):
+            path = "%s/%s" % (root, lib_dir)
+            if os.path.exists(path):
+                library_dirs.append(path)
+                break
+
+
+_maybe_add_library_root("CTRANSLATE2")
+
+cflags = ["-std=c++17", "-fvisibility=hidden"]
+ldflags = []
+package_data = {}
+if sys.platform == "darwin":
+    # std::visit requires macOS 10.14
+    cflags.append("-mmacosx-version-min=10.14")
+    ldflags.append("-Wl,-rpath,/usr/local/lib")
+elif sys.platform == "win32":
+    cflags = ["/std:c++17", "/d2FH4-"]
+    package_data["ctranslate2"] = ["*.dll"]
+
+ctranslate2_module = Extension(
+    "ctranslate2._ext",
+    sources=glob.glob(os.path.join("cpp", "*.cc")),
+    extra_compile_args=cflags,
+    extra_link_args=ldflags,
+    include_dirs=include_dirs,
+    library_dirs=library_dirs,
+    libraries=["ctranslate2"],
+)
+
+ParallelCompile("CMAKE_BUILD_PARALLEL_LEVEL").install()
+
+setup(
+    name="ctranslate2",
+    version=_get_project_version(),
+    license="MIT",
+    description="Fast inference engine for Transformer models",
+    long_description=_get_long_description(),
+    long_description_content_type="text/markdown",
+    author="OpenNMT",
+    url="https://opennmt.net",
+    classifiers=[
+        "Development Status :: 5 - Production/Stable",
+        "Environment :: GPU :: NVIDIA CUDA :: 11.0",
+        "Environment :: GPU :: NVIDIA CUDA :: 11.1",
+        "Environment :: GPU :: NVIDIA CUDA :: 11.2",
+        "Environment :: GPU :: NVIDIA CUDA :: 11.3",
+        "Environment :: GPU :: NVIDIA CUDA :: 11.4",
+        "Environment :: GPU :: NVIDIA CUDA :: 11.5",
+        "Environment :: GPU :: NVIDIA CUDA :: 11.6",
+        "Environment :: GPU :: NVIDIA CUDA :: 11.7",
+        "Environment :: GPU :: NVIDIA CUDA :: 11.8",
+        "Intended Audience :: Developers",
+        "Intended Audience :: Science/Research",
+        "License :: OSI Approved :: MIT License",
+        "Programming Language :: Python :: 3",
+        "Programming Language :: Python :: 3 :: Only",
+        "Programming Language :: Python :: 3.8",
+        "Programming Language :: Python :: 3.9",
+        "Programming Language :: Python :: 3.10",
+        "Programming Language :: Python :: 3.11",
+        "Programming Language :: Python :: 3.12",
+        "Topic :: Scientific/Engineering :: Artificial Intelligence",
+    ],
+    project_urls={
+        "Documentation": "https://opennmt.net/CTranslate2",
+        "Forum": "https://forum.opennmt.net",
+        "Gitter": "https://gitter.im/OpenNMT/CTranslate2",
+        "Source": "https://github.com/OpenNMT/CTranslate2",
+    },
+    keywords="opennmt nmt neural machine translation cuda mkl inference quantization",
+    packages=find_packages(exclude=["bin"]),
+    package_data=package_data,
+    ext_modules=[ctranslate2_module],
+    python_requires=">=3.8",
+    install_requires=[
+        "setuptools",
+        "numpy==1.25.2",
+        "pyyaml>=5.3,<7",
+    ],
+    entry_points={
+        "console_scripts": [
+            "ct2-fairseq-converter=ctranslate2.converters.fairseq:main",
+            "ct2-marian-converter=ctranslate2.converters.marian:main",
+            "ct2-openai-gpt2-converter=ctranslate2.converters.openai_gpt2:main",
+            "ct2-opennmt-py-converter=ctranslate2.converters.opennmt_py:main",
+            "ct2-opennmt-tf-converter=ctranslate2.converters.opennmt_tf:main",
+            "ct2-opus-mt-converter=ctranslate2.converters.opus_mt:main",
+            "ct2-transformers-converter=ctranslate2.converters.transformers:main",
+        ],
+    },
+)

From acc9c85ccd27ba17ccbd3441d22c1fdcc51920c1 Mon Sep 17 00:00:00 2001
From: Dagamies <ari@juntunen.org>
Date: Thu, 25 Jul 2024 16:42:15 +0200
Subject: [PATCH 09/10] Build instructions & fixes

---
 docs/ppc64le.md       | 56 ++++++++++++++++++++++++++++++++++++++++++
 src/cpu/vec_power10.h | 57 +++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 113 insertions(+)
 create mode 100644 docs/ppc64le.md

diff --git a/docs/ppc64le.md b/docs/ppc64le.md
new file mode 100644
index 000000000..4828aad2b
--- /dev/null
+++ b/docs/ppc64le.md
@@ -0,0 +1,56 @@
+# IBM Power10 -ppc64le
+
+CTranslate2 fully supports IBM Power10 MMA and VSX extensions. Each Power10 core has 4 Matrix Math Accelerator units. For optimum performance use at least SMT4, in some cases SMT8 seems to perform better, but it is advicable to try out both. A simple way to test this is to use --intra_threads parameter to control the number of threads CTranslate2 is executing. At maximum this should be 8*number of physical cores (SMT-8).
+
+Based on preliminary testing Power10 core offer 27-42% higher tokens/s compared to Intel Gold Core.
+
+It should be possible to build for Power9, but missing MMA units will have significant impact on performance.
+
+OneDNN is used for int8 matrix math that is fully utilizing MMA units, it should be possible to build with OpenBLAS for 16bit MMA usage.
+
+## Build docker / podman container
+
+This is the easy way:
+```git clone --recursive https://github.com/OpenNMT/CTranslate2/
+cd CTranslate2/docker
+podman build  -t elinar.ai/ct2-ppc64le -f Dockerfile.ppc64le ..
+
+```
+
+Then run CTranslate2 container (substitue mount point, MODEL_LOCATION and SRC_FILE):
+```podman run  --security-opt=label=disable  --ipc=host --ulimit=host -it --rm -v /tmp:/tmp  elinar.ai/ct2-ppc64le --model MODEL_LOCATION --src SRC_FILE --intra_threads 16```
+
+## Install from sources
+This build has been tested on RHEL 9 / ppc64le and requires IBM Advance Toolchain 17.0 ( https://www.ibm.com/support/pages/advance-toolchain-linux-power )
+```
+#sleef:
+git clone -b 3.6.1 https://github.com/shibatch/sleef
+
+cd sleef
+mkdir build && cd build
+cmake -DSLEEF_BUILD_INLINE_HEADERS=TRUE  -DCMAKE_CXX_FLAGS='-mcpu=power10 -mtune=power10 -O3 -std=gnu++11 -maltivec -mabi=altivec -mstrict-align ' -DCMAKE_C_COMPILER=/opt/at17.0/bin/gcc -DCMAKE_CXX_COMPILER=/opt/at17.0/bin/g++  -DAT_PATH=/opt/at17.0/ -DBUILD_SHARED_LIBS=FALSE -DBUILD_TESTS=FALSE -DENFORCE_VSX3=TRUE -DSLEEF_SHOW_CONFIG=1 -DCMAKE_BUILD_TYPE=Release   ..
+
+cmake --build build -j --clean-first
+sudo cmake --install build --prefix=/usr/
+
+
+#OneDNN;
+git clone  -b v3.2 --recursive https://github.com/oneapi-src/oneDNN
+cd oneDNN
+mkdir build && cd build
+cmake -DCMAKE_CXX_FLAGS='-mcpu=power10 -mtune=power10 -O3 -maltivec' -DOPENMP_RUNTIME=COMP  ..
+make -j16
+sudo make install
+
+
+git clone --recursive https://github.com/Dagamies/CTranslate2
+cd CTranslate2
+mkdir build
+cd build
+cmake -DWITH_CUDA=OFF -DWITH_MKL=OFF -DWITH_OPENBLAS=OFF -DWITH_DNNL=ON -DCMAKE_CXX_FLAGS='-mcpu=power10 -mtune=power10 -O3 -ffp-contract=off' -DOPENMP_RUNTIME=COMP ..
+make -j16
+sudo make install
+sudo ldconfig -v
+export LD_LIBRARY_PATH=/usr/local/lib64/
+
+```
\ No newline at end of file
diff --git a/src/cpu/vec_power10.h b/src/cpu/vec_power10.h
index c87242184..de8ec6955 100644
--- a/src/cpu/vec_power10.h
+++ b/src/cpu/vec_power10.h
@@ -206,6 +206,63 @@ namespace ctranslate2 {
         float t1 = a[2] > a[3] ? a[2] : a[3];
 	return t0 > t1 ? t0 : t1;
       }
+
+      static inline value_type round(value_type a) {
+	return vec_round(a);
+      }
+
+      template<typename U>
+      static inline void convert_and_store(value_type v, U* a, dim_t count) {
+          *a = v;
+      }
+
+      static inline void convert_and_store(value_type v, int8_t *a, dim_t count) {
+	auto i32 = vec_cts(v,0);
+	
+	int8_t tmp[4];
+	tmp[0]=i32[0];
+	tmp[1]=i32[1];
+	tmp[2]=i32[2];
+	tmp[3]=i32[3];
+	std::copy(tmp, tmp + count, a);
+	
+      }
+
+      static inline void convert_and_store(value_type v, uint8_t *a, dim_t count) {
+	auto u32 = vec_ctu(v,0);
+        uint8_t tmp[4];
+        tmp[0]=u32[0];
+        tmp[1]=u32[1];
+        tmp[2]=u32[2];
+        tmp[3]=u32[3];
+        std::copy(tmp, tmp + count, a);
+
+	
+      }
+
+      /*      static inline void convert_and_store(value_type v, int8_t *a, dim_t count) {
+        //convert float32x4_t to int32x4_t
+        auto i32x4 = vcvtq_s32_f32(v);
+        //then convert to int16x4_t
+        auto i16x4 = vqmovn_s32(i32x4);
+        //finally convert to int8x4_t
+        auto i8x8 = vqmovn_s16(vcombine_s16(i16x4, vdup_n_s16(0)));
+        int8_t tmp[8];
+        vst1_s8(tmp, i8x8);
+        std::copy(tmp, tmp + count, a);
+      }
+
+      static inline void convert_and_store(value_type v, uint8_t *a, dim_t count) {
+        //convert float32x4_t to uint32x4_t
+        auto u32x4 = vcvtq_u32_f32(v);
+        //then convert to uint16x4_t
+        auto u16x4 = vqmovn_u32(u32x4);
+        //finally convert to uint8x8_t
+        auto u8x8 = vqmovn_u16(vcombine_u16(u16x4, vdup_n_u16(0)));
+        uint8_t tmp[8];
+        vst1_u8(tmp, u8x8);
+        std::copy(tmp, tmp + count, a);
+	}*/
     };
   }
 }

From 0651b8c167c7276ebdc72a1daeb95edf5a55fa0a Mon Sep 17 00:00:00 2001
From: Dagamies <ari@juntunen.org>
Date: Thu, 25 Jul 2024 16:57:30 +0200
Subject: [PATCH 10/10] Remove unnecessary code

---
 src/cpu/vec_power10.h | 34 +---------------------------------
 1 file changed, 1 insertion(+), 33 deletions(-)

diff --git a/src/cpu/vec_power10.h b/src/cpu/vec_power10.h
index de8ec6955..c9c71abe3 100644
--- a/src/cpu/vec_power10.h
+++ b/src/cpu/vec_power10.h
@@ -211,11 +211,6 @@ namespace ctranslate2 {
 	return vec_round(a);
       }
 
-      template<typename U>
-      static inline void convert_and_store(value_type v, U* a, dim_t count) {
-          *a = v;
-      }
-
       static inline void convert_and_store(value_type v, int8_t *a, dim_t count) {
 	auto i32 = vec_cts(v,0);
 	
@@ -225,7 +220,6 @@ namespace ctranslate2 {
 	tmp[2]=i32[2];
 	tmp[3]=i32[3];
 	std::copy(tmp, tmp + count, a);
-	
       }
 
       static inline void convert_and_store(value_type v, uint8_t *a, dim_t count) {
@@ -235,34 +229,8 @@ namespace ctranslate2 {
         tmp[1]=u32[1];
         tmp[2]=u32[2];
         tmp[3]=u32[3];
-        std::copy(tmp, tmp + count, a);
-
-	
+        std::copy(tmp, tmp + count, a);	
       }
-
-      /*      static inline void convert_and_store(value_type v, int8_t *a, dim_t count) {
-        //convert float32x4_t to int32x4_t
-        auto i32x4 = vcvtq_s32_f32(v);
-        //then convert to int16x4_t
-        auto i16x4 = vqmovn_s32(i32x4);
-        //finally convert to int8x4_t
-        auto i8x8 = vqmovn_s16(vcombine_s16(i16x4, vdup_n_s16(0)));
-        int8_t tmp[8];
-        vst1_s8(tmp, i8x8);
-        std::copy(tmp, tmp + count, a);
-      }
-
-      static inline void convert_and_store(value_type v, uint8_t *a, dim_t count) {
-        //convert float32x4_t to uint32x4_t
-        auto u32x4 = vcvtq_u32_f32(v);
-        //then convert to uint16x4_t
-        auto u16x4 = vqmovn_u32(u32x4);
-        //finally convert to uint8x8_t
-        auto u8x8 = vqmovn_u16(vcombine_u16(u16x4, vdup_n_u16(0)));
-        uint8_t tmp[8];
-        vst1_u8(tmp, u8x8);
-        std::copy(tmp, tmp + count, a);
-	}*/
     };
   }
 }