Skip to content

Commit 50eb9aa

Browse files
sse4.2: added the implementation for mm_cmpestra
1 parent 15a47fc commit 50eb9aa

File tree

2 files changed

+635
-0
lines changed

2 files changed

+635
-0
lines changed

simde/x86/sse4.2.h

Lines changed: 225 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -92,6 +92,231 @@ SIMDE_BEGIN_DECLS_
9292
#define _SIDD_UNIT_MASK SIMDE_SIDD_UNIT_MASK
9393
#endif
9494

95+
SIMDE_FUNCTION_ATTRIBUTES
96+
int
97+
simde_mm_cmpestra_8_(simde__m128i a, int la, simde__m128i b, int lb, const int imm8)
98+
SIMDE_REQUIRE_CONSTANT_RANGE(imm8, 0, 255) {
99+
const int cmp_op = imm8 & 0x06;
100+
const int polarity = imm8 & 0x30;
101+
simde__m128i_private
102+
bool_res_ = simde__m128i_to_private(simde_mm_setzero_si128()),
103+
a_ = simde__m128i_to_private(a),
104+
b_ = simde__m128i_to_private(b);
105+
const int upper_bound = (128 / 8) - 1;
106+
int a_invalid = 0;
107+
int b_invalid = 0;
108+
for(int i = 0 ; i < upper_bound ; i++) {
109+
for(int j = 0; j< upper_bound ; j++){
110+
int bitvalue = ((a_.i8[i] == b_.i8[j]) ? 1 : 0);
111+
if(i == la)
112+
a_invalid = 1;
113+
if(j == lb)
114+
b_invalid = 1;
115+
switch(cmp_op){
116+
case SIMDE_SIDD_CMP_EQUAL_ANY:
117+
bitvalue = 0;
118+
break;
119+
case SIMDE_SIDD_CMP_RANGES:
120+
bitvalue = 0;
121+
break;
122+
case SIMDE_SIDD_CMP_EQUAL_EACH:
123+
if(a_invalid && b_invalid)
124+
bitvalue = 1;
125+
else
126+
bitvalue = 0;
127+
break;
128+
case SIMDE_SIDD_CMP_EQUAL_ORDERED:
129+
if(a_invalid && !b_invalid)
130+
bitvalue = 1;
131+
else if(a_invalid && b_invalid)
132+
bitvalue = 1;
133+
else
134+
bitvalue = 0;
135+
break;
136+
}
137+
bool_res_.i8[i] |= (bitvalue << j);
138+
}
139+
}
140+
int32_t int_res_1 = 0;
141+
int32_t int_res_2 = 0;
142+
switch(cmp_op) {
143+
case SIMDE_SIDD_CMP_EQUAL_ANY:
144+
for(int i = 0 ; i < upper_bound ; i++){
145+
SIMDE_VECTORIZE_REDUCTION(|:int_res_1)
146+
for(int j = 0 ; j < upper_bound ; j++){
147+
int_res_1 |= (((bool_res_.i8[i] >> j) & 1) << i);
148+
}
149+
}
150+
break;
151+
case SIMDE_SIDD_CMP_RANGES:
152+
for(int i = 0 ; i < upper_bound ; i++){
153+
SIMDE_VECTORIZE_REDUCTION(|:int_res_1)
154+
for(int j = 0 ; j < upper_bound ; j++){
155+
int_res_1 |= ((((bool_res_.i8[i] >> j) & 1) & ((bool_res_.i8[i] >> (j + 1)) & 1)) << i);
156+
j += 2;
157+
}
158+
}
159+
break;
160+
case SIMDE_SIDD_CMP_EQUAL_EACH:
161+
for(int i = 0 ; i < upper_bound ; i++){
162+
SIMDE_VECTORIZE_REDUCTION(|:int_res_1)
163+
for(int j = 0 ; j < upper_bound ; j++){
164+
int_res_1 |= (((bool_res_.i8[i] >> i) & 1) << i);
165+
}
166+
}
167+
break;
168+
case SIMDE_SIDD_CMP_EQUAL_ORDERED:
169+
int_res_1 = 0xff;
170+
for(int i = 0 ; i < upper_bound ; i++){
171+
int k = i;
172+
SIMDE_VECTORIZE_REDUCTION(&:int_res_1)
173+
for(int j = 0 ; j < (upper_bound-i) ; j++){
174+
int_res_1 &= (((bool_res_.i8[k] >> j) & 1 ) << i) ;
175+
k += 1;
176+
}
177+
}
178+
break;
179+
}
180+
for(int i = 0; i < upper_bound ; i++){
181+
if(polarity & 1){
182+
if((polarity >> 1) & 1) {
183+
if (i >= lb) {
184+
int_res_2 |= (((int_res_1 >> i) & 1) << i);
185+
}
186+
else {
187+
int_res_2 |= ((((int_res_1 >> i) & 1) ^ (-1)) << i);
188+
}
189+
}
190+
else{
191+
int_res_2 |= ((((int_res_1 >> i) & 1) ^ (-1)) << i);
192+
}
193+
}
194+
else{
195+
int_res_2 |= ( ((int_res_1 >> i) & 1) << i);
196+
}
197+
}
198+
return !int_res_2 & (lb > upper_bound);
199+
}
200+
201+
SIMDE_FUNCTION_ATTRIBUTES
202+
int
203+
simde_mm_cmpestra_16_(simde__m128i a, int la, simde__m128i b, int lb, const int imm8)
204+
SIMDE_REQUIRE_CONSTANT_RANGE(imm8, 0, 255) {
205+
const int cmp_op = imm8 & 0x06;
206+
const int polarity = imm8 & 0x30;
207+
simde__m128i_private
208+
bool_res_ = simde__m128i_to_private(simde_mm_setzero_si128()),
209+
a_ = simde__m128i_to_private(a),
210+
b_ = simde__m128i_to_private(b);
211+
const int upper_bound = (128 / 16) - 1;
212+
int a_invalid = 0;
213+
int b_invalid = 0;
214+
for(int i = 0 ; i < upper_bound ; i++) {
215+
for(int j = 0; j< upper_bound ; j++)
216+
{
217+
int bitvalue = ((a_.i16[i] == b_.i16[j]) ? 1 : 0);
218+
if(i == la)
219+
a_invalid = 1;
220+
if(j == lb)
221+
b_invalid = 1;
222+
switch(cmp_op){
223+
case SIMDE_SIDD_CMP_EQUAL_ANY:
224+
bitvalue = 0;
225+
break;
226+
case SIMDE_SIDD_CMP_RANGES:
227+
bitvalue = 0;
228+
break;
229+
case SIMDE_SIDD_CMP_EQUAL_EACH:
230+
if(a_invalid && b_invalid)
231+
bitvalue = 1;
232+
else
233+
bitvalue = 0;
234+
break;
235+
case SIMDE_SIDD_CMP_EQUAL_ORDERED:
236+
if(a_invalid && !b_invalid)
237+
bitvalue = 1;
238+
else if(a_invalid && b_invalid)
239+
bitvalue = 1;
240+
else
241+
bitvalue = 0;
242+
break;
243+
}
244+
bool_res_.i16[i] |= (bitvalue << j);
245+
}
246+
}
247+
int32_t int_res_1 = 0;
248+
int32_t int_res_2 = 0;
249+
switch(cmp_op) {
250+
case SIMDE_SIDD_CMP_EQUAL_ANY:
251+
for(int i = 0 ; i < upper_bound ; i++){
252+
SIMDE_VECTORIZE_REDUCTION(|:int_res_1)
253+
for (int j = 0 ; j < upper_bound ; j++){
254+
int_res_1 |= (((bool_res_.i16[i] >> j) & 1) << i) ;
255+
}
256+
}
257+
break;
258+
case SIMDE_SIDD_CMP_RANGES:
259+
for(int i = 0 ; i < upper_bound ; i++){
260+
SIMDE_VECTORIZE_REDUCTION(|:int_res_1)
261+
for(int j = 0 ; j < upper_bound ; j++){
262+
int_res_1 |= ((((bool_res_.i16[i] >> j) & 1) & ((bool_res_.i16[i] >> (j + 1)) & 1)) << i);
263+
j += 2;
264+
}
265+
}
266+
break;
267+
case SIMDE_SIDD_CMP_EQUAL_EACH:
268+
for(int i = 0 ; i < upper_bound ; i++){
269+
SIMDE_VECTORIZE_REDUCTION(|:int_res_1)
270+
for(int j = 0 ; j < upper_bound ; j++){
271+
int_res_1 |= (((bool_res_.i16[i] >> i) & 1) << i);
272+
}
273+
}
274+
break;
275+
case SIMDE_SIDD_CMP_EQUAL_ORDERED:
276+
int_res_1 = 0xffff;
277+
for(int i = 0 ; i < upper_bound ; i++){
278+
int k = i;
279+
SIMDE_VECTORIZE_REDUCTION(&:int_res_1)
280+
for(int j = 0 ; j < (upper_bound-i) ; j++){
281+
int_res_1 &= (((bool_res_.i16[k] >> j) & 1) << i) ;
282+
k += 1;
283+
}
284+
}
285+
break;
286+
}
287+
for(int i = 0; i < upper_bound ; i++){
288+
if(polarity & 1){
289+
if((polarity >> 1) & 1) {
290+
if (i >= lb) {
291+
int_res_2 |= (((int_res_1 >> i) & 1) << i);
292+
}
293+
else {
294+
int_res_2 |= ((((int_res_1 >> i) & 1) ^ (-1)) << i);
295+
}
296+
}
297+
else{
298+
int_res_2 |= ((((int_res_1 >> i) & 1) ^ (-1)) << i);
299+
}
300+
}
301+
else{
302+
int_res_2 |= (((int_res_1 >> i) & 1) << i);
303+
}
304+
}
305+
return !int_res_2 & (lb > upper_bound);
306+
}
307+
308+
#if defined(SIMDE_X86_SSE4_2_NATIVE)
309+
#define simde_mm_cmpestra(a, la, b, lb, imm8) _mm_cmpestra(a, la, b, lb, imm8)
310+
#else
311+
#define simde_mm_cmpestra(a, la, b, lb, imm8) \
312+
(((imm8) & SIMDE_SIDD_UWORD_OPS) \
313+
? simde_mm_cmpestra_16_((a), (la), (b), (lb), (imm8)) \
314+
: simde_mm_cmpestra_8_((a), (la), (b), (lb), (imm8)))
315+
#endif
316+
#if defined(SIMDE_X86_SSE4_2_ENABLE_NATIVE_ALIASES)
317+
#define _mm_cmpestra(a, la, b, lb, imm8) simde_mm_cmpestra(a, la, b, lb, imm8)
318+
#endif
319+
95320
SIMDE_FUNCTION_ATTRIBUTES
96321
int simde_mm_cmpestrs (simde__m128i a, int la, simde__m128i b, int lb, const int imm8)
97322
SIMDE_REQUIRE_CONSTANT_RANGE(imm8, 0, 127) {

0 commit comments

Comments
 (0)