Skip to content

Commit 8d7f40b

Browse files
sse4.2: added the implementation for mm_cmpestra
1 parent 1cec0b4 commit 8d7f40b

File tree

2 files changed

+272
-0
lines changed

2 files changed

+272
-0
lines changed

simde/x86/sse4.2.h

Lines changed: 237 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -93,6 +93,243 @@ SIMDE_BEGIN_DECLS_
9393
#define _SIDD_UNIT_MASK SIMDE_SIDD_UNIT_MASK
9494
#endif
9595

96+
SIMDE_FUNCTION_ATTRIBUTES
97+
int
98+
simde_mm_cmpestra_8_(simde__m128i a, int la, simde__m128i b, int lb, const int imm8)
99+
SIMDE_REQUIRE_CONSTANT_RANGE(imm8, 0, 255) {
100+
const int cmp_op = imm8 & 0x0c;
101+
const int polarity = imm8 & 0x30;
102+
simde__m128i_private
103+
bool_res_ = simde__m128i_to_private(simde_mm_setzero_si128()),
104+
a_ = simde__m128i_to_private(a),
105+
b_ = simde__m128i_to_private(b);
106+
const int upper_bound = (128 / 8) - 1;
107+
int a_invalid = 0;
108+
int b_invalid = 0;
109+
for(int i = 0 ; i <= upper_bound ; i++) {
110+
for(int j = 0; j <= upper_bound ; j++){
111+
int bitvalue = ((a_.i8[i] == b_.i8[j]) ? 1 : 0);
112+
if(i == la)
113+
a_invalid = 1;
114+
if(j == lb)
115+
b_invalid = 1;
116+
switch(cmp_op){
117+
case SIMDE_SIDD_CMP_EQUAL_ANY:
118+
case SIMDE_SIDD_CMP_RANGES:
119+
if(!a_invalid && !b_invalid);
120+
else
121+
bitvalue = 0;
122+
break;
123+
case SIMDE_SIDD_CMP_EQUAL_EACH:
124+
if(!a_invalid && !b_invalid);
125+
else if(a_invalid && b_invalid)
126+
bitvalue = 1;
127+
else
128+
bitvalue = 0;
129+
break;
130+
case SIMDE_SIDD_CMP_EQUAL_ORDERED:
131+
if(!a_invalid && !b_invalid);
132+
else if(a_invalid && !b_invalid)
133+
bitvalue = 1;
134+
else if(a_invalid && b_invalid)
135+
bitvalue = 1;
136+
else
137+
bitvalue = 0;
138+
break;
139+
}
140+
bool_res_.i8[i] |= (bitvalue << j);
141+
}
142+
}
143+
int32_t int_res_1 = 0;
144+
int32_t int_res_2 = 0;
145+
switch(cmp_op) {
146+
case SIMDE_SIDD_CMP_EQUAL_ANY:
147+
for(int i = 0 ; i <= upper_bound ; i++){
148+
SIMDE_VECTORIZE_REDUCTION(|:int_res_1)
149+
for(int j = 0 ; j <= upper_bound ; j++){
150+
int_res_1 |= (((bool_res_.i8[i] >> j) & 1) << i);
151+
}
152+
}
153+
break;
154+
case SIMDE_SIDD_CMP_RANGES:
155+
for(int i = 0 ; i <= upper_bound ; i++){
156+
SIMDE_VECTORIZE_REDUCTION(|:int_res_1)
157+
for(int j = 0 ; j <= upper_bound ; j++){
158+
int_res_1 |= ((((bool_res_.i8[i] >> j) & 1) & ((bool_res_.i8[i] >> (j + 1)) & 1)) << i);
159+
j += 2;
160+
}
161+
}
162+
break;
163+
case SIMDE_SIDD_CMP_EQUAL_EACH:
164+
SIMDE_VECTORIZE_REDUCTION(|:int_res_1)
165+
for(int i = 0 ; i <= upper_bound ; i++){
166+
int_res_1 |= (((bool_res_.i8[i] >> i) & 1) << i);
167+
}
168+
break;
169+
case SIMDE_SIDD_CMP_EQUAL_ORDERED:
170+
int_res_1 = 0xff;
171+
for(int i = 0 ; i <= upper_bound ; i++){
172+
int k = i;
173+
HEDLEY_DIAGNOSTIC_PUSH
174+
#if defined(SIMDE_BUG_CLANG_45959)
175+
#pragma clang diagnostic ignored "-Wsign-conversion"
176+
#endif
177+
SIMDE_VECTORIZE_REDUCTION(&:int_res_1)
178+
for(int j = 0 ; j <= (upper_bound-i) ; j++){
179+
int_res_1 &= (((bool_res_.i8[k] >> j) & 1 ) << i) ;
180+
k += 1;
181+
}
182+
HEDLEY_DIAGNOSTIC_POP
183+
}
184+
break;
185+
}
186+
for(int i = 0; i <= upper_bound ; i++){
187+
if(polarity & SIMDE_SIDD_NEGATIVE_POLARITY){
188+
if(polarity & SIMDE_SIDD_MASKED_POSITIVE_POLARITY) {
189+
if (i >= lb) {
190+
int_res_2 |= (((int_res_1 >> i) & 1) << i);
191+
}
192+
else {
193+
int_res_2 |= ((((int_res_1 >> i) & 1) ^ (-1)) << i);
194+
}
195+
}
196+
else{
197+
int_res_2 |= ((((int_res_1 >> i) & 1) ^ (-1)) << i);
198+
}
199+
}
200+
else{
201+
int_res_2 |= ( ((int_res_1 >> i) & 1) << i);
202+
}
203+
}
204+
return !int_res_2 & (lb > upper_bound);
205+
}
206+
207+
SIMDE_FUNCTION_ATTRIBUTES
208+
int
209+
simde_mm_cmpestra_16_(simde__m128i a, int la, simde__m128i b, int lb, const int imm8)
210+
SIMDE_REQUIRE_CONSTANT_RANGE(imm8, 0, 255) {
211+
const int cmp_op = imm8 & 0x0c;
212+
const int polarity = imm8 & 0x30;
213+
simde__m128i_private
214+
bool_res_ = simde__m128i_to_private(simde_mm_setzero_si128()),
215+
a_ = simde__m128i_to_private(a),
216+
b_ = simde__m128i_to_private(b);
217+
const int upper_bound = (128 / 16) - 1;
218+
int a_invalid = 0;
219+
int b_invalid = 0;
220+
for(int i = 0 ; i <= upper_bound ; i++) {
221+
for(int j = 0; j <= upper_bound ; j++)
222+
{
223+
int bitvalue = ((a_.i16[i] == b_.i16[j]) ? 1 : 0);
224+
a_invalid = 0;
225+
b_invalid = 0;
226+
if(i >= la)
227+
a_invalid = 1;
228+
if(j >= lb)
229+
b_invalid = 1;
230+
switch(cmp_op){
231+
case SIMDE_SIDD_CMP_EQUAL_ANY:
232+
case SIMDE_SIDD_CMP_RANGES:
233+
if(!a_invalid && !b_invalid);
234+
else
235+
bitvalue = 0;
236+
break;
237+
case SIMDE_SIDD_CMP_EQUAL_EACH:
238+
if(!a_invalid && !b_invalid);
239+
else if(a_invalid && b_invalid)
240+
bitvalue = 1;
241+
else
242+
bitvalue = 0;
243+
break;
244+
case SIMDE_SIDD_CMP_EQUAL_ORDERED:
245+
if(!a_invalid && !b_invalid);
246+
else if(a_invalid && !b_invalid)
247+
bitvalue = 1;
248+
else if(a_invalid && b_invalid)
249+
bitvalue = 1;
250+
else
251+
bitvalue = 0;
252+
break;
253+
}
254+
bool_res_.i16[i] |= (bitvalue << j);
255+
}
256+
}
257+
int32_t int_res_1 = 0;
258+
int32_t int_res_2 = 0;
259+
switch(cmp_op) {
260+
case SIMDE_SIDD_CMP_EQUAL_ANY:
261+
for(int i = 0 ; i <= upper_bound ; i++){
262+
SIMDE_VECTORIZE_REDUCTION(|:int_res_1)
263+
for (int j = 0 ; j <= upper_bound ; j++){
264+
int_res_1 |= (((bool_res_.i16[i] >> j) & 1) << i) ;
265+
}
266+
}
267+
break;
268+
case SIMDE_SIDD_CMP_RANGES:
269+
for(int i = 0 ; i <= upper_bound ; i++){
270+
SIMDE_VECTORIZE_REDUCTION(|:int_res_1)
271+
for(int j = 0 ; j <= upper_bound ; j++){
272+
int_res_1 |= ((((bool_res_.i16[i] >> j) & 1) & ((bool_res_.i16[i] >> (j + 1)) & 1)) << i);
273+
j += 2;
274+
}
275+
}
276+
break;
277+
case SIMDE_SIDD_CMP_EQUAL_EACH:
278+
SIMDE_VECTORIZE_REDUCTION(|:int_res_1)
279+
for(int i = 0 ; i <= upper_bound ; i++){
280+
int_res_1 |= (((bool_res_.i16[i] >> i) & 1) << i);
281+
}
282+
break;
283+
case SIMDE_SIDD_CMP_EQUAL_ORDERED:
284+
int_res_1 = 0xffff;
285+
for(int i = 0 ; i <= upper_bound ; i++){
286+
int k = i;
287+
HEDLEY_DIAGNOSTIC_PUSH
288+
#if defined(SIMDE_BUG_CLANG_45959)
289+
#pragma clang diagnostic ignored "-Wsign-conversion"
290+
#endif
291+
SIMDE_VECTORIZE_REDUCTION(&:int_res_1)
292+
for(int j = 0 ; j <= (upper_bound-i) ; j++){
293+
int_res_1 &= (((bool_res_.i16[k] >> j) & 1) << i) ;
294+
k += 1;
295+
}
296+
HEDLEY_DIAGNOSTIC_POP
297+
}
298+
break;
299+
}
300+
for(int i = 0; i <= upper_bound ; i++){
301+
if(polarity & SIMDE_SIDD_NEGATIVE_POLARITY){
302+
if(polarity & SIMDE_SIDD_MASKED_POSITIVE_POLARITY) {
303+
if (i >= lb) {
304+
int_res_2 |= (((int_res_1 >> i) & 1) << i);
305+
}
306+
else {
307+
int_res_2 |= ((((int_res_1 >> i) & 1) ^ (-1)) << i);
308+
}
309+
}
310+
else{
311+
int_res_2 |= ((((int_res_1 >> i) & 1) ^ (-1)) << i);
312+
}
313+
}
314+
else{
315+
int_res_2 |= (((int_res_1 >> i) & 1) << i);
316+
}
317+
}
318+
return !int_res_2 & (lb > upper_bound);
319+
}
320+
321+
#if defined(SIMDE_X86_SSE4_2_NATIVE)
322+
#define simde_mm_cmpestra(a, la, b, lb, imm8) _mm_cmpestra(a, la, b, lb, imm8)
323+
#else
324+
#define simde_mm_cmpestra(a, la, b, lb, imm8) \
325+
(((imm8) & SIMDE_SIDD_UWORD_OPS) \
326+
? simde_mm_cmpestra_16_((a), (la), (b), (lb), (imm8)) \
327+
: simde_mm_cmpestra_8_((a), (la), (b), (lb), (imm8)))
328+
#endif
329+
#if defined(SIMDE_X86_SSE4_2_ENABLE_NATIVE_ALIASES)
330+
#define _mm_cmpestra(a, la, b, lb, imm8) simde_mm_cmpestra(a, la, b, lb, imm8)
331+
#endif
332+
96333
SIMDE_FUNCTION_ATTRIBUTES
97334
int simde_mm_cmpestrs (simde__m128i a, int la, simde__m128i b, int lb, const int imm8)
98335
SIMDE_REQUIRE_CONSTANT_RANGE(imm8, 0, 127) {

test/x86/sse4.2.c

Lines changed: 35 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,40 @@
2525
#include <test/x86/test-sse2.h>
2626
#include <simde/x86/sse4.2.h>
2727

28+
static MunitResult
29+
test_simde_mm_cmpestra_ranges_8(const MunitParameter params[], void* data) {
30+
(void) params;
31+
(void) data;
32+
33+
const struct {
34+
simde__m128i a;
35+
int la;
36+
simde__m128i b;
37+
int lb;
38+
int r;
39+
} test_vec[] = {
40+
{ simde_mm_set_epi8(INT8_C( 45), INT8_C( -94), INT8_C( 38), INT8_C( -11),
41+
INT8_C( 84), INT8_C(-123), INT8_C( -43), INT8_C( -49),
42+
INT8_C( 25), INT8_C( -55), INT8_C(-121), INT8_C( -6),
43+
INT8_C( 57), INT8_C( 108), INT8_C( -55), INT8_C( 69)),
44+
23 ,
45+
simde_mm_set_epi8(INT8_C( -26), INT8_C( -61), INT8_C( -21), INT8_C( -96),
46+
INT8_C( 48), INT8_C(-112), INT8_C( 95), INT8_C( -56),
47+
INT8_C( 29), INT8_C( -55), INT8_C(-121), INT8_C( -6),
48+
INT8_C( 57), INT8_C( 108), INT8_C( -55), INT8_C( 69)),
49+
28 ,
50+
0 }
51+
};
52+
53+
for (size_t i = 0 ; i < (sizeof(test_vec) / sizeof(test_vec[0])); i++) {
54+
int r;
55+
r = simde_mm_cmpestra(test_vec[i].a, test_vec[i].la, test_vec[i].b, test_vec[i].lb, 36);
56+
munit_assert_int(r, ==, test_vec[i].r);
57+
}
58+
59+
return MUNIT_OK;
60+
}
61+
2862
static int
2963
test_simde_mm_cmpestrs_8(SIMDE_MUNIT_TEST_ARGS) {
3064
const struct {
@@ -1064,6 +1098,7 @@ test_simde_mm_crc32_u64 (SIMDE_MUNIT_TEST_ARGS) {
10641098
}
10651099

10661100
SIMDE_TEST_FUNC_LIST_BEGIN
1101+
SIMDE_TEST_FUNC_LIST_ENTRY(mm_cmpestra_ranges_8)
10671102
SIMDE_TEST_FUNC_LIST_ENTRY(mm_cmpestrs_8)
10681103
SIMDE_TEST_FUNC_LIST_ENTRY(mm_cmpestrs_16)
10691104
SIMDE_TEST_FUNC_LIST_ENTRY(mm_cmpestrz_8)

0 commit comments

Comments
 (0)