Skip to content

Commit b73630c

Browse files
sse4.2: added the implementation for mm_cmpestra
1 parent 635a43f commit b73630c

File tree

2 files changed

+1608
-0
lines changed

2 files changed

+1608
-0
lines changed

simde/x86/sse4.2.h

Lines changed: 221 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -92,6 +92,227 @@ SIMDE_BEGIN_DECLS_
9292
#define _SIDD_UNIT_MASK SIMDE_SIDD_UNIT_MASK
9393
#endif
9494

95+
SIMDE_FUNCTION_ATTRIBUTES
96+
int
97+
simde_mm_cmpestra_8_(simde__m128i a, int la, simde__m128i b, int lb, const int imm8)
98+
SIMDE_REQUIRE_CONSTANT_RANGE(imm8, 0, 255) {
99+
const int cmp_op = imm8 & 0x0c;
100+
const int polarity = imm8 & 0x30;
101+
simde__m128i_private
102+
bool_res_ = simde__m128i_to_private(simde_mm_setzero_si128()),
103+
a_ = simde__m128i_to_private(a),
104+
b_ = simde__m128i_to_private(b);
105+
const int upper_bound = (128 / 8) - 1;
106+
int a_invalid = 0;
107+
int b_invalid = 0;
108+
for(int i = 0 ; i < upper_bound ; i++) {
109+
for(int j = 0; j< upper_bound ; j++){
110+
int bitvalue = ((a_.i8[i] == b_.i8[j]) ? 1 : 0);
111+
if(i == la)
112+
a_invalid = 1;
113+
if(j == lb)
114+
b_invalid = 1;
115+
switch(cmp_op){
116+
case SIMDE_SIDD_CMP_EQUAL_ANY:
117+
case SIMDE_SIDD_CMP_RANGES:
118+
bitvalue = 0;
119+
break;
120+
case SIMDE_SIDD_CMP_EQUAL_EACH:
121+
if(a_invalid && b_invalid)
122+
bitvalue = 1;
123+
else
124+
bitvalue = 0;
125+
break;
126+
case SIMDE_SIDD_CMP_EQUAL_ORDERED:
127+
if(a_invalid && !b_invalid)
128+
bitvalue = 1;
129+
else if(a_invalid && b_invalid)
130+
bitvalue = 1;
131+
else
132+
bitvalue = 0;
133+
break;
134+
}
135+
bool_res_.i8[i] |= (bitvalue << j);
136+
}
137+
}
138+
int32_t int_res_1 = 0;
139+
int32_t int_res_2 = 0;
140+
switch(cmp_op) {
141+
case SIMDE_SIDD_CMP_EQUAL_ANY:
142+
for(int i = 0 ; i < upper_bound ; i++){
143+
SIMDE_VECTORIZE_REDUCTION(|:int_res_1)
144+
for(int j = 0 ; j < upper_bound ; j++){
145+
int_res_1 |= (((bool_res_.i8[i] >> j) & 1) << i);
146+
}
147+
}
148+
break;
149+
case SIMDE_SIDD_CMP_RANGES:
150+
for(int i = 0 ; i < upper_bound ; i++){
151+
SIMDE_VECTORIZE_REDUCTION(|:int_res_1)
152+
for(int j = 0 ; j < upper_bound ; j++){
153+
int_res_1 |= ((((bool_res_.i8[i] >> j) & 1) & ((bool_res_.i8[i] >> (j + 1)) & 1)) << i);
154+
j += 2;
155+
}
156+
}
157+
break;
158+
case SIMDE_SIDD_CMP_EQUAL_EACH:
159+
for(int i = 0 ; i < upper_bound ; i++){
160+
SIMDE_VECTORIZE_REDUCTION(|:int_res_1)
161+
for(int j = 0 ; j < upper_bound ; j++){
162+
int_res_1 |= (((bool_res_.i8[i] >> i) & 1) << i);
163+
}
164+
}
165+
break;
166+
case SIMDE_SIDD_CMP_EQUAL_ORDERED:
167+
int_res_1 = 0xff;
168+
for(int i = 0 ; i < upper_bound ; i++){
169+
int k = i;
170+
SIMDE_VECTORIZE_REDUCTION(&:int_res_1)
171+
for(int j = 0 ; j < (upper_bound-i) ; j++){
172+
int_res_1 &= (((bool_res_.i8[k] >> j) & 1 ) << i) ;
173+
k += 1;
174+
}
175+
}
176+
break;
177+
}
178+
for(int i = 0; i < upper_bound ; i++){
179+
if(polarity & 1){
180+
if((polarity >> 1) & 1) {
181+
if (i >= lb) {
182+
int_res_2 |= (((int_res_1 >> i) & 1) << i);
183+
}
184+
else {
185+
int_res_2 |= ((((int_res_1 >> i) & 1) ^ (-1)) << i);
186+
}
187+
}
188+
else{
189+
int_res_2 |= ((((int_res_1 >> i) & 1) ^ (-1)) << i);
190+
}
191+
}
192+
else{
193+
int_res_2 |= ( ((int_res_1 >> i) & 1) << i);
194+
}
195+
}
196+
return !int_res_2 & (lb > upper_bound);
197+
}
198+
199+
SIMDE_FUNCTION_ATTRIBUTES
200+
int
201+
simde_mm_cmpestra_16_(simde__m128i a, int la, simde__m128i b, int lb, const int imm8)
202+
SIMDE_REQUIRE_CONSTANT_RANGE(imm8, 0, 255) {
203+
const int cmp_op = imm8 & 0x0c;
204+
const int polarity = imm8 & 0x30;
205+
simde__m128i_private
206+
bool_res_ = simde__m128i_to_private(simde_mm_setzero_si128()),
207+
a_ = simde__m128i_to_private(a),
208+
b_ = simde__m128i_to_private(b);
209+
const int upper_bound = (128 / 16) - 1;
210+
int a_invalid = 0;
211+
int b_invalid = 0;
212+
for(int i = 0 ; i < upper_bound ; i++) {
213+
for(int j = 0; j< upper_bound ; j++)
214+
{
215+
int bitvalue = ((a_.i16[i] == b_.i16[j]) ? 1 : 0);
216+
if(i == la)
217+
a_invalid = 1;
218+
if(j == lb)
219+
b_invalid = 1;
220+
switch(cmp_op){
221+
case SIMDE_SIDD_CMP_EQUAL_ANY:
222+
case SIMDE_SIDD_CMP_RANGES:
223+
bitvalue = 0;
224+
break;
225+
case SIMDE_SIDD_CMP_EQUAL_EACH:
226+
if(a_invalid && b_invalid)
227+
bitvalue = 1;
228+
else
229+
bitvalue = 0;
230+
break;
231+
case SIMDE_SIDD_CMP_EQUAL_ORDERED:
232+
if(a_invalid && !b_invalid)
233+
bitvalue = 1;
234+
else if(a_invalid && b_invalid)
235+
bitvalue = 1;
236+
else
237+
bitvalue = 0;
238+
break;
239+
}
240+
bool_res_.i16[i] |= (bitvalue << j);
241+
}
242+
}
243+
int32_t int_res_1 = 0;
244+
int32_t int_res_2 = 0;
245+
switch(cmp_op) {
246+
case SIMDE_SIDD_CMP_EQUAL_ANY:
247+
for(int i = 0 ; i < upper_bound ; i++){
248+
SIMDE_VECTORIZE_REDUCTION(|:int_res_1)
249+
for (int j = 0 ; j < upper_bound ; j++){
250+
int_res_1 |= (((bool_res_.i16[i] >> j) & 1) << i) ;
251+
}
252+
}
253+
break;
254+
case SIMDE_SIDD_CMP_RANGES:
255+
for(int i = 0 ; i < upper_bound ; i++){
256+
SIMDE_VECTORIZE_REDUCTION(|:int_res_1)
257+
for(int j = 0 ; j < upper_bound ; j++){
258+
int_res_1 |= ((((bool_res_.i16[i] >> j) & 1) & ((bool_res_.i16[i] >> (j + 1)) & 1)) << i);
259+
j += 2;
260+
}
261+
}
262+
break;
263+
case SIMDE_SIDD_CMP_EQUAL_EACH:
264+
for(int i = 0 ; i < upper_bound ; i++){
265+
SIMDE_VECTORIZE_REDUCTION(|:int_res_1)
266+
for(int j = 0 ; j < upper_bound ; j++){
267+
int_res_1 |= (((bool_res_.i16[i] >> i) & 1) << i);
268+
}
269+
}
270+
break;
271+
case SIMDE_SIDD_CMP_EQUAL_ORDERED:
272+
int_res_1 = 0xffff;
273+
for(int i = 0 ; i < upper_bound ; i++){
274+
int k = i;
275+
SIMDE_VECTORIZE_REDUCTION(&:int_res_1)
276+
for(int j = 0 ; j < (upper_bound-i) ; j++){
277+
int_res_1 &= (((bool_res_.i16[k] >> j) & 1) << i) ;
278+
k += 1;
279+
}
280+
}
281+
break;
282+
}
283+
for(int i = 0; i < upper_bound ; i++){
284+
if(polarity & 1){
285+
if((polarity >> 1) & 1) {
286+
if (i >= lb) {
287+
int_res_2 |= (((int_res_1 >> i) & 1) << i);
288+
}
289+
else {
290+
int_res_2 |= ((((int_res_1 >> i) & 1) ^ (-1)) << i);
291+
}
292+
}
293+
else{
294+
int_res_2 |= ((((int_res_1 >> i) & 1) ^ (-1)) << i);
295+
}
296+
}
297+
else{
298+
int_res_2 |= (((int_res_1 >> i) & 1) << i);
299+
}
300+
}
301+
return !int_res_2 & (lb > upper_bound);
302+
}
303+
304+
#if defined(SIMDE_X86_SSE4_2_NATIVE)
305+
#define simde_mm_cmpestra(a, la, b, lb, imm8) _mm_cmpestra(a, la, b, lb, imm8)
306+
#else
307+
#define simde_mm_cmpestra(a, la, b, lb, imm8) \
308+
(((imm8) & SIMDE_SIDD_UWORD_OPS) \
309+
? simde_mm_cmpestra_16_((a), (la), (b), (lb), (imm8)) \
310+
: simde_mm_cmpestra_8_((a), (la), (b), (lb), (imm8)))
311+
#endif
312+
#if defined(SIMDE_X86_SSE4_2_ENABLE_NATIVE_ALIASES)
313+
#define _mm_cmpestra(a, la, b, lb, imm8) simde_mm_cmpestra(a, la, b, lb, imm8)
314+
#endif
315+
95316
SIMDE_FUNCTION_ATTRIBUTES
96317
int simde_mm_cmpestrs (simde__m128i a, int la, simde__m128i b, int lb, const int imm8)
97318
SIMDE_REQUIRE_CONSTANT_RANGE(imm8, 0, 127) {

0 commit comments

Comments
 (0)