Skip to content

Commit 6282de3

Browse files
authored
Merge pull request #12 from haskell-works/fix-simd-vector-type
Use correct type __m256i instead of __m256
2 parents 2df95b4 + 1b28b9c commit 6282de3

File tree

2 files changed

+24
-24
lines changed

2 files changed

+24
-24
lines changed

cbits/simd_avx2.c

Lines changed: 23 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -39,15 +39,15 @@ void avx2_cmpeq8(
3939
#if defined(AVX2_ENABLED)
4040
uint32_t *target32 = (uint32_t *)target;
4141

42-
__m256 v_comparand = _mm256_set1_epi8(byte);
42+
__m256i v_comparand = _mm256_set1_epi8(byte);
4343

4444
uint32_t *out_mask = (uint32_t*)target;
4545

4646
size_t i;
4747

4848
for (i = 0; i < target_length * 2; ++i) {
49-
__m256 v_data_a = *(__m256*)(source + (i * 32));
50-
__m256 v_results_a = _mm256_cmpeq_epi8(v_data_a, v_comparand);
49+
__m256i v_data_a = *(__m256i *)(source + (i * 32));
50+
__m256i v_results_a = _mm256_cmpeq_epi8(v_data_a, v_comparand);
5151
uint32_t mask = (uint32_t)_mm256_movemask_epi8(v_results_a);
5252
target32[i] = mask;
5353
}
@@ -63,10 +63,10 @@ void avx2_and_bits(
6363
size_t i;
6464

6565
for (i = 0; i < target_length; i += 32) {
66-
__m256 v_data_a = *(__m256*)(source_a + i);
67-
__m256 v_data_b = *(__m256*)(source_b + i);
68-
__m256 v_results = _mm256_and_si256(v_data_a, v_data_b);
69-
*(__m256*)(target + i) = v_results;
66+
__m256i v_data_a = *(__m256i *)(source_a + i);
67+
__m256i v_data_b = *(__m256i *)(source_b + i);
68+
__m256i v_results = _mm256_and_si256(v_data_a, v_data_b);
69+
*(__m256i *)(target + i) = v_results;
7070
}
7171
#endif
7272
}
@@ -80,10 +80,10 @@ void avx2_and_not_bits(
8080
size_t i;
8181

8282
for (i = 0; i < target_length; i += 32) {
83-
__m256 v_data_a = *(__m256*)(source_a + i);
84-
__m256 v_data_b = *(__m256*)(source_b + i);
85-
__m256 v_results = _mm256_andnot_si256(v_data_a, v_data_b);
86-
*(__m256*)(target + i) = v_results;
83+
__m256i v_data_a = *(__m256i *)(source_a + i);
84+
__m256i v_data_b = *(__m256i *)(source_b + i);
85+
__m256i v_results = _mm256_andnot_si256(v_data_a, v_data_b);
86+
*(__m256i *)(target + i) = v_results;
8787
}
8888
#endif
8989
}
@@ -93,14 +93,14 @@ void avx2_not_bits(
9393
size_t target_length,
9494
uint8_t *source) {
9595
#if defined(AVX2_ENABLED)
96-
__m256 ones = _mm256_set1_epi8(0xff);
96+
__m256i ones = _mm256_set1_epi8(0xff);
9797

9898
size_t i;
9999

100100
for (i = 0; i < target_length; i += 32) {
101-
__m256 v_data = *(__m256*)(source + i);
102-
__m256 v_results = _mm256_xor_si256(v_data, ones);
103-
*(__m256*)(target + i) = v_results;
101+
__m256i v_data = *(__m256i *)(source + i);
102+
__m256i v_results = _mm256_xor_si256(v_data, ones);
103+
*(__m256i *)(target + i) = v_results;
104104
}
105105
#endif
106106
}
@@ -114,10 +114,10 @@ void avx2_or_bits(
114114
size_t i;
115115

116116
for (i = 0; i < target_length; i += 32) {
117-
__m256 v_data_a = *(__m256*)(source_a + i);
118-
__m256 v_data_b = *(__m256*)(source_b + i);
119-
__m256 v_results = _mm256_or_si256(v_data_a, v_data_b);
120-
*(__m256*)(target + i) = v_results;
117+
__m256i v_data_a = *(__m256i *)(source_a + i);
118+
__m256i v_data_b = *(__m256i *)(source_b + i);
119+
__m256i v_results = _mm256_or_si256(v_data_a, v_data_b);
120+
*(__m256i *)(target + i) = v_results;
121121
}
122122
#endif
123123
}
@@ -131,10 +131,10 @@ void avx2_xor_bits(
131131
size_t i;
132132

133133
for (i = 0; i < target_length; i += 32) {
134-
__m256 v_data_a = *(__m256*)(source_a + i);
135-
__m256 v_data_b = *(__m256*)(source_b + i);
136-
__m256 v_results = _mm256_xor_si256(v_data_a, v_data_b);
137-
*(__m256*)(target + i) = v_results;
134+
__m256i v_data_a = *(__m256i *)(source_a + i);
135+
__m256i v_data_b = *(__m256i *)(source_b + i);
136+
__m256i v_results = _mm256_xor_si256(v_data_a, v_data_b);
137+
*(__m256i *)(target + i) = v_results;
138138
}
139139
#endif
140140
}

stack.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@ packages:
55

66
extra-deps:
77
- hw-hedgehog-0.1.0.2
8-
- hw-prim-0.6.2.12
8+
- hw-prim-0.6.2.14
99
- hw-rankselect-0.12.0.3
1010
- lazy-csv-0.5.1
1111
- sv-0.1

0 commit comments

Comments
 (0)