@@ -31,6 +31,14 @@ namespace xsimd {
31
31
inline __m256i merge_sse (__m128i low, __m128i high) {
32
32
return _mm256_insertf128_si256 (_mm256_castsi128_si256 (low), high, 1 );
33
33
}
34
+ template <class F >
35
+ __m256i fwd_to_sse (F f, __m256i self) {
36
+ __m128i self_low, self_high;
37
+ split_avx (self, self_low, self_high);
38
+ __m128i res_low = f (self_low);
39
+ __m128i res_high = f (self_high);
40
+ return merge_sse (res_low, res_high);
41
+ }
34
42
template <class F >
35
43
__m256i fwd_to_sse (F f, __m256i self, __m256i other) {
36
44
__m128i self_low, self_high, other_low, other_high;
@@ -63,13 +71,7 @@ namespace xsimd {
63
71
// add
64
72
template <class A , class T , class =typename std::enable_if<std::is_integral<T>::value, void >::type>
65
73
batch<T, A> add (batch<T, A> const & self, batch<T, A> const & other, requires_arch<avx>) {
66
- switch (sizeof (T)) {
67
- case 1 : return _mm256_add_epi8 (self, other);
68
- case 2 : return _mm256_add_epi16 (self, other);
69
- case 4 : return detail::fwd_to_sse ([](__m128i s, __m128i o) { return add (batch<T, sse4_2>(s), batch<T, sse4_2>(o)); }, self, other);
70
- case 8 : return detail::fwd_to_sse ([](__m128i s, __m128i o) { return add (batch<T, sse4_2>(s), batch<T, sse4_2>(o)); }, self, other);
71
- default : assert (false && " unsupported arch/op combination" ); return {};
72
- }
74
+ return detail::fwd_to_sse ([](__m128i s, __m128i o) { return add (batch<T, sse4_2>(s), batch<T, sse4_2>(o)); }, self, other);
73
75
}
74
76
template <class A > batch<float , A> add (batch<float , A> const & self, batch<float , A> const & other, requires_arch<avx>) {
75
77
return _mm256_add_ps (self, other);
@@ -153,24 +155,17 @@ namespace xsimd {
153
155
// bitwise_lshift
154
156
template <class A , class T , class =typename std::enable_if<std::is_integral<T>::value, void >::type>
155
157
batch<T, A> bitwise_lshift (batch<T, A> const & self, int32_t other, requires_arch<avx>) {
156
- switch (sizeof (T)) {
157
- case 1 : return detail::fwd_to_sse ([](__m128i s, __m128i o) { return bitwise_and (batch<T, sse4_2>(s), batch<T, sse4_2>(o)); }, _mm256_set1_epi8 (0xFF << other), _mm256_slli_epi32 (self, other));
158
-
159
- case 2 : return _mm256_slli_epi16 (self, other);
160
- case 4 : return detail::fwd_to_sse ([](__m128i s, int32_t o) { return bitwise_lshift (batch<T, sse4_2>(s), o, sse4_2{}); },self, other);
161
- case 8 : return _mm256_slli_epi64 (self, other);
162
- default : assert (false && " unsupported arch/op combination" ); return {};
163
- }
158
+ return detail::fwd_to_sse ([](__m128i s, int32_t o) { return bitwise_lshift (batch<T, sse4_2>(s), o, sse4_2{}); },self, other);
164
159
}
165
160
166
161
// bitwise_not
167
162
template <class A , class T , class =typename std::enable_if<std::is_integral<T>::value, void >::type>
168
163
batch<T, A> bitwise_not (batch<T, A> const & self, requires_arch<avx>) {
169
- return _mm256_xor_si256 (self, _mm256_set1_epi32 (- 1 ) );
164
+ return detail::fwd_to_sse ([](__m128i s) { return bitwise_not (batch<T, sse4_2>(s), sse4_2{}); }, self );
170
165
}
171
166
template <class A , class T , class =typename std::enable_if<std::is_integral<T>::value, void >::type>
172
167
batch_bool<T, A> bitwise_not (batch_bool<T, A> const & self, requires_arch<avx>) {
173
- return _mm256_xor_si256 (self, _mm256_set1_epi32 (- 1 ) );
168
+ return detail::fwd_to_sse ([](__m128i s) { return bitwise_not (batch_bool<T, sse4_2>(s), sse4_2{}); }, self );
174
169
}
175
170
176
171
// bitwise_or
@@ -188,48 +183,17 @@ namespace xsimd {
188
183
}
189
184
template <class A , class T , class =typename std::enable_if<std::is_integral<T>::value, void >::type>
190
185
batch<T, A> bitwise_or (batch<T, A> const & self, batch<T, A> const & other, requires_arch<avx>) {
191
- return _mm256_or_si256 ( self, other);
186
+ return detail::fwd_to_sse ([](__m128i s, __m128i o) { return bitwise_or (batch<T, sse4_2>(s), batch<T, sse4_2>(o)); }, self, other);
192
187
}
193
188
template <class A , class T , class =typename std::enable_if<std::is_integral<T>::value, void >::type>
194
189
batch_bool<T, A> bitwise_or (batch_bool<T, A> const & self, batch_bool<T, A> const & other, requires_arch<avx>) {
195
- return _mm256_or_si256 ( self, other);
190
+ return detail::fwd_to_sse ([](__m128i s, __m128i o) { return bitwise_or (batch_bool<T, sse4_2>(s), batch_bool<T, sse4_2>(o)); }, self, other);
196
191
}
197
192
198
193
// bitwise_rshift
199
194
template <class A , class T , class =typename std::enable_if<std::is_integral<T>::value, void >::type>
200
195
batch<T, A> bitwise_rshift (batch<T, A> const & self, int32_t other, requires_arch<avx>) {
201
- if (std::is_signed<T>::value) {
202
- switch (sizeof (T)) {
203
- case 1 : {
204
- __m256i sign_mask = _mm256_set1_epi16 ((0xFF00 >> other) & 0x00FF );
205
- __m256i cmp_is_negative = _mm256_cmpgt_epi8 (_mm256_setzero_si256 (), self);
206
- __m256i res = _mm256_srai_epi16 (self, other);
207
- return _mm256_or_si256 (
208
- detail::fwd_to_sse ([](__m128i s, __m128i o) { return bitwise_and (batch<T, sse4_2>(s), batch<T, sse4_2>(o)); }, sign_mask, cmp_is_negative),
209
- _mm256_andnot_si256 (sign_mask, res));
210
- }
211
- case 2 : return _mm256_srai_epi16 (self, other);
212
- case 4 : return detail::fwd_to_sse ([](__m128i s, int32_t o) { return bitwise_rshift (batch<T, sse4_2>(s), o, sse4_2{}); }, self, other);
213
- case 8 : {
214
- // from https://github.com/samyvilar/vect/blob/master/vect_128.h
215
- return _mm256_or_si256 (
216
- _mm256_srli_epi64 (self, other),
217
- _mm256_slli_epi64 (
218
- detail::fwd_to_sse ([](__m128i s, int32_t o) { return _mm_srai_epi32 (s, o); }, _mm256_shuffle_epi32 (self, _MM_SHUFFLE (3 , 3 , 1 , 1 )), 32 ),
219
- 64 - other));
220
- }
221
- default : assert (false && " unsupported arch/op combination" ); return {};
222
- }
223
- }
224
- else {
225
- switch (sizeof (T)) {
226
- case 1 : return detail::fwd_to_sse ([](__m128i s, __m128i o) { return bitwise_and (batch<T, sse4_2>(s), batch<T, sse4_2>(o)); }, _mm256_set1_epi8 (0xFF >> other), _mm256_srli_epi32 (self, other));
227
- case 2 : return _mm256_srli_epi16 (self, other);
228
- case 4 : return _mm256_srli_epi32 (self, other);
229
- case 8 : return _mm256_srli_epi64 (self, other);
230
- default : assert (false && " unsupported arch/op combination" ); return {};
231
- }
232
- }
196
+ return detail::fwd_to_sse ([](__m128i s, int32_t o) { return bitwise_rshift (batch<T, sse4_2>(s), o, sse4_2{}); }, self, other);
233
197
}
234
198
235
199
// bitwise_xor
@@ -247,8 +211,15 @@ namespace xsimd {
247
211
}
248
212
template <class A , class T , class =typename std::enable_if<std::is_integral<T>::value, void >::type>
249
213
batch<T, A> bitwise_xor (batch<T, A> const & self, batch<T, A> const & other, requires_arch<avx>) {
250
- return _mm256_xor_si256 (self, other);
214
+ return detail::fwd_to_sse ([](__m128i s, __m128i o) { return bitwise_xor (batch<T, sse4_2>(s), batch<T, sse4_2>(o), sse4_2{}); },
215
+ self, other);
251
216
}
217
+ template <class A , class T , class =typename std::enable_if<std::is_integral<T>::value, void >::type>
218
+ batch<T, A> bitwise_xor (batch_bool<T, A> const & self, batch_bool<T, A> const & other, requires_arch<avx>) {
219
+ return detail::fwd_to_sse ([](__m128i s, __m128i o) { return bitwise_xor (batch_bool<T, sse4_2>(s), batch_bool<T, sse4_2>(o), sse4_2{}); },
220
+ self, other);
221
+ }
222
+
252
223
// bitwise_cast
253
224
template <class A , class T , class =typename std::enable_if<std::is_integral<T>::value, void >::type>
254
225
batch<float , A> bitwise_cast (batch<T, A> const & self, batch<float , A> const &, requires_arch<avx>) {
@@ -414,20 +385,9 @@ namespace xsimd {
414
385
}
415
386
template <class A , class T , class =typename std::enable_if<std::is_integral<T>::value, void >::type>
416
387
batch_bool<T, A> eq (batch<T, A> const & self, batch<T, A> const & other, requires_arch<avx>) {
417
- switch (sizeof (T)) {
418
- case 1 : return _mm256_cmpeq_epi8 (self, other);
419
- case 2 : return _mm256_cmpeq_epi16 (self, other);
420
- case 4 : return detail::fwd_to_sse ([](__m128i s, __m128i o) { return eq (batch<T, sse4_2>(s), batch<T, sse4_2>(o), sse4_2{}); },self, other);
421
- case 8 : {
422
- __m256i tmp1 = detail::fwd_to_sse ([](__m128i s, __m128i o) { return eq (batch<T, sse4_2>(s), batch<T, sse4_2>(o)); },self, other);
423
- __m256i tmp2 = _mm256_shuffle_epi32 (tmp1, 0xB1 );
424
- __m256i tmp3 = detail::fwd_to_sse ([](__m128i s, __m128i o) { return bitwise_and (batch<T, sse4_2>(s), batch<T, sse4_2>(o)); }, tmp1, tmp2);
425
- __m256i tmp4 = detail::fwd_to_sse ([](__m128i s, uint32_t o) { return _mm_srai_epi32 (s, o); }, tmp3, 31 );
426
- return _mm256_shuffle_epi32 (tmp4, 0xF5 );
427
- }
428
- default : assert (false && " unsupported arch/op combination" ); return {};
429
- }
388
+ return detail::fwd_to_sse ([](__m128i s, __m128i o) { return eq (batch<T, sse4_2>(s), batch<T, sse4_2>(o), sse4_2{}); },self, other);
430
389
}
390
+
431
391
template <class A , class T , class =typename std::enable_if<std::is_integral<T>::value, void >::type>
432
392
batch_bool<T, A> eq (batch_bool<T, A> const & self, batch_bool<T, A> const & other, requires_arch<avx>) {
433
393
return eq (batch<T, A>(self.data ), batch<T, A>(other.data ));
@@ -868,13 +828,7 @@ namespace xsimd {
868
828
// sub
869
829
template <class A , class T , class =typename std::enable_if<std::is_integral<T>::value, void >::type>
870
830
batch<T, A> sub (batch<T, A> const & self, batch<T, A> const & other, requires_arch<avx>) {
871
- switch (sizeof (T)) {
872
- case 1 : return _mm256_sub_epi8 (self, other);
873
- case 2 : return _mm256_sub_epi16 (self, other);
874
- case 4 : return detail::fwd_to_sse ([](__m128i s, __m128i o) { return sub (batch<T, sse4_2>(s), batch<T, sse4_2>(o)); }, self, other);
875
- case 8 : return _mm256_sub_epi64 (self, other);
876
- default : assert (false && " unsupported arch/op combination" ); return {};
877
- }
831
+ return detail::fwd_to_sse ([](__m128i s, __m128i o) { return sub (batch<T, sse4_2>(s), batch<T, sse4_2>(o)); }, self, other);
878
832
}
879
833
template <class A > batch<float , A> sub (batch<float , A> const & self, batch<float , A> const & other, requires_arch<avx>) {
880
834
return _mm256_sub_ps (self, other);
0 commit comments