From a70dfcf9b97eb22f6ccd2625bfc1d37000f6adf4 Mon Sep 17 00:00:00 2001 From: Ting Chou Date: Fri, 16 Aug 2024 15:22:25 +0800 Subject: [PATCH 1/5] Add RISC-V implementation from https://github.com/pattonkan/sse2rvv commit ca0e0b4e. --- common/simd/riscv/emulation.h | 152 ++ common/simd/riscv/sse2rvv.h | 3679 +++++++++++++++++++++++++++++++++ 2 files changed, 3831 insertions(+) create mode 100644 common/simd/riscv/emulation.h create mode 100644 common/simd/riscv/sse2rvv.h diff --git a/common/simd/riscv/emulation.h b/common/simd/riscv/emulation.h new file mode 100644 index 0000000000..bb7418b83e --- /dev/null +++ b/common/simd/riscv/emulation.h @@ -0,0 +1,152 @@ +#pragma once + +#include "sse2rvv.h" + +#define _MM_SHUFFLE(fp3, fp2, fp1, fp0) \ + (((fp3) << 6) | ((fp2) << 4) | ((fp1) << 2) | ((fp0))) + +/* Rounding mode macros. */ +#define _MM_FROUND_TO_NEAREST_INT 0x00 +#define _MM_FROUND_TO_NEG_INF 0x01 +#define _MM_FROUND_TO_POS_INF 0x02 +#define _MM_FROUND_TO_ZERO 0x03 +#define _MM_FROUND_CUR_DIRECTION 0x04 +#define _MM_FROUND_NO_EXC 0x08 +#define _MM_ROUND_NEAREST 0x0000 +#define _MM_ROUND_DOWN 0x2000 +#define _MM_ROUND_UP 0x4000 +#define _MM_ROUND_TOWARD_ZERO 0x6000 +/* Flush zero mode macros. */ +#define _MM_FLUSH_ZERO_MASK 0x8000 +#define _MM_FLUSH_ZERO_ON 0x8000 +#define _MM_FLUSH_ZERO_OFF 0x0000 + +enum _mm_hint { + _MM_HINT_NTA = 0, + _MM_HINT_T0 = 1, + _MM_HINT_T1 = 2, + _MM_HINT_T2 = 3, +}; + +__forceinline __m128i _mm_cvtps_epi32(__m128 a) { + return __riscv_vfcvt_x_f_v_i32m1(a, 4); +} + +__forceinline int _mm_cvtsi128_si32(__m128i a) { + return __riscv_vmv_x_s_i32m1_i32(a); +} + +__forceinline float _mm_cvtss_f32 (__m128 a) { + return __riscv_vfmv_f_s_f32m1_f32(a); +} + +__forceinline __m128 _mm_dp_ps(__m128 a, __m128 b, const int imm8) { + vfloat32m1_t zeros = __riscv_vfmv_v_f_f32m1(0, 4); + vbool32_t high = __riscv_vreinterpret_v_i32m1_b32(__riscv_vmv_s_x_i32m1(imm8 >> 4, 1)); + vbool32_t low = __riscv_vreinterpret_v_i32m1_b32(__riscv_vmv_s_x_i32m1(imm8 & 0xf, 1)); + vfloat32m1_t sum = __riscv_vfredusum_vs_f32m1_f32m1_m(high, __riscv_vfmul(a, b, 4), zeros, 4); + return vreinterpretq_f32_m128(__riscv_vrgather_vx_f32m1_mu(low, zeros, sum, 0, 4)); +} + +__forceinline __int64 _mm_cvtsi128_si64 (__m128i a) { + return __riscv_vmv_x_s_i64m1_i64(__riscv_vreinterpret_v_i32m1_i64m1(a)); +} + +__forceinline unsigned int _mm_getcsr(void) { + return 0; +} + +__forceinline void _mm_setcsr(unsigned int a) { + int rm; + + switch (a) { + case _MM_ROUND_TOWARD_ZERO: + // FIXME: I can't find the straightforward mapping of this. + rm = 0b01; + break; + case _MM_ROUND_DOWN: + rm = 0b10; + break; + case _MM_ROUND_UP: + rm = 0b00; + break; + default: //_MM_ROUND_NEAREST + rm = 0b01; + } + + asm volatile("csrw vxrm,%0" :: "r"(rm)); +} + +__forceinline void _mm_mfence (void) { + __sync_synchronize(); +} + +__forceinline void _mm_pause (void) { + __asm__ __volatile__("fence.i\n\t" + "fence r, r\n\t"); +} + +__forceinline void _mm_prefetch (char const* p, int i) { + (void)i; + __builtin_prefetch(p); +} + +__forceinline __m128 _mm_round_ps(__m128 a, int rounding) { + vfloat32m1_t _a = vreinterpretq_m128_f32(a); + switch (rounding) { + case (_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC): + return __riscv_vfcvt_f_x_v_f32m1(__riscv_vfcvt_x_f_v_i32m1_rm(_a, 0, 4), 4); + case (_MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC): + return _mm_floor_ps(a); + case (_MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC): + return _mm_ceil_ps(a); + case (_MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC): + return __riscv_vfcvt_f_x_v_f32m1(__riscv_vfcvt_x_f_v_i32m1_rm(_a, 1, 4), 4); + default: //_MM_FROUND_CUR_DIRECTION + return __riscv_vfcvt_f_x_v_f32m1(__riscv_vfcvt_x_f_v_i32m1(_a, 4), 4); + } +} + +__forceinline int _mm_popcnt_u32(unsigned int a) { + return __builtin_popcount(a); +} + +__forceinline int64_t _mm_popcnt_u64(uint64_t a) { + return __builtin_popcount(a); +} + +__forceinline __m128 _mm_fmadd_ps(__m128 a, __m128 b, __m128 c) { + vfloat32m1_t _a = vreinterpretq_m128_f32(a); + vfloat32m1_t _b = vreinterpretq_m128_f32(b); + vfloat32m1_t _c = vreinterpretq_m128_f32(c); + return vreinterpretq_f32_m128(__riscv_vfmacc_vv_f32m1(_c, _a, _b, 4)); +} + +__forceinline __m128 _mm_fmsub_ps(__m128 a, __m128 b, __m128 c) { + vfloat32m1_t _a = vreinterpretq_m128_f32(a); + vfloat32m1_t _b = vreinterpretq_m128_f32(b); + vfloat32m1_t _c = vreinterpretq_m128_f32(c); + return vreinterpretq_f32_m128(__riscv_vfmsac_vv_f32m1(_c, _a, _b, 4)); +} + +__forceinline __m128 _mm_fnmadd_ps(__m128 a, __m128 b, __m128 c) { + vfloat32m1_t _a = vreinterpretq_m128_f32(a); + vfloat32m1_t _b = vreinterpretq_m128_f32(b); + vfloat32m1_t _c = vreinterpretq_m128_f32(c); + return vreinterpretq_f32_m128(__riscv_vfnmsac_vv_f32m1(_c, _a, _b, 4)); +} + +__forceinline __m128 _mm_fnmsub_ps(__m128 a, __m128 b, __m128 c) { + vfloat32m1_t _a = vreinterpretq_m128_f32(a); + vfloat32m1_t _b = vreinterpretq_m128_f32(b); + vfloat32m1_t _c = vreinterpretq_m128_f32(c); + return vreinterpretq_f32_m128(__riscv_vfnmacc_vv_f32m1(_c, _a, _b, 4)); +} + +/* Dummy defines for floating point control */ +#define _MM_MASK_MASK 0x1f80 +#define _MM_MASK_DIV_ZERO 0x200 +// #define _MM_FLUSH_ZERO_ON 0x8000 +#define _MM_MASK_DENORM 0x100 +#define _MM_SET_EXCEPTION_MASK(x) +// #define _MM_SET_FLUSH_ZERO_MODE(x) diff --git a/common/simd/riscv/sse2rvv.h b/common/simd/riscv/sse2rvv.h new file mode 100644 index 0000000000..ca70f3af7b --- /dev/null +++ b/common/simd/riscv/sse2rvv.h @@ -0,0 +1,3679 @@ +#ifndef SSE2RVV_H +#define SSE2RVV_H + +// This header file provides a simple API translation layer +// between SSE intrinsics to their corresponding RVV versions + +/* + * sse2rvv is freely redistributable under the MIT License. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +/* compiler specific definitions */ +#if defined(__GNUC__) || defined(__clang__) +#pragma push_macro("FORCE_INLINE") +#pragma push_macro("ALIGN_STRUCT") +#define FORCE_INLINE static inline __attribute__((always_inline)) +#define ALIGN_STRUCT(x) __attribute__((aligned(x))) +#define _sse2rvv_likely(x) __builtin_expect(!!(x), 1) +#define _sse2rvv_unlikely(x) __builtin_expect(!!(x), 0) +#else +#pragma message("Macro name collisions may happen with unsupported compilers.") +#endif + +/* C language does not allow initializing a variable with a function call. */ +#ifdef __cplusplus +#define _sse2rvv_const static const +#else +#define _sse2rvv_const const +#endif + +#include +#include +#include +#include + +/* A few intrinsics accept traditional data types like ints or floats, but + * most operate on data types that are specific to SSE. + * If a vector type ends in d, it contains doubles, and if it does not have + * a suffix, it contains floats. An integer vector type can contain any type + * of integer, from chars to shorts to unsigned long longs. + */ +typedef vint32m1_t __m64 __attribute__((riscv_rvv_vector_bits(128))); +typedef vfloat32m1_t __m128 __attribute__((riscv_rvv_vector_bits(128))); +typedef vfloat64m1_t __m128d __attribute__((riscv_rvv_vector_bits(128))); +typedef vint32m1_t __m128i __attribute__((riscv_rvv_vector_bits(128))); + +// A struct is defined in this header file called 'SIMDVec' which can be used +// by applications which attempt to access the contents of an __m128 struct +// directly. It is important to note that accessing the __m128 struct directly +// is bad coding practice by Microsoft: @see: +// https://learn.microsoft.com/en-us/cpp/cpp/m128 +// +// However, some legacy source code may try to access the contents of an __m128 +// struct directly so the developer can use the SIMDVec as an alias for it. Any +// casting must be done manually by the developer, as you cannot cast or +// otherwise alias the base NEON data type for intrinsic operations. +// +// union intended to allow direct access to an __m128 variable using the names +// that the MSVC compiler provides. This union should really only be used when +// trying to access the members of the vector as integer values. GCC/clang +// allow native access to the float members through a simple array access +// operator (in C since 4.6, in C++ since 4.8). +// +// Ideally direct accesses to SIMD vectors should not be used since it can cause +// a performance hit. If it really is needed however, the original __m128 +// variable can be aliased with a pointer to this union and used to access +// individual components. The use of this union should be hidden behind a macro +// that is used throughout the codebase to access the members instead of always +// declaring this type of variable. +typedef union ALIGN_STRUCT(16) SIMDVec { + float m128_f32[4]; // as floats - DON'T USE. Added for convenience. + int8_t m128_i8[16]; // as signed 8-bit integers. + int16_t m128_i16[8]; // as signed 16-bit integers. + int32_t m128_i32[4]; // as signed 32-bit integers. + int64_t m128_i64[2]; // as signed 64-bit integers. + uint8_t m128_u8[16]; // as unsigned 8-bit integers. + uint16_t m128_u16[8]; // as unsigned 16-bit integers. + uint32_t m128_u32[4]; // as unsigned 32-bit integers. + uint64_t m128_u64[2]; // as unsigned 64-bit integers. +} SIMDVec; + +#define vreinterpretq_m128_u8(x) \ + __riscv_vreinterpret_v_u32m1_u8m1(__riscv_vreinterpret_v_f32m1_u32m1(x)) +#define vreinterpretq_m128_u16(x) \ + __riscv_vreinterpret_v_u32m1_u16m1(__riscv_vreinterpret_v_f32m1_u32m1(x)) +#define vreinterpretq_m128_u32(x) __riscv_vreinterpret_v_f32m1_u32m1(x) +#define vreinterpretq_m128_u64(x) \ + __riscv_vreinterpret_v_u32m1_u64m1(__riscv_vreinterpret_v_f32m1_u32m1(x)) +#define vreinterpretq_m128_i8(x) \ + __riscv_vreinterpret_v_i32m1_i8m1(__riscv_vreinterpret_v_f32m1_i32m1(x)) +#define vreinterpretq_m128_i16(x) \ + __riscv_vreinterpret_v_i32m1_i16m1(__riscv_vreinterpret_v_f32m1_i32m1(x)) +#define vreinterpretq_m128_i32(x) __riscv_vreinterpret_v_f32m1_i32m1(x) +#define vreinterpretq_m128_i64(x) \ + __riscv_vreinterpret_v_i32m1_i64m1(__riscv_vreinterpret_v_f32m1_i32m1(x)) +#define vreinterpretq_m128_f32(x) (x) +#define vreinterpretq_m128_f64(x) \ + __riscv_vreinterpret_v_u64m1_f64m1(__riscv_vreinterpret_v_u32m1_u64m1( \ + __riscv_vreinterpret_v_f32m1_u32m1(x))) + +#define vreinterpretq_u8_m128(x) \ + __riscv_vreinterpret_v_u32m1_f32m1(__riscv_vreinterpret_v_u8m1_u32m1(x)) +#define vreinterpretq_u16_m128(x) \ + __riscv_vreinterpret_v_u32m1_f32m1(__riscv_vreinterpret_v_u16m1_u32m1(x)) +#define vreinterpretq_u32_m128(x) __riscv_vreinterpret_v_u32m1_f32m1(x) +#define vreinterpretq_u64_m128(x) \ + __riscv_vreinterpret_v_u32m1_f32m1(__riscv_vreinterpret_v_u64m1_u32m1(x)) +#define vreinterpretq_i8_m128(x) \ + __riscv_vreinterpret_v_i32m1_f32m1(__riscv_vreinterpret_v_i8m1_i32m1(x)) +#define vreinterpretq_i16_m128(x) \ + __riscv_vreinterpret_v_i32m1_f32m1(__riscv_vreinterpret_v_i16m1_i32m1(x)) +#define vreinterpretq_i32_m128(x) __riscv_vreinterpret_v_i32m1_f32m1(x) +#define vreinterpretq_i64_m128(x) \ + __riscv_vreinterpret_v_i32m1_f32m1(__riscv_vreinterpret_v_i64m1_i32m1(x)) +#define vreinterpretq_f32_m128(x) (x) +#define vreinterpretq_f64_m128(x) \ + __riscv_vreinterpret_v_u32m1_f32m1(__riscv_vreinterpret_v_u64m1_u32m1( \ + __riscv_vreinterpret_v_f64m1_u64m1(x))) + +#define vreinterpretq_m128d_u8(x) \ + __riscv_vreinterpret_v_u64m1_u8m1(__riscv_vreinterpret_v_f64m1_u64m1(x)) +#define vreinterpretq_m128d_u16(x) \ + __riscv_vreinterpret_v_u64m1_u16m1(__riscv_vreinterpret_v_f64m1_u64m1(x)) +#define vreinterpretq_m128d_u32(x) \ + __riscv_vreinterpret_v_u64m1_u32m1(__riscv_vreinterpret_v_f64m1_u64m1(x)) +#define vreinterpretq_m128d_u64(x) __riscv_vreinterpret_v_f64m1_u64m1(x) +#define vreinterpretq_m128d_i8(x) \ + __riscv_vreinterpret_v_i64m1_i8m1(__riscv_vreinterpret_v_f64m1_i64m1(x)) +#define vreinterpretq_m128d_i16(x) \ + __riscv_vreinterpret_v_i64m1_i16m1(__riscv_vreinterpret_v_f64m1_i64m1(x)) +#define vreinterpretq_m128d_i32(x) \ + __riscv_vreinterpret_v_i64m1_i32m1(__riscv_vreinterpret_v_f64m1_i64m1(x)) +#define vreinterpretq_m128d_i64(x) __riscv_vreinterpret_v_f64m1_i64m1(x) +#define vreinterpretq_m128d_f32(x) \ + __riscv_vreinterpret_v_i32m1_f32m1(__riscv_vreinterpret_v_i64m1_i32m1( \ + __riscv_vreinterpret_v_f64m1_i64m1(x))) +#define vreinterpretq_m128d_f64(x) (x) + +#define vreinterpretq_u8_m128d(x) \ + __riscv_vreinterpret_v_u64m1_f64m1(__riscv_vreinterpret_v_u8m1_u64m1(x)) +#define vreinterpretq_u16_m128d(x) \ + __riscv_vreinterpret_v_u64m1_f64m1(__riscv_vreinterpret_v_u16m1_u64m1(x)) +#define vreinterpretq_u32_m128d(x) \ + __riscv_vreinterpret_v_u64m1_f64m1(__riscv_vreinterpret_v_u32m1_u64m1(x)) +#define vreinterpretq_u64_m128d(x) __riscv_vreinterpret_v_u64m1_f64m1(x) +#define vreinterpretq_i8_m128d(x) \ + __riscv_vreinterpret_v_i64m1_f64m1(__riscv_vreinterpret_v_i8m1_i64m1(x)) +#define vreinterpretq_i16_m128d(x) \ + __riscv_vreinterpret_v_i64m1_f64m1(__riscv_vreinterpret_v_i16m1_i64m1(x)) +#define vreinterpretq_i32_m128d(x) \ + __riscv_vreinterpret_v_i64m1_f64m1(__riscv_vreinterpret_v_i32m1_i64m1(x)) +#define vreinterpretq_i64_m128d(x) __riscv_vreinterpret_v_i64m1_f64m1(x) +#define vreinterpretq_f32_m128d(x) \ + __riscv_vreinterpret_v_i64m1_f64m1(__riscv_vreinterpret_v_i32m1_i64m1( \ + __riscv_vreinterpret_v_f32m1_i32m1(x))) +#define vreinterpretq_f64_m128d(x) (x) + +#define vreinterpretq_m128i_u8(x) \ + __riscv_vreinterpret_v_u32m1_u8m1(__riscv_vreinterpret_v_i32m1_u32m1(x)) +#define vreinterpretq_m128i_u16(x) \ + __riscv_vreinterpret_v_u32m1_u16m1(__riscv_vreinterpret_v_i32m1_u32m1(x)) +#define vreinterpretq_m128i_u32(x) __riscv_vreinterpret_v_i32m1_u32m1(x) +#define vreinterpretq_m128i_u64(x) \ + __riscv_vreinterpret_v_u32m1_u64m1(__riscv_vreinterpret_v_i32m1_u32m1(x)) +#define vreinterpretq_m128i_i8(x) __riscv_vreinterpret_v_i32m1_i8m1(x) +#define vreinterpretq_m128i_i16(x) __riscv_vreinterpret_v_i32m1_i16m1(x) +#define vreinterpretq_m128i_i32(x) (x) +#define vreinterpretq_m128i_i64(x) __riscv_vreinterpret_v_i32m1_i64m1(x) +#define vreinterpretq_m128i_f32(x) __riscv_vreinterpret_v_i32m1_f32m1(x) +#define vreinterpretq_m128i_f64(x) \ + __riscv_vreinterpret_v_f32m1_f64m1(__riscv_vreinterpret_v_i32m1_f32m1(x)) + +#define vreinterpretq_u8_m128i(x) \ + __riscv_vreinterpret_v_u32m1_i32m1(__riscv_vreinterpret_v_u8m1_u32m1(x)) +#define vreinterpretq_u16_m128i(x) \ + __riscv_vreinterpret_v_u32m1_i32m1(__riscv_vreinterpret_v_u16m1_u32m1(x)) +#define vreinterpretq_u32_m128i(x) __riscv_vreinterpret_v_u32m1_i32m1(x) +#define vreinterpretq_u64_m128i(x) \ + __riscv_vreinterpret_v_u32m1_i32m1(__riscv_vreinterpret_v_u64m1_u32m1(x)) +#define vreinterpretq_i8_m128i(x) __riscv_vreinterpret_v_i8m1_i32m1(x) +#define vreinterpretq_i16_m128i(x) __riscv_vreinterpret_v_i16m1_i32m1(x) +#define vreinterpretq_i32_m128i(x) (x) +#define vreinterpretq_i64_m128i(x) __riscv_vreinterpret_v_i64m1_i32m1(x) +#define vreinterpretq_f32_m128i(x) __riscv_vreinterpret_v_f32m1_i32m1(x) +#define vreinterpretq_f64_m128i(x) \ + __riscv_vreinterpret_v_i64m1_i32m1(__riscv_vreinterpret_v_f64m1_i64m1(x)) + +#define vreinterpretq_m64_u8(x) \ + __riscv_vreinterpret_v_u32m1_u8m1(__riscv_vreinterpret_v_i32m1_u32m1(x)) +#define vreinterpretq_m64_u16(x) \ + __riscv_vreinterpret_v_u32m1_u16m1(__riscv_vreinterpret_v_i32m1_u32m1(x)) +#define vreinterpretq_m64_u32(x) __riscv_vreinterpret_v_i32m1_u32m1(x) +#define vreinterpretq_m64_u64(x) \ + __riscv_vreinterpret_v_f64m1_u64m1(__riscv_vreinterpret_v_f32m1_f64m1(x)) +#define vreinterpretq_m64_i8(x) __riscv_vreinterpret_v_i32m1_i8m1(x) +#define vreinterpretq_m64_i16(x) __riscv_vreinterpret_v_i32m1_i16m1(x) +#define vreinterpretq_m64_i32(x) (x) +#define vreinterpretq_m64_i64(x) __riscv_vreinterpret_v_i32m1_i64m1(x) +#define vreinterpretq_m64_f32(x) __riscv_vreinterpret_v_i32m1_f32m1(x) +#define vreinterpretq_m64_f64(x) \ + __riscv_vreinterpret_v_i64m1_f64m1(__riscv_vreinterpret_v_i32m1_i64m1(x)) + +#define vreinterpretq_u8_m64(x) \ + __riscv_vreinterpret_v_u32m1_i32m1(__riscv_vreinterpret_v_u8m1_u32m1(x)) +#define vreinterpretq_u16_m64(x) \ + __riscv_vreinterpret_v_u32m1_i32m1(__riscv_vreinterpret_v_u16m1_u32m1(x)) +#define vreinterpretq_u32_m64(x) __riscv_vreinterpret_v_u32m1_f32m1(x) +#define vreinterpretq_u64_m64(x) \ + __riscv_vreinterpret_v_u32m1_i32m1(__riscv_vreinterpret_v_u64m1_u32m1(x)) +#define vreinterpretq_i8_m64(x) __riscv_vreinterpret_v_i8m1_i32m1(x) +#define vreinterpretq_i16_m64(x) __riscv_vreinterpret_v_i16m1_i32m1(x) +#define vreinterpretq_i32_m64(x) (x) +#define vreinterpretq_i64_m64(x) __riscv_vreinterpret_v_i64m1_i32m1(x) +#define vreinterpretq_f32_m64(x) __riscv_vreinterpret_v_f32m1_i32m1(x) +#define vreinterpretq_f64_m64(x) \ + __riscv_vreinterpret_v_i64m1_i32m1(__riscv_vreinterpret_v_f64m1_i64m1(x)) + +// __int64 is defined in the Intrinsics Guide which maps to different datatype +// in different data model +#if !(defined(_WIN32) || defined(_WIN64) || defined(__int64)) +#if (defined(__x86_64__) || defined(__i386__)) +#define __int64 long long +#else +#define __int64 int64_t +#endif +#endif + +// forward declaration +FORCE_INLINE int _mm_extract_pi16(__m64 a, int imm8); +FORCE_INLINE __m64 _mm_sad_pu8(__m64 a, __m64 b); +FORCE_INLINE __m64 _mm_shuffle_pi16(__m64 a, int imm8); + +/* SSE, SSE2, SSE3, SSSE3, SSE4.1, SSE4.2 */ + +FORCE_INLINE __m128i _mm_abs_epi16(__m128i a) { + vint16m1_t _a = vreinterpretq_m128i_i16(a); + vint16m1_t mask = __riscv_vsra_vx_i16m1(_a, 15, 8); + vint16m1_t a_xor = __riscv_vxor_vv_i16m1(_a, mask, 8); + return vreinterpretq_i16_m128i(__riscv_vsub_vv_i16m1(a_xor, mask, 8)); +} + +FORCE_INLINE __m128i _mm_abs_epi32(__m128i a) { + vint32m1_t _a = vreinterpretq_m128i_i32(a); + vint32m1_t mask = __riscv_vsra_vx_i32m1(_a, 31, 4); + vint32m1_t a_xor = __riscv_vxor_vv_i32m1(_a, mask, 4); + return vreinterpretq_i32_m128i(__riscv_vsub_vv_i32m1(a_xor, mask, 4)); +} + +FORCE_INLINE __m128i _mm_abs_epi8(__m128i a) { + vint8m1_t _a = vreinterpretq_m128i_i8(a); + vint8m1_t mask = __riscv_vsra_vx_i8m1(_a, 7, 16); + vint8m1_t a_xor = __riscv_vxor_vv_i8m1(_a, mask, 16); + return vreinterpretq_i8_m128i(__riscv_vsub_vv_i8m1(a_xor, mask, 16)); +} + +FORCE_INLINE __m64 _mm_abs_pi16(__m64 a) { + vint16m1_t _a = vreinterpretq_m128i_i16(a); + vint16m1_t mask = __riscv_vsra_vx_i16m1(_a, 15, 4); + vint16m1_t a_xor = __riscv_vxor_vv_i16m1(_a, mask, 4); + return vreinterpretq_i16_m128i(__riscv_vsub_vv_i16m1(a_xor, mask, 4)); +} + +FORCE_INLINE __m64 _mm_abs_pi32(__m64 a) { + vint32m1_t _a = vreinterpretq_m128i_i32(a); + vint32m1_t mask = __riscv_vsra_vx_i32m1(_a, 31, 2); + vint32m1_t a_xor = __riscv_vxor_vv_i32m1(_a, mask, 2); + return vreinterpretq_i32_m128i(__riscv_vsub_vv_i32m1(a_xor, mask, 2)); +} + +FORCE_INLINE __m64 _mm_abs_pi8(__m64 a) { + vint8m1_t _a = vreinterpretq_m128i_i8(a); + vint8m1_t mask = __riscv_vsra_vx_i8m1(_a, 7, 8); + vint8m1_t a_xor = __riscv_vxor_vv_i8m1(_a, mask, 8); + return vreinterpretq_i8_m128i(__riscv_vsub_vv_i8m1(a_xor, mask, 8)); +} + +FORCE_INLINE __m128i _mm_add_epi16(__m128i a, __m128i b) { + vint16m1_t _a = vreinterpretq_m128i_i16(a); + vint16m1_t _b = vreinterpretq_m128i_i16(b); + return vreinterpretq_i16_m128i(__riscv_vadd_vv_i16m1(_a, _b, 8)); +} + +FORCE_INLINE __m128i _mm_add_epi32(__m128i a, __m128i b) { + vint32m1_t _a = vreinterpretq_m128i_i32(a); + vint32m1_t _b = vreinterpretq_m128i_i32(b); + return __riscv_vadd_vv_i32m1(_a, _b, 4); +} + +FORCE_INLINE __m128i _mm_add_epi64(__m128i a, __m128i b) { + vint64m1_t _a = vreinterpretq_m128i_i64(a); + vint64m1_t _b = vreinterpretq_m128i_i64(b); + return vreinterpretq_i64_m128i(__riscv_vadd_vv_i64m1(_a, _b, 2)); +} + +FORCE_INLINE __m128i _mm_add_epi8(__m128i a, __m128i b) { + vint8m1_t _a = vreinterpretq_m128i_i8(a); + vint8m1_t _b = vreinterpretq_m128i_i8(b); + return vreinterpretq_i8_m128i(__riscv_vadd_vv_i8m1(_a, _b, 16)); +} + +FORCE_INLINE __m128d _mm_add_pd(__m128d a, __m128d b) { + vfloat64m1_t _a = vreinterpretq_m128d_f64(a); + vfloat64m1_t _b = vreinterpretq_m128d_f64(b); + return vreinterpretq_f64_m128d(__riscv_vfadd_vv_f64m1(_a, _b, 2)); +} + +FORCE_INLINE __m128 _mm_add_ps(__m128 a, __m128 b) { + vfloat32m1_t _a = vreinterpretq_m128_f32(a); + vfloat32m1_t _b = vreinterpretq_m128_f32(b); + return vreinterpretq_f32_m128(__riscv_vfadd_vv_f32m1(_a, _b, 4)); +} + +FORCE_INLINE __m128d _mm_add_sd(__m128d a, __m128d b) { + vfloat64m1_t _a = vreinterpretq_m128d_f64(a); + vfloat64m1_t _b = vreinterpretq_m128d_f64(b); + vfloat64m1_t add = __riscv_vfadd_vv_f64m1(_a, _b, 2); + return vreinterpretq_f64_m128d(__riscv_vslideup_vx_f64m1_tu(_a, add, 0, 1)); +} + +FORCE_INLINE __m64 _mm_add_si64(__m64 a, __m64 b) { + vint64m1_t _a = vreinterpretq_m64_i64(a); + vint64m1_t _b = vreinterpretq_m64_i64(b); + return vreinterpretq_i64_m64(__riscv_vadd_vv_i64m1(_a, _b, 1)); +} + +FORCE_INLINE __m128 _mm_add_ss(__m128 a, __m128 b) { + vfloat32m1_t _a = vreinterpretq_m128_f32(a); + vfloat32m1_t _b = vreinterpretq_m128_f32(b); + vfloat32m1_t add = __riscv_vfadd_vv_f32m1(_a, _b, 4); + return vreinterpretq_f32_m128(__riscv_vslideup_vx_f32m1_tu(_a, add, 0, 1)); +} + +FORCE_INLINE __m128i _mm_adds_epi16(__m128i a, __m128i b) { + vint16m1_t _a = vreinterpretq_m128i_i16(a); + vint16m1_t _b = vreinterpretq_m128i_i16(b); + return vreinterpretq_i16_m128i(__riscv_vsadd_vv_i16m1(_a, _b, 8)); +} + +FORCE_INLINE __m128i _mm_adds_epi8(__m128i a, __m128i b) { + vint8m1_t _a = vreinterpretq_m128i_i8(a); + vint8m1_t _b = vreinterpretq_m128i_i8(b); + return vreinterpretq_i8_m128i(__riscv_vsadd_vv_i8m1(_a, _b, 16)); +} + +FORCE_INLINE __m128i _mm_adds_epu16(__m128i a, __m128i b) { + vuint16m1_t _a = vreinterpretq_m128i_u16(a); + vuint16m1_t _b = vreinterpretq_m128i_u16(b); + return vreinterpretq_u16_m128i(__riscv_vsaddu_vv_u16m1(_a, _b, 8)); +} + +FORCE_INLINE __m128i _mm_adds_epu8(__m128i a, __m128i b) { + vuint8m1_t _a = vreinterpretq_m128i_u8(a); + vuint8m1_t _b = vreinterpretq_m128i_u8(b); + return vreinterpretq_u8_m128i(__riscv_vsaddu_vv_u8m1(_a, _b, 16)); +} + +FORCE_INLINE __m128d _mm_addsub_pd(__m128d a, __m128d b) { + vfloat64m1_t _a = vreinterpretq_m128d_f64(a); + vfloat64m1_t _b = vreinterpretq_m128d_f64(b); + vfloat64m1_t add = __riscv_vfadd_vv_f64m1(_a, _b, 2); + vfloat64m1_t sub = __riscv_vfsub_vv_f64m1(_a, _b, 2); + return vreinterpretq_f64_m128d(__riscv_vslideup_vx_f64m1_tu(add, sub, 0, 1)); +} + +FORCE_INLINE __m128 _mm_addsub_ps(__m128 a, __m128 b) { + vfloat32m1_t _a = vreinterpretq_m128_f32(a); + vfloat32m1_t _b = vreinterpretq_m128_f32(b); + vfloat32m1_t add = __riscv_vfadd_vv_f32m1(_a, _b, 4); + vfloat32m1_t sub = __riscv_vfsub_vv_f32m1(_a, _b, 4); + vbool32_t mask = + __riscv_vreinterpret_v_i32m1_b32(__riscv_vmv_s_x_i32m1(0xa, 4)); + return vreinterpretq_f32_m128(__riscv_vmerge_vvm_f32m1(sub, add, mask, 4)); +} + +FORCE_INLINE __m128i _mm_alignr_epi8(__m128i a, __m128i b, int imm8) { + vuint8m2_t _a = __riscv_vlmul_ext_v_u8m1_u8m2(vreinterpretq_m128i_u8(a)); + vuint8m2_t _b = __riscv_vlmul_ext_v_u8m1_u8m2(vreinterpretq_m128i_u8(b)); + vuint8m2_t ab = __riscv_vslideup_vx_u8m2_tu(_b, _a, 16, 32); + return vreinterpretq_u8_m128i(__riscv_vlmul_trunc_v_u8m2_u8m1( + __riscv_vslidedown_vx_u8m2(ab, imm8, 32))); +} + +FORCE_INLINE __m64 _mm_alignr_pi8(__m64 a, __m64 b, int imm8) { + vuint8m1_t _a = vreinterpretq_m64_u8(a); + vuint8m1_t _b = vreinterpretq_m64_u8(b); + vuint8m1_t ab = __riscv_vslideup_vx_u8m1_tu(_b, _a, 8, 16); + return vreinterpretq_u8_m64(__riscv_vslidedown_vx_u8m1(ab, imm8, 16)); +} + +FORCE_INLINE __m128d _mm_and_pd(__m128d a, __m128d b) { + vint64m1_t _a = vreinterpretq_m128d_i64(a); + vint64m1_t _b = vreinterpretq_m128d_i64(b); + return vreinterpretq_i64_m128d(__riscv_vand_vv_i64m1(_a, _b, 2)); +} + +FORCE_INLINE __m128 _mm_and_ps(__m128 a, __m128 b) { + vint32m1_t _a = vreinterpretq_m128_i32(a); + vint32m1_t _b = vreinterpretq_m128_i32(b); + return vreinterpretq_i32_m128(__riscv_vand_vv_i32m1(_a, _b, 4)); +} + +FORCE_INLINE __m128i _mm_and_si128(__m128i a, __m128i b) { + vint32m1_t _a = vreinterpretq_m128i_i32(a); + vint32m1_t _b = vreinterpretq_m128i_i32(b); + return vreinterpretq_i32_m128i(__riscv_vand_vv_i32m1(_a, _b, 4)); +} + +FORCE_INLINE __m128d _mm_andnot_pd(__m128d a, __m128d b) { + vint64m1_t _a = vreinterpretq_m128d_i64(a); + vint64m1_t _b = vreinterpretq_m128d_i64(b); + return vreinterpretq_i64_m128d( + __riscv_vand_vv_i64m1(__riscv_vnot_v_i64m1(_a, 2), _b, 2)); +} + +FORCE_INLINE __m128 _mm_andnot_ps(__m128 a, __m128 b) { + vint32m1_t _a = vreinterpretq_m128_i32(a); + vint32m1_t _b = vreinterpretq_m128_i32(b); + return vreinterpretq_i32_m128( + __riscv_vand_vv_i32m1(__riscv_vnot_v_i32m1(_a, 4), _b, 4)); +} + +FORCE_INLINE __m128i _mm_andnot_si128(__m128i a, __m128i b) { + vint32m1_t _a = vreinterpretq_m128i_i32(a); + vint32m1_t _b = vreinterpretq_m128i_i32(b); + return vreinterpretq_i32_m128i( + __riscv_vand_vv_i32m1(__riscv_vnot_v_i32m1(_a, 4), _b, 4)); +} + +FORCE_INLINE __m128i _mm_avg_epu16(__m128i a, __m128i b) { + vuint16m1_t _a = vreinterpretq_m128i_u16(a); + vuint16m1_t _b = vreinterpretq_m128i_u16(b); + return vreinterpretq_u16_m128i( + __riscv_vaaddu_vv_u16m1(_a, _b, __RISCV_VXRM_RNU, 8)); +} + +FORCE_INLINE __m128i _mm_avg_epu8(__m128i a, __m128i b) { + vuint8m1_t _a = vreinterpretq_m128i_u8(a); + vuint8m1_t _b = vreinterpretq_m128i_u8(b); + return vreinterpretq_u8_m128i( + __riscv_vaaddu_vv_u8m1(_a, _b, __RISCV_VXRM_RNU, 16)); +} + +FORCE_INLINE __m64 _mm_avg_pu16(__m64 a, __m64 b) { + // FIXME vreinterpretq_m64_u16 would trigger memory error + vint16m1_t __a = __riscv_vreinterpret_v_i32m1_i16m1(a); + vuint16m1_t _a = __riscv_vreinterpret_v_i16m1_u16m1(__a); + vint16m1_t __b = __riscv_vreinterpret_v_i32m1_i16m1(b); + vuint16m1_t _b = __riscv_vreinterpret_v_i16m1_u16m1(__b); + return vreinterpretq_u16_m64( + __riscv_vaaddu_vv_u16m1(_a, _b, __RISCV_VXRM_RNU, 4)); +} + +FORCE_INLINE __m64 _mm_avg_pu8(__m64 a, __m64 b) { + vuint8m1_t _a = vreinterpretq_m64_u8(a); + vuint8m1_t _b = vreinterpretq_m64_u8(b); + return vreinterpretq_u8_m64( + __riscv_vaaddu_vv_u8m1(_a, _b, __RISCV_VXRM_RNU, 8)); +} + +FORCE_INLINE __m128i _mm_blend_epi16(__m128i a, __m128i b, const int imm8) { + vint16m1_t _a = vreinterpretq_m128i_i16(a); + vint16m1_t _b = vreinterpretq_m128i_i16(b); + vbool16_t _imm8 = + __riscv_vreinterpret_v_i8m1_b16(__riscv_vmv_s_x_i8m1(imm8, 8)); + return vreinterpretq_i16_m128i(__riscv_vmerge_vvm_i16m1(_a, _b, _imm8, 8)); +} + +FORCE_INLINE __m128d _mm_blend_pd(__m128d a, __m128d b, const int imm8) { + vfloat64m1_t _a = vreinterpretq_m128d_f64(a); + vfloat64m1_t _b = vreinterpretq_m128d_f64(b); + vbool64_t _imm8 = + __riscv_vreinterpret_v_i8m1_b64(__riscv_vmv_s_x_i8m1(imm8, 2)); + return vreinterpretq_f64_m128d(__riscv_vmerge_vvm_f64m1(_a, _b, _imm8, 2)); +} + +FORCE_INLINE __m128 _mm_blend_ps(__m128 a, __m128 b, const int imm8) { + vfloat32m1_t _a = vreinterpretq_m128_f32(a); + vfloat32m1_t _b = vreinterpretq_m128_f32(b); + vbool32_t _imm8 = + __riscv_vreinterpret_v_i8m1_b32(__riscv_vmv_s_x_i8m1(imm8, 4)); + return vreinterpretq_f32_m128(__riscv_vmerge_vvm_f32m1(_a, _b, _imm8, 4)); +} + +FORCE_INLINE __m128i _mm_blendv_epi8(__m128i a, __m128i b, __m128i mask) { + vint8m1_t _a = vreinterpretq_m128i_i8(a); + vint8m1_t _b = vreinterpretq_m128i_i8(b); + vint8m1_t _mask = vreinterpretq_m128i_i8(mask); + vint8m1_t mask_sra = __riscv_vsra_vx_i8m1(_mask, 7, 16); + vbool8_t mask_b8 = __riscv_vmsne_vx_i8m1_b8(mask_sra, 0, 16); + return vreinterpretq_i8_m128i(__riscv_vmerge_vvm_i8m1(_a, _b, mask_b8, 16)); +} + +FORCE_INLINE __m128d _mm_blendv_pd(__m128d a, __m128d b, __m128d mask) { + vfloat64m1_t _a = vreinterpretq_m128d_f64(a); + vfloat64m1_t _b = vreinterpretq_m128d_f64(b); + vint64m1_t _mask = vreinterpretq_m128d_i64(mask); + vint64m1_t mask_sra = __riscv_vsra_vx_i64m1(_mask, 63, 2); + vbool64_t mask_b64 = __riscv_vmsne_vx_i64m1_b64(mask_sra, 0, 2); + return vreinterpretq_f64_m128d(__riscv_vmerge_vvm_f64m1(_a, _b, mask_b64, 2)); +} + +FORCE_INLINE __m128 _mm_blendv_ps(__m128 a, __m128 b, __m128 mask) { + vfloat32m1_t _a = vreinterpretq_m128_f32(a); + vfloat32m1_t _b = vreinterpretq_m128_f32(b); + vint32m1_t _mask = vreinterpretq_m128_i32(mask); + vint32m1_t mask_sra = __riscv_vsra_vx_i32m1(_mask, 31, 4); + vbool32_t mask_b32 = __riscv_vmsne_vx_i32m1_b32(mask_sra, 0, 4); + return vreinterpretq_f32_m128(__riscv_vmerge_vvm_f32m1(_a, _b, mask_b32, 4)); +} + +FORCE_INLINE __m128i _mm_bslli_si128(__m128i a, int imm8) { + vuint8m1_t _a = vreinterpretq_m128i_u8(a); + vuint8m1_t zeros = __riscv_vmv_v_x_u8m1(0, 16); + return vreinterpretq_u8_m128i( + __riscv_vslideup_vx_u8m1_tu(zeros, _a, imm8 & 0xff, 16)); +} + +FORCE_INLINE __m128i _mm_bsrli_si128(__m128i a, int imm8) { + vuint8m1_t _a = vreinterpretq_m128i_u8(a); + return vreinterpretq_u8_m128i( + __riscv_vslidedown_vx_u8m1(_a, imm8 & 0xff, 16)); +} + +FORCE_INLINE __m128 _mm_castpd_ps(__m128d a) { + return __riscv_vreinterpret_v_u32m1_f32m1(__riscv_vreinterpret_v_u64m1_u32m1( + __riscv_vreinterpret_v_f64m1_u64m1(a))); +} + +FORCE_INLINE __m128i _mm_castpd_si128(__m128d a) { + return __riscv_vreinterpret_v_i64m1_i32m1( + __riscv_vreinterpret_v_f64m1_i64m1(a)); +} + +FORCE_INLINE __m128d _mm_castps_pd(__m128 a) { + return __riscv_vreinterpret_v_i64m1_f64m1(__riscv_vreinterpret_v_i32m1_i64m1( + __riscv_vreinterpret_v_f32m1_i32m1(a))); +} + +FORCE_INLINE __m128i _mm_castps_si128(__m128 a) { + return __riscv_vreinterpret_v_f32m1_i32m1(a); +} + +FORCE_INLINE __m128d _mm_castsi128_pd(__m128i a) { + return __riscv_vreinterpret_v_i64m1_f64m1( + __riscv_vreinterpret_v_i32m1_i64m1(a)); +} + +FORCE_INLINE __m128 _mm_castsi128_ps(__m128i a) { + return __riscv_vreinterpret_v_i32m1_f32m1(a); +} + +FORCE_INLINE __m128d _mm_ceil_pd(__m128d a) { + // FIXME riscv round doesn't work + vfloat64m1_t _a = vreinterpretq_m128d_f64(a); + double arr[2]; + const int len = 2; + __riscv_vse64_v_f64m1(arr, _a, len); + for (int i = 0; i < len; i++) { + arr[i] = ceil(arr[i]); + } + return vreinterpretq_f64_m128d(__riscv_vle64_v_f64m1(arr, len)); +} + +FORCE_INLINE __m128 _mm_ceil_ps(__m128 a) { + // FIXME riscv round doesn't work + vfloat32m1_t _a = vreinterpretq_m128_f32(a); + float arr[4]; + const int len = 4; + __riscv_vse32_v_f32m1(arr, _a, len); + for (int i = 0; i < len; i++) { + arr[i] = ceil(arr[i]); + } + return vreinterpretq_f32_m128(__riscv_vle32_v_f32m1(arr, len)); +} + +FORCE_INLINE __m128d _mm_ceil_sd(__m128d a, __m128d b) { + // FIXME riscv round doesn't work + vfloat64m1_t _a = vreinterpretq_m128d_f64(a); + vfloat64m1_t _b = vreinterpretq_m128d_f64(b); + double arr[2]; + const int len = 2; + __riscv_vse64_v_f64m1(arr, _b, len); + arr[0] = ceil(arr[0]); + vfloat64m1_t _arr = __riscv_vle64_v_f64m1(arr, 1); + return vreinterpretq_f64_m128d(__riscv_vslideup_vx_f64m1_tu(_a, _arr, 0, 1)); +} + +FORCE_INLINE __m128 _mm_ceil_ss(__m128 a, __m128 b) { + // FIXME riscv round doesn't work + vfloat32m1_t _a = vreinterpretq_m128_f32(a); + vfloat32m1_t _b = vreinterpretq_m128_f32(b); + float arr[4]; + const int len = 4; + __riscv_vse32_v_f32m1(arr, _b, len); + arr[0] = ceil(arr[0]); + vfloat32m1_t _arr = __riscv_vle32_v_f32m1(arr, 1); + return vreinterpretq_f32_m128(__riscv_vslideup_vx_f32m1_tu(_a, _arr, 0, 1)); +} + +// FORCE_INLINE void _mm_clflush (void const* p) {} + +FORCE_INLINE __m128i _mm_cmpeq_epi16(__m128i a, __m128i b) { + vint16m1_t _a = vreinterpretq_m128i_i16(a); + vint16m1_t _b = vreinterpretq_m128i_i16(b); + vbool16_t cmp_res = __riscv_vmseq_vv_i16m1_b16(_a, _b, 8); + return vreinterpretq_i16_m128i( + __riscv_vmerge_vxm_i16m1(__riscv_vmv_v_x_i16m1(0x0, 8), -1, cmp_res, 8)); +} + +FORCE_INLINE __m128i _mm_cmpeq_epi32(__m128i a, __m128i b) { + vint32m1_t _a = vreinterpretq_m128i_i32(a); + vint32m1_t _b = vreinterpretq_m128i_i32(b); + vbool32_t cmp_res = __riscv_vmseq_vv_i32m1_b32(_a, _b, 4); + return vreinterpretq_i32_m128i( + __riscv_vmerge_vxm_i32m1(__riscv_vmv_v_x_i32m1(0x0, 4), -1, cmp_res, 4)); +} + +FORCE_INLINE __m128i _mm_cmpeq_epi64(__m128i a, __m128i b) { + vint64m1_t _a = vreinterpretq_m128i_i64(a); + vint64m1_t _b = vreinterpretq_m128i_i64(b); + vbool64_t cmp_res = __riscv_vmseq_vv_i64m1_b64(_a, _b, 2); + return vreinterpretq_i64_m128i( + __riscv_vmerge_vxm_i64m1(__riscv_vmv_v_x_i64m1(0x0, 2), -1, cmp_res, 2)); +} + +FORCE_INLINE __m128i _mm_cmpeq_epi8(__m128i a, __m128i b) { + vint8m1_t _a = vreinterpretq_m128i_i8(a); + vint8m1_t _b = vreinterpretq_m128i_i8(b); + vbool8_t cmp_res = __riscv_vmseq_vv_i8m1_b8(_a, _b, 16); + return vreinterpretq_i8_m128i( + __riscv_vmerge_vxm_i8m1(__riscv_vmv_v_x_i8m1(0x0, 16), -1, cmp_res, 16)); +} + +FORCE_INLINE __m128d _mm_cmpeq_pd(__m128d a, __m128d b) { + vfloat64m1_t _a = vreinterpretq_m128d_f64(a); + vfloat64m1_t _b = vreinterpretq_m128d_f64(b); + vbool64_t cmp_res = __riscv_vmfeq_vv_f64m1_b64(_a, _b, 2); + return vreinterpretq_i64_m128d( + __riscv_vmerge_vxm_i64m1(__riscv_vmv_v_x_i64m1(0x0, 2), -1, cmp_res, 2)); +} + +FORCE_INLINE __m128 _mm_cmpeq_ps(__m128 a, __m128 b) { + vfloat32m1_t _a = vreinterpretq_m128_f32(a); + vfloat32m1_t _b = vreinterpretq_m128_f32(b); + vbool32_t cmp_res = __riscv_vmfeq_vv_f32m1_b32(_a, _b, 4); + return vreinterpretq_i32_m128( + __riscv_vmerge_vxm_i32m1(__riscv_vmv_v_x_i32m1(0x0, 4), -1, cmp_res, 4)); +} + +FORCE_INLINE __m128d _mm_cmpeq_sd(__m128d a, __m128d b) { + vfloat64m1_t _a = vreinterpretq_m128d_f64(a); + vfloat64m1_t _b = vreinterpretq_m128d_f64(b); + vbool64_t cmp_res = __riscv_vmfeq_vv_f64m1_b64(_a, _b, 2); + vint64m1_t cmp_res_i64 = + __riscv_vmerge_vxm_i64m1(__riscv_vmv_v_x_i64m1(0x0, 2), -1, cmp_res, 2); + return vreinterpretq_i64_m128d(__riscv_vslideup_vx_i64m1_tu( + __riscv_vreinterpret_v_f64m1_i64m1(_a), cmp_res_i64, 0, 1)); +} + +FORCE_INLINE __m128 _mm_cmpeq_ss(__m128 a, __m128 b) { + vfloat32m1_t _a = vreinterpretq_m128_f32(a); + vfloat32m1_t _b = vreinterpretq_m128_f32(b); + vbool32_t cmp_res = __riscv_vmfeq_vv_f32m1_b32(_a, _b, 4); + vint32m1_t cmp_res_i32 = + __riscv_vmerge_vxm_i32m1(__riscv_vmv_v_x_i32m1(0x0, 4), -1, cmp_res, 4); + return vreinterpretq_i32_m128(__riscv_vslideup_vx_i32m1_tu( + __riscv_vreinterpret_v_f32m1_i32m1(_a), cmp_res_i32, 0, 1)); +} + +// FORCE_INLINE int _mm_cmpestra (__m128i a, int la, __m128i b, int lb, const +// int imm8) {} + +// FORCE_INLINE int _mm_cmpestrc (__m128i a, int la, __m128i b, int lb, const +// int imm8) {} + +// FORCE_INLINE int _mm_cmpestri (__m128i a, int la, __m128i b, int lb, const +// int imm8) {} + +// FORCE_INLINE __m128i _mm_cmpestrm (__m128i a, int la, __m128i b, int lb, +// const int imm8) {} + +// FORCE_INLINE int _mm_cmpestro (__m128i a, int la, __m128i b, int lb, const +// int imm8) {} + +// FORCE_INLINE int _mm_cmpestrs (__m128i a, int la, __m128i b, int lb, const +// int imm8) {} + +// FORCE_INLINE int _mm_cmpestrz (__m128i a, int la, __m128i b, int lb, const +// int imm8) {} + +FORCE_INLINE __m128d _mm_cmpge_pd(__m128d a, __m128d b) { + vfloat64m1_t _a = vreinterpretq_m128d_f64(a); + vfloat64m1_t _b = vreinterpretq_m128d_f64(b); + vbool64_t cmp_res = __riscv_vmfge_vv_f64m1_b64(_a, _b, 2); + return vreinterpretq_i64_m128d( + __riscv_vmerge_vxm_i64m1(__riscv_vmv_v_x_i64m1(0x0, 2), -1, cmp_res, 2)); +} + +FORCE_INLINE __m128 _mm_cmpge_ps(__m128 a, __m128 b) { + vfloat32m1_t _a = vreinterpretq_m128_f32(a); + vfloat32m1_t _b = vreinterpretq_m128_f32(b); + vbool32_t cmp_res = __riscv_vmfge_vv_f32m1_b32(_a, _b, 4); + return vreinterpretq_i32_m128( + __riscv_vmerge_vxm_i32m1(__riscv_vmv_v_x_i32m1(0x0, 4), -1, cmp_res, 4)); +} + +FORCE_INLINE __m128d _mm_cmpge_sd(__m128d a, __m128d b) { + vfloat64m1_t _a = vreinterpretq_m128d_f64(a); + vfloat64m1_t _b = vreinterpretq_m128d_f64(b); + vbool64_t cmp_res = __riscv_vmfge_vv_f64m1_b64(_a, _b, 2); + vint64m1_t cmp_res_i64 = + __riscv_vmerge_vxm_i64m1(__riscv_vmv_v_x_i64m1(0x0, 2), -1, cmp_res, 2); + return vreinterpretq_i64_m128d(__riscv_vslideup_vx_i64m1_tu( + __riscv_vreinterpret_v_f64m1_i64m1(_a), cmp_res_i64, 0, 1)); +} + +FORCE_INLINE __m128 _mm_cmpge_ss(__m128 a, __m128 b) { + vfloat32m1_t _a = vreinterpretq_m128_f32(a); + vfloat32m1_t _b = vreinterpretq_m128_f32(b); + vbool32_t cmp_res = __riscv_vmfge_vv_f32m1_b32(_a, _b, 4); + vint32m1_t cmp_res_i32 = + __riscv_vmerge_vxm_i32m1(__riscv_vmv_v_x_i32m1(0x0, 4), -1, cmp_res, 4); + return vreinterpretq_i32_m128(__riscv_vslideup_vx_i32m1_tu( + __riscv_vreinterpret_v_f32m1_i32m1(_a), cmp_res_i32, 0, 1)); +} + +FORCE_INLINE __m128i _mm_cmpgt_epi16(__m128i a, __m128i b) { + vint16m1_t _a = vreinterpretq_m128i_i16(a); + vint16m1_t _b = vreinterpretq_m128i_i16(b); + vbool16_t cmp_res = __riscv_vmsgt_vv_i16m1_b16(_a, _b, 8); + return vreinterpretq_i16_m128i( + __riscv_vmerge_vxm_i16m1(__riscv_vmv_v_x_i16m1(0x0, 8), -1, cmp_res, 8)); +} + +FORCE_INLINE __m128i _mm_cmpgt_epi32(__m128i a, __m128i b) { + vint32m1_t _a = vreinterpretq_m128i_i32(a); + vint32m1_t _b = vreinterpretq_m128i_i32(b); + vbool32_t cmp_res = __riscv_vmsgt_vv_i32m1_b32(_a, _b, 4); + return vreinterpretq_i32_m128i( + __riscv_vmerge_vxm_i32m1(__riscv_vmv_v_x_i32m1(0x0, 4), -1, cmp_res, 4)); +} + +FORCE_INLINE __m128i _mm_cmpgt_epi64(__m128i a, __m128i b) { + vint64m1_t _a = vreinterpretq_m128i_i64(a); + vint64m1_t _b = vreinterpretq_m128i_i64(b); + vbool64_t cmp_res = __riscv_vmsgt_vv_i64m1_b64(_a, _b, 2); + return vreinterpretq_i64_m128i( + __riscv_vmerge_vxm_i64m1(__riscv_vmv_v_x_i64m1(0x0, 2), -1, cmp_res, 2)); +} + +FORCE_INLINE __m128i _mm_cmpgt_epi8(__m128i a, __m128i b) { + vint8m1_t _a = vreinterpretq_m128i_i8(a); + vint8m1_t _b = vreinterpretq_m128i_i8(b); + vbool8_t cmp_res = __riscv_vmsgt_vv_i8m1_b8(_a, _b, 16); + return vreinterpretq_i8_m128i( + __riscv_vmerge_vxm_i8m1(__riscv_vmv_v_x_i8m1(0x0, 16), -1, cmp_res, 16)); +} + +FORCE_INLINE __m128d _mm_cmpgt_pd(__m128d a, __m128d b) { + vfloat64m1_t _a = vreinterpretq_m128d_f64(a); + vfloat64m1_t _b = vreinterpretq_m128d_f64(b); + vbool64_t cmp_res = __riscv_vmfgt_vv_f64m1_b64(_a, _b, 2); + return vreinterpretq_i64_m128d( + __riscv_vmerge_vxm_i64m1(__riscv_vmv_v_x_i64m1(0x0, 2), -1, cmp_res, 2)); +} + +FORCE_INLINE __m128 _mm_cmpgt_ps(__m128 a, __m128 b) { + vfloat32m1_t _a = vreinterpretq_m128_f32(a); + vfloat32m1_t _b = vreinterpretq_m128_f32(b); + vbool32_t cmp_res = __riscv_vmfgt_vv_f32m1_b32(_a, _b, 4); + return vreinterpretq_i32_m128( + __riscv_vmerge_vxm_i32m1(__riscv_vmv_v_x_i32m1(0x0, 4), -1, cmp_res, 4)); +} + +FORCE_INLINE __m128d _mm_cmpgt_sd(__m128d a, __m128d b) { + vfloat64m1_t _a = vreinterpretq_m128d_f64(a); + vfloat64m1_t _b = vreinterpretq_m128d_f64(b); + vbool64_t cmp_res = __riscv_vmfgt_vv_f64m1_b64(_a, _b, 2); + vint64m1_t cmp_res_i64 = + __riscv_vmerge_vxm_i64m1(__riscv_vmv_v_x_i64m1(0x0, 2), -1, cmp_res, 2); + return vreinterpretq_i64_m128d(__riscv_vslideup_vx_i64m1_tu( + __riscv_vreinterpret_v_f64m1_i64m1(_a), cmp_res_i64, 0, 1)); +} + +FORCE_INLINE __m128 _mm_cmpgt_ss(__m128 a, __m128 b) { + vfloat32m1_t _a = vreinterpretq_m128_f32(a); + vfloat32m1_t _b = vreinterpretq_m128_f32(b); + vbool32_t cmp_res = __riscv_vmfgt_vv_f32m1_b32(_a, _b, 4); + vint32m1_t cmp_res_i32 = + __riscv_vmerge_vxm_i32m1(__riscv_vmv_v_x_i32m1(0x0, 4), -1, cmp_res, 4); + return vreinterpretq_i32_m128(__riscv_vslideup_vx_i32m1_tu( + __riscv_vreinterpret_v_f32m1_i32m1(_a), cmp_res_i32, 0, 1)); +} + +// FORCE_INLINE int _mm_cmpistra (__m128i a, __m128i b, const int imm8) {} + +// FORCE_INLINE int _mm_cmpistrc (__m128i a, __m128i b, const int imm8) {} + +// FORCE_INLINE int _mm_cmpistri (__m128i a, __m128i b, const int imm8) {} + +// FORCE_INLINE __m128i _mm_cmpistrm (__m128i a, __m128i b, const int imm8) {} + +// FORCE_INLINE int _mm_cmpistro (__m128i a, __m128i b, const int imm8) {} + +// FORCE_INLINE int _mm_cmpistrs (__m128i a, __m128i b, const int imm8) {} + +// FORCE_INLINE int _mm_cmpistrz (__m128i a, __m128i b, const int imm8) {} + +FORCE_INLINE __m128d _mm_cmple_pd(__m128d a, __m128d b) { + vfloat64m1_t _a = vreinterpretq_m128d_f64(a); + vfloat64m1_t _b = vreinterpretq_m128d_f64(b); + vbool64_t cmp_res = __riscv_vmfle_vv_f64m1_b64(_a, _b, 2); + return vreinterpretq_i64_m128d( + __riscv_vmerge_vxm_i64m1(__riscv_vmv_v_x_i64m1(0x0, 2), -1, cmp_res, 2)); +} + +FORCE_INLINE __m128 _mm_cmple_ps(__m128 a, __m128 b) { + vfloat32m1_t _a = vreinterpretq_m128_f32(a); + vfloat32m1_t _b = vreinterpretq_m128_f32(b); + vbool32_t cmp_res = __riscv_vmfle_vv_f32m1_b32(_a, _b, 4); + return vreinterpretq_i32_m128( + __riscv_vmerge_vxm_i32m1(__riscv_vmv_v_x_i32m1(0x0, 4), -1, cmp_res, 4)); +} + +FORCE_INLINE __m128d _mm_cmple_sd(__m128d a, __m128d b) { + vfloat64m1_t _a = vreinterpretq_m128d_f64(a); + vfloat64m1_t _b = vreinterpretq_m128d_f64(b); + vbool64_t cmp_res = __riscv_vmfle_vv_f64m1_b64(_a, _b, 2); + vint64m1_t cmp_res_i64 = + __riscv_vmerge_vxm_i64m1(__riscv_vmv_v_x_i64m1(0x0, 2), -1, cmp_res, 2); + return vreinterpretq_i64_m128d(__riscv_vslideup_vx_i64m1_tu( + __riscv_vreinterpret_v_f64m1_i64m1(_a), cmp_res_i64, 0, 1)); +} + +FORCE_INLINE __m128 _mm_cmple_ss(__m128 a, __m128 b) { + vfloat32m1_t _a = vreinterpretq_m128_f32(a); + vfloat32m1_t _b = vreinterpretq_m128_f32(b); + vbool32_t cmp_res = __riscv_vmfle_vv_f32m1_b32(_a, _b, 4); + vint32m1_t cmp_res_i32 = + __riscv_vmerge_vxm_i32m1(__riscv_vmv_v_x_i32m1(0x0, 4), -1, cmp_res, 4); + return vreinterpretq_i32_m128(__riscv_vslideup_vx_i32m1_tu( + __riscv_vreinterpret_v_f32m1_i32m1(_a), cmp_res_i32, 0, 1)); +} + +FORCE_INLINE __m128i _mm_cmplt_epi16(__m128i a, __m128i b) { + vint16m1_t _a = vreinterpretq_m128i_i16(a); + vint16m1_t _b = vreinterpretq_m128i_i16(b); + vbool16_t cmp_res = __riscv_vmslt_vv_i16m1_b16(_a, _b, 8); + return vreinterpretq_i16_m128i( + __riscv_vmerge_vxm_i16m1(__riscv_vmv_v_x_i16m1(0x0, 8), -1, cmp_res, 8)); +} + +FORCE_INLINE __m128i _mm_cmplt_epi32(__m128i a, __m128i b) { + vint32m1_t _a = vreinterpretq_m128i_i32(a); + vint32m1_t _b = vreinterpretq_m128i_i32(b); + vbool32_t cmp_res = __riscv_vmslt_vv_i32m1_b32(_a, _b, 4); + return vreinterpretq_i32_m128i( + __riscv_vmerge_vxm_i32m1(__riscv_vmv_v_x_i32m1(0x0, 4), -1, cmp_res, 4)); +} + +FORCE_INLINE __m128i _mm_cmplt_epi8(__m128i a, __m128i b) { + vint8m1_t _a = vreinterpretq_m128i_i8(a); + vint8m1_t _b = vreinterpretq_m128i_i8(b); + vbool8_t cmp_res = __riscv_vmslt_vv_i8m1_b8(_a, _b, 16); + return vreinterpretq_i8_m128i( + __riscv_vmerge_vxm_i8m1(__riscv_vmv_v_x_i8m1(0x0, 16), -1, cmp_res, 16)); +} + +FORCE_INLINE __m128d _mm_cmplt_pd(__m128d a, __m128d b) { + vfloat64m1_t _a = vreinterpretq_m128d_f64(a); + vfloat64m1_t _b = vreinterpretq_m128d_f64(b); + vbool64_t cmp_res = __riscv_vmflt_vv_f64m1_b64(_a, _b, 2); + return vreinterpretq_i64_m128d( + __riscv_vmerge_vxm_i64m1(__riscv_vmv_v_x_i64m1(0x0, 2), -1, cmp_res, 2)); +} + +FORCE_INLINE __m128 _mm_cmplt_ps(__m128 a, __m128 b) { + vfloat32m1_t _a = vreinterpretq_m128_f32(a); + vfloat32m1_t _b = vreinterpretq_m128_f32(b); + vbool32_t cmp_res = __riscv_vmflt_vv_f32m1_b32(_a, _b, 4); + return vreinterpretq_i32_m128( + __riscv_vmerge_vxm_i32m1(__riscv_vmv_v_x_i32m1(0x0, 4), -1, cmp_res, 4)); +} + +FORCE_INLINE __m128d _mm_cmplt_sd(__m128d a, __m128d b) { + vfloat64m1_t _a = vreinterpretq_m128d_f64(a); + vfloat64m1_t _b = vreinterpretq_m128d_f64(b); + vbool64_t cmp_res = __riscv_vmflt_vv_f64m1_b64(_a, _b, 2); + vint64m1_t cmp_res_i64 = + __riscv_vmerge_vxm_i64m1(__riscv_vmv_v_x_i64m1(0x0, 2), -1, cmp_res, 2); + return vreinterpretq_i64_m128d(__riscv_vslideup_vx_i64m1_tu( + __riscv_vreinterpret_v_f64m1_i64m1(_a), cmp_res_i64, 0, 1)); +} + +FORCE_INLINE __m128 _mm_cmplt_ss(__m128 a, __m128 b) { + vfloat32m1_t _a = vreinterpretq_m128_f32(a); + vfloat32m1_t _b = vreinterpretq_m128_f32(b); + vbool32_t cmp_res = __riscv_vmflt_vv_f32m1_b32(_a, _b, 4); + vint32m1_t cmp_res_i32 = + __riscv_vmerge_vxm_i32m1(__riscv_vmv_v_x_i32m1(0x0, 4), -1, cmp_res, 4); + return vreinterpretq_i32_m128(__riscv_vslideup_vx_i32m1_tu( + __riscv_vreinterpret_v_f32m1_i32m1(_a), cmp_res_i32, 0, 1)); +} + +FORCE_INLINE __m128d _mm_cmpneq_pd(__m128d a, __m128d b) { + vfloat64m1_t _a = vreinterpretq_m128d_f64(a); + vfloat64m1_t _b = vreinterpretq_m128d_f64(b); + vbool64_t cmp_res = __riscv_vmfne_vv_f64m1_b64(_a, _b, 2); + return vreinterpretq_i64_m128d( + __riscv_vmerge_vxm_i64m1(__riscv_vmv_v_x_i64m1(0x0, 2), -1, cmp_res, 2)); +} + +FORCE_INLINE __m128 _mm_cmpneq_ps(__m128 a, __m128 b) { + vfloat32m1_t _a = vreinterpretq_m128_f32(a); + vfloat32m1_t _b = vreinterpretq_m128_f32(b); + vbool32_t cmp_res = __riscv_vmfne_vv_f32m1_b32(_a, _b, 4); + return vreinterpretq_i32_m128( + __riscv_vmerge_vxm_i32m1(__riscv_vmv_v_x_i32m1(0x0, 4), -1, cmp_res, 4)); +} + +FORCE_INLINE __m128d _mm_cmpneq_sd(__m128d a, __m128d b) { + vfloat64m1_t _a = vreinterpretq_m128d_f64(a); + vfloat64m1_t _b = vreinterpretq_m128d_f64(b); + vbool64_t cmp_res = __riscv_vmfne_vv_f64m1_b64(_a, _b, 2); + vint64m1_t cmp_res_i64 = + __riscv_vmerge_vxm_i64m1(__riscv_vmv_v_x_i64m1(0x0, 2), -1, cmp_res, 2); + return vreinterpretq_i64_m128d(__riscv_vslideup_vx_i64m1_tu( + __riscv_vreinterpret_v_f64m1_i64m1(_a), cmp_res_i64, 0, 1)); +} + +FORCE_INLINE __m128 _mm_cmpneq_ss(__m128 a, __m128 b) { + vfloat32m1_t _a = vreinterpretq_m128_f32(a); + vfloat32m1_t _b = vreinterpretq_m128_f32(b); + vbool32_t cmp_res = __riscv_vmfne_vv_f32m1_b32(_a, _b, 4); + vint32m1_t cmp_res_i32 = + __riscv_vmerge_vxm_i32m1(__riscv_vmv_v_x_i32m1(0x0, 4), -1, cmp_res, 4); + return vreinterpretq_i32_m128(__riscv_vslideup_vx_i32m1_tu( + __riscv_vreinterpret_v_f32m1_i32m1(_a), cmp_res_i32, 0, 1)); +} + +FORCE_INLINE __m128d _mm_cmpnge_pd(__m128d a, __m128d b) { + vfloat64m1_t _a = vreinterpretq_m128d_f64(a); + vfloat64m1_t _b = vreinterpretq_m128d_f64(b); + vbool64_t cmp_res = __riscv_vmfge_vv_f64m1_b64(_a, _b, 2); + return vreinterpretq_i64_m128d(__riscv_vmerge_vxm_i64m1( + __riscv_vmv_v_x_i64m1(UINT64_MAX, 2), 0x0, cmp_res, 2)); +} + +FORCE_INLINE __m128 _mm_cmpnge_ps(__m128 a, __m128 b) { + vfloat32m1_t _a = vreinterpretq_m128_f32(a); + vfloat32m1_t _b = vreinterpretq_m128_f32(b); + vbool32_t cmp_res = __riscv_vmfge_vv_f32m1_b32(_a, _b, 4); + return vreinterpretq_i32_m128(__riscv_vmerge_vxm_i32m1( + __riscv_vmv_v_x_i32m1(UINT32_MAX, 4), 0x0, cmp_res, 4)); +} + +FORCE_INLINE __m128d _mm_cmpnge_sd(__m128d a, __m128d b) { + vfloat64m1_t _a = vreinterpretq_m128d_f64(a); + vfloat64m1_t _b = vreinterpretq_m128d_f64(b); + vbool64_t cmp_res = __riscv_vmfge_vv_f64m1_b64(_a, _b, 2); + vint64m1_t merge = __riscv_vmerge_vxm_i64m1( + __riscv_vmv_v_x_i64m1(UINT64_MAX, 2), 0x0, cmp_res, 1); + return vreinterpretq_i64_m128d(__riscv_vslideup_vx_i64m1( + __riscv_vreinterpret_v_f64m1_i64m1(_a), merge, 0, 1)); +} + +FORCE_INLINE __m128 _mm_cmpnge_ss(__m128 a, __m128 b) { + vfloat32m1_t _a = vreinterpretq_m128_f32(a); + vfloat32m1_t _b = vreinterpretq_m128_f32(b); + vbool32_t cmp_res = __riscv_vmfge_vv_f32m1_b32(_a, _b, 4); + vint32m1_t merge = __riscv_vmerge_vxm_i32m1( + __riscv_vmv_v_x_i32m1(UINT32_MAX, 4), 0x0, cmp_res, 4); + return vreinterpretq_i32_m128(__riscv_vslideup_vx_i32m1( + __riscv_vreinterpret_v_f32m1_i32m1(_a), merge, 0, 1)); +} + +FORCE_INLINE __m128d _mm_cmpngt_pd(__m128d a, __m128d b) { + vfloat64m1_t _a = vreinterpretq_m128d_f64(a); + vfloat64m1_t _b = vreinterpretq_m128d_f64(b); + vbool64_t cmp_res = __riscv_vmfgt_vv_f64m1_b64(_a, _b, 2); + return vreinterpretq_i64_m128d(__riscv_vmerge_vxm_i64m1( + __riscv_vmv_v_x_i64m1(UINT64_MAX, 2), 0x0, cmp_res, 2)); +} + +FORCE_INLINE __m128 _mm_cmpngt_ps(__m128 a, __m128 b) { + vfloat32m1_t _a = vreinterpretq_m128_f32(a); + vfloat32m1_t _b = vreinterpretq_m128_f32(b); + vbool32_t cmp_res = __riscv_vmfgt_vv_f32m1_b32(_a, _b, 4); + return vreinterpretq_i32_m128(__riscv_vmerge_vxm_i32m1( + __riscv_vmv_v_x_i32m1(UINT32_MAX, 4), 0x0, cmp_res, 4)); +} + +FORCE_INLINE __m128d _mm_cmpngt_sd(__m128d a, __m128d b) { + vfloat64m1_t _a = vreinterpretq_m128d_f64(a); + vfloat64m1_t _b = vreinterpretq_m128d_f64(b); + vbool64_t cmp_res = __riscv_vmfgt_vv_f64m1_b64(_a, _b, 2); + vint64m1_t merge = __riscv_vmerge_vxm_i64m1( + __riscv_vmv_v_x_i64m1(UINT64_MAX, 2), 0x0, cmp_res, 1); + return vreinterpretq_i64_m128d(__riscv_vslideup_vx_i64m1( + __riscv_vreinterpret_v_f64m1_i64m1(_a), merge, 0, 1)); +} + +FORCE_INLINE __m128 _mm_cmpngt_ss(__m128 a, __m128 b) { + vfloat32m1_t _a = vreinterpretq_m128_f32(a); + vfloat32m1_t _b = vreinterpretq_m128_f32(b); + vbool32_t cmp_res = __riscv_vmfgt_vv_f32m1_b32(_a, _b, 4); + vint32m1_t merge = __riscv_vmerge_vxm_i32m1( + __riscv_vmv_v_x_i32m1(UINT32_MAX, 4), 0x0, cmp_res, 4); + return vreinterpretq_i32_m128(__riscv_vslideup_vx_i32m1( + __riscv_vreinterpret_v_f32m1_i32m1(_a), merge, 0, 1)); +} + +FORCE_INLINE __m128d _mm_cmpnle_pd(__m128d a, __m128d b) { + vfloat64m1_t _a = vreinterpretq_m128d_f64(a); + vfloat64m1_t _b = vreinterpretq_m128d_f64(b); + vbool64_t cmp_res = __riscv_vmfle_vv_f64m1_b64(_a, _b, 2); + return vreinterpretq_i64_m128d(__riscv_vmerge_vxm_i64m1( + __riscv_vmv_v_x_i64m1(UINT64_MAX, 2), 0x0, cmp_res, 2)); +} + +FORCE_INLINE __m128 _mm_cmpnle_ps(__m128 a, __m128 b) { + vfloat32m1_t _a = vreinterpretq_m128_f32(a); + vfloat32m1_t _b = vreinterpretq_m128_f32(b); + vbool32_t cmp_res = __riscv_vmfle_vv_f32m1_b32(_a, _b, 4); + return vreinterpretq_i32_m128(__riscv_vmerge_vxm_i32m1( + __riscv_vmv_v_x_i32m1(UINT32_MAX, 4), 0x0, cmp_res, 4)); +} + +FORCE_INLINE __m128d _mm_cmpnle_sd(__m128d a, __m128d b) { + vfloat64m1_t _a = vreinterpretq_m128d_f64(a); + vfloat64m1_t _b = vreinterpretq_m128d_f64(b); + vbool64_t cmp_res = __riscv_vmfle_vv_f64m1_b64(_a, _b, 2); + vint64m1_t merge = __riscv_vmerge_vxm_i64m1( + __riscv_vmv_v_x_i64m1(UINT64_MAX, 2), 0x0, cmp_res, 1); + return vreinterpretq_i64_m128d(__riscv_vslideup_vx_i64m1( + __riscv_vreinterpret_v_f64m1_i64m1(_a), merge, 0, 1)); +} + +FORCE_INLINE __m128 _mm_cmpnle_ss(__m128 a, __m128 b) { + vfloat32m1_t _a = vreinterpretq_m128_f32(a); + vfloat32m1_t _b = vreinterpretq_m128_f32(b); + vbool32_t cmp_res = __riscv_vmfle_vv_f32m1_b32(_a, _b, 4); + vint32m1_t merge = __riscv_vmerge_vxm_i32m1( + __riscv_vmv_v_x_i32m1(UINT32_MAX, 4), 0x0, cmp_res, 4); + return vreinterpretq_i32_m128(__riscv_vslideup_vx_i32m1( + __riscv_vreinterpret_v_f32m1_i32m1(_a), merge, 0, 1)); +} + +FORCE_INLINE __m128d _mm_cmpnlt_pd(__m128d a, __m128d b) { + vfloat64m1_t _a = vreinterpretq_m128d_f64(a); + vfloat64m1_t _b = vreinterpretq_m128d_f64(b); + vbool64_t cmp_res = __riscv_vmflt_vv_f64m1_b64(_a, _b, 2); + return vreinterpretq_i64_m128d(__riscv_vmerge_vxm_i64m1( + __riscv_vmv_v_x_i64m1(UINT64_MAX, 2), 0x0, cmp_res, 2)); +} + +FORCE_INLINE __m128 _mm_cmpnlt_ps(__m128 a, __m128 b) { + vfloat32m1_t _a = vreinterpretq_m128_f32(a); + vfloat32m1_t _b = vreinterpretq_m128_f32(b); + vbool32_t cmp_res = __riscv_vmflt_vv_f32m1_b32(_a, _b, 4); + return vreinterpretq_i32_m128(__riscv_vmerge_vxm_i32m1( + __riscv_vmv_v_x_i32m1(UINT32_MAX, 4), 0x0, cmp_res, 4)); +} + +FORCE_INLINE __m128d _mm_cmpnlt_sd(__m128d a, __m128d b) { + vfloat64m1_t _a = vreinterpretq_m128d_f64(a); + vfloat64m1_t _b = vreinterpretq_m128d_f64(b); + vbool64_t cmp_res = __riscv_vmflt_vv_f64m1_b64(_a, _b, 2); + vint64m1_t merge = __riscv_vmerge_vxm_i64m1( + __riscv_vmv_v_x_i64m1(UINT64_MAX, 2), 0x0, cmp_res, 1); + return vreinterpretq_i64_m128d(__riscv_vslideup_vx_i64m1( + __riscv_vreinterpret_v_f64m1_i64m1(_a), merge, 0, 1)); +} + +FORCE_INLINE __m128 _mm_cmpnlt_ss(__m128 a, __m128 b) { + vfloat32m1_t _a = vreinterpretq_m128_f32(a); + vfloat32m1_t _b = vreinterpretq_m128_f32(b); + vbool32_t cmp_res = __riscv_vmflt_vv_f32m1_b32(_a, _b, 4); + vint32m1_t merge = __riscv_vmerge_vxm_i32m1( + __riscv_vmv_v_x_i32m1(UINT32_MAX, 4), 0x0, cmp_res, 4); + return vreinterpretq_i32_m128(__riscv_vslideup_vx_i32m1( + __riscv_vreinterpret_v_f32m1_i32m1(_a), merge, 0, 1)); +} + +FORCE_INLINE __m128d _mm_cmpord_pd(__m128d a, __m128d b) { + vfloat64m1_t _a = vreinterpretq_m128d_f64(a); + vfloat64m1_t _b = vreinterpretq_m128d_f64(b); + vbool64_t not_nan_a = __riscv_vmfeq_vv_f64m1_b64(_a, _a, 2); + vbool64_t not_nan_b = __riscv_vmfeq_vv_f64m1_b64(_b, _b, 2); + vbool64_t non_nan = __riscv_vmand_mm_b64(not_nan_a, not_nan_b, 2); + return vreinterpretq_i64_m128d( + __riscv_vmerge_vxm_i64m1(__riscv_vmv_v_x_i64m1(0x0, 2), -1, non_nan, 2)); +} + +FORCE_INLINE __m128 _mm_cmpord_ps(__m128 a, __m128 b) { + vfloat32m1_t _a = vreinterpretq_m128_f32(a); + vfloat32m1_t _b = vreinterpretq_m128_f32(b); + vbool32_t not_nan_a = __riscv_vmfeq_vv_f32m1_b32(_a, _a, 4); + vbool32_t not_nan_b = __riscv_vmfeq_vv_f32m1_b32(_b, _b, 4); + vbool32_t non_nan = __riscv_vmand_mm_b32(not_nan_a, not_nan_b, 4); + return vreinterpretq_i32_m128( + __riscv_vmerge_vxm_i32m1(__riscv_vmv_v_x_i32m1(0x0, 4), -1, non_nan, 4)); +} + +FORCE_INLINE __m128d _mm_cmpord_sd(__m128d a, __m128d b) { + vfloat64m1_t _a = vreinterpretq_m128d_f64(a); + vfloat64m1_t _b = vreinterpretq_m128d_f64(b); + vbool64_t not_nan_a = __riscv_vmfeq_vv_f64m1_b64(_a, _a, 2); + vbool64_t not_nan_b = __riscv_vmfeq_vv_f64m1_b64(_b, _b, 2); + vbool64_t non_nan = __riscv_vmand_mm_b64(not_nan_a, not_nan_b, 2); + vint64m1_t cmp_res_i64 = + __riscv_vmerge_vxm_i64m1(__riscv_vmv_v_x_i64m1(0x0, 2), -1, non_nan, 2); + return vreinterpretq_i64_m128d(__riscv_vslideup_vx_i64m1_tu( + __riscv_vreinterpret_v_f64m1_i64m1(_a), cmp_res_i64, 0, 1)); +} + +FORCE_INLINE __m128 _mm_cmpord_ss(__m128 a, __m128 b) { + vfloat32m1_t _a = vreinterpretq_m128_f32(a); + vfloat32m1_t _b = vreinterpretq_m128_f32(b); + vbool32_t not_nan_a = __riscv_vmfeq_vv_f32m1_b32(_a, _a, 4); + vbool32_t not_nan_b = __riscv_vmfeq_vv_f32m1_b32(_b, _b, 4); + vbool32_t non_nan = __riscv_vmand_mm_b32(not_nan_a, not_nan_b, 4); + vint32m1_t cmp_res_i32 = + __riscv_vmerge_vxm_i32m1(__riscv_vmv_v_x_i32m1(0x0, 4), -1, non_nan, 4); + return vreinterpretq_i32_m128(__riscv_vslideup_vx_i32m1_tu( + __riscv_vreinterpret_v_f32m1_i32m1(_a), cmp_res_i32, 0, 1)); +} + +FORCE_INLINE __m128d _mm_cmpunord_pd(__m128d a, __m128d b) { + vfloat64m1_t _a = vreinterpretq_m128d_f64(a); + vfloat64m1_t _b = vreinterpretq_m128d_f64(b); + vbool64_t not_nan_a = __riscv_vmfeq_vv_f64m1_b64(_a, _a, 2); + vbool64_t not_nan_b = __riscv_vmfeq_vv_f64m1_b64(_b, _b, 2); + vbool64_t non_nan = __riscv_vmand_mm_b64(not_nan_a, not_nan_b, 2); + return vreinterpretq_i64_m128d(__riscv_vmerge_vxm_i64m1( + __riscv_vmv_v_x_i64m1(UINT64_MAX, 2), 0, non_nan, 2)); +} + +FORCE_INLINE __m128 _mm_cmpunord_ps(__m128 a, __m128 b) { + vfloat32m1_t _a = vreinterpretq_m128_f32(a); + vfloat32m1_t _b = vreinterpretq_m128_f32(b); + vbool32_t not_nan_a = __riscv_vmfeq_vv_f32m1_b32(_a, _a, 4); + vbool32_t not_nan_b = __riscv_vmfeq_vv_f32m1_b32(_b, _b, 4); + vbool32_t non_nan = __riscv_vmand_mm_b32(not_nan_a, not_nan_b, 4); + return vreinterpretq_i32_m128(__riscv_vmerge_vxm_i32m1( + __riscv_vmv_v_x_i32m1(UINT32_MAX, 4), 0, non_nan, 4)); +} + +FORCE_INLINE __m128d _mm_cmpunord_sd(__m128d a, __m128d b) { + vfloat64m1_t _a = vreinterpretq_m128d_f64(a); + vfloat64m1_t _b = vreinterpretq_m128d_f64(b); + vbool64_t not_nan_a = __riscv_vmfeq_vv_f64m1_b64(_a, _a, 2); + vbool64_t not_nan_b = __riscv_vmfeq_vv_f64m1_b64(_b, _b, 2); + vbool64_t non_nan = __riscv_vmand_mm_b64(not_nan_a, not_nan_b, 2); + vint64m1_t cmp_res_i64 = __riscv_vmerge_vxm_i64m1( + __riscv_vmv_v_x_i64m1(UINT64_MAX, 2), 0, non_nan, 2); + return vreinterpretq_i64_m128d(__riscv_vslideup_vx_i64m1_tu( + __riscv_vreinterpret_v_f64m1_i64m1(_a), cmp_res_i64, 0, 1)); +} + +FORCE_INLINE __m128 _mm_cmpunord_ss(__m128 a, __m128 b) { + vfloat32m1_t _a = vreinterpretq_m128_f32(a); + vfloat32m1_t _b = vreinterpretq_m128_f32(b); + vbool32_t not_nan_a = __riscv_vmfeq_vv_f32m1_b32(_a, _a, 4); + vbool32_t not_nan_b = __riscv_vmfeq_vv_f32m1_b32(_b, _b, 4); + vbool32_t non_nan = __riscv_vmand_mm_b32(not_nan_a, not_nan_b, 4); + vint32m1_t cmp_res_i32 = __riscv_vmerge_vxm_i32m1( + __riscv_vmv_v_x_i32m1(UINT32_MAX, 4), 0, non_nan, 4); + return vreinterpretq_i32_m128(__riscv_vslideup_vx_i32m1_tu( + __riscv_vreinterpret_v_f32m1_i32m1(_a), cmp_res_i32, 0, 1)); +} + +FORCE_INLINE int _mm_comieq_sd(__m128d a, __m128d b) { + vfloat64m1_t _a = vreinterpretq_m128d_f64(a); + vfloat64m1_t _b = vreinterpretq_m128d_f64(b); + return __riscv_vmv_x_s_i64m1_i64( + __riscv_vreinterpret_v_f64m1_i64m1(_mm_cmpeq_sd(_a, _b))) & + 0x1; +} + +FORCE_INLINE int _mm_comieq_ss(__m128 a, __m128 b) { + vfloat32m1_t _a = vreinterpretq_m128_f32(a); + vfloat32m1_t _b = vreinterpretq_m128_f32(b); + return __riscv_vmv_x_s_i32m1_i32( + __riscv_vreinterpret_v_f32m1_i32m1(_mm_cmpeq_ss(_a, _b))) & + 0x1; +} + +FORCE_INLINE int _mm_comige_sd(__m128d a, __m128d b) { + vfloat64m1_t _a = vreinterpretq_m128d_f64(a); + vfloat64m1_t _b = vreinterpretq_m128d_f64(b); + return __riscv_vmv_x_s_i64m1_i64( + __riscv_vreinterpret_v_f64m1_i64m1(_mm_cmpge_sd(_a, _b))) & + 0x1; +} + +FORCE_INLINE int _mm_comige_ss(__m128 a, __m128 b) { + vfloat32m1_t _a = vreinterpretq_m128_f32(a); + vfloat32m1_t _b = vreinterpretq_m128_f32(b); + return __riscv_vmv_x_s_i32m1_i32( + __riscv_vreinterpret_v_f32m1_i32m1(_mm_cmpge_ss(_a, _b))) & + 0x1; +} + +FORCE_INLINE int _mm_comigt_sd(__m128d a, __m128d b) { + vfloat64m1_t _a = vreinterpretq_m128d_f64(a); + vfloat64m1_t _b = vreinterpretq_m128d_f64(b); + return __riscv_vmv_x_s_i64m1_i64( + __riscv_vreinterpret_v_f64m1_i64m1(_mm_cmpgt_sd(_a, _b))) & + 0x1; +} + +FORCE_INLINE int _mm_comigt_ss(__m128 a, __m128 b) { + vfloat32m1_t _a = vreinterpretq_m128_f32(a); + vfloat32m1_t _b = vreinterpretq_m128_f32(b); + return __riscv_vmv_x_s_i32m1_i32( + __riscv_vreinterpret_v_f32m1_i32m1(_mm_cmpgt_ss(_a, _b))) & + 0x1; +} + +FORCE_INLINE int _mm_comile_sd(__m128d a, __m128d b) { + vfloat64m1_t _a = vreinterpretq_m128d_f64(a); + vfloat64m1_t _b = vreinterpretq_m128d_f64(b); + return __riscv_vmv_x_s_i64m1_i64( + __riscv_vreinterpret_v_f64m1_i64m1(_mm_cmple_sd(_a, _b))) & + 0x1; +} + +FORCE_INLINE int _mm_comile_ss(__m128 a, __m128 b) { + vfloat32m1_t _a = vreinterpretq_m128_f32(a); + vfloat32m1_t _b = vreinterpretq_m128_f32(b); + return __riscv_vmv_x_s_i32m1_i32( + __riscv_vreinterpret_v_f32m1_i32m1(_mm_cmple_ss(_a, _b))) & + 0x1; +} + +FORCE_INLINE int _mm_comilt_sd(__m128d a, __m128d b) { + vfloat64m1_t _a = vreinterpretq_m128d_f64(a); + vfloat64m1_t _b = vreinterpretq_m128d_f64(b); + return __riscv_vmv_x_s_i64m1_i64( + __riscv_vreinterpret_v_f64m1_i64m1(_mm_cmplt_sd(_a, _b))) & + 0x1; +} + +FORCE_INLINE int _mm_comilt_ss(__m128 a, __m128 b) { + vfloat32m1_t _a = vreinterpretq_m128_f32(a); + vfloat32m1_t _b = vreinterpretq_m128_f32(b); + return __riscv_vmv_x_s_i32m1_i32( + __riscv_vreinterpret_v_f32m1_i32m1(_mm_cmplt_ss(_a, _b))) & + 0x1; +} + +FORCE_INLINE int _mm_comineq_sd(__m128d a, __m128d b) { + vfloat64m1_t _a = vreinterpretq_m128d_f64(a); + vfloat64m1_t _b = vreinterpretq_m128d_f64(b); + return __riscv_vmv_x_s_i64m1_i64( + __riscv_vreinterpret_v_f64m1_i64m1(_mm_cmpneq_sd(_a, _b))) & + 0x1; +} + +FORCE_INLINE int _mm_comineq_ss(__m128 a, __m128 b) { + vfloat32m1_t _a = vreinterpretq_m128_f32(a); + vfloat32m1_t _b = vreinterpretq_m128_f32(b); + return __riscv_vmv_x_s_i32m1_i32( + __riscv_vreinterpret_v_f32m1_i32m1(_mm_cmpneq_ss(_a, _b))) & + 0x1; +} + +FORCE_INLINE unsigned int _mm_crc32_u8(unsigned int crc, unsigned char v) { + crc ^= v; + for (int bit = 0; bit < 8; bit++) { + if (crc & 1) + crc = (crc >> 1) ^ UINT32_C(0x82f63b78); + else + crc = (crc >> 1); + } + return crc; +} + +FORCE_INLINE unsigned int _mm_crc32_u16(unsigned int crc, unsigned short v) { + crc = _mm_crc32_u8(crc, v & UINT8_MAX); + crc = _mm_crc32_u8(crc, (v >> 8) & UINT8_MAX); + return crc; +} + +FORCE_INLINE unsigned int _mm_crc32_u32(unsigned int crc, unsigned int v) { + crc = _mm_crc32_u16(crc, v & UINT16_MAX); + crc = _mm_crc32_u16(crc, (v >> 16) & UINT16_MAX); + return crc; +} + +FORCE_INLINE __int64 _mm_crc32_u64(__int64 crc, __int64 v) { + crc = _mm_crc32_u32((uint32_t)(crc), v & UINT32_MAX); + crc = _mm_crc32_u32((uint32_t)(crc), (v >> 32) & UINT32_MAX); + return crc; +} + +FORCE_INLINE __m128 _mm_cvt_pi2ps(__m128 a, __m64 b) { + vfloat32m1_t _a = vreinterpretq_m128_f32(a); + vint32m1_t _b = vreinterpretq_m64_i32(b); + vfloat32m1_t cvt = __riscv_vfcvt_f_x_v_f32m1(_b, 2); + return vreinterpretq_f32_m128(__riscv_vslideup_vx_f32m1_tu(_a, cvt, 0, 2)); +} + +FORCE_INLINE __m64 _mm_cvt_ps2pi(__m128 a) { + vfloat32m1_t _a = vreinterpretq_m128_f32(a); + return vreinterpretq_i32_m64(__riscv_vfcvt_x_f_v_i32m1(_a, 2)); +} + +FORCE_INLINE __m128 _mm_cvt_si2ss(__m128 a, int b) { + vfloat32m1_t _a = vreinterpretq_m128_f32(a); + vfloat32m1_t _b = __riscv_vfmv_s_f_f32m1(b, 1); + return vreinterpretq_f32_m128(__riscv_vslideup_vx_f32m1_tu(_a, _b, 0, 1)); +} + +FORCE_INLINE int _mm_cvt_ss2si(__m128 a) { + vfloat32m1_t _a = vreinterpretq_m128_f32(a); + vint32m1_t a_i32 = __riscv_vfcvt_x_f_v_i32m1_rm(_a, __RISCV_FRM_RNE, 1); + return (int)(__riscv_vmv_x_s_i32m1_i32(a_i32)); +} + +FORCE_INLINE __m128i _mm_cvtepi16_epi32(__m128i a) { + vint16m1_t _a = vreinterpretq_m128i_i16(a); + return vreinterpretq_i32_m128i( + __riscv_vlmul_trunc_v_i32m2_i32m1(__riscv_vsext_vf2_i32m2(_a, 4))); +} + +FORCE_INLINE __m128i _mm_cvtepi16_epi64(__m128i a) { + vint16m1_t _a = vreinterpretq_m128i_i16(a); + vint32m1_t a_ext = + __riscv_vlmul_trunc_v_i32m2_i32m1(__riscv_vsext_vf2_i32m2(_a, 4)); + return vreinterpretq_i64_m128i( + __riscv_vlmul_trunc_v_i64m2_i64m1(__riscv_vsext_vf2_i64m2(a_ext, 2))); +} + +FORCE_INLINE __m128i _mm_cvtepi32_epi64(__m128i a) { + vint32m1_t _a = vreinterpretq_m128i_i32(a); + return vreinterpretq_i64_m128i( + __riscv_vlmul_trunc_v_i64m2_i64m1(__riscv_vsext_vf2_i64m2(_a, 2))); +} + +FORCE_INLINE __m128d _mm_cvtepi32_pd(__m128i a) { + vint32m1_t _a = vreinterpretq_m128i_i32(a); + vint64m1_t a_ext = + __riscv_vlmul_trunc_v_i64m2_i64m1(__riscv_vsext_vf2_i64m2(_a, 2)); + return vreinterpretq_f64_m128d(__riscv_vfcvt_f_x_v_f64m1(a_ext, 2)); +} + +FORCE_INLINE __m128 _mm_cvtepi32_ps(__m128i a) { + vint32m1_t _a = vreinterpretq_m128i_i32(a); + return vreinterpretq_f32_m128(__riscv_vfcvt_f_x_v_f32m1(_a, 4)); +} + +FORCE_INLINE __m128i _mm_cvtepi8_epi16(__m128i a) { + vint8m1_t _a = vreinterpretq_m128i_i8(a); + return vreinterpretq_i16_m128i( + __riscv_vlmul_trunc_v_i16m2_i16m1(__riscv_vsext_vf2_i16m2(_a, 8))); +} + +FORCE_INLINE __m128i _mm_cvtepi8_epi32(__m128i a) { + vint8m1_t _a = vreinterpretq_m128i_i8(a); + vint16m1_t a_ext = + __riscv_vlmul_trunc_v_i16m2_i16m1(__riscv_vsext_vf2_i16m2(_a, 8)); + return vreinterpretq_i32_m128i( + __riscv_vlmul_trunc_v_i32m2_i32m1(__riscv_vsext_vf2_i32m2(a_ext, 4))); +} + +FORCE_INLINE __m128i _mm_cvtepi8_epi64(__m128i a) { + vint8m1_t _a = vreinterpretq_m128i_i8(a); + vint16m1_t a_ext1 = + __riscv_vlmul_trunc_v_i16m2_i16m1(__riscv_vsext_vf2_i16m2(_a, 8)); + vint32m1_t a_ext2 = + __riscv_vlmul_trunc_v_i32m2_i32m1(__riscv_vsext_vf2_i32m2(a_ext1, 4)); + return vreinterpretq_i64_m128i( + __riscv_vlmul_trunc_v_i64m2_i64m1(__riscv_vsext_vf2_i64m2(a_ext2, 2))); +} + +FORCE_INLINE __m128i _mm_cvtepu16_epi32(__m128i a) { + vuint16m1_t _a = vreinterpretq_m128i_u16(a); + return vreinterpretq_u32_m128i( + __riscv_vlmul_trunc_v_u32m2_u32m1(__riscv_vzext_vf2_u32m2(_a, 4))); +} + +FORCE_INLINE __m128i _mm_cvtepu16_epi64(__m128i a) { + vuint16m1_t _a = vreinterpretq_m128i_u16(a); + vuint32m1_t a_ext = + __riscv_vlmul_trunc_v_u32m2_u32m1(__riscv_vzext_vf2_u32m2(_a, 4)); + return vreinterpretq_u64_m128i( + __riscv_vlmul_trunc_v_u64m2_u64m1(__riscv_vzext_vf2_u64m2(a_ext, 2))); +} + +FORCE_INLINE __m128i _mm_cvtepu32_epi64(__m128i a) { + vuint32m1_t _a = vreinterpretq_m128i_u32(a); + return vreinterpretq_u64_m128i( + __riscv_vlmul_trunc_v_u64m2_u64m1(__riscv_vzext_vf2_u64m2(_a, 2))); +} + +FORCE_INLINE __m128i _mm_cvtepu8_epi16(__m128i a) { + vuint8m1_t _a = vreinterpretq_m128i_u8(a); + return vreinterpretq_u16_m128i( + __riscv_vlmul_trunc_v_u16m2_u16m1(__riscv_vzext_vf2_u16m2(_a, 8))); +} + +FORCE_INLINE __m128i _mm_cvtepu8_epi32(__m128i a) { + vuint8m1_t _a = vreinterpretq_m128i_u8(a); + vuint16m1_t a_ext = + __riscv_vlmul_trunc_v_u16m2_u16m1(__riscv_vzext_vf2_u16m2(_a, 8)); + return vreinterpretq_u32_m128i( + __riscv_vlmul_trunc_v_u32m2_u32m1(__riscv_vzext_vf2_u32m2(a_ext, 4))); +} + +FORCE_INLINE __m128i _mm_cvtepu8_epi64(__m128i a) { + vuint8m1_t _a = vreinterpretq_m128i_u8(a); + vuint16m1_t a_ext1 = + __riscv_vlmul_trunc_v_u16m2_u16m1(__riscv_vzext_vf2_u16m2(_a, 8)); + vuint32m1_t a_ext2 = + __riscv_vlmul_trunc_v_u32m2_u32m1(__riscv_vzext_vf2_u32m2(a_ext1, 4)); + return vreinterpretq_u64_m128i( + __riscv_vlmul_trunc_v_u64m2_u64m1(__riscv_vzext_vf2_u64m2(a_ext2, 2))); +} + +// FORCE_INLINE __m128i _mm_cvtpd_epi32 (__m128d a) {} + +// FORCE_INLINE __m64 _mm_cvtpd_pi32 (__m128d a) {} + +// FORCE_INLINE __m128 _mm_cvtpd_ps (__m128d a) {} + +// FORCE_INLINE __m128 _mm_cvtpi16_ps (__m64 a) {} + +// FORCE_INLINE __m128d _mm_cvtpi32_pd (__m64 a) {} + +// FORCE_INLINE __m128 _mm_cvtpi32_ps (__m128 a, __m64 b) {} + +// FORCE_INLINE __m128 _mm_cvtpi32x2_ps (__m64 a, __m64 b) {} + +// FORCE_INLINE __m128 _mm_cvtpi8_ps (__m64 a) {} + +// FORCE_INLINE __m128i _mm_cvtps_epi32 (__m128 a) {} + +// FORCE_INLINE __m128d _mm_cvtps_pd (__m128 a) {} + +// FORCE_INLINE __m64 _mm_cvtps_pi16 (__m128 a) {} + +// FORCE_INLINE __m64 _mm_cvtps_pi32 (__m128 a) {} + +// FORCE_INLINE __m64 _mm_cvtps_pi8 (__m128 a) {} + +// FORCE_INLINE __m128 _mm_cvtpu16_ps (__m64 a) {} + +// FORCE_INLINE __m128 _mm_cvtpu8_ps (__m64 a) {} + +// FORCE_INLINE double _mm_cvtsd_f64 (__m128d a) {} + +// FORCE_INLINE int _mm_cvtsd_si32 (__m128d a) {} + +// FORCE_INLINE __int64 _mm_cvtsd_si64 (__m128d a) {} + +// FORCE_INLINE __int64 _mm_cvtsd_si64x (__m128d a) {} + +// FORCE_INLINE __m128 _mm_cvtsd_ss (__m128 a, __m128d b) {} + +// FORCE_INLINE int _mm_cvtsi128_si32 (__m128i a) {} + +// FORCE_INLINE __int64 _mm_cvtsi128_si64 (__m128i a) {} + +// FORCE_INLINE __int64 _mm_cvtsi128_si64x (__m128i a) {} + +// FORCE_INLINE __m128d _mm_cvtsi32_sd (__m128d a, int b) {} + +// FORCE_INLINE __m128i _mm_cvtsi32_si128 (int a) {} + +// FORCE_INLINE __m128 _mm_cvtsi32_ss (__m128 a, int b) {} + +// FORCE_INLINE __m128d _mm_cvtsi64_sd (__m128d a, __int64 b) {} + +// FORCE_INLINE __m128i _mm_cvtsi64_si128 (__int64 a) {} + +// FORCE_INLINE __m128 _mm_cvtsi64_ss (__m128 a, __int64 b) {} + +// FORCE_INLINE __m128d _mm_cvtsi64x_sd (__m128d a, __int64 b) {} + +// FORCE_INLINE __m128i _mm_cvtsi64x_si128 (__int64 a) {} + +// FORCE_INLINE float _mm_cvtss_f32 (__m128 a) {} + +// FORCE_INLINE __m128d _mm_cvtss_sd (__m128d a, __m128 b) {} + +// FORCE_INLINE int _mm_cvtss_si32 (__m128 a) {} + +// FORCE_INLINE __int64 _mm_cvtss_si64 (__m128 a) {} + +// FORCE_INLINE __m64 _mm_cvtt_ps2pi (__m128 a) {} + +// FORCE_INLINE int _mm_cvtt_ss2si (__m128 a) {} + +// FORCE_INLINE __m128i _mm_cvttpd_epi32 (__m128d a) {} + +// FORCE_INLINE __m64 _mm_cvttpd_pi32 (__m128d a) {} + +// FORCE_INLINE __m128i _mm_cvttps_epi32 (__m128 a) {} + +// FORCE_INLINE __m64 _mm_cvttps_pi32 (__m128 a) {} + +// FORCE_INLINE int _mm_cvttsd_si32 (__m128d a) {} + +// FORCE_INLINE __int64 _mm_cvttsd_si64 (__m128d a) {} + +// FORCE_INLINE __int64 _mm_cvttsd_si64x (__m128d a) {} + +// FORCE_INLINE int _mm_cvttss_si32 (__m128 a) {} + +// FORCE_INLINE __int64 _mm_cvttss_si64 (__m128 a) {} + +FORCE_INLINE __m128d _mm_div_pd(__m128d a, __m128d b) { + vfloat64m1_t _a = vreinterpretq_m128d_f64(a); + vfloat64m1_t _b = vreinterpretq_m128d_f64(b); + return vreinterpretq_f64_m128d(__riscv_vfdiv_vv_f64m1(_a, _b, 2)); +} + +FORCE_INLINE __m128 _mm_div_ps(__m128 a, __m128 b) { + vfloat32m1_t _a = vreinterpretq_m128_f32(a); + vfloat32m1_t _b = vreinterpretq_m128_f32(b); + return vreinterpretq_f32_m128(__riscv_vfdiv_vv_f32m1(_a, _b, 4)); +} + +FORCE_INLINE __m128d _mm_div_sd(__m128d a, __m128d b) { + vfloat64m1_t _a = vreinterpretq_m128d_f64(a); + vfloat64m1_t _b = vreinterpretq_m128d_f64(b); + vfloat64m1_t div = __riscv_vfdiv_vv_f64m1(_a, _b, 2); + return vreinterpretq_f64_m128d(__riscv_vslideup_vx_f64m1_tu(_a, div, 0, 1)); +} + +FORCE_INLINE __m128 _mm_div_ss(__m128 a, __m128 b) { + vfloat32m1_t _a = vreinterpretq_m128_f32(a); + vfloat32m1_t _b = vreinterpretq_m128_f32(b); + vfloat32m1_t div = __riscv_vfdiv_vv_f32m1(_a, _b, 4); + return vreinterpretq_f32_m128(__riscv_vslideup_vx_f32m1_tu(_a, div, 0, 1)); +} + +// FORCE_INLINE __m128d _mm_dp_pd (__m128d a, __m128d b, const int imm8) {} + +// FORCE_INLINE __m128 _mm_dp_ps (__m128 a, __m128 b, const int imm8) {} + +FORCE_INLINE int _mm_extract_epi16(__m128i a, int imm8) { + vint16m1_t _a = vreinterpretq_m128i_i16(a); + vint16m1_t a_s = __riscv_vslidedown_vx_i16m1(_a, imm8 & 0x7, 8); + return (int)__riscv_vmv_x_s_i16m1_i16(a_s) & UINT16_MAX; +} + +FORCE_INLINE int _mm_extract_epi32(__m128i a, const int imm8) { + vint32m1_t _a = vreinterpretq_m128i_i32(a); + vint32m1_t a_s = __riscv_vslidedown_vx_i32m1(_a, imm8 & 0x3, 4); + return (int)__riscv_vmv_x_s_i32m1_i32(a_s); +} + +FORCE_INLINE __int64 _mm_extract_epi64(__m128i a, const int imm8) { + vint64m1_t _a = vreinterpretq_m128i_i64(a); + vint64m1_t a_s = __riscv_vslidedown_vx_i64m1(_a, imm8 & 0x1, 2); + return (__int64)__riscv_vmv_x_s_i64m1_i64(a_s); +} + +FORCE_INLINE int _mm_extract_epi8(__m128i a, const int imm8) { + vint8m1_t _a = vreinterpretq_m128i_i8(a); + vint8m1_t a_s = __riscv_vslidedown_vx_i8m1(_a, imm8 & 0xf, 16); + return (int)__riscv_vmv_x_s_i8m1_i8(a_s) & UINT8_MAX; +} + +FORCE_INLINE int _mm_extract_pi16(__m64 a, int imm8) { + vint16m1_t _a = vreinterpretq_m64_i16(a); + vint16m1_t a_s = __riscv_vslidedown_vx_i16m1(_a, imm8 & 0x3, 8); + return (int)__riscv_vmv_x_s_i16m1_i16(a_s) & UINT16_MAX; +} + +FORCE_INLINE int _mm_extract_ps(__m128 a, const int imm8) { + vint32m1_t _a = vreinterpretq_m128_i32(a); + vint32m1_t a_s = __riscv_vslidedown_vx_i32m1(_a, imm8 & 0x3, 4); + return (int)__riscv_vmv_x_s_i32m1_i32(a_s); +} + +FORCE_INLINE __m128d _mm_floor_pd(__m128d a) { + // FIXME riscv round doesn't work + vfloat64m1_t _a = vreinterpretq_m128d_f64(a); + double arr[2]; + const int len = 2; + __riscv_vse64_v_f64m1(arr, _a, len); + for (int i = 0; i < len; i++) { + arr[i] = floor(arr[i]); + } + return vreinterpretq_f64_m128d(__riscv_vle64_v_f64m1(arr, len)); +} + +FORCE_INLINE __m128 _mm_floor_ps(__m128 a) { + // FIXME riscv round doesn't work + vfloat32m1_t _a = vreinterpretq_m128_f32(a); + float arr[4]; + const int len = 4; + __riscv_vse32_v_f32m1(arr, _a, len); + for (int i = 0; i < len; i++) { + arr[i] = floor(arr[i]); + } + return vreinterpretq_f32_m128(__riscv_vle32_v_f32m1(arr, len)); +} + +FORCE_INLINE __m128d _mm_floor_sd(__m128d a, __m128d b) { + // FIXME riscv round doesn't work + vfloat64m1_t _a = vreinterpretq_m128d_f64(a); + vfloat64m1_t _b = vreinterpretq_m128d_f64(b); + double arr[2]; + const int len = 2; + __riscv_vse64_v_f64m1(arr, _b, len); + arr[0] = floor(arr[0]); + vfloat64m1_t _arr = __riscv_vle64_v_f64m1(arr, 1); + return vreinterpretq_f64_m128d(__riscv_vslideup_vx_f64m1_tu(_a, _arr, 0, 1)); +} + +FORCE_INLINE __m128 _mm_floor_ss(__m128 a, __m128 b) { + // FIXME riscv round doesn't work + vfloat32m1_t _a = vreinterpretq_m128_f32(a); + vfloat32m1_t _b = vreinterpretq_m128_f32(b); + float arr[4]; + const int len = 4; + __riscv_vse32_v_f32m1(arr, _b, len); + arr[0] = floor(arr[0]); + vfloat32m1_t _arr = __riscv_vle32_v_f32m1(arr, 1); + return vreinterpretq_f32_m128(__riscv_vslideup_vx_f32m1_tu(_a, _arr, 0, 1)); +} + +FORCE_INLINE void _mm_free(void *mem_addr) { free(mem_addr); } + +// FORCE_INLINE unsigned int _MM_GET_FLUSH_ZERO_MODE () {} + +// FORCE_INLINE unsigned int _MM_GET_ROUNDING_MODE () {} + +// FORCE_INLINE unsigned int _mm_getcsr (void) {} + +FORCE_INLINE __m128i _mm_hadd_epi16(__m128i a, __m128i b) { + vint16m2_t _a = __riscv_vlmul_ext_v_i16m1_i16m2(vreinterpretq_m128i_i16(a)); + vint16m2_t _b = __riscv_vlmul_ext_v_i16m1_i16m2(vreinterpretq_m128i_i16(b)); + vint16m2_t ab = __riscv_vslideup_vx_i16m2_tu(_a, _b, 8, 16); + vint16m2_t ab_s = __riscv_vslidedown_vx_i16m2(ab, 1, 16); + vint32m2_t ab_add = + __riscv_vreinterpret_v_i16m2_i32m2(__riscv_vadd_vv_i16m2(ab, ab_s, 16)); + return vreinterpretq_i16_m128i(__riscv_vnsra_wx_i16m1(ab_add, 0, 8)); +} + +FORCE_INLINE __m128i _mm_hadd_epi32(__m128i a, __m128i b) { + vint32m2_t _a = __riscv_vlmul_ext_v_i32m1_i32m2(vreinterpretq_m128i_i32(a)); + vint32m2_t _b = __riscv_vlmul_ext_v_i32m1_i32m2(vreinterpretq_m128i_i32(b)); + vint32m2_t ab = __riscv_vslideup_vx_i32m2_tu(_a, _b, 4, 8); + vint32m2_t ab_s = __riscv_vslidedown_vx_i32m2(ab, 1, 8); + vint64m2_t ab_add = + __riscv_vreinterpret_v_i32m2_i64m2(__riscv_vadd_vv_i32m2(ab, ab_s, 8)); + return vreinterpretq_i32_m128i(__riscv_vnsra_wx_i32m1(ab_add, 0, 4)); +} + +FORCE_INLINE __m128d _mm_hadd_pd(__m128d a, __m128d b) { + vfloat64m2_t _a = __riscv_vlmul_ext_v_f64m1_f64m2(vreinterpretq_m128d_f64(a)); + vfloat64m2_t _b = __riscv_vlmul_ext_v_f64m1_f64m2(vreinterpretq_m128d_f64(b)); + vfloat64m2_t ab = __riscv_vslideup_vx_f64m2_tu(_a, _b, 2, 4); + vfloat64m2_t ab_s = __riscv_vslidedown_vx_f64m2(ab, 1, 4); + vfloat64m2_t ab_add = __riscv_vfadd_vv_f64m2(ab, ab_s, 4); + vbool32_t mask = __riscv_vreinterpret_v_u8m1_b32(__riscv_vmv_s_x_u8m1(85, 2)); + return vreinterpretq_f64_m128d(__riscv_vlmul_trunc_v_f64m2_f64m1( + __riscv_vcompress_vm_f64m2(ab_add, mask, 4))); +} + +FORCE_INLINE __m64 _mm_hadd_pi16(__m64 a, __m64 b) { + vint16m1_t _a = vreinterpretq_m64_i16(a); + vint16m1_t _b = vreinterpretq_m64_i16(b); + vint16m1_t ab = __riscv_vslideup_vx_i16m1_tu(_a, _b, 4, 8); + vint16m1_t ab_s = __riscv_vslidedown_vx_i16m1(ab, 1, 8); + vint32m1_t ab_add = + __riscv_vreinterpret_v_i16m1_i32m1(__riscv_vadd_vv_i16m1(ab, ab_s, 8)); + return vreinterpretq_i16_m64( + __riscv_vlmul_ext_v_i16mf2_i16m1(__riscv_vnsra_wx_i16mf2(ab_add, 0, 4))); +} + +FORCE_INLINE __m64 _mm_hadd_pi32(__m64 a, __m64 b) { + vint32m1_t _a = vreinterpretq_m64_i32(a); + vint32m1_t _b = vreinterpretq_m64_i32(b); + vint32m1_t ab = __riscv_vslideup_vx_i32m1_tu(_a, _b, 2, 4); + vint32m1_t ab_s = __riscv_vslidedown_vx_i32m1(ab, 1, 4); + vint64m1_t ab_add = + __riscv_vreinterpret_v_i32m1_i64m1(__riscv_vadd_vv_i32m1(ab, ab_s, 4)); + return vreinterpretq_i32_m64( + __riscv_vlmul_ext_v_i32mf2_i32m1(__riscv_vnsra_wx_i32mf2(ab_add, 0, 2))); +} + +FORCE_INLINE __m128 _mm_hadd_ps(__m128 a, __m128 b) { + vfloat32m2_t _a = __riscv_vlmul_ext_v_f32m1_f32m2(vreinterpretq_m128_f32(a)); + vfloat32m2_t _b = __riscv_vlmul_ext_v_f32m1_f32m2(vreinterpretq_m128_f32(b)); + vfloat32m2_t ab = __riscv_vslideup_vx_f32m2_tu(_a, _b, 4, 8); + vfloat32m2_t ab_s = __riscv_vslidedown_vx_f32m2(ab, 1, 8); + vint64m2_t ab_add = __riscv_vreinterpret_v_i32m2_i64m2( + __riscv_vreinterpret_v_f32m2_i32m2(__riscv_vfadd_vv_f32m2(ab, ab_s, 8))); + return vreinterpretq_i32_m128(__riscv_vnsra_wx_i32m1(ab_add, 0, 4)); +} + +FORCE_INLINE __m128i _mm_hadds_epi16(__m128i a, __m128i b) { + vint16m2_t _a = __riscv_vlmul_ext_v_i16m1_i16m2(vreinterpretq_m128i_i16(a)); + vint16m2_t _b = __riscv_vlmul_ext_v_i16m1_i16m2(vreinterpretq_m128i_i16(b)); + vint16m2_t ab = __riscv_vslideup_vx_i16m2_tu(_a, _b, 8, 16); + vint16m2_t ab_s = __riscv_vslidedown_vx_i16m2(ab, 1, 16); + vint32m2_t ab_add = + __riscv_vreinterpret_v_i16m2_i32m2(__riscv_vsadd_vv_i16m2(ab, ab_s, 16)); + return vreinterpretq_i16_m128i(__riscv_vnsra_wx_i16m1(ab_add, 0, 8)); +} + +FORCE_INLINE __m64 _mm_hadds_pi16(__m64 a, __m64 b) { + vint16m1_t _a = vreinterpretq_m64_i16(a); + vint16m1_t _b = vreinterpretq_m64_i16(b); + vint16m1_t ab = __riscv_vslideup_vx_i16m1_tu(_a, _b, 4, 8); + vint16m1_t ab_s = __riscv_vslidedown_vx_i16m1(ab, 1, 8); + vint32m1_t ab_add = + __riscv_vreinterpret_v_i16m1_i32m1(__riscv_vsadd_vv_i16m1(ab, ab_s, 8)); + return vreinterpretq_i16_m64( + __riscv_vlmul_ext_v_i16mf2_i16m1(__riscv_vnsra_wx_i16mf2(ab_add, 0, 4))); +} + +FORCE_INLINE __m128i _mm_hsub_epi16(__m128i a, __m128i b) { + vint16m2_t _a = __riscv_vlmul_ext_v_i16m1_i16m2(vreinterpretq_m128i_i16(a)); + vint16m2_t _b = __riscv_vlmul_ext_v_i16m1_i16m2(vreinterpretq_m128i_i16(b)); + vint16m2_t ab = __riscv_vslideup_vx_i16m2_tu(_a, _b, 8, 16); + vint16m2_t ab_s = __riscv_vslidedown_vx_i16m2(ab, 1, 16); + vint32m2_t ab_sub = + __riscv_vreinterpret_v_i16m2_i32m2(__riscv_vsub_vv_i16m2(ab, ab_s, 16)); + return vreinterpretq_i16_m128i(__riscv_vnsra_wx_i16m1(ab_sub, 0, 8)); +} + +FORCE_INLINE __m128i _mm_hsub_epi32(__m128i a, __m128i b) { + vint32m2_t _a = __riscv_vlmul_ext_v_i32m1_i32m2(vreinterpretq_m128i_i32(a)); + vint32m2_t _b = __riscv_vlmul_ext_v_i32m1_i32m2(vreinterpretq_m128i_i32(b)); + vint32m2_t ab = __riscv_vslideup_vx_i32m2_tu(_a, _b, 4, 8); + vint32m2_t ab_s = __riscv_vslidedown_vx_i32m2(ab, 1, 8); + vint64m2_t ab_sub = + __riscv_vreinterpret_v_i32m2_i64m2(__riscv_vsub_vv_i32m2(ab, ab_s, 8)); + return vreinterpretq_i32_m128i(__riscv_vnsra_wx_i32m1(ab_sub, 0, 4)); +} + +FORCE_INLINE __m128d _mm_hsub_pd(__m128d a, __m128d b) { + vfloat64m2_t _a = __riscv_vlmul_ext_v_f64m1_f64m2(vreinterpretq_m128d_f64(a)); + vfloat64m2_t _b = __riscv_vlmul_ext_v_f64m1_f64m2(vreinterpretq_m128d_f64(b)); + vfloat64m2_t ab = __riscv_vslideup_vx_f64m2_tu(_a, _b, 2, 4); + vfloat64m2_t ab_s = __riscv_vslidedown_vx_f64m2(ab, 1, 4); + vfloat64m2_t ab_sub = __riscv_vfsub_vv_f64m2(ab, ab_s, 4); + vbool32_t mask = __riscv_vreinterpret_v_u8m1_b32(__riscv_vmv_s_x_u8m1(85, 2)); + return vreinterpretq_f64_m128d(__riscv_vlmul_trunc_v_f64m2_f64m1( + __riscv_vcompress_vm_f64m2(ab_sub, mask, 4))); +} + +FORCE_INLINE __m64 _mm_hsub_pi16(__m64 a, __m64 b) { + vint16m1_t _a = vreinterpretq_m64_i16(a); + vint16m1_t _b = vreinterpretq_m64_i16(b); + vint16m1_t ab = __riscv_vslideup_vx_i16m1_tu(_a, _b, 4, 8); + vint16m1_t ab_s = __riscv_vslidedown_vx_i16m1(ab, 1, 8); + vint32m1_t ab_sub = + __riscv_vreinterpret_v_i16m1_i32m1(__riscv_vsub_vv_i16m1(ab, ab_s, 8)); + return vreinterpretq_i16_m64( + __riscv_vlmul_ext_v_i16mf2_i16m1(__riscv_vnsra_wx_i16mf2(ab_sub, 0, 4))); +} + +FORCE_INLINE __m64 _mm_hsub_pi32(__m64 a, __m64 b) { + vint32m1_t _a = vreinterpretq_m64_i32(a); + vint32m1_t _b = vreinterpretq_m64_i32(b); + vint32m1_t ab = __riscv_vslideup_vx_i32m1_tu(_a, _b, 2, 4); + vint32m1_t ab_s = __riscv_vslidedown_vx_i32m1(ab, 1, 4); + vint64m1_t ab_sub = + __riscv_vreinterpret_v_i32m1_i64m1(__riscv_vsub_vv_i32m1(ab, ab_s, 4)); + return vreinterpretq_i32_m64( + __riscv_vlmul_ext_v_i32mf2_i32m1(__riscv_vnsra_wx_i32mf2(ab_sub, 0, 2))); +} + +FORCE_INLINE __m128 _mm_hsub_ps(__m128 a, __m128 b) { + vfloat32m2_t _a = __riscv_vlmul_ext_v_f32m1_f32m2(vreinterpretq_m128_f32(a)); + vfloat32m2_t _b = __riscv_vlmul_ext_v_f32m1_f32m2(vreinterpretq_m128_f32(b)); + vfloat32m2_t ab = __riscv_vslideup_vx_f32m2_tu(_a, _b, 4, 8); + vfloat32m2_t ab_s = __riscv_vslidedown_vx_f32m2(ab, 1, 8); + vint64m2_t ab_sub = __riscv_vreinterpret_v_i32m2_i64m2( + __riscv_vreinterpret_v_f32m2_i32m2(__riscv_vfsub_vv_f32m2(ab, ab_s, 8))); + return vreinterpretq_i32_m128(__riscv_vnsra_wx_i32m1(ab_sub, 0, 4)); +} + +FORCE_INLINE __m128i _mm_hsubs_epi16(__m128i a, __m128i b) { + vint16m2_t _a = __riscv_vlmul_ext_v_i16m1_i16m2(vreinterpretq_m128i_i16(a)); + vint16m2_t _b = __riscv_vlmul_ext_v_i16m1_i16m2(vreinterpretq_m128i_i16(b)); + vint16m2_t ab = __riscv_vslideup_vx_i16m2_tu(_a, _b, 8, 16); + vint16m2_t ab_s = __riscv_vslidedown_vx_i16m2(ab, 1, 16); + vint32m2_t ab_sub = + __riscv_vreinterpret_v_i16m2_i32m2(__riscv_vssub_vv_i16m2(ab, ab_s, 16)); + return vreinterpretq_i16_m128i(__riscv_vnsra_wx_i16m1(ab_sub, 0, 8)); +} + +FORCE_INLINE __m64 _mm_hsubs_pi16(__m64 a, __m64 b) { + vint16m1_t _a = vreinterpretq_m64_i16(a); + vint16m1_t _b = vreinterpretq_m64_i16(b); + vint16m1_t ab = __riscv_vslideup_vx_i16m1_tu(_a, _b, 4, 8); + vint16m1_t ab_s = __riscv_vslidedown_vx_i16m1(ab, 1, 8); + vint32m1_t ab_sub = + __riscv_vreinterpret_v_i16m1_i32m1(__riscv_vssub_vv_i16m1(ab, ab_s, 8)); + return vreinterpretq_i16_m64( + __riscv_vlmul_ext_v_i16mf2_i16m1(__riscv_vnsra_wx_i16mf2(ab_sub, 0, 4))); +} + +FORCE_INLINE __m128i _mm_insert_epi16(__m128i a, int i, int imm8) { + vint16m1_t _a = vreinterpretq_m128i_i16(a); + vbool16_t mask = __riscv_vreinterpret_v_u8m1_b16( + __riscv_vmv_s_x_u8m1(((uint8_t)(1 << (imm8 & 0x7))), 8)); + return vreinterpretq_i16_m128i(__riscv_vmerge_vxm_i16m1(_a, i, mask, 8)); +} + +FORCE_INLINE __m128i _mm_insert_epi32(__m128i a, int i, const int imm8) { + vint32m1_t _a = vreinterpretq_m128i_i32(a); + vbool32_t mask = __riscv_vreinterpret_v_u8m1_b32( + __riscv_vmv_s_x_u8m1(((uint8_t)(1 << (imm8 & 0x3))), 4)); + return vreinterpretq_i32_m128i(__riscv_vmerge_vxm_i32m1(_a, i, mask, 4)); +} + +FORCE_INLINE __m128i _mm_insert_epi64(__m128i a, __int64 i, const int imm8) { + vint64m1_t _a = vreinterpretq_m128i_i64(a); + vbool64_t mask = __riscv_vreinterpret_v_u8m1_b64( + __riscv_vmv_s_x_u8m1(((uint8_t)(1 << (imm8 & 0x1))), 2)); + return vreinterpretq_i64_m128i(__riscv_vmerge_vxm_i64m1(_a, i, mask, 2)); +} + +FORCE_INLINE __m128i _mm_insert_epi8(__m128i a, int i, const int imm8) { + vint8m1_t _a = vreinterpretq_m128i_i8(a); + vbool8_t mask = __riscv_vreinterpret_v_u16m1_b8( + __riscv_vmv_s_x_u16m1(((uint16_t)(1 << (imm8 & 0xf))), 16)); + return vreinterpretq_i8_m128i(__riscv_vmerge_vxm_i8m1(_a, i, mask, 16)); +} + +FORCE_INLINE __m64 _mm_insert_pi16(__m64 a, int i, int imm8) { + vint16m1_t _a = vreinterpretq_m64_i16(a); + vbool16_t mask = __riscv_vreinterpret_v_u8m1_b16( + __riscv_vmv_s_x_u8m1(((uint8_t)(1 << imm8)), 8)); + return vreinterpretq_i16_m64(__riscv_vmerge_vxm_i16m1(_a, i, mask, 8)); +} + +FORCE_INLINE __m128 _mm_insert_ps(__m128 a, __m128 b, const int imm8) { + vint32m1_t _a = vreinterpretq_m128_i32(a); + vint32m1_t _b = vreinterpretq_m128_i32(b); + vint32m1_t tmp = + __riscv_vrgather_vx_i32m1(_b, (((uint8_t)imm8) >> 6) & 0x3, 4); + vbool32_t mask1 = __riscv_vreinterpret_v_u32m1_b32( + __riscv_vmv_s_x_u32m1((1 << ((imm8 >> 4) & 0x3)), 4)); + vint32m1_t tmp2 = __riscv_vmerge_vvm_i32m1(_a, tmp, mask1, 4); + vbool32_t mask2 = + __riscv_vreinterpret_v_u32m1_b32(__riscv_vmv_s_x_u32m1(imm8 & 0xf, 4)); + return vreinterpretq_i32_m128(__riscv_vmerge_vxm_i32m1(tmp2, 0, mask2, 4)); +} + +FORCE_INLINE __m128i _mm_lddqu_si128(__m128i const *mem_addr) { + return vreinterpretq_i32_m128i( + __riscv_vle32_v_i32m1((int32_t const *)mem_addr, 4)); +} + +// FORCE_INLINE void _mm_lfence (void) {} + +FORCE_INLINE __m128d _mm_load_pd(double const *mem_addr) { + return vreinterpretq_f64_m128d(__riscv_vle64_v_f64m1(mem_addr, 2)); +} + +FORCE_INLINE __m128d _mm_load_pd1(double const *mem_addr) { + return vreinterpretq_f64_m128d(__riscv_vfmv_v_f_f64m1(mem_addr[0], 2)); +} + +FORCE_INLINE __m128 _mm_load_ps(float const *mem_addr) { + return vreinterpretq_f32_m128(__riscv_vle32_v_f32m1(mem_addr, 4)); +} + +FORCE_INLINE __m128 _mm_load_ps1(float const *mem_addr) { + return vreinterpretq_f32_m128(__riscv_vfmv_v_f_f32m1(mem_addr[0], 4)); +} + +FORCE_INLINE __m128d _mm_load_sd(double const *mem_addr) { + vfloat64m1_t addr = __riscv_vle64_v_f64m1(mem_addr, 1); + vfloat64m1_t zeros = __riscv_vfmv_v_f_f64m1(0, 2); + return vreinterpretq_f64_m128d( + __riscv_vslideup_vx_f64m1_tu(zeros, addr, 0, 1)); +} + +FORCE_INLINE __m128i _mm_load_si128(__m128i const *mem_addr) { + return vreinterpretq_f64_m128i( + __riscv_vle64_v_f64m1((double const *)mem_addr, 2)); +} + +FORCE_INLINE __m128 _mm_load_ss(float const *mem_addr) { + vfloat32m1_t addr = __riscv_vle32_v_f32m1(mem_addr, 1); + vfloat32m1_t zeros = __riscv_vfmv_v_f_f32m1(0, 4); + return vreinterpretq_f32_m128( + __riscv_vslideup_vx_f32m1_tu(zeros, addr, 0, 1)); +} + +FORCE_INLINE __m128d _mm_load1_pd(double const *mem_addr) { + return vreinterpretq_f64_m128d(__riscv_vfmv_v_f_f64m1(mem_addr[0], 2)); +} + +FORCE_INLINE __m128 _mm_load1_ps(float const *mem_addr) { + return vreinterpretq_f32_m128(__riscv_vfmv_v_f_f32m1(mem_addr[0], 4)); +} + +FORCE_INLINE __m128d _mm_loaddup_pd(double const *mem_addr) { + return vreinterpretq_f64_m128d(__riscv_vfmv_v_f_f64m1(mem_addr[0], 2)); +} + +FORCE_INLINE __m128d _mm_loadh_pd(__m128d a, double const *mem_addr) { + vfloat64m1_t _a = vreinterpretq_m128d_f64(a); + vfloat64m1_t addr = __riscv_vle64_v_f64m1(mem_addr, 1); + return vreinterpretq_f64_m128d(__riscv_vslideup_vx_f64m1_tu(_a, addr, 1, 2)); +} + +FORCE_INLINE __m128 _mm_loadh_pi(__m128 a, __m64 const *mem_addr) { + vint64m1_t _a = vreinterpretq_m128_i64(a); + vint64m1_t addr = vreinterpretq_m64_i64(*mem_addr); + return vreinterpretq_i64_m128(__riscv_vslideup_vx_i64m1_tu(_a, addr, 1, 2)); +} + +FORCE_INLINE __m128i _mm_loadl_epi64(__m128i const *mem_addr) { + vint64m1_t addr = vreinterpretq_m128i_i64(*mem_addr); + vint64m1_t zeros = __riscv_vmv_v_x_i64m1(0, 2); + return vreinterpretq_i64_m128i( + __riscv_vslideup_vx_i64m1_tu(addr, zeros, 1, 2)); +} + +FORCE_INLINE __m128d _mm_loadl_pd(__m128d a, double const *mem_addr) { + vfloat64m1_t _a = vreinterpretq_m128d_f64(a); + vfloat64m1_t addr = __riscv_vle64_v_f64m1(mem_addr, 1); + return vreinterpretq_f64_m128d(__riscv_vslideup_vx_f64m1_tu(_a, addr, 0, 1)); +} + +FORCE_INLINE __m128 _mm_loadl_pi(__m128 a, __m64 const *mem_addr) { + vint64m1_t _a = vreinterpretq_m128_i64(a); + vint64m1_t addr = vreinterpretq_m64_i64(*mem_addr); + return vreinterpretq_i64_m128(__riscv_vslideup_vx_i64m1_tu(_a, addr, 0, 1)); +} + +FORCE_INLINE __m128d _mm_loadr_pd(double const *mem_addr) { + vfloat64m1_t addr = __riscv_vle64_v_f64m1(mem_addr, 2); + vfloat64m1_t addr_high = __riscv_vslidedown_vx_f64m1(addr, 1, 2); + return vreinterpretq_f64_m128d( + __riscv_vslideup_vx_f64m1_tu(addr_high, addr, 1, 2)); +} + +FORCE_INLINE __m128 _mm_loadr_ps(float const *mem_addr) { + vuint32m1_t addr = __riscv_vle32_v_u32m1((uint32_t const *)mem_addr, 4); + vuint32m1_t vid = __riscv_vid_v_u32m1(4); + vuint32m1_t vid_rev = __riscv_vrsub_vx_u32m1(vid, 3, 4); + return vreinterpretq_u32_m128(__riscv_vrgather_vv_u32m1(addr, vid_rev, 4)); +} + +FORCE_INLINE __m128d _mm_loadu_pd(double const *mem_addr) { + return vreinterpretq_f64_m128d(__riscv_vle64_v_f64m1(mem_addr, 2)); +} + +FORCE_INLINE __m128 _mm_loadu_ps(float const *mem_addr) { + return vreinterpretq_f32_m128(__riscv_vle32_v_f32m1(mem_addr, 4)); +} + +FORCE_INLINE __m128i _mm_loadu_si128(__m128i const *mem_addr) { + return vreinterpretq_i32_m128i( + __riscv_vle32_v_i32m1((int32_t const *)mem_addr, 4)); +} + +FORCE_INLINE __m128i _mm_loadu_si16(void const *mem_addr) { + vint16m1_t ld = __riscv_vle16_v_i16m1((int16_t const *)mem_addr, 1); + vint16m1_t zeros = __riscv_vmv_v_x_i16m1(0, 8); + vbool16_t mask = __riscv_vreinterpret_v_u8m1_b16(__riscv_vmv_v_x_u8m1(1, 8)); + return vreinterpretq_i16_m128i(__riscv_vmerge_vvm_i16m1(zeros, ld, mask, 8)); +} + +FORCE_INLINE __m128i _mm_loadu_si32(void const *mem_addr) { + vint32m1_t ld = __riscv_vle32_v_i32m1((int32_t const *)mem_addr, 1); + vint32m1_t zeros = __riscv_vmv_v_x_i32m1(0, 4); + vbool32_t mask = __riscv_vreinterpret_v_u8m1_b32(__riscv_vmv_v_x_u8m1(1, 4)); + return vreinterpretq_i32_m128i(__riscv_vmerge_vvm_i32m1(zeros, ld, mask, 4)); +} + +FORCE_INLINE __m128i _mm_loadu_si64(void const *mem_addr) { + vint64m1_t ld = __riscv_vle64_v_i64m1((int64_t const *)mem_addr, 1); + vint64m1_t zeros = __riscv_vmv_v_x_i64m1(0, 2); + return vreinterpretq_i64_m128i(__riscv_vslideup_vx_i64m1_tu(zeros, ld, 0, 1)); +} + +FORCE_INLINE __m128i _mm_madd_epi16(__m128i a, __m128i b) { + vint16m1_t _a = vreinterpretq_m128i_i16(a); + vint16m1_t _b = vreinterpretq_m128i_i16(b); + vint32m2_t wmul = __riscv_vwmul_vv_i32m2(_a, _b, 8); + vint32m2_t wmul_s = __riscv_vslidedown_vx_i32m2(wmul, 1, 8); + vint32m2_t wmul_add = __riscv_vadd_vv_i32m2(wmul, wmul_s, 8); + return vreinterpretq_i32_m128i(__riscv_vnsra_wx_i32m1( + __riscv_vreinterpret_v_i32m2_i64m2(wmul_add), 0, 4)); +} + +FORCE_INLINE __m128i _mm_maddubs_epi16(__m128i a, __m128i b) { + vint16m2_t _a = __riscv_vreinterpret_v_u16m2_i16m2( + __riscv_vzext_vf2_u16m2(vreinterpretq_m128i_u8(a), 16)); + vint16m2_t _b = __riscv_vsext_vf2_i16m2(vreinterpretq_m128i_i8(b), 16); + vint16m2_t mul = __riscv_vmul_vv_i16m2(_a, _b, 16); + vint16m2_t mul_s = __riscv_vslidedown_vx_i16m2(mul, 1, 16); + vint32m4_t mul_add = __riscv_vwadd_vv_i32m4(mul, mul_s, 16); + vint16m2_t sat = __riscv_vnclip_wx_i16m2(mul_add, 0, __RISCV_VXRM_RDN, 16); + return vreinterpretq_i16_m128i( + __riscv_vnsra_wx_i16m1(__riscv_vreinterpret_v_i16m2_i32m2(sat), 0, 16)); +} + +FORCE_INLINE __m64 _mm_maddubs_pi16(__m64 a, __m64 b) { + vint16m2_t _a = __riscv_vreinterpret_v_u16m2_i16m2( + __riscv_vzext_vf2_u16m2(vreinterpretq_m128i_u8(a), 8)); + vint16m2_t _b = __riscv_vsext_vf2_i16m2(vreinterpretq_m128i_i8(b), 8); + vint16m2_t mul = __riscv_vmul_vv_i16m2(_a, _b, 8); + vint16m2_t mul_s = __riscv_vslidedown_vx_i16m2(mul, 1, 8); + vint32m4_t mul_add = __riscv_vwadd_vv_i32m4(mul, mul_s, 8); + vint16m2_t sat = __riscv_vnclip_wx_i16m2(mul_add, 0, __RISCV_VXRM_RDN, 8); + return vreinterpretq_i16_m128i( + __riscv_vnsra_wx_i16m1(__riscv_vreinterpret_v_i16m2_i32m2(sat), 0, 8)); +} + +FORCE_INLINE void *_mm_malloc(size_t size, size_t align) { + void *ptr; + if (align == 1) { + return malloc(size); + } + if (align == 2 || (sizeof(void *) == 8 && align == 4)) { + align = sizeof(void *); + } + ptr = aligned_alloc(align, size); + if (ptr) { + return ptr; + } + return NULL; +} + +FORCE_INLINE void _mm_maskmove_si64(__m64 a, __m64 mask, char *mem_addr) { + vint8m1_t _a = vreinterpretq_m64_i8(a); + vint8m1_t _mask = vreinterpretq_m64_i8(mask); + vbool8_t lt_mask = __riscv_vmslt_vx_i8m1_b8(_mask, 0, 8); + __riscv_vse8_v_i8m1_m(lt_mask, (int8_t *)mem_addr, _a, 8); +} + +FORCE_INLINE void _mm_maskmoveu_si128(__m128i a, __m128i mask, char *mem_addr) { + vint8m1_t _a = vreinterpretq_m128i_i8(a); + vint8m1_t _mask = vreinterpretq_m128i_i8(mask); + vbool8_t lt_mask = __riscv_vmslt_vx_i8m1_b8(_mask, 0, 16); + __riscv_vse8_v_i8m1_m(lt_mask, (int8_t *)mem_addr, _a, 16); +} + +FORCE_INLINE void _m_maskmovq(__m64 a, __m64 mask, char *mem_addr) { + return _mm_maskmove_si64(a, mask, mem_addr); +} + +FORCE_INLINE __m128i _mm_max_epi16(__m128i a, __m128i b) { + vint16m1_t _a = vreinterpretq_m128i_i16(a); + vint16m1_t _b = vreinterpretq_m128i_i16(b); + return vreinterpretq_i16_m128i(__riscv_vmax_vv_i16m1(_a, _b, 8)); +} + +FORCE_INLINE __m128i _mm_max_epi32(__m128i a, __m128i b) { + vint32m1_t _a = vreinterpretq_m128i_i32(a); + vint32m1_t _b = vreinterpretq_m128i_i32(b); + return vreinterpretq_i32_m128i(__riscv_vmax_vv_i32m1(_a, _b, 4)); +} + +FORCE_INLINE __m128i _mm_max_epi8(__m128i a, __m128i b) { + vint8m1_t _a = vreinterpretq_m128i_i8(a); + vint8m1_t _b = vreinterpretq_m128i_i8(b); + return vreinterpretq_i8_m128i(__riscv_vmax_vv_i8m1(_a, _b, 16)); +} + +FORCE_INLINE __m128i _mm_max_epu16(__m128i a, __m128i b) { + vuint16m1_t _a = vreinterpretq_m128i_u16(a); + vuint16m1_t _b = vreinterpretq_m128i_u16(b); + return vreinterpretq_u16_m128i(__riscv_vmaxu_vv_u16m1(_a, _b, 8)); +} + +FORCE_INLINE __m128i _mm_max_epu32(__m128i a, __m128i b) { + vuint32m1_t _a = vreinterpretq_m128i_u32(a); + vuint32m1_t _b = vreinterpretq_m128i_u32(b); + return vreinterpretq_u32_m128i(__riscv_vmaxu_vv_u32m1(_a, _b, 4)); +} + +FORCE_INLINE __m128i _mm_max_epu8(__m128i a, __m128i b) { + vuint8m1_t _a = vreinterpretq_m128i_u8(a); + vuint8m1_t _b = vreinterpretq_m128i_u8(b); + return vreinterpretq_u8_m128i(__riscv_vmaxu_vv_u8m1(_a, _b, 16)); +} + +FORCE_INLINE __m128d _mm_max_pd(__m128d a, __m128d b) { + vfloat64m1_t _a = vreinterpretq_m128d_f64(a); + vfloat64m1_t _b = vreinterpretq_m128d_f64(b); + return vreinterpretq_f64_m128d(__riscv_vfmax_vv_f64m1(_a, _b, 2)); +} + +FORCE_INLINE __m64 _mm_max_pi16(__m64 a, __m64 b) { + vint16m1_t _a = vreinterpretq_m64_i16(a); + vint16m1_t _b = vreinterpretq_m64_i16(b); + return vreinterpretq_i16_m64(__riscv_vmax_vv_i16m1(_a, _b, 4)); +} + +FORCE_INLINE __m128 _mm_max_ps(__m128 a, __m128 b) { + vfloat32m1_t _a = vreinterpretq_m128_f32(a); + vfloat32m1_t _b = vreinterpretq_m128_f32(b); + return vreinterpretq_f32_m128(__riscv_vfmax_vv_f32m1(_a, _b, 4)); +} + +FORCE_INLINE __m64 _mm_max_pu8(__m64 a, __m64 b) { + vuint8m1_t _a = vreinterpretq_m64_u8(a); + vuint8m1_t _b = vreinterpretq_m64_u8(b); + return vreinterpretq_u8_m64(__riscv_vmaxu_vv_u8m1(_a, _b, 8)); +} + +FORCE_INLINE __m128d _mm_max_sd(__m128d a, __m128d b) { + vfloat64m1_t _a = vreinterpretq_m128d_f64(a); + vfloat64m1_t _b = vreinterpretq_m128d_f64(b); + vfloat64m1_t max = __riscv_vfmax_vv_f64m1(_a, _b, 1); + return vreinterpretq_f64_m128d(__riscv_vslideup_vx_f64m1_tu(_a, max, 0, 1)); +} + +FORCE_INLINE __m128 _mm_max_ss(__m128 a, __m128 b) { + vfloat32m1_t _a = vreinterpretq_m128_f32(a); + vfloat32m1_t _b = vreinterpretq_m128_f32(b); + vfloat32m1_t max = __riscv_vfmax_vv_f32m1(_a, _b, 1); + return vreinterpretq_f32_m128(__riscv_vslideup_vx_f32m1_tu(_a, max, 0, 1)); +} + +// FORCE_INLINE void _mm_mfence (void) {} + +FORCE_INLINE __m128i _mm_min_epi16(__m128i a, __m128i b) { + vint16m1_t _a = vreinterpretq_m128i_i16(a); + vint16m1_t _b = vreinterpretq_m128i_i16(b); + return vreinterpretq_i16_m128i(__riscv_vmin_vv_i16m1(_a, _b, 8)); +} + +FORCE_INLINE __m128i _mm_min_epi32(__m128i a, __m128i b) { + vint32m1_t _a = vreinterpretq_m128i_i32(a); + vint32m1_t _b = vreinterpretq_m128i_i32(b); + return vreinterpretq_i32_m128i(__riscv_vmin_vv_i32m1(_a, _b, 4)); +} + +FORCE_INLINE __m128i _mm_min_epi8(__m128i a, __m128i b) { + vint8m1_t _a = vreinterpretq_m128i_i8(a); + vint8m1_t _b = vreinterpretq_m128i_i8(b); + return vreinterpretq_i8_m128i(__riscv_vmin_vv_i8m1(_a, _b, 16)); +} + +FORCE_INLINE __m128i _mm_min_epu16(__m128i a, __m128i b) { + vuint16m1_t _a = vreinterpretq_m128i_u16(a); + vuint16m1_t _b = vreinterpretq_m128i_u16(b); + return vreinterpretq_u16_m128i(__riscv_vminu_vv_u16m1(_a, _b, 8)); +} + +FORCE_INLINE __m128i _mm_min_epu32(__m128i a, __m128i b) { + vuint32m1_t _a = vreinterpretq_m128i_u32(a); + vuint32m1_t _b = vreinterpretq_m128i_u32(b); + return vreinterpretq_u32_m128i(__riscv_vminu_vv_u32m1(_a, _b, 4)); +} + +FORCE_INLINE __m128i _mm_min_epu8(__m128i a, __m128i b) { + vuint8m1_t _a = vreinterpretq_m128i_u8(a); + vuint8m1_t _b = vreinterpretq_m128i_u8(b); + return vreinterpretq_u8_m128i(__riscv_vminu_vv_u8m1(_a, _b, 16)); +} + +FORCE_INLINE __m128d _mm_min_pd(__m128d a, __m128d b) { + vfloat64m1_t _a = vreinterpretq_m128d_f64(a); + vfloat64m1_t _b = vreinterpretq_m128d_f64(b); + return vreinterpretq_f64_m128d(__riscv_vfmin_vv_f64m1(_a, _b, 2)); +} + +FORCE_INLINE __m64 _mm_min_pi16(__m64 a, __m64 b) { + vint16m1_t _a = vreinterpretq_m64_i16(a); + vint16m1_t _b = vreinterpretq_m64_i16(b); + return vreinterpretq_i16_m64(__riscv_vmin_vv_i16m1(_a, _b, 4)); +} + +FORCE_INLINE __m128 _mm_min_ps(__m128 a, __m128 b) { + vfloat32m1_t _a = vreinterpretq_m128_f32(a); + vfloat32m1_t _b = vreinterpretq_m128_f32(b); + return vreinterpretq_f32_m128(__riscv_vfmin_vv_f32m1(_a, _b, 4)); +} + +FORCE_INLINE __m64 _mm_min_pu8(__m64 a, __m64 b) { + vuint8m1_t _a = vreinterpretq_m64_u8(a); + vuint8m1_t _b = vreinterpretq_m64_u8(b); + return vreinterpretq_u8_m64(__riscv_vminu_vv_u8m1(_a, _b, 8)); +} + +FORCE_INLINE __m128d _mm_min_sd(__m128d a, __m128d b) { + vfloat64m1_t _a = vreinterpretq_m128d_f64(a); + vfloat64m1_t _b = vreinterpretq_m128d_f64(b); + vfloat64m1_t min = __riscv_vfmin_vv_f64m1(_a, _b, 1); + return vreinterpretq_f64_m128d(__riscv_vslideup_vx_f64m1_tu(_a, min, 0, 1)); +} + +FORCE_INLINE __m128 _mm_min_ss(__m128 a, __m128 b) { + vfloat32m1_t _a = vreinterpretq_m128_f32(a); + vfloat32m1_t _b = vreinterpretq_m128_f32(b); + vfloat32m1_t min = __riscv_vfmin_vv_f32m1(_a, _b, 1); + return vreinterpretq_f32_m128(__riscv_vslideup_vx_f32m1_tu(_a, min, 0, 1)); +} + +FORCE_INLINE __m128i _mm_minpos_epu16(__m128i a) { + // TODO add macro for ignoring index + vuint16m1_t _a = vreinterpretq_m128i_u16(a); + vuint16m1_t a_min = __riscv_vredminu_vs_u16m1_u16m1(_a, _a, 8); + vuint16m1_t a_min_dup = __riscv_vrgather_vx_u16m1(a_min, 0, 8); + vuint16m1_t vid = __riscv_vid_v_u16m1(8); + vbool16_t eq_mask = __riscv_vmseq_vv_u16m1_b16(_a, a_min_dup, 8); + vuint16m1_t min_vids = __riscv_vmerge_vvm_u16m1( + __riscv_vmv_v_x_u16m1(UINT16_MAX, 8), vid, eq_mask, 8); + // FIXME sth wrong with __riscv_vredminu_vs_u16m1_u16m1_m() + vuint16m1_t min_vid = __riscv_vredminu_vs_u16m1_u16m1(min_vids, min_vids, 8); + vuint16m1_t min_index = + __riscv_vslideup_vx_u16m1_tu(a_min_dup, min_vid, 1, 2); + vuint16m1_t zeros = __riscv_vmv_v_x_u16m1(0, 8); + return vreinterpretq_u16_m128i( + __riscv_vslideup_vx_u16m1_tu(zeros, min_index, 0, 2)); +} + +FORCE_INLINE __m128i _mm_move_epi64(__m128i a) { + vuint64m1_t _a = vreinterpretq_m128i_u64(a); + vuint64m1_t zeros = __riscv_vmv_v_x_u64m1(0, 2); + return vreinterpretq_u64_m128i(__riscv_vslideup_vx_u64m1_tu(zeros, _a, 0, 1)); +} + +FORCE_INLINE __m128d _mm_move_sd(__m128d a, __m128d b) { + vfloat64m1_t _a = vreinterpretq_m128d_f64(a); + vfloat64m1_t _b = vreinterpretq_m128d_f64(b); + return vreinterpretq_f64_m128d(__riscv_vslideup_vx_f64m1_tu(_a, _b, 0, 1)); +} + +FORCE_INLINE __m128 _mm_move_ss(__m128 a, __m128 b) { + vfloat32m1_t _a = vreinterpretq_m128_f32(a); + vfloat32m1_t _b = vreinterpretq_m128_f32(b); + return vreinterpretq_f32_m128(__riscv_vslideup_vx_f32m1_tu(_a, _b, 0, 1)); +} + +FORCE_INLINE __m128d _mm_movedup_pd(__m128d a) { + vfloat64m1_t _a = vreinterpretq_m128d_f64(a); + return vreinterpretq_f64_m128d(__riscv_vslideup_vx_f64m1_tu(_a, _a, 1, 2)); +} + +FORCE_INLINE __m128 _mm_movehdup_ps(__m128 a) { + // TODO optimize with vmacc + vuint64m1_t _a = vreinterpretq_m128_u64(a); + vuint64m1_t a_low = __riscv_vsrl_vx_u64m1(_a, 32, 2); + vuint64m1_t a_high = __riscv_vsll_vx_u64m1(a_low, 32, 2); + return vreinterpretq_u64_m128(__riscv_vor_vv_u64m1(a_high, a_low, 2)); +} + +FORCE_INLINE __m128 _mm_movehl_ps(__m128 a, __m128 b) { + vfloat64m1_t _a = vreinterpretq_m128_f64(a); + vfloat64m1_t _b = vreinterpretq_m128_f64(b); + vfloat64m1_t b_s = __riscv_vslidedown_vx_f64m1(_b, 1, 2); + return vreinterpretq_f64_m128(__riscv_vslideup_vx_f64m1_tu(_a, b_s, 0, 1)); +} + +FORCE_INLINE __m128 _mm_moveldup_ps(__m128 a) { + // TODO optimize with vmacc + vuint64m1_t _a = vreinterpretq_m128_u64(a); + vuint64m1_t a_high = __riscv_vsll_vx_u64m1(_a, 32, 2); + vuint64m1_t a_low = __riscv_vsrl_vx_u64m1(a_high, 32, 2); + return vreinterpretq_u64_m128(__riscv_vor_vv_u64m1(a_high, a_low, 2)); +} + +FORCE_INLINE __m128 _mm_movelh_ps(__m128 a, __m128 b) { + vfloat64m1_t _a = vreinterpretq_m128_f64(a); + vfloat64m1_t _b = vreinterpretq_m128_f64(b); + return vreinterpretq_f64_m128(__riscv_vslideup_vx_f64m1_tu(_a, _b, 1, 2)); +} + +FORCE_INLINE int _mm_movemask_epi8(__m128i a) { + vint8m1_t _a = vreinterpretq_m128i_i8(a); + vuint16m1_t nonzeros = + __riscv_vreinterpret_v_b8_u16m1(__riscv_vmslt_vx_i8m1_b8(_a, 0, 16)); + return (int)__riscv_vmv_x_s_u16m1_u16(nonzeros); +} + +FORCE_INLINE int _mm_movemask_pd(__m128d a) { + vint64m1_t _a = vreinterpretq_m128d_i64(a); + vuint8m1_t nonzeros = + __riscv_vreinterpret_v_b64_u8m1(__riscv_vmslt_vx_i64m1_b64(_a, 0, 2)); + return (int)(__riscv_vmv_x_s_u8m1_u8(nonzeros) & 0x3); +} + +FORCE_INLINE int _mm_movemask_pi8(__m64 a) { + vint8m1_t _a = vreinterpretq_m128i_i8(a); + vuint8m1_t nonzeros = + __riscv_vreinterpret_v_b8_u8m1(__riscv_vmslt_vx_i8m1_b8(_a, 0, 8)); + return (int)__riscv_vmv_x_s_u8m1_u8(nonzeros); +} + +FORCE_INLINE int _mm_movemask_ps(__m128 a) { + vint32m1_t _a = vreinterpretq_m128_i32(a); + vuint8m1_t nonzeros = + __riscv_vreinterpret_v_b32_u8m1(__riscv_vmslt_vx_i32m1_b32(_a, 0, 4)); + return (int)(__riscv_vmv_x_s_u8m1_u8(nonzeros) & 0xf); +} + +FORCE_INLINE __m64 _mm_movepi64_pi64(__m128i a) { + return vreinterpretq_i32_m64(vreinterpretq_m128i_i32(a)); +} + +FORCE_INLINE __m128i _mm_movpi64_epi64(__m64 a) { + vint32m1_t _a = vreinterpretq_m128i_i32(a); + vint32m1_t zeros = __riscv_vmv_v_x_i32m1(0, 4); + return vreinterpretq_i32_m64(__riscv_vslideup_vx_i32m1_tu(zeros, _a, 0, 2)); +} + +// FORCE_INLINE __m128i _mm_mpsadbw_epu8 (__m128i a, __m128i b, const int imm8) +// {} + +FORCE_INLINE __m128i _mm_mul_epi32(__m128i a, __m128i b) { + vint64m1_t _a = vreinterpretq_m128i_i64(a); + vint64m1_t _b = vreinterpretq_m128i_i64(b); + vint32mf2_t a_srl = __riscv_vnsra_wx_i32mf2(_a, 0, 2); + vint32mf2_t b_srl = __riscv_vnsra_wx_i32mf2(_b, 0, 2); + return vreinterpretq_i64_m128i(__riscv_vwmul_vv_i64m1(a_srl, b_srl, 2)); +} + +FORCE_INLINE __m128i _mm_mul_epu32(__m128i a, __m128i b) { + vuint64m1_t _a = vreinterpretq_m128i_u64(a); + vuint64m1_t _b = vreinterpretq_m128i_u64(b); + vuint32mf2_t a_srl = __riscv_vnsrl_wx_u32mf2(_a, 0, 2); + vuint32mf2_t b_srl = __riscv_vnsrl_wx_u32mf2(_b, 0, 2); + return vreinterpretq_u64_m128i(__riscv_vwmulu_vv_u64m1(a_srl, b_srl, 2)); +} + +FORCE_INLINE __m128d _mm_mul_pd(__m128d a, __m128d b) { + vfloat64m1_t _a = vreinterpretq_m128d_f64(a); + vfloat64m1_t _b = vreinterpretq_m128d_f64(b); + return vreinterpretq_f64_m128d(__riscv_vfmul_vv_f64m1(_a, _b, 2)); +} + +FORCE_INLINE __m128 _mm_mul_ps(__m128 a, __m128 b) { + vfloat32m1_t _a = vreinterpretq_m128_f32(a); + vfloat32m1_t _b = vreinterpretq_m128_f32(b); + return vreinterpretq_f32_m128(__riscv_vfmul_vv_f32m1(_a, _b, 4)); +} + +FORCE_INLINE __m128d _mm_mul_sd(__m128d a, __m128d b) { + vfloat64m1_t _a = vreinterpretq_m128d_f64(a); + vfloat64m1_t _b = vreinterpretq_m128d_f64(b); + vfloat64m1_t mul = __riscv_vfmul_vv_f64m1(_a, _b, 2); + return vreinterpretq_f64_m128d(__riscv_vslideup_vx_f64m1_tu(_a, mul, 0, 1)); +} + +FORCE_INLINE __m128 _mm_mul_ss(__m128 a, __m128 b) { + vfloat32m1_t _a = vreinterpretq_m128_f32(a); + vfloat32m1_t _b = vreinterpretq_m128_f32(b); + vfloat32m1_t mul = __riscv_vfmul_vv_f32m1(_a, _b, 4); + return vreinterpretq_f32_m128(__riscv_vslideup_vx_f32m1_tu(_a, mul, 0, 1)); +} + +FORCE_INLINE __m64 _mm_mul_su32(__m64 a, __m64 b) { + vuint32mf2_t _a = + __riscv_vlmul_trunc_v_u32m1_u32mf2(vreinterpretq_m64_u32(a)); + vuint32mf2_t _b = + __riscv_vlmul_trunc_v_u32m1_u32mf2(vreinterpretq_m64_u32(b)); + return vreinterpretq_u64_m64(__riscv_vwmulu_vv_u64m1(_a, _b, 2)); +} + +FORCE_INLINE __m128i _mm_mulhi_epi16(__m128i a, __m128i b) { + vint16m1_t _a = vreinterpretq_m128i_i16(a); + vint16m1_t _b = vreinterpretq_m128i_i16(b); + return vreinterpretq_i16_m128i(__riscv_vmulh_vv_i16m1(_a, _b, 8)); +} + +FORCE_INLINE __m128i _mm_mulhi_epu16(__m128i a, __m128i b) { + vuint16m1_t _a = vreinterpretq_m128i_u16(a); + vuint16m1_t _b = vreinterpretq_m128i_u16(b); + return vreinterpretq_u16_m128i(__riscv_vmulhu_vv_u16m1(_a, _b, 8)); +} + +FORCE_INLINE __m64 _mm_mulhi_pu16(__m64 a, __m64 b) { + vuint16m1_t _a = vreinterpretq_m64_u16(a); + vuint16m1_t _b = vreinterpretq_m64_u16(b); + return vreinterpretq_u16_m128i(__riscv_vmulhu_vv_u16m1(_a, _b, 4)); +} + +FORCE_INLINE __m128i _mm_mulhrs_epi16(__m128i a, __m128i b) { + vint16m1_t _a = vreinterpretq_m128i_i16(a); + vint16m1_t _b = vreinterpretq_m128i_i16(b); + vint32m2_t ab_mul = __riscv_vwmul_vv_i32m2(_a, _b, 8); + vint32m2_t sra = __riscv_vsra_vx_i32m2(ab_mul, 14, 8); + return vreinterpretq_i16_m128i( + __riscv_vnclip_wx_i16m1(sra, 1, __RISCV_VXRM_RNU, 8)); +} + +FORCE_INLINE __m64 _mm_mulhrs_pi16(__m64 a, __m64 b) { + vint16m1_t _a = vreinterpretq_m64_i16(a); + vint16m1_t _b = vreinterpretq_m64_i16(b); + vint32m2_t ab_mul = __riscv_vwmul_vv_i32m2(_a, _b, 8); + vint32m2_t sra = __riscv_vsra_vx_i32m2(ab_mul, 14, 8); + return vreinterpretq_i16_m64( + __riscv_vnclip_wx_i16m1(sra, 1, __RISCV_VXRM_RNU, 8)); +} + +FORCE_INLINE __m128i _mm_mullo_epi16(__m128i a, __m128i b) { + vint16m1_t _a = vreinterpretq_m128i_i16(a); + vint16m1_t _b = vreinterpretq_m128i_i16(b); + return vreinterpretq_i16_m128i(__riscv_vmul_vv_i16m1(_a, _b, 8)); +} + +FORCE_INLINE __m128i _mm_mullo_epi32(__m128i a, __m128i b) { + vint32m1_t _a = vreinterpretq_m128i_i32(a); + vint32m1_t _b = vreinterpretq_m128i_i32(b); + return vreinterpretq_i32_m128i(__riscv_vmul_vv_i32m1(_a, _b, 4)); +} + +FORCE_INLINE __m128d _mm_or_pd(__m128d a, __m128d b) { + vint64m1_t _a = vreinterpretq_m128d_i64(a); + vint64m1_t _b = vreinterpretq_m128d_i64(b); + return vreinterpretq_i64_m128d(__riscv_vor_vv_i64m1(_a, _b, 2)); +} + +FORCE_INLINE __m128 _mm_or_ps(__m128 a, __m128 b) { + vint32m1_t _a = vreinterpretq_m128_i32(a); + vint32m1_t _b = vreinterpretq_m128_i32(b); + return vreinterpretq_i32_m128(__riscv_vor_vv_i32m1(_a, _b, 4)); +} + +FORCE_INLINE __m128i _mm_or_si128(__m128i a, __m128i b) { + vint32m1_t _a = vreinterpretq_m128i_i32(a); + vint32m1_t _b = vreinterpretq_m128i_i32(b); + return vreinterpretq_i32_m128i(__riscv_vor_vv_i32m1(_a, _b, 4)); +} + +FORCE_INLINE __m128i _mm_packs_epi16(__m128i a, __m128i b) { + vint16m1_t _a = vreinterpretq_m128i_i16(a); + vint16m1_t _b = vreinterpretq_m128i_i16(b); + vint8m1_t a_sat = __riscv_vlmul_ext_v_i8mf2_i8m1( + __riscv_vnclip_wx_i8mf2(_a, 0, __RISCV_VXRM_RDN, 8)); + vint8m1_t b_sat = __riscv_vlmul_ext_v_i8mf2_i8m1( + __riscv_vnclip_wx_i8mf2(_b, 0, __RISCV_VXRM_RDN, 8)); + return vreinterpretq_i8_m128i( + __riscv_vslideup_vx_i8m1_tu(a_sat, b_sat, 8, 16)); +} + +FORCE_INLINE __m128i _mm_packs_epi32(__m128i a, __m128i b) { + vint32m1_t _a = vreinterpretq_m128i_i32(a); + vint32m1_t _b = vreinterpretq_m128i_i32(b); + vint16m1_t a_sat = __riscv_vlmul_ext_v_i16mf2_i16m1( + __riscv_vnclip_wx_i16mf2(_a, 0, __RISCV_VXRM_RDN, 4)); + vint16m1_t b_sat = __riscv_vlmul_ext_v_i16mf2_i16m1( + __riscv_vnclip_wx_i16mf2(_b, 0, __RISCV_VXRM_RDN, 4)); + return vreinterpretq_i16_m128i( + __riscv_vslideup_vx_i16m1_tu(a_sat, b_sat, 4, 8)); +} + +FORCE_INLINE __m128i _mm_packus_epi16(__m128i a, __m128i b) { + vint16m1_t _a = vreinterpretq_m128i_i16(a); + vint16m1_t _b = vreinterpretq_m128i_i16(b); + vbool16_t a_neg_mask = __riscv_vmslt_vx_i16m1_b16(_a, 0, 8); + vbool16_t b_neg_mask = __riscv_vmslt_vx_i16m1_b16(_b, 0, 8); + vuint16m1_t a_unsigned = __riscv_vreinterpret_v_i16m1_u16m1( + __riscv_vmerge_vxm_i16m1(_a, 0, a_neg_mask, 8)); + vuint16m1_t b_unsigned = __riscv_vreinterpret_v_i16m1_u16m1( + __riscv_vmerge_vxm_i16m1(_b, 0, b_neg_mask, 8)); + vuint8m1_t a_sat = __riscv_vlmul_ext_v_u8mf2_u8m1( + __riscv_vnclipu_wx_u8mf2(a_unsigned, 0, __RISCV_VXRM_RDN, 8)); + vuint8m1_t b_sat = __riscv_vlmul_ext_v_u8mf2_u8m1( + __riscv_vnclipu_wx_u8mf2(b_unsigned, 0, __RISCV_VXRM_RDN, 8)); + return vreinterpretq_u8_m128i( + __riscv_vslideup_vx_u8m1_tu(a_sat, b_sat, 8, 16)); +} + +FORCE_INLINE __m128i _mm_packus_epi32(__m128i a, __m128i b) { + vint32m1_t _a = vreinterpretq_m128i_i32(a); + vint32m1_t _b = vreinterpretq_m128i_i32(b); + vbool32_t a_neg_mask = __riscv_vmslt_vx_i32m1_b32(_a, 0, 4); + vbool32_t b_neg_mask = __riscv_vmslt_vx_i32m1_b32(_b, 0, 4); + vuint32m1_t a_unsigned = __riscv_vreinterpret_v_i32m1_u32m1( + __riscv_vmerge_vxm_i32m1(_a, 0, a_neg_mask, 4)); + vuint32m1_t b_unsigned = __riscv_vreinterpret_v_i32m1_u32m1( + __riscv_vmerge_vxm_i32m1(_b, 0, b_neg_mask, 4)); + vuint16m1_t a_sat = __riscv_vlmul_ext_v_u16mf2_u16m1( + __riscv_vnclipu_wx_u16mf2(a_unsigned, 0, __RISCV_VXRM_RDN, 4)); + vuint16m1_t b_sat = __riscv_vlmul_ext_v_u16mf2_u16m1( + __riscv_vnclipu_wx_u16mf2(b_unsigned, 0, __RISCV_VXRM_RDN, 4)); + return vreinterpretq_u16_m128i( + __riscv_vslideup_vx_u16m1_tu(a_sat, b_sat, 4, 8)); +} + +// FORCE_INLINE void _mm_pause (void) {} + +FORCE_INLINE __m64 _m_pavgb(__m64 a, __m64 b) { return _mm_avg_pu8(a, b); } + +FORCE_INLINE __m64 _m_pavgw(__m64 a, __m64 b) { return _mm_avg_pu16(a, b); } + +FORCE_INLINE int _m_pextrw(__m64 a, int imm8) { + return _mm_extract_pi16(a, imm8); +} + +FORCE_INLINE __m64 _m_pinsrw(__m64 a, int i, int imm8) { + return _mm_insert_pi16(a, i, imm8); +} + +FORCE_INLINE __m64 _m_pmaxsw(__m64 a, __m64 b) { return _mm_max_pi16(a, b); } + +FORCE_INLINE __m64 _m_pmaxub(__m64 a, __m64 b) { return _mm_max_pu8(a, b); } + +FORCE_INLINE __m64 _m_pminsw(__m64 a, __m64 b) { return _mm_min_pi16(a, b); } + +FORCE_INLINE __m64 _m_pminub(__m64 a, __m64 b) { return _mm_min_pu8(a, b); } + +FORCE_INLINE int _m_pmovmskb(__m64 a) { return _mm_movemask_pi8(a); } + +FORCE_INLINE __m64 _m_pmulhuw(__m64 a, __m64 b) { return _mm_mulhi_pu16(a, b); } + +// FORCE_INLINE void _mm_prefetch (char const* p, int i) {} + +FORCE_INLINE __m64 _m_psadbw(__m64 a, __m64 b) { return _mm_sad_pu8(a, b); } + +FORCE_INLINE __m64 _m_pshufw(__m64 a, int imm8) { + return _mm_shuffle_pi16(a, imm8); +} + +FORCE_INLINE __m128 _mm_rcp_ps(__m128 a) { + // TODO add high precision mode + vfloat32m1_t _a = vreinterpretq_m128_f32(a); + return vreinterpretq_f32_m128(__riscv_vfrec7_v_f32m1(_a, 4)); +} + +FORCE_INLINE __m128 _mm_rcp_ss(__m128 a) { + // TODO add high precision mode + vfloat32m1_t _a = vreinterpretq_m128_f32(a); + vfloat32m1_t recip = __riscv_vfrec7_v_f32m1(_a, 4); + return vreinterpretq_f32_m128(__riscv_vslideup_vx_f32m1(_a, recip, 0, 1)); +} + +// FORCE_INLINE __m128d _mm_round_pd (__m128d a, int rounding) {} + +// FORCE_INLINE __m128 _mm_round_ps (__m128 a, int rounding) {} + +// FORCE_INLINE __m128d _mm_round_sd (__m128d a, __m128d b, int rounding) {} + +// FORCE_INLINE __m128 _mm_round_ss (__m128 a, __m128 b, int rounding) {} + +FORCE_INLINE __m128 _mm_rsqrt_ps(__m128 a) { + // TODO add high precision mode + vfloat32m1_t _a = vreinterpretq_m128_f32(a); + return vreinterpretq_f32_m128(__riscv_vfrsqrt7_v_f32m1(_a, 4)); +} + +FORCE_INLINE __m128 _mm_rsqrt_ss(__m128 a) { + // TODO add high precision mode + vfloat32m1_t _a = vreinterpretq_m128_f32(a); + vfloat32m1_t sqrt = __riscv_vfrsqrt7_v_f32m1(_a, 4); + return vreinterpretq_f32_m128(__riscv_vslideup_vx_f32m1(_a, sqrt, 0, 1)); +} + +FORCE_INLINE __m128i _mm_sad_epu8(__m128i a, __m128i b) { + vuint8m1_t _a = vreinterpretq_m128i_u8(a); + vuint8m1_t _b = vreinterpretq_m128i_u8(b); + vuint8m1_t max = __riscv_vmaxu_vv_u8m1(_a, _b, 16); + vuint8m1_t min = __riscv_vminu_vv_u8m1(_a, _b, 16); + vuint8m1_t diff = __riscv_vsub_vv_u8m1(max, min, 16); + vuint8m1_t zeros = __riscv_vmv_v_x_u8m1(0, 16); + vuint8m1_t high = __riscv_vslidedown_vx_u8m1(diff, 8, 16); + vuint16m1_t redsum_low = __riscv_vwredsumu_vs_u8m1_u16m1( + diff, __riscv_vreinterpret_v_u8m1_u16m1(zeros), 8); + vuint16m1_t redsum_high = __riscv_vwredsumu_vs_u8m1_u16m1( + high, __riscv_vreinterpret_v_u8m1_u16m1(zeros), 16); + return vreinterpretq_u16_m128i( + __riscv_vslideup_vx_u16m1_tu(redsum_low, redsum_high, 4, 8)); +} + +FORCE_INLINE __m64 _mm_sad_pu8(__m64 a, __m64 b) { + vuint8m1_t _a = vreinterpretq_m64_u8(a); + vuint8m1_t _b = vreinterpretq_m64_u8(b); + vuint8m1_t max = __riscv_vmaxu_vv_u8m1(_a, _b, 8); + vuint8m1_t min = __riscv_vminu_vv_u8m1(_a, _b, 8); + vuint8m1_t diff = __riscv_vsub_vv_u8m1(max, min, 8); + vuint8m1_t zeros = __riscv_vmv_v_x_u8m1(0, 8); + vuint16m1_t redsum = __riscv_vwredsumu_vs_u8m1_u16m1( + diff, __riscv_vreinterpret_v_u8m1_u16m1(zeros), 8); + return vreinterpretq_u16_m64(__riscv_vslideup_vx_u16m1_tu( + __riscv_vreinterpret_v_u8m1_u16m1(zeros), redsum, 0, 1)); +} + +FORCE_INLINE __m128i _mm_set_epi16(short e7, short e6, short e5, short e4, + short e3, short e2, short e1, short e0) { + short arr[8] = {e0, e1, e2, e3, e4, e5, e6, e7}; + return vreinterpretq_i16_m128i(__riscv_vle16_v_i16m1(arr, 8)); +} + +FORCE_INLINE __m128i _mm_set_epi32(int e3, int e2, int e1, int e0) { + int arr[4] = {e0, e1, e2, e3}; + return vreinterpretq_i32_m128i(__riscv_vle32_v_i32m1(arr, 4)); +} + +FORCE_INLINE __m128i _mm_set_epi64(__m64 e1, __m64 e0) { + vint32m1_t _e1 = vreinterpretq_m64_i32(e1); + vint32m1_t _e0 = vreinterpretq_m64_i32(e0); + return vreinterpretq_i32_m128i(__riscv_vslideup_vx_i32m1_tu(_e0, _e1, 2, 4)); +} + +FORCE_INLINE __m128i _mm_set_epi64x(__int64 e1, __int64 e0) { + __int64 arr[2] = {e0, e1}; + return vreinterpretq_i64_m128i(__riscv_vle64_v_i64m1(arr, 2)); +} + +FORCE_INLINE __m128i _mm_set_epi8(char e15, char e14, char e13, char e12, + char e11, char e10, char e9, char e8, char e7, + char e6, char e5, char e4, char e3, char e2, + char e1, char e0) { + char arr[16] = {e0, e1, e2, e3, e4, e5, e6, e7, + e8, e9, e10, e11, e12, e13, e14, e15}; + return vreinterpretq_i8_m128i( + __riscv_vle8_v_i8m1((const signed char *)arr, 16)); +} + +// FORCE_INLINE void _MM_SET_FLUSH_ZERO_MODE (unsigned int a) {} + +FORCE_INLINE __m128d _mm_set_pd(double e1, double e0) { + double arr[2] = {e0, e1}; + return vreinterpretq_f64_m128d(__riscv_vle64_v_f64m1(arr, 2)); +} + +FORCE_INLINE __m128d _mm_set_pd1(double a) { + return vreinterpretq_f64_m128d(__riscv_vfmv_v_f_f64m1(a, 2)); +} + +FORCE_INLINE __m128 _mm_set_ps(float e3, float e2, float e1, float e0) { + float arr[4] = {e0, e1, e2, e3}; + return vreinterpretq_f32_m128(__riscv_vle32_v_f32m1(arr, 4)); +} + +FORCE_INLINE __m128 _mm_set_ps1(float a) { + return vreinterpretq_f32_m128(__riscv_vfmv_v_f_f32m1(a, 4)); +} + +// FORCE_INLINE void _MM_SET_ROUNDING_MODE (unsigned int a) {} + +FORCE_INLINE __m128d _mm_set_sd(double a) { + double arr[2] = {a, 0}; + return vreinterpretq_f64_m128d(__riscv_vle64_v_f64m1(arr, 2)); +} + +FORCE_INLINE __m128 _mm_set_ss(float a) { + float arr[4] = {a, 0, 0, 0}; + return vreinterpretq_f32_m128(__riscv_vle32_v_f32m1(arr, 4)); +} + +FORCE_INLINE __m128i _mm_set1_epi16(short a) { + return vreinterpretq_i16_m128i(__riscv_vmv_v_x_i16m1(a, 8)); +} + +FORCE_INLINE __m128i _mm_set1_epi32(int a) { + return vreinterpretq_i32_m128i(__riscv_vmv_v_x_i32m1(a, 4)); +} + +FORCE_INLINE __m128i _mm_set1_epi64(__m64 a) { + vint32m1_t _a = vreinterpretq_m64_i32(a); + return vreinterpretq_i32_m128i(__riscv_vslideup_vx_i32m1_tu(_a, _a, 2, 4)); +} + +FORCE_INLINE __m128i _mm_set1_epi64x(__int64 a) { + return vreinterpretq_i64_m128i(__riscv_vmv_v_x_i64m1(a, 2)); +} + +FORCE_INLINE __m128i _mm_set1_epi8(char a) { + return vreinterpretq_i8_m128i(__riscv_vmv_v_x_i8m1(a, 16)); +} + +FORCE_INLINE __m128d _mm_set1_pd(double a) { + return vreinterpretq_f64_m128d(__riscv_vfmv_v_f_f64m1(a, 2)); +} + +FORCE_INLINE __m128 _mm_set1_ps(float a) { + return vreinterpretq_f32_m128(__riscv_vfmv_v_f_f32m1(a, 4)); +} + +// FORCE_INLINE void _mm_setcsr (unsigned int a) {} + +FORCE_INLINE __m128i _mm_setr_epi16(short e7, short e6, short e5, short e4, + short e3, short e2, short e1, short e0) { + short arr[8] = {e7, e6, e5, e4, e3, e2, e1, e0}; + return vreinterpretq_i16_m128i(__riscv_vle16_v_i16m1(arr, 8)); +} + +FORCE_INLINE __m128i _mm_setr_epi32(int e3, int e2, int e1, int e0) { + int arr[4] = {e3, e2, e1, e0}; + return vreinterpretq_i32_m128i(__riscv_vle32_v_i32m1(arr, 4)); +} + +FORCE_INLINE __m128i _mm_setr_epi64(__m64 e1, __m64 e0) { + vint32m1_t _e1 = vreinterpretq_m64_i32(e1); + vint32m1_t _e0 = vreinterpretq_m64_i32(e0); + return vreinterpretq_i32_m128i(__riscv_vslideup_vx_i32m1_tu(_e1, _e0, 2, 4)); +} + +FORCE_INLINE __m128i _mm_setr_epi8(char e15, char e14, char e13, char e12, + char e11, char e10, char e9, char e8, + char e7, char e6, char e5, char e4, char e3, + char e2, char e1, char e0) { + char arr[16] = {e15, e14, e13, e12, e11, e10, e9, e8, + e7, e6, e5, e4, e3, e2, e1, e0}; + return vreinterpretq_i8_m128i( + __riscv_vle8_v_i8m1((const signed char *)arr, 16)); +} + +FORCE_INLINE __m128d _mm_setr_pd(double e1, double e0) { + double arr[2] = {e1, e0}; + return vreinterpretq_f64_m128d(__riscv_vle64_v_f64m1(arr, 2)); +} + +FORCE_INLINE __m128 _mm_setr_ps(float e3, float e2, float e1, float e0) { + float arr[4] = {e3, e2, e1, e0}; + return vreinterpretq_f32_m128(__riscv_vle32_v_f32m1(arr, 4)); +} + +FORCE_INLINE __m128d _mm_setzero_pd(void) { + return vreinterpretq_f32_m128d(__riscv_vfmv_v_f_f32m1(0, 4)); +} + +FORCE_INLINE __m128 _mm_setzero_ps(void) { + return vreinterpretq_f32_m128(__riscv_vfmv_v_f_f32m1(0, 4)); +} + +FORCE_INLINE __m128i _mm_setzero_si128() { + return vreinterpretq_f32_m128i(__riscv_vfmv_v_f_f32m1(0, 4)); +} + +// FORCE_INLINE void _mm_sfence (void) {} + +FORCE_INLINE __m128i _mm_shuffle_epi32(__m128i a, int imm8) { + vuint32m1_t _a = vreinterpretq_m128i_u32(a); + vuint32m1_t imm8_dup = __riscv_vmv_v_x_u32m1(imm8, 4); + vuint32m1_t vid = __riscv_vsll_vx_u32m1(__riscv_vid_v_u32m1(4), 1, 4); + vuint32m1_t idxs = + __riscv_vand_vx_u32m1(__riscv_vsrl_vv_u32m1(imm8_dup, vid, 4), 0x3, 4); + return vreinterpretq_u32_m128i(__riscv_vrgather_vv_u32m1(_a, idxs, 4)); +} + +FORCE_INLINE __m128i _mm_shuffle_epi8(__m128i a, __m128i b) { + vint8m1_t _a = vreinterpretq_m128i_i8(a); + vint8m1_t _b = vreinterpretq_m128i_i8(b); + vbool8_t mask_lt_zero = __riscv_vmslt_vx_i8m1_b8(_b, 0, 16); + vuint8m1_t idxs = + __riscv_vreinterpret_v_i8m1_u8m1(__riscv_vand_vx_i8m1(_b, 0xf, 16)); + vint8m1_t shuffle = __riscv_vrgather_vv_i8m1(_a, idxs, 16); + return vreinterpretq_i8_m128i( + __riscv_vmerge_vxm_i8m1(shuffle, 0, mask_lt_zero, 16)); +} + +FORCE_INLINE __m128d _mm_shuffle_pd(__m128d a, __m128d b, int imm8) { + vuint64m1_t _a = vreinterpretq_m128d_u64(a); + vuint64m1_t _b = vreinterpretq_m128d_u64(b); + vuint64m1_t a_s = __riscv_vslidedown_vx_u64m1(_a, imm8 & 0x1, 2); + vuint64m1_t b_s = __riscv_vslidedown_vx_u64m1(_b, (imm8 >> 1) & 0x1, 2); + return vreinterpretq_u64_m128d(__riscv_vslideup_vx_u64m1_tu(a_s, b_s, 1, 2)); +} + +FORCE_INLINE __m64 _mm_shuffle_pi16(__m64 a, int imm8) { + vuint16m1_t _a = vreinterpretq_m64_u16(a); + vuint16m1_t imm8_dup = __riscv_vmv_v_x_u16m1(imm8, 4); + vuint16m1_t vid = __riscv_vsll_vx_u16m1(__riscv_vid_v_u16m1(4), 1, 4); + vuint16m1_t idxs = + __riscv_vand_vx_u16m1(__riscv_vsrl_vv_u16m1(imm8_dup, vid, 4), 0x3, 4); + return vreinterpretq_u16_m64(__riscv_vrgather_vv_u16m1(_a, idxs, 4)); +} + +FORCE_INLINE __m64 _mm_shuffle_pi8(__m64 a, __m64 b) { + vint8m1_t _a = vreinterpretq_m64_i8(a); + vint8m1_t _b = vreinterpretq_m64_i8(b); + vbool8_t mask_lt_zero = __riscv_vmslt_vx_i8m1_b8(_b, 0, 8); + vuint8m1_t idxs = + __riscv_vreinterpret_v_i8m1_u8m1(__riscv_vand_vx_i8m1(_b, 0x7, 8)); + vint8m1_t shuffle = __riscv_vrgather_vv_i8m1(_a, idxs, 8); + return vreinterpretq_i8_m64( + __riscv_vmerge_vxm_i8m1(shuffle, 0, mask_lt_zero, 8)); +} + +FORCE_INLINE __m128 _mm_shuffle_ps(__m128 a, __m128 b, unsigned int imm8) { + vuint32m1_t _a = vreinterpretq_m128_u32(a); + vuint32m1_t _b = vreinterpretq_m128_u32(b); + vuint32m1_t imm8_dup = __riscv_vmv_v_x_u32m1(imm8, 4); + vuint32m1_t vid = __riscv_vsll_vx_u32m1(__riscv_vid_v_u32m1(4), 1, 4); + vuint32m1_t idxs = + __riscv_vand_vx_u32m1(__riscv_vsrl_vv_u32m1(imm8_dup, vid, 4), 0x3, 4); + vuint32m1_t a_shuffle = __riscv_vrgather_vv_u32m1(_a, idxs, 2); + vuint32m1_t b_shuffle = __riscv_vrgather_vv_u32m1(_b, idxs, 4); + return vreinterpretq_u32_m128( + __riscv_vslideup_vx_u32m1_tu(b_shuffle, a_shuffle, 0, 2)); +} + +FORCE_INLINE __m128i _mm_shufflehi_epi16(__m128i a, int imm8) { + vuint16m1_t _a = vreinterpretq_m64_u16(a); + vuint16m1_t imm8_dup = __riscv_vmv_v_x_u16m1(imm8, 4); + vuint16m1_t vid = __riscv_vsll_vx_u16m1(__riscv_vid_v_u16m1(4), 1, 4); + vuint16m1_t idxs = __riscv_vadd_vx_u16m1( + __riscv_vand_vx_u16m1(__riscv_vsrl_vv_u16m1(imm8_dup, vid, 4), 0x3, 4), 4, + 4); + vuint16m1_t shuffle = __riscv_vrgather_vv_u16m1(_a, idxs, 4); + return vreinterpretq_u16_m64(__riscv_vslideup_vx_u16m1_tu(_a, shuffle, 4, 8)); +} + +FORCE_INLINE __m128i _mm_shufflelo_epi16(__m128i a, int imm8) { + vuint16m1_t _a = vreinterpretq_m64_u16(a); + vuint16m1_t imm8_dup = __riscv_vmv_v_x_u16m1(imm8, 4); + vuint16m1_t vid = __riscv_vsll_vx_u16m1(__riscv_vid_v_u16m1(4), 1, 4); + vuint16m1_t idxs = + __riscv_vand_vx_u16m1(__riscv_vsrl_vv_u16m1(imm8_dup, vid, 4), 0x3, 4); + vuint16m1_t shuffle = __riscv_vrgather_vv_u16m1(_a, idxs, 4); + return vreinterpretq_u16_m64(__riscv_vslideup_vx_u16m1_tu(_a, shuffle, 0, 4)); +} + +FORCE_INLINE __m128i _mm_sign_epi16(__m128i a, __m128i b) { + vint16m1_t _a = vreinterpretq_m128i_i16(a); + vint16m1_t _b = vreinterpretq_m128i_i16(b); + + vbool16_t lt_mask = __riscv_vmslt_vx_i16m1_b16(_b, 0, 8); + vbool16_t zero_mask = __riscv_vmseq_vx_i16m1_b16(_b, 0, 8); + vint16m1_t a_neg = __riscv_vneg_v_i16m1(_a, 8); + vint16m1_t res_lt = __riscv_vmerge_vvm_i16m1(_a, a_neg, lt_mask, 8); + return vreinterpretq_i16_m128i( + __riscv_vmerge_vxm_i16m1(res_lt, 0, zero_mask, 8)); +} + +FORCE_INLINE __m128i _mm_sign_epi32(__m128i a, __m128i b) { + vint32m1_t _a = vreinterpretq_m128i_i32(a); + vint32m1_t _b = vreinterpretq_m128i_i32(b); + + vbool32_t lt_mask = __riscv_vmslt_vx_i32m1_b32(_b, 0, 4); + vbool32_t zero_mask = __riscv_vmseq_vx_i32m1_b32(_b, 0, 4); + vint32m1_t a_neg = __riscv_vneg_v_i32m1(_a, 4); + vint32m1_t res_lt = __riscv_vmerge_vvm_i32m1(_a, a_neg, lt_mask, 4); + return vreinterpretq_i32_m128i( + __riscv_vmerge_vxm_i32m1(res_lt, 0, zero_mask, 4)); +} + +FORCE_INLINE __m128i _mm_sign_epi8(__m128i a, __m128i b) { + vint8m1_t _a = vreinterpretq_m128i_i8(a); + vint8m1_t _b = vreinterpretq_m128i_i8(b); + + vbool8_t lt_mask = __riscv_vmslt_vx_i8m1_b8(_b, 0, 16); + vbool8_t zero_mask = __riscv_vmseq_vx_i8m1_b8(_b, 0, 16); + vint8m1_t a_neg = __riscv_vneg_v_i8m1(_a, 16); + vint8m1_t res_lt = __riscv_vmerge_vvm_i8m1(_a, a_neg, lt_mask, 16); + return vreinterpretq_i8_m128i( + __riscv_vmerge_vxm_i8m1(res_lt, 0, zero_mask, 16)); +} + +FORCE_INLINE __m64 _mm_sign_pi16(__m64 a, __m64 b) { + vint16m1_t _a = vreinterpretq_m64_i16(a); + vint16m1_t _b = vreinterpretq_m64_i16(b); + + vbool16_t lt_mask = __riscv_vmslt_vx_i16m1_b16(_b, 0, 4); + vbool16_t zero_mask = __riscv_vmseq_vx_i16m1_b16(_b, 0, 4); + vint16m1_t a_neg = __riscv_vneg_v_i16m1(_a, 4); + vint16m1_t res_lt = __riscv_vmerge_vvm_i16m1(_a, a_neg, lt_mask, 4); + return vreinterpretq_i16_m64( + __riscv_vmerge_vxm_i16m1(res_lt, 0, zero_mask, 4)); +} + +FORCE_INLINE __m64 _mm_sign_pi32(__m64 a, __m64 b) { + vint32m1_t _a = vreinterpretq_m64_i32(a); + vint32m1_t _b = vreinterpretq_m64_i32(b); + + vbool32_t lt_mask = __riscv_vmslt_vx_i32m1_b32(_b, 0, 2); + vbool32_t zero_mask = __riscv_vmseq_vx_i32m1_b32(_b, 0, 2); + vint32m1_t a_neg = __riscv_vneg_v_i32m1(_a, 2); + vint32m1_t res_lt = __riscv_vmerge_vvm_i32m1(_a, a_neg, lt_mask, 2); + return vreinterpretq_i32_m64( + __riscv_vmerge_vxm_i32m1(res_lt, 0, zero_mask, 2)); +} + +FORCE_INLINE __m64 _mm_sign_pi8(__m64 a, __m64 b) { + vint8m1_t _a = vreinterpretq_m64_i8(a); + vint8m1_t _b = vreinterpretq_m64_i8(b); + + vbool8_t lt_mask = __riscv_vmslt_vx_i8m1_b8(_b, 0, 8); + vbool8_t zero_mask = __riscv_vmseq_vx_i8m1_b8(_b, 0, 8); + vint8m1_t a_neg = __riscv_vneg_v_i8m1(_a, 8); + vint8m1_t res_lt = __riscv_vmerge_vvm_i8m1(_a, a_neg, lt_mask, 8); + return vreinterpretq_i8_m64(__riscv_vmerge_vxm_i8m1(res_lt, 0, zero_mask, 8)); +} + +FORCE_INLINE __m128i _mm_sll_epi16(__m128i a, __m128i count) { + vint16m1_t _a = vreinterpretq_m128i_i16(a); + vint64m1_t _count = vreinterpretq_m128i_i64(count); + int64_t shift = __riscv_vmv_x_s_i64m1_i64(_count); + if (shift > 15) { + return vreinterpretq_i16_m128i(__riscv_vmv_v_x_i16m1(0, 8)); + } + return vreinterpretq_i16_m128i(__riscv_vsll_vx_i16m1(_a, shift, 8)); +} + +FORCE_INLINE __m128i _mm_sll_epi32(__m128i a, __m128i count) { + vint32m1_t _a = vreinterpretq_m128i_i32(a); + vint64m1_t _count = vreinterpretq_m128i_i64(count); + int64_t shift = __riscv_vmv_x_s_i64m1_i64(_count); + if (shift > 31) { + return vreinterpretq_i32_m128i(__riscv_vmv_v_x_i32m1(0, 4)); + } + return vreinterpretq_i32_m128i(__riscv_vsll_vx_i32m1(_a, shift, 4)); +} + +FORCE_INLINE __m128i _mm_sll_epi64(__m128i a, __m128i count) { + vint64m1_t _a = vreinterpretq_m128i_i64(a); + vint64m1_t _count = vreinterpretq_m128i_i64(count); + int64_t shift = __riscv_vmv_x_s_i64m1_i64(_count); + if (shift > 63) { + return vreinterpretq_i64_m128i(__riscv_vmv_v_x_i64m1(0, 2)); + } + return vreinterpretq_i64_m128i(__riscv_vsll_vx_i64m1(_a, shift, 2)); +} + +FORCE_INLINE __m128i _mm_slli_epi16(__m128i a, int imm8) { + vint16m1_t _a = vreinterpretq_m128i_i16(a); + const int _imm8 = imm8 & 0xff; + if (_imm8 > 15) { + return vreinterpretq_i16_m128i(__riscv_vmv_v_x_i16m1(0, 8)); + } + return vreinterpretq_i16_m128i(__riscv_vsll_vx_i16m1(_a, _imm8, 8)); +} + +FORCE_INLINE __m128i _mm_slli_epi32(__m128i a, int imm8) { + vint32m1_t _a = vreinterpretq_m128i_i32(a); + const int _imm8 = imm8 & 0xff; + if (_imm8 > 31) { + return vreinterpretq_i32_m128i(__riscv_vmv_v_x_i32m1(0, 4)); + } + return vreinterpretq_i32_m128i(__riscv_vsll_vx_i32m1(_a, _imm8, 4)); +} + +FORCE_INLINE __m128i _mm_slli_epi64(__m128i a, int imm8) { + vint64m1_t _a = vreinterpretq_m128i_i64(a); + const int _imm8 = imm8 & 0xff; + if (_imm8 > 63) { + return vreinterpretq_i64_m128i(__riscv_vmv_v_x_i64m1(0, 2)); + } + return vreinterpretq_i64_m128i(__riscv_vsll_vx_i64m1(_a, _imm8, 2)); +} + +FORCE_INLINE __m128i _mm_slli_si128(__m128i a, int imm8) { + vuint8m1_t _a = vreinterpretq_m128i_u8(a); + vuint8m1_t zeros = __riscv_vmv_v_x_u8m1(0, 16); + return vreinterpretq_u8_m128i( + __riscv_vslideup_vx_u8m1_tu(zeros, _a, imm8 & 0xff, 16)); +} + +FORCE_INLINE __m128d _mm_sqrt_pd(__m128d a) { + vfloat64m1_t _a = vreinterpretq_m128d_f64(a); + return vreinterpretq_f64_m128d( + __riscv_vfrec7_v_f64m1(__riscv_vfrsqrt7_v_f64m1(_a, 2), 2)); +} + +FORCE_INLINE __m128 _mm_sqrt_ps(__m128 a) { + vfloat32m1_t _a = vreinterpretq_m128_f32(a); + return vreinterpretq_f32_m128( + __riscv_vfrec7_v_f32m1(__riscv_vfrsqrt7_v_f32m1(_a, 4), 4)); +} + +FORCE_INLINE __m128d _mm_sqrt_sd(__m128d a, __m128d b) { + vfloat64m1_t _a = vreinterpretq_m128d_f64(a); + vfloat64m1_t _b = vreinterpretq_m128d_f64(b); + vfloat64m1_t b_rnd = + __riscv_vfrec7_v_f64m1(__riscv_vfrsqrt7_v_f64m1(_b, 2), 2); + return vreinterpretq_f64_m128d(__riscv_vslideup_vx_f64m1_tu(_a, b_rnd, 0, 1)); +} + +FORCE_INLINE __m128 _mm_sqrt_ss(__m128 a) { + vfloat32m1_t _a = vreinterpretq_m128_f32(a); + vfloat32m1_t rnd = __riscv_vfrec7_v_f32m1(__riscv_vfrsqrt7_v_f32m1(_a, 4), 4); + return vreinterpretq_f32_m128(__riscv_vslideup_vx_f32m1_tu(_a, rnd, 0, 1)); +} + +FORCE_INLINE __m128i _mm_sra_epi16(__m128i a, __m128i count) { + vint16m1_t _a = vreinterpretq_m128i_i16(a); + vint64m1_t _count = vreinterpretq_m128i_i64(count); + int64_t count_non_vec = __riscv_vmv_x_s_i64m1_i64(_count); + int64_t count_non_vec_shift = count_non_vec >> 1; + vint16m1_t a_s = __riscv_vsra_vx_i16m1(_a, count_non_vec_shift, 8); + return vreinterpretq_i16_m128i( + __riscv_vsra_vx_i16m1(a_s, count_non_vec - count_non_vec_shift, 8)); +} + +FORCE_INLINE __m128i _mm_sra_epi32(__m128i a, __m128i count) { + vint32m1_t _a = vreinterpretq_m128i_i32(a); + vint64m1_t _count = vreinterpretq_m128i_i64(count); + int64_t count_non_vec = __riscv_vmv_x_s_i64m1_i64(_count); + int64_t count_non_vec_shift = count_non_vec >> 1; + vint32m1_t a_s = __riscv_vsra_vx_i32m1(_a, count_non_vec_shift, 4); + return vreinterpretq_i32_m128i( + __riscv_vsra_vx_i32m1(a_s, count_non_vec - count_non_vec_shift, 4)); +} + +FORCE_INLINE __m128i _mm_srai_epi16(__m128i a, int imm8) { + vint16m1_t _a = vreinterpretq_m128i_i16(a); + int64_t imm8_shift = imm8 >> 1; + vint16m1_t a_s = __riscv_vsra_vx_i16m1(_a, imm8_shift, 8); + return vreinterpretq_i16_m128i( + __riscv_vsra_vx_i16m1(a_s, imm8 - imm8_shift, 8)); +} + +FORCE_INLINE __m128i _mm_srai_epi32(__m128i a, int imm8) { + vint32m1_t _a = vreinterpretq_m128i_i32(a); + int64_t imm8_shift = imm8 >> 1; + vint32m1_t a_s = __riscv_vsra_vx_i32m1(_a, imm8_shift, 4); + return vreinterpretq_i32_m128i( + __riscv_vsra_vx_i32m1(a_s, imm8 - imm8_shift, 4)); +} + +FORCE_INLINE __m128i _mm_srl_epi16(__m128i a, __m128i count) { + vuint16m1_t _a = vreinterpretq_m128i_u16(a); + vint64m1_t _count = vreinterpretq_m128i_i64(count); + int64_t shift = __riscv_vmv_x_s_i64m1_i64(_count); + if (shift > 15) { + return vreinterpretq_u16_m128i(__riscv_vmv_v_x_u16m1(0, 8)); + } + return vreinterpretq_u16_m128i(__riscv_vsrl_vx_u16m1(_a, shift, 8)); +} + +FORCE_INLINE __m128i _mm_srl_epi32(__m128i a, __m128i count) { + vuint32m1_t _a = vreinterpretq_m128i_u32(a); + vint64m1_t _count = vreinterpretq_m128i_i64(count); + int64_t shift = __riscv_vmv_x_s_i64m1_i64(_count); + if (shift > 31) { + return vreinterpretq_u32_m128i(__riscv_vmv_v_x_u32m1(0, 4)); + } + return vreinterpretq_u32_m128i(__riscv_vsrl_vx_u32m1(_a, shift, 4)); +} + +FORCE_INLINE __m128i _mm_srl_epi64(__m128i a, __m128i count) { + vuint64m1_t _a = vreinterpretq_m128i_u64(a); + vint64m1_t _count = vreinterpretq_m128i_i64(count); + int64_t shift = __riscv_vmv_x_s_i64m1_i64(_count); + if (shift > 63) { + return vreinterpretq_u64_m128i(__riscv_vmv_v_x_u64m1(0, 2)); + } + return vreinterpretq_u64_m128i(__riscv_vsrl_vx_u64m1(_a, shift, 2)); +} + +FORCE_INLINE __m128i _mm_srli_epi16(__m128i a, int imm8) { + vuint16m1_t _a = vreinterpretq_m128i_u16(a); + const int _imm8 = imm8 & 0xff; + if (_imm8 > 15) { + return vreinterpretq_u16_m128i(__riscv_vmv_v_x_u16m1(0, 8)); + } + return vreinterpretq_u16_m128i(__riscv_vsrl_vx_u16m1(_a, _imm8, 8)); +} + +FORCE_INLINE __m128i _mm_srli_epi32(__m128i a, int imm8) { + vuint32m1_t _a = vreinterpretq_m128i_u32(a); + const int _imm8 = imm8 & 0xff; + if (_imm8 > 31) { + return vreinterpretq_u32_m128i(__riscv_vmv_v_x_u32m1(0, 4)); + } + return vreinterpretq_u32_m128i(__riscv_vsrl_vx_u32m1(_a, _imm8, 4)); +} + +FORCE_INLINE __m128i _mm_srli_epi64(__m128i a, int imm8) { + vuint64m1_t _a = vreinterpretq_m128i_u64(a); + const int _imm8 = imm8 & 0xff; + if (_imm8 > 63) { + return vreinterpretq_u64_m128i(__riscv_vmv_v_x_u64m1(0, 2)); + } + return vreinterpretq_u64_m128i(__riscv_vsrl_vx_u64m1(_a, _imm8, 2)); +} + +FORCE_INLINE __m128i _mm_srli_si128(__m128i a, int imm8) { + vuint8m1_t _a = vreinterpretq_m128i_u8(a); + return vreinterpretq_u8_m128i( + __riscv_vslidedown_vx_u8m1(_a, imm8 & 0xff, 16)); +} + +FORCE_INLINE void _mm_store_pd(double *mem_addr, __m128d a) { + vfloat64m1_t _a = vreinterpretq_m128d_f64(a); + __riscv_vse64_v_f64m1(mem_addr, _a, 2); +} + +FORCE_INLINE void _mm_store_pd1(double *mem_addr, __m128d a) { + vfloat64m1_t _a = vreinterpretq_m128d_f64(a); + vfloat64m1_t a_arranged = __riscv_vrgather_vx_f64m1(_a, 0, 2); + __riscv_vse64_v_f64m1(mem_addr, a_arranged, 2); +} + +FORCE_INLINE void _mm_store_ps(float *mem_addr, __m128 a) { + vfloat32m1_t _a = vreinterpretq_m128_f32(a); + __riscv_vse32_v_f32m1(mem_addr, _a, 4); +} + +FORCE_INLINE void _mm_store_ps1(float *mem_addr, __m128 a) { + vfloat32m1_t _a = vreinterpretq_m128_f32(a); + vfloat32m1_t a_arranged = __riscv_vrgather_vx_f32m1(_a, 0, 4); + __riscv_vse32_v_f32m1(mem_addr, a_arranged, 4); +} + +FORCE_INLINE void _mm_store_sd(double *mem_addr, __m128d a) { + vfloat64m1_t _a = vreinterpretq_m128d_f64(a); + __riscv_vse64_v_f64m1(mem_addr, _a, 1); +} + +FORCE_INLINE void _mm_store_si128(__m128i *mem_addr, __m128i a) { + *mem_addr = a; +} + +FORCE_INLINE void _mm_store_ss(float *mem_addr, __m128 a) { + vfloat32m1_t _a = vreinterpretq_m128_f32(a); + __riscv_vse32_v_f32m1(mem_addr, _a, 1); +} + +FORCE_INLINE void _mm_store1_pd(double *mem_addr, __m128d a) { + return _mm_store_pd1(mem_addr, a); +} + +FORCE_INLINE void _mm_store1_ps(float *mem_addr, __m128 a) { + return _mm_store_ps1(mem_addr, a); +} + +FORCE_INLINE void _mm_storeh_pd(double *mem_addr, __m128d a) { + vfloat64m1_t _a = vreinterpretq_m128d_f64(a); + _a = __riscv_vslidedown_vx_f64m1(_a, 1, 2); + __riscv_vse64_v_f64m1(mem_addr, _a, 1); +} + +FORCE_INLINE void _mm_storeh_pi(__m64 *mem_addr, __m128 a) { + vint32m1_t _a = vreinterpretq_m128_i32(a); + vint32m1_t addr = vreinterpretq_m64_i32(*mem_addr); + _a = __riscv_vslidedown_vx_i32m1(_a, 2, 2); + *mem_addr = __riscv_vslideup_vx_i32m1_tu(addr, _a, 0, 2); +} + +FORCE_INLINE void _mm_storel_epi64(__m128i *mem_addr, __m128i a) { + *mem_addr = a; +} + +FORCE_INLINE void _mm_storel_pd(double *mem_addr, __m128d a) { + vfloat64m1_t _a = vreinterpretq_m128d_f64(a); + __riscv_vse64_v_f64m1(mem_addr, _a, 1); +} + +FORCE_INLINE void _mm_storel_pi(__m64 *mem_addr, __m128 a) { + vint32m1_t _a = vreinterpretq_m128_i32(a); + vint32m1_t addr = vreinterpretq_m64_i32(*mem_addr); + *mem_addr = __riscv_vslideup_vx_i32m1_tu(addr, _a, 0, 2); +} + +FORCE_INLINE void _mm_storer_pd(double *mem_addr, __m128d a) { + vfloat64m1_t _a = vreinterpretq_m128d_f64(a); + vfloat64m1_t a_down = __riscv_vslidedown_vx_f64m1(_a, 1, 2); + _a = __riscv_vslideup_vx_f64m1_tu(a_down, _a, 1, 2); + __riscv_vse64_v_f64m1(mem_addr, _a, 2); +} + +FORCE_INLINE void _mm_storer_ps(float *mem_addr, __m128 a) { + vuint32m1_t _a = vreinterpretq_m128_u32(a); + vuint32m1_t vid = __riscv_vid_v_u32m1(4); + vuint32m1_t threes = vreinterpretq_m128i_u32(_mm_set1_epi32(3)); + vuint32m1_t idxs = __riscv_vsub_vv_u32m1(threes, vid, 4); + vuint32m1_t a_rev = __riscv_vrgather_vv_u32m1(_a, idxs, 4); + __riscv_vse32_v_f32m1(mem_addr, __riscv_vreinterpret_v_u32m1_f32m1(a_rev), 4); +} + +FORCE_INLINE void _mm_storeu_pd(double *mem_addr, __m128d a) { + vfloat64m1_t _a = vreinterpretq_m128d_f64(a); + __riscv_vse64_v_f64m1(mem_addr, _a, 2); +} + +FORCE_INLINE void _mm_storeu_ps(float *mem_addr, __m128 a) { + vfloat32m1_t _a = vreinterpretq_m128_f32(a); + __riscv_vse32_v_f32m1(mem_addr, _a, 4); +} + +FORCE_INLINE void _mm_storeu_si128(__m128i *mem_addr, __m128i a) { + *mem_addr = a; +} + +FORCE_INLINE void _mm_storeu_si16(void *mem_addr, __m128i a) { + vint16m1_t _a = vreinterpretq_m128i_i16(a); + __riscv_vse16_v_i16m1((int16_t *)mem_addr, _a, 1); +} + +FORCE_INLINE void _mm_storeu_si32(void *mem_addr, __m128i a) { + vint32m1_t _a = vreinterpretq_m128i_i32(a); + __riscv_vse32_v_i32m1((int32_t *)mem_addr, _a, 1); +} + +FORCE_INLINE void _mm_storeu_si64(void *mem_addr, __m128i a) { + vuint64m1_t _a = vreinterpretq_m128i_u64(a); + __riscv_vse64_v_u64m1((uint64_t *)mem_addr, _a, 1); +} + +FORCE_INLINE __m128i _mm_stream_load_si128(void *mem_addr) { + return vreinterpretq_i32_m128i( + __riscv_vle32_v_i32m1((int32_t const *)mem_addr, 4)); +} + +FORCE_INLINE void _mm_stream_pd(void *mem_addr, __m128d a) { + vfloat64m1_t _a = vreinterpretq_m128d_f64(a); + __riscv_vse64_v_f64m1((double *)mem_addr, _a, 2); +} + +FORCE_INLINE void _mm_stream_pi(void *mem_addr, __m64 a) { + vint32m1_t _a = vreinterpretq_m64_i32(a); + __riscv_vse32_v_i32m1((int32_t *)mem_addr, _a, 4); +} + +FORCE_INLINE void _mm_stream_ps(void *mem_addr, __m128 a) { + vfloat32m1_t _a = vreinterpretq_m128_f32(a); + __riscv_vse32_v_f32m1((float *)mem_addr, _a, 4); +} + +FORCE_INLINE void _mm_stream_si128(void *mem_addr, __m128i a) { + vint32m1_t _a = vreinterpretq_m128i_i32(a); + __riscv_vse32_v_i32m1((int32_t *)mem_addr, _a, 4); +} + +FORCE_INLINE void _mm_stream_si32(void *mem_addr, int a) { + ((int *)mem_addr)[0] = a; +} + +FORCE_INLINE void _mm_stream_si64(void *mem_addr, __int64 a) { + ((__int64 *)mem_addr)[0] = a; +} + +FORCE_INLINE __m128i _mm_sub_epi16(__m128i a, __m128i b) { + vint16m1_t _a = vreinterpretq_m128i_i16(a); + vint16m1_t _b = vreinterpretq_m128i_i16(b); + return vreinterpretq_i16_m128i(__riscv_vsub_vv_i16m1(_a, _b, 8)); +} + +FORCE_INLINE __m128i _mm_sub_epi32(__m128i a, __m128i b) { + vint32m1_t _a = vreinterpretq_m128i_i32(a); + vint32m1_t _b = vreinterpretq_m128i_i32(b); + return vreinterpretq_i32_m128i(__riscv_vsub_vv_i32m1(_a, _b, 4)); +} + +FORCE_INLINE __m128i _mm_sub_epi64(__m128i a, __m128i b) { + vint64m1_t _a = vreinterpretq_m128i_i64(a); + vint64m1_t _b = vreinterpretq_m128i_i64(b); + return vreinterpretq_i64_m128i(__riscv_vsub_vv_i64m1(_a, _b, 2)); +} + +FORCE_INLINE __m128i _mm_sub_epi8(__m128i a, __m128i b) { + vint8m1_t _a = vreinterpretq_m128i_i8(a); + vint8m1_t _b = vreinterpretq_m128i_i8(b); + return vreinterpretq_i8_m128i(__riscv_vsub_vv_i8m1(_a, _b, 16)); +} + +FORCE_INLINE __m128d _mm_sub_pd(__m128d a, __m128d b) { + vfloat64m1_t _a = vreinterpretq_m128d_f64(a); + vfloat64m1_t _b = vreinterpretq_m128d_f64(b); + return vreinterpretq_f64_m128d(__riscv_vfsub_vv_f64m1(_a, _b, 2)); +} + +FORCE_INLINE __m128 _mm_sub_ps(__m128 a, __m128 b) { + vfloat32m1_t _a = vreinterpretq_m128_f32(a); + vfloat32m1_t _b = vreinterpretq_m128_f32(b); + return vreinterpretq_f32_m128(__riscv_vfsub_vv_f32m1(_a, _b, 4)); +} + +FORCE_INLINE __m128d _mm_sub_sd(__m128d a, __m128d b) { + vfloat64m1_t _a = vreinterpretq_m128d_f64(a); + vfloat64m1_t _b = vreinterpretq_m128d_f64(b); + vfloat64m1_t sub = __riscv_vfsub_vv_f64m1(_a, _b, 2); + vbool64_t mask = __riscv_vreinterpret_v_u8m1_b64(__riscv_vmv_v_x_u8m1(1, 8)); + return vreinterpretq_f64_m128d(__riscv_vmerge_vvm_f64m1(_a, sub, mask, 2)); +} + +FORCE_INLINE __m64 _mm_sub_si64(__m64 a, __m64 b) { + vint64m1_t _a = vreinterpretq_m64_i64(a); + vint64m1_t _b = vreinterpretq_m64_i64(b); + return vreinterpretq_i64_m64(__riscv_vsub_vv_i64m1(_a, _b, 1)); +} + +FORCE_INLINE __m128 _mm_sub_ss(__m128 a, __m128 b) { + vfloat32m1_t _a = vreinterpretq_m128_f32(a); + vfloat32m1_t _b = vreinterpretq_m128_f32(b); + vfloat32m1_t sub = __riscv_vfsub_vv_f32m1(_a, _b, 4); + vbool32_t mask = __riscv_vreinterpret_v_u8m1_b32(__riscv_vmv_v_x_u8m1(1, 8)); + return vreinterpretq_f32_m128(__riscv_vmerge_vvm_f32m1(_a, sub, mask, 4)); +} + +FORCE_INLINE __m128i _mm_subs_epi16(__m128i a, __m128i b) { + vint16m1_t _a = vreinterpretq_m128i_i16(a); + vint16m1_t _b = vreinterpretq_m128i_i16(b); + return vreinterpretq_i16_m128i(__riscv_vssub_vv_i16m1(_a, _b, 8)); +} + +FORCE_INLINE __m128i _mm_subs_epi8(__m128i a, __m128i b) { + vint8m1_t _a = vreinterpretq_m128i_i8(a); + vint8m1_t _b = vreinterpretq_m128i_i8(b); + return vreinterpretq_i8_m128i(__riscv_vssub_vv_i8m1(_a, _b, 16)); +} + +FORCE_INLINE __m128i _mm_subs_epu16(__m128i a, __m128i b) { + vuint16m1_t _a = vreinterpretq_m128i_u16(a); + vuint16m1_t _b = vreinterpretq_m128i_u16(b); + return vreinterpretq_u16_m128i(__riscv_vssubu_vv_u16m1(_a, _b, 8)); +} + +FORCE_INLINE __m128i _mm_subs_epu8(__m128i a, __m128i b) { + vuint8m1_t _a = vreinterpretq_m128i_u8(a); + vuint8m1_t _b = vreinterpretq_m128i_u8(b); + return vreinterpretq_u8_m128i(__riscv_vssubu_vv_u8m1(_a, _b, 16)); +} + +FORCE_INLINE int _mm_test_all_ones(__m128i a) { + vint32m1_t _a = vreinterpretq_m128i_i32(a); + vint32m1_t _mask = __riscv_vmv_v_x_i32m1(UINT32_MAX, 4); + vint32m1_t a_not = __riscv_vnot_v_i32m1(_a, 4); + vint32m1_t _and = __riscv_vand_vv_i32m1(a_not, _mask, 4); + vint32m1_t redsum = + __riscv_vredsum_vs_i32m1_i32m1(__riscv_vmv_v_x_i32m1(0, 4), _and, 4); + return !(int)__riscv_vmv_x_s_i32m1_i32(redsum); +} + +FORCE_INLINE int _mm_test_all_zeros(__m128i mask, __m128i a) { + vint32m1_t _a = vreinterpretq_m128i_i32(a); + vint32m1_t _mask = vreinterpretq_m128i_i32(mask); + vint32m1_t _and = __riscv_vand_vv_i32m1(_a, _mask, 4); + vint32m1_t redor = + __riscv_vredor_vs_i32m1_i32m1(_and, __riscv_vmv_v_x_i32m1(0, 4), 4); + return !(int)__riscv_vmv_x_s_i32m1_i32(redor); +} + +FORCE_INLINE int _mm_test_mix_ones_zeros(__m128i mask, __m128i a) { + vint32m1_t _mask = vreinterpretq_m128i_i32(mask); + vint32m1_t _a = vreinterpretq_m128i_i32(a); + vint32m1_t zf_and = __riscv_vand_vv_i32m1(_a, _mask, 4); + vint32m1_t zf_redor = + __riscv_vredor_vs_i32m1_i32m1(zf_and, __riscv_vmv_v_x_i32m1(0, 4), 4); + int zf_neg = (int)__riscv_vmv_x_s_i32m1_i32(zf_redor); + + vint32m1_t a_not = __riscv_vnot_v_i32m1(_a, 4); + vint32m1_t cf_and = __riscv_vand_vv_i32m1(a_not, _mask, 4); + vint32m1_t cf_redor = + __riscv_vredor_vs_i32m1_i32m1(cf_and, __riscv_vmv_v_x_i32m1(0, 4), 4); + int cf_neg = (int)__riscv_vmv_x_s_i32m1_i32(cf_redor); + return !!(zf_neg | cf_neg); +} + +FORCE_INLINE int _mm_testc_si128(__m128i a, __m128i b) { + vint32m1_t _a = vreinterpretq_m128i_i32(a); + vint32m1_t _b = vreinterpretq_m128i_i32(b); + vint32m1_t a_not = __riscv_vnot_v_i32m1(_a, 4); + vint32m1_t cf_and = __riscv_vand_vv_i32m1(a_not, _b, 4); + vint32m1_t cf_redor = + __riscv_vredor_vs_i32m1_i32m1(cf_and, __riscv_vmv_v_x_i32m1(0, 4), 4); + return !(int)__riscv_vmv_x_s_i32m1_i32(cf_redor); +} + +FORCE_INLINE int _mm_testnzc_si128(__m128i a, __m128i b) { + vint32m1_t _a = vreinterpretq_m128i_i32(a); + vint32m1_t _mask = vreinterpretq_m128i_i32(b); + vint32m1_t zf_and = __riscv_vand_vv_i32m1(_a, _mask, 4); + vint32m1_t zf_redor = + __riscv_vredor_vs_i32m1_i32m1(zf_and, __riscv_vmv_v_x_i32m1(0, 4), 4); + int zf_neg = !!(int)__riscv_vmv_x_s_i32m1_i32(zf_redor); + + vint32m1_t a_not = __riscv_vnot_v_i32m1(_a, 4); + vint32m1_t cf_and = __riscv_vand_vv_i32m1(a_not, _mask, 4); + vint32m1_t cf_redor = + __riscv_vredor_vs_i32m1_i32m1(cf_and, __riscv_vmv_v_x_i32m1(0, 4), 4); + int cf_neg = !!(int)__riscv_vmv_x_s_i32m1_i32(cf_redor); + return zf_neg & cf_neg; +} + +FORCE_INLINE int _mm_testz_si128(__m128i a, __m128i b) { + vint32m1_t _a = vreinterpretq_m128i_i32(a); + vint32m1_t _b = vreinterpretq_m128i_i32(b); + vint32m1_t zf_and = __riscv_vand_vv_i32m1(_a, _b, 4); + vint32m1_t zf_redor = + __riscv_vredor_vs_i32m1_i32m1(zf_and, __riscv_vmv_v_x_i32m1(0, 4), 4); + return !(int)__riscv_vmv_x_s_i32m1_i32(zf_redor); +} + +// FORCE_INLINE void _MM_TRANSPOSE4_PS (__m128 row0, __m128 row1, __m128 row2, +// __m128 row3) {} + +FORCE_INLINE int _mm_ucomieq_sd(__m128d a, __m128d b) { + return _mm_comieq_sd(a, b); +} + +FORCE_INLINE int _mm_ucomieq_ss(__m128 a, __m128 b) { + return _mm_comieq_ss(a, b); +} + +FORCE_INLINE int _mm_ucomige_sd(__m128d a, __m128d b) { + return _mm_comige_sd(a, b); +} + +FORCE_INLINE int _mm_ucomige_ss(__m128 a, __m128 b) { + return _mm_comige_ss(a, b); +} + +FORCE_INLINE int _mm_ucomigt_sd(__m128d a, __m128d b) { + return _mm_comigt_sd(a, b); +} + +FORCE_INLINE int _mm_ucomigt_ss(__m128 a, __m128 b) { + return _mm_comigt_ss(a, b); +} + +FORCE_INLINE int _mm_ucomile_sd(__m128d a, __m128d b) { + return _mm_comile_sd(a, b); +} + +FORCE_INLINE int _mm_ucomile_ss(__m128 a, __m128 b) { + return _mm_comile_ss(a, b); +} + +FORCE_INLINE int _mm_ucomilt_sd(__m128d a, __m128d b) { + return _mm_comilt_sd(a, b); +} + +FORCE_INLINE int _mm_ucomilt_ss(__m128 a, __m128 b) { + return _mm_comilt_ss(a, b); +} + +FORCE_INLINE int _mm_ucomineq_sd(__m128d a, __m128d b) { + return _mm_comineq_sd(a, b); +} + +FORCE_INLINE int _mm_ucomineq_ss(__m128 a, __m128 b) { + return _mm_comineq_ss(a, b); +} + +FORCE_INLINE __m128d _mm_undefined_pd(void) { +#if defined(__GNUC__) || defined(__clang__) +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wuninitialized" +#endif + __m128d a; +#if defined(_MSC_VER) + a = _mm_setzero_pd(); +#endif + return a; +#if defined(__GNUC__) || defined(__clang__) +#pragma GCC diagnostic pop +#endif +} + +FORCE_INLINE __m128 _mm_undefined_ps(void) { +#if defined(__GNUC__) || defined(__clang__) +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wuninitialized" +#endif + __m128 a; +#if defined(_MSC_VER) + a = _mm_setzero_ps(); +#endif + return a; +#if defined(__GNUC__) || defined(__clang__) +#pragma GCC diagnostic pop +#endif +} + +FORCE_INLINE __m128i _mm_undefined_si128(void) { +#if defined(__GNUC__) || defined(__clang__) +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wuninitialized" +#endif + __m128i a; +#if defined(_MSC_VER) + a = _mm_setzero_si128(); +#endif + return a; +#if defined(__GNUC__) || defined(__clang__) +#pragma GCC diagnostic pop +#endif +} + +FORCE_INLINE __m128i _mm_unpackhi_epi16(__m128i a, __m128i b) { + vuint16m2_t _a = __riscv_vlmul_ext_v_u16m1_u16m2(vreinterpretq_m128i_u16(a)); + vuint16m2_t _b = __riscv_vlmul_ext_v_u16m1_u16m2(vreinterpretq_m128i_u16(b)); + vuint16m2_t ab = __riscv_vslideup_vx_u16m2_tu(_a, _b, 8, 16); + uint16_t arr[16] = {4, 12, 5, 13, 6, 14, 7, 15}; + vuint16m2_t idx = __riscv_vle16_v_u16m2(arr, 16); + return vreinterpretq_u16_m128i( + __riscv_vlmul_trunc_v_u16m2_u16m1(__riscv_vrgather_vv_u16m2(ab, idx, 8))); +} + +FORCE_INLINE __m128i _mm_unpackhi_epi32(__m128i a, __m128i b) { + vuint32m2_t _a = __riscv_vlmul_ext_v_u32m1_u32m2(vreinterpretq_m128i_u32(a)); + vuint32m2_t _b = __riscv_vlmul_ext_v_u32m1_u32m2(vreinterpretq_m128i_u32(b)); + vuint32m2_t ab = __riscv_vslideup_vx_u32m2_tu(_a, _b, 4, 8); + uint32_t arr[8] = {2, 6, 3, 7, 0, 0, 0, 0}; + vuint32m2_t idx = __riscv_vle32_v_u32m2(arr, 8); + return vreinterpretq_u32_m128i( + __riscv_vlmul_trunc_v_u32m2_u32m1(__riscv_vrgather_vv_u32m2(ab, idx, 4))); +} + +FORCE_INLINE __m128i _mm_unpackhi_epi64(__m128i a, __m128i b) { + vuint64m1_t _a = vreinterpretq_m128i_u64(a); + vuint64m1_t _b = vreinterpretq_m128i_u64(b); + vuint64m1_t a_s = __riscv_vslidedown_vx_u64m1(_a, 1, 2); + return vreinterpretq_u64_m128i(__riscv_vslideup_vx_u64m1_tu(_b, a_s, 0, 1)); +} + +FORCE_INLINE __m128i _mm_unpackhi_epi8(__m128i a, __m128i b) { + vuint8m2_t _a = __riscv_vlmul_ext_v_u8m1_u8m2(vreinterpretq_m128i_u8(a)); + vuint8m2_t _b = __riscv_vlmul_ext_v_u8m1_u8m2(vreinterpretq_m128i_u8(b)); + vuint8m2_t ab = __riscv_vslideup_vx_u8m2_tu(_a, _b, 16, 32); + uint8_t arr[32] = {8, 24, 9, 25, 10, 26, 11, 27, + 12, 28, 13, 29, 14, 30, 15, 31}; + vuint8m2_t idx = __riscv_vle8_v_u8m2(arr, 32); + return vreinterpretq_u8_m128i( + __riscv_vlmul_trunc_v_u8m2_u8m1(__riscv_vrgather_vv_u8m2(ab, idx, 16))); +} + +FORCE_INLINE __m128d _mm_unpackhi_pd(__m128d a, __m128d b) { + vuint64m1_t _a = vreinterpretq_m128d_u64(a); + vuint64m1_t _b = vreinterpretq_m128d_u64(b); + vuint64m1_t a_s = __riscv_vslidedown_vx_u64m1(_a, 1, 2); + return vreinterpretq_u64_m128d(__riscv_vslideup_vx_u64m1_tu(_b, a_s, 0, 1)); +} + +FORCE_INLINE __m128 _mm_unpackhi_ps(__m128 a, __m128 b) { + vuint32m2_t _a = __riscv_vlmul_ext_v_u32m1_u32m2(vreinterpretq_m128_u32(a)); + vuint32m2_t _b = __riscv_vlmul_ext_v_u32m1_u32m2(vreinterpretq_m128_u32(b)); + vuint32m2_t ab = __riscv_vslideup_vx_u32m2_tu(_a, _b, 4, 8); + uint32_t arr[8] = {2, 6, 3, 7, 0, 0, 0, 0}; + vuint32m2_t idx = __riscv_vle32_v_u32m2(arr, 8); + return vreinterpretq_u32_m128( + __riscv_vlmul_trunc_v_u32m2_u32m1(__riscv_vrgather_vv_u32m2(ab, idx, 4))); +} + +FORCE_INLINE __m128i _mm_unpacklo_epi16(__m128i a, __m128i b) { + vuint16m1_t _a = vreinterpretq_m128i_u16(a); + vuint16m1_t _b = vreinterpretq_m128i_u16(b); + vuint16m1_t ab = __riscv_vslideup_vx_u16m1_tu(_a, _b, 4, 8); + uint16_t arr[8] = {0, 4, 1, 5, 2, 6, 3, 7}; + vuint16m1_t idx = __riscv_vle16_v_u16m1(arr, 8); + return vreinterpretq_u16_m128i(__riscv_vrgather_vv_u16m1(ab, idx, 8)); +} + +FORCE_INLINE __m128i _mm_unpacklo_epi32(__m128i a, __m128i b) { + vuint32m1_t _a = vreinterpretq_m128i_u32(a); + vuint32m1_t _b = vreinterpretq_m128i_u32(b); + vuint32m1_t ab = __riscv_vslideup_vx_u32m1_tu(_a, _b, 2, 4); + uint32_t arr[4] = {0, 2, 1, 3}; + vuint32m1_t idx = __riscv_vle32_v_u32m1(arr, 4); + return vreinterpretq_u32_m128i(__riscv_vrgather_vv_u32m1(ab, idx, 4)); +} + +FORCE_INLINE __m128i _mm_unpacklo_epi64(__m128i a, __m128i b) { + vuint64m1_t _a = vreinterpretq_m128i_u64(a); + vuint64m1_t _b = vreinterpretq_m128i_u64(b); + return vreinterpretq_u64_m128i(__riscv_vslideup_vx_u64m1_tu(_a, _b, 1, 2)); +} + +FORCE_INLINE __m128i _mm_unpacklo_epi8(__m128i a, __m128i b) { + vuint8m1_t _a = vreinterpretq_m128i_u8(a); + vuint8m1_t _b = vreinterpretq_m128i_u8(b); + vuint8m1_t ab = __riscv_vslideup_vx_u8m1_tu(_a, _b, 8, 16); + uint8_t arr[16] = {0, 8, 1, 9, 2, 10, 3, 11, 4, 12, 5, 13, 6, 14, 7, 15}; + vuint8m1_t idx = __riscv_vle8_v_u8m1(arr, 16); + return vreinterpretq_u8_m128i(__riscv_vrgather_vv_u8m1(ab, idx, 16)); +} + +FORCE_INLINE __m128d _mm_unpacklo_pd(__m128d a, __m128d b) { + vuint64m1_t _a = vreinterpretq_m128d_u64(a); + vuint64m1_t _b = vreinterpretq_m128d_u64(b); + return vreinterpretq_u64_m128d(__riscv_vslideup_vx_u64m1_tu(_a, _b, 1, 2)); +} + +FORCE_INLINE __m128 _mm_unpacklo_ps(__m128 a, __m128 b) { + vuint32m1_t _a = vreinterpretq_m128_u32(a); + vuint32m1_t _b = vreinterpretq_m128_u32(b); + vuint32m1_t ab = __riscv_vslideup_vx_u32m1_tu(_a, _b, 2, 4); + uint32_t arr[4] = {0, 2, 1, 3}; + vuint32m1_t idx = __riscv_vle32_v_u32m1(arr, 4); + return vreinterpretq_u32_m128(__riscv_vrgather_vv_u32m1(ab, idx, 4)); +} + +FORCE_INLINE __m128d _mm_xor_pd(__m128d a, __m128d b) { + vint64m1_t _a = vreinterpretq_m128d_i64(a); + vint64m1_t _b = vreinterpretq_m128d_i64(b); + return vreinterpretq_i64_m128d(__riscv_vxor_vv_i64m1(_a, _b, 2)); +} + +FORCE_INLINE __m128 _mm_xor_ps(__m128 a, __m128 b) { + vint32m1_t _a = vreinterpretq_m128_i32(a); + vint32m1_t _b = vreinterpretq_m128_i32(b); + return vreinterpretq_i32_m128(__riscv_vxor_vv_i32m1(_a, _b, 4)); +} + +FORCE_INLINE __m128i _mm_xor_si128(__m128i a, __m128i b) { + vint32m1_t _a = vreinterpretq_m128i_i32(a); + vint32m1_t _b = vreinterpretq_m128i_i32(b); + return vreinterpretq_i32_m128i(__riscv_vxor_vv_i32m1(_a, _b, 4)); +} + +/* AES */ + +// In the absence of crypto extensions, implement aesenc using regular NEON +// intrinsics instead. See: +// https://www.workofard.com/2017/01/accelerated-aes-for-the-arm64-linux-kernel/ +// https://www.workofard.com/2017/07/ghash-for-low-end-cores/ and +// for more information. +// FORCE_INLINE __m128i _mm_aesenc_si128(__m128i a, __m128i RoundKey) {} + +// Perform one round of an AES decryption flow on data (state) in a using the +// round key in RoundKey, and store the result in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_aesdec_si128 +// FORCE_INLINE __m128i _mm_aesdec_si128(__m128i a, __m128i RoundKey) {} + +// Perform the last round of an AES encryption flow on data (state) in a using +// the round key in RoundKey, and store the result in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_aesenclast_si128 +// FORCE_INLINE __m128i _mm_aesenclast_si128(__m128i a, __m128i RoundKey) {} + +// Perform the last round of an AES decryption flow on data (state) in a using +// the round key in RoundKey, and store the result in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_aesdeclast_si128 +// FORCE_INLINE __m128i _mm_aesdeclast_si128(__m128i a, __m128i RoundKey) {} + +// Perform the InvMixColumns transformation on a and store the result in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_aesimc_si128 +// FORCE_INLINE __m128i _mm_aesimc_si128(__m128i a) {} + +// Assist in expanding the AES cipher key by computing steps towards generating +// a round key for encryption cipher using data from a and an 8-bit round +// constant specified in imm8, and store the result in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_aeskeygenassist_si128 +// +// Emits the Advanced Encryption Standard (AES) instruction aeskeygenassist. +// This instruction generates a round key for AES encryption. See +// https://kazakov.life/2017/11/01/cryptocurrency-mining-on-ios-devices/ +// for details. +// FORCE_INLINE __m128i _mm_aeskeygenassist_si128(__m128i a, const int rcon) {} + +/* Others */ + +// Perform a carry-less multiplication of two 64-bit integers, selected from a +// and b according to imm8, and store the results in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_clmulepi64_si128 +// FORCE_INLINE __m128i _mm_clmulepi64_si128(__m128i _a, __m128i _b, const int +// imm) {} + +// FORCE_INLINE unsigned int _sse2rvv_mm_get_denormals_zero_mode(void) {} + +// Count the number of bits set to 1 in unsigned 32-bit integer a, and +// return that count in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_popcnt_u32 +// FORCE_INLINE int _mm_popcnt_u32(unsigned int a) {} + +// Count the number of bits set to 1 in unsigned 64-bit integer a, and +// return that count in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_popcnt_u64 +// FORCE_INLINE int64_t _mm_popcnt_u64(uint64_t a) {} + +// FORCE_INLINE void _sse2rvv_mm_set_denormals_zero_mode(unsigned int flag) {} + +// Return the current 64-bit value of the processor's time-stamp counter. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=rdtsc +// FORCE_INLINE uint64_t _rdtsc(void) {} + +#if defined(__GNUC__) || defined(__clang__) +#pragma pop_macro("ALIGN_STRUCT") +#pragma pop_macro("FORCE_INLINE") +#endif + +// #if defined(__GNUC__) && !defined(__clang__) +// #pragma GCC pop_options +// #endif + +#endif From 68afeafdd3b520bcab43f5ea50551d88560a09b4 Mon Sep 17 00:00:00 2001 From: Ting Chou Date: Tue, 20 Aug 2024 18:05:42 +0800 Subject: [PATCH 2/5] Integrate riscv64 to the build system. --- CMakeLists.txt | 26 +++++++++++++++++++ common/cmake/clang.cmake | 3 +++ common/math/emath.h | 2 ++ common/math/vec3.h | 2 +- common/math/vec4.h | 2 +- common/simd/riscv/emulation.h | 2 +- common/simd/vfloat4_sse2.h | 2 +- common/sys/intrinsics.h | 2 ++ common/sys/platform.h | 2 +- common/sys/sysinfo.cpp | 10 +++++++ common/sys/thread.cpp | 2 ++ kernels/bvh/node_intersector1.h | 21 ++++++++------- kernels/common/accel.h | 2 +- .../geometry/instance_array_intersector.cpp | 2 +- kernels/geometry/instance_intersector.cpp | 2 +- 15 files changed, 65 insertions(+), 17 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 5480385072..1f7d00ef6b 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -243,6 +243,12 @@ ELSEIF(CMAKE_SYSTEM_PROCESSOR STREQUAL "aarch64" OR CMAKE_SYSTEM_PROCESSOR STREQ SET(EMBREE_ARM ON) ENDIF() +# detect RISC-V compilation +IF (CMAKE_SYSTEM_PROCESSOR STREQUAL "riscv64") + MESSAGE(STATUS "Building for RISC-V 64") + SET(EMBREE_RISCV ON) +ENDIF() + SET(EMBREE_TASKING_SYSTEM "TBB" CACHE STRING "Selects tasking system") SET(EMBREE_TBB_COMPONENT "tbb" CACHE STRING "The TBB component/library name.") @@ -385,6 +391,8 @@ ENDIF() IF (EMBREE_ARM) SET_PROPERTY(CACHE EMBREE_MAX_ISA PROPERTY STRINGS NONE NEON NEON2X) +ELSEIF (EMBREE_RISCV) + SET_PROPERTY(CACHE EMBREE_MAX_ISA PROPERTY STRINGS NONE SSE2 SSE4.2 DEFAULT) ELSE() SET_PROPERTY(CACHE EMBREE_MAX_ISA PROPERTY STRINGS NONE SSE2 SSE4.2 AVX AVX2 AVX512 DEFAULT) ENDIF() @@ -399,6 +407,8 @@ IF (EMBREE_MAX_ISA STREQUAL "NONE") OPTION(EMBREE_ISA_NEON "Enables NEON ISA." ON) OPTION(EMBREE_ISA_NEON2X "Enables NEON ISA double pumped." OFF) ENDIF() + ELSEIF (EMBREE_RISCV) + OPTION(EMBREE_ISA_RVV "Enables RVV ISA." ON) ELSE() TRY_COMPILE(COMPILER_SUPPORTS_AVX "${CMAKE_BINARY_DIR}" "${PROJECT_SOURCE_DIR}/common/cmake/check_isa.cpp" COMPILE_DEFINITIONS ${FLAGS_AVX}) TRY_COMPILE(COMPILER_SUPPORTS_AVX2 "${CMAKE_BINARY_DIR}" "${PROJECT_SOURCE_DIR}/common/cmake/check_isa.cpp" COMPILE_DEFINITIONS ${FLAGS_AVX2}) @@ -416,11 +426,13 @@ IF (EMBREE_MAX_ISA STREQUAL "NONE") # Don't use OPTION, but still set them to OFF, so that embree-config.cmake is consisten with its definitions SET(EMBREE_ISA_NEON OFF) SET(EMBREE_ISA_NEON2X OFF) + SET(EMBREE_ISA_RVV OFF) ENDIF() ELSEIF (EMBREE_MAX_ISA STREQUAL "DEFAULT") UNSET(EMBREE_ISA_NEON CACHE) UNSET(EMBREE_ISA_NEON2X CACHE) + UNSET(EMBREE_ISA_RVV CACHE) UNSET(EMBREE_ISA_SSE2 CACHE) UNSET(EMBREE_ISA_SSE42 CACHE) UNSET(EMBREE_ISA_AVX CACHE) @@ -428,6 +440,7 @@ ELSEIF (EMBREE_MAX_ISA STREQUAL "DEFAULT") UNSET(EMBREE_ISA_AVX512 CACHE) SET(EMBREE_ISA_NEON OFF) SET(EMBREE_ISA_NEON2X OFF) + SET(EMBREE_ISA_RVV OFF) SET(EMBREE_ISA_SSE2 OFF) SET(EMBREE_ISA_SSE42 OFF) SET(EMBREE_ISA_AVX OFF) @@ -442,6 +455,7 @@ ELSEIF (EMBREE_MAX_ISA STREQUAL "DEFAULT") ELSE() UNSET(EMBREE_ISA_NEON CACHE) UNSET(EMBREE_ISA_NEON2X CACHE) + UNSET(EMBREE_ISA_RVV CACHE) UNSET(EMBREE_ISA_SSE2 CACHE) UNSET(EMBREE_ISA_SSE42 CACHE) UNSET(EMBREE_ISA_AVX CACHE) @@ -452,6 +466,8 @@ ELSE() SET(ISA 1) ELSEIF(EMBREE_MAX_ISA STREQUAL "NEON2X") SET(ISA 2) + ELSEIF(EMBREE_MAX_ISA STREQUAL "RVV") + SET(ISA 1) ELSEIF(EMBREE_MAX_ISA STREQUAL "SSE2") SET(ISA 1) ELSEIF(EMBREE_MAX_ISA STREQUAL "SSE4.2") @@ -470,6 +486,7 @@ ELSE() SET(EMBREE_ISA_NEON OFF) SET(EMBREE_ISA_NEON2X OFF) + SET(EMBREE_ISA_RVV OFF) SET(EMBREE_ISA_SSE2 OFF) SET(EMBREE_ISA_SSE42 OFF) SET(EMBREE_ISA_AVX OFF) @@ -483,6 +500,10 @@ ELSE() IF (ISA GREATER 1) SET(EMBREE_ISA_NEON2X ON) ENDIF () + ELSEIF (EMBREE_RISCV) + IF (ISA GREATER 0) + SET(EMBREE_ISA_RVV ON) + ENDIF () ELSE() IF (ISA GREATER 0) SET(EMBREE_ISA_SSE2 ON) @@ -574,6 +595,11 @@ IF (EMBREE_ISA_NEON2X) SET(EMBREE_ISA_AVX2 ON) ENDIF() +IF (EMBREE_ISA_RVV) + SET(EMBREE_ISA_SSE2 ON) + SET(EMBREE_ISA_SSE42 ON) +ENDIF() + IF (EMBREE_ISA_SSE2) ADD_DEFINITIONS(-DEMBREE_TARGET_SSE2) IF (NOT EMBREE_ARM) diff --git a/common/cmake/clang.cmake b/common/cmake/clang.cmake index 2666d1be8b..7b1c9f9dbd 100644 --- a/common/cmake/clang.cmake +++ b/common/cmake/clang.cmake @@ -21,6 +21,9 @@ IF (EMBREE_ARM) SET(FLAGS_AVX "-D__AVX__ -D__SSE4_2__ -D__SSE4_1__ -D__BMI__ -D__BMI2__ -D__LZCNT__") SET(FLAGS_AVX2 "-D__AVX2__ -D__AVX__ -D__SSE4_2__ -D__SSE4_1__ -D__BMI__ -D__BMI2__ -D__LZCNT__") ENDIF () +ELSEIF (EMBREE_RISCV) + SET(FLAGS_SSE2 "-D__SSE__ -D__SSE2__ -march=rv64gcv_zba_zbb_zbs -mrvv-vector-bits=zvl") + SET(FLAGS_SSE42 "-D__SSE4_2__ -D__SSE4_1__ -march=rv64gcv_zba_zbb_zbs -mrvv-vector-bits=zvl") ELSE () # for `thread` keyword _SET_IF_EMPTY(FLAGS_SSE2 "-msse -msse2 -mno-sse4.2") diff --git a/common/math/emath.h b/common/math/emath.h index 22a89a7669..1e17263145 100644 --- a/common/math/emath.h +++ b/common/math/emath.h @@ -14,6 +14,8 @@ #if defined(__ARM_NEON) #include "../simd/arm/emulation.h" +#elif defined(__riscv_v) +#include "../simd/riscv/emulation.h" #else #include #include diff --git a/common/math/vec3.h b/common/math/vec3.h index d5e78befe8..221d353854 100644 --- a/common/math/vec3.h +++ b/common/math/vec3.h @@ -292,7 +292,7 @@ namespace embree template<> __forceinline Vec3::Vec3(const Vec3fa& a) { x = a.x; y = a.y; z = a.z; } -#elif defined(__SSE__) || defined(__ARM_NEON) +#elif defined(__SSE__) || defined(__ARM_NEON) || defined(__riscv_v) template<> __forceinline Vec3::Vec3(const Vec3fa& a) { const vfloat4 v = vfloat4(a.m128); x = shuffle<0,0,0,0>(v); y = shuffle<1,1,1,1>(v); z = shuffle<2,2,2,2>(v); diff --git a/common/math/vec4.h b/common/math/vec4.h index 5647859257..ae2a3d1746 100644 --- a/common/math/vec4.h +++ b/common/math/vec4.h @@ -227,7 +227,7 @@ namespace embree template<> __forceinline Vec4::Vec4( const Vec3fx& a ) { x = a.x; y = a.y; z = a.z; w = a.w; } -#elif defined(__SSE__) || defined(__ARM_NEON) +#elif defined(__SSE__) || defined(__ARM_NEON) || defined(__riscv_v) template<> __forceinline Vec4::Vec4( const Vec3fx& a ) { const vfloat4 v = vfloat4(a.m128); x = shuffle<0,0,0,0>(v); y = shuffle<1,1,1,1>(v); z = shuffle<2,2,2,2>(v); w = shuffle<3,3,3,3>(v); } diff --git a/common/simd/riscv/emulation.h b/common/simd/riscv/emulation.h index bb7418b83e..46a73cb5d9 100644 --- a/common/simd/riscv/emulation.h +++ b/common/simd/riscv/emulation.h @@ -149,4 +149,4 @@ __forceinline __m128 _mm_fnmsub_ps(__m128 a, __m128 b, __m128 c) { // #define _MM_FLUSH_ZERO_ON 0x8000 #define _MM_MASK_DENORM 0x100 #define _MM_SET_EXCEPTION_MASK(x) -// #define _MM_SET_FLUSH_ZERO_MODE(x) +#define _MM_SET_FLUSH_ZERO_MODE(x) diff --git a/common/simd/vfloat4_sse2.h b/common/simd/vfloat4_sse2.h index fccf11fe0c..9c4450a498 100644 --- a/common/simd/vfloat4_sse2.h +++ b/common/simd/vfloat4_sse2.h @@ -435,7 +435,7 @@ namespace embree /// Ternary Operators //////////////////////////////////////////////////////////////////////////////// -#if defined(__AVX2__) || defined(__ARM_NEON) +#if defined(__AVX2__) || defined(__ARM_NEON) || defined(__riscv_v) __forceinline vfloat4 madd (const vfloat4& a, const vfloat4& b, const vfloat4& c) { return _mm_fmadd_ps(a,b,c); } __forceinline vfloat4 msub (const vfloat4& a, const vfloat4& b, const vfloat4& c) { return _mm_fmsub_ps(a,b,c); } __forceinline vfloat4 nmadd(const vfloat4& a, const vfloat4& b, const vfloat4& c) { return _mm_fnmadd_ps(a,b,c); } diff --git a/common/sys/intrinsics.h b/common/sys/intrinsics.h index f5074bb29d..2b82a6fb3d 100644 --- a/common/sys/intrinsics.h +++ b/common/sys/intrinsics.h @@ -11,6 +11,8 @@ #if defined(__ARM_NEON) #include "../simd/arm/emulation.h" +#elif defined(__riscv_v) +#include "../simd/riscv/emulation.h" #else #include #if defined(__EMSCRIPTEN__) diff --git a/common/sys/platform.h b/common/sys/platform.h index 6dc0cf3318..887d2ad2fe 100644 --- a/common/sys/platform.h +++ b/common/sys/platform.h @@ -58,7 +58,7 @@ #endif /* detect 64 bit platform */ -#if defined(__X86_64__) || defined(__aarch64__) +#if defined(__X86_64__) || defined(__aarch64__) || (defined(__riscv) && (__riscv_xlen == 64)) #define __64BIT__ #endif diff --git a/common/sys/sysinfo.cpp b/common/sys/sysinfo.cpp index 5f375cd95c..266f05701f 100644 --- a/common/sys/sysinfo.cpp +++ b/common/sys/sysinfo.cpp @@ -376,6 +376,16 @@ namespace embree cpu_features |= CPU_FEATURE_NEON_2X; return cpu_features; +#elif defined(__riscv) + + int cpu_features = CPU_FEATURE_SSE|CPU_FEATURE_SSE2; + cpu_features |= CPU_FEATURE_SSE3|CPU_FEATURE_SSSE3|CPU_FEATURE_SSE41|CPU_FEATURE_SSE42; + cpu_features |= CPU_FEATURE_XMM_ENABLED; + cpu_features |= CPU_FEATURE_POPCNT; + cpu_features |= CPU_FEATURE_FMA3; + cpu_features |= CPU_FEATURE_LZCNT; + return cpu_features; + #else /* Unknown CPU. */ return 0; diff --git a/common/sys/thread.cpp b/common/sys/thread.cpp index 8b072067e6..8887d4905a 100644 --- a/common/sys/thread.cpp +++ b/common/sys/thread.cpp @@ -8,6 +8,8 @@ #include #if defined(__ARM_NEON) #include "../simd/arm/emulation.h" +#elif defined(__riscv_v) +#include "../simd/riscv/emulation.h" #else #include #if defined(__EMSCRIPTEN__) diff --git a/kernels/bvh/node_intersector1.h b/kernels/bvh/node_intersector1.h index 17641fa888..a28c53abe2 100644 --- a/kernels/bvh/node_intersector1.h +++ b/kernels/bvh/node_intersector1.h @@ -13,6 +13,9 @@ #define __FMA_X4__ #endif +#if defined(__riscv_v) +#define __FMA_X4__ +#endif namespace embree { @@ -40,7 +43,7 @@ namespace embree rdir = Vec3vf(ray_rdir.x,ray_rdir.y,ray_rdir.z); #if defined(__FMA_X4__) const Vec3fa ray_org_rdir = ray_org*ray_rdir; -#if !defined(__aarch64__) +#if !defined(__aarch64__) && !defined(__riscv_v) org_rdir = Vec3vf(ray_org_rdir.x,ray_org_rdir.y,ray_org_rdir.z); #else //for aarch64, we do not have msub equal instruction, so we negeate orig and use madd @@ -65,7 +68,7 @@ namespace embree dir = Vec3vf(ray_dir.x[k], ray_dir.y[k], ray_dir.z[k]); rdir = Vec3vf(ray_rdir.x[k], ray_rdir.y[k], ray_rdir.z[k]); #if defined(__FMA_X4__) -#if !defined(__aarch64__) +#if !defined(__aarch64__) && !defined(__riscv_v) org_rdir = org*rdir; #else neg_org_rdir = -(org*rdir); @@ -82,7 +85,7 @@ namespace embree Vec3fa org_xyz, dir_xyz; Vec3vf org, dir, rdir; #if defined(__FMA_X4__) -#if !defined(__aarch64__) +#if !defined(__aarch64__) && !defined(__riscv_v) Vec3vf org_rdir; #else //aarch64 version are keeping negation of the org_rdir and use madd @@ -430,7 +433,7 @@ namespace embree __forceinline size_t intersectNode<4>(const typename BVH4::AABBNode* node, const TravRay<4,false>& ray, vfloat4& dist) { #if defined(__FMA_X4__) -#if defined(__aarch64__) +#if defined(__aarch64__) || defined(__riscv_v) const vfloat4 tNearX = madd(vfloat4::load((float*)((const char*)&node->lower_x+ray.nearX)), ray.rdir.x, ray.neg_org_rdir.x); const vfloat4 tNearY = madd(vfloat4::load((float*)((const char*)&node->lower_x+ray.nearY)), ray.rdir.y, ray.neg_org_rdir.y); const vfloat4 tNearZ = madd(vfloat4::load((float*)((const char*)&node->lower_x+ray.nearZ)), ray.rdir.z, ray.neg_org_rdir.z); @@ -454,7 +457,7 @@ namespace embree const vfloat4 tFarZ = (vfloat4::load((float*)((const char*)&node->lower_x+ray.farZ )) - ray.org.z) * ray.rdir.z; #endif -#if defined(__aarch64__) +#if defined(__aarch64__) || defined(__riscv_v) const vfloat4 tNear = maxi(tNearX, tNearY, tNearZ, ray.tnear); const vfloat4 tFar = mini(tFarX, tFarY, tFarZ, ray.tfar); const vbool4 vmask = asInt(tNear) <= asInt(tFar); @@ -567,7 +570,7 @@ namespace embree const vfloat* pFarY = (const vfloat*)((const char*)&node->lower_x+ray.farY); const vfloat* pFarZ = (const vfloat*)((const char*)&node->lower_x+ray.farZ); #if defined(__FMA_X4__) -#if defined(__aarch64__) +#if defined(__aarch64__) || defined(__riscv_v) const vfloat tNearX = madd(madd(time,pNearX[6],vfloat(pNearX[0])), ray.rdir.x, ray.neg_org_rdir.x); const vfloat tNearY = madd(madd(time,pNearY[6],vfloat(pNearY[0])), ray.rdir.y, ray.neg_org_rdir.y); const vfloat tNearZ = madd(madd(time,pNearZ[6],vfloat(pNearZ[0])), ray.rdir.z, ray.neg_org_rdir.z); @@ -652,7 +655,7 @@ namespace embree const vfloat* pFarY = (const vfloat*)((const char*)&node->lower_x+ray.farY); const vfloat* pFarZ = (const vfloat*)((const char*)&node->lower_x+ray.farZ); #if defined (__FMA_X4__) -#if defined(__aarch64__) +#if defined(__aarch64__) || defined(__riscv_v) const vfloat tNearX = madd(madd(time,pNearX[6],vfloat(pNearX[0])), ray.rdir.x, ray.neg_org_rdir.x); const vfloat tNearY = madd(madd(time,pNearY[6],vfloat(pNearY[0])), ray.rdir.y, ray.neg_org_rdir.y); const vfloat tNearZ = madd(madd(time,pNearZ[6],vfloat(pNearZ[0])), ray.rdir.z, ray.neg_org_rdir.z); @@ -750,7 +753,7 @@ namespace embree const vfloat4 upper_z = madd(node->dequantize<4>(ray.farZ >> 2),scale_z,start_z); #if defined(__FMA_X4__) -#if defined(__aarch64__) +#if defined(__aarch64__) || defined(__riscv_v) const vfloat4 tNearX = madd(lower_x, ray.rdir.x, ray.neg_org_rdir.x); const vfloat4 tNearY = madd(lower_y, ray.rdir.y, ray.neg_org_rdir.y); const vfloat4 tNearZ = madd(lower_z, ray.rdir.z, ray.neg_org_rdir.z); @@ -938,7 +941,7 @@ namespace embree const vfloat lower_z = node->dequantizeLowerZ(time); const vfloat upper_z = node->dequantizeUpperZ(time); #if defined(__FMA_X4__) -#if defined(__aarch64__) +#if defined(__aarch64__) || defined(__riscv_v) const vfloat tNearX = madd(lower_x, ray.rdir.x, ray.neg_org_rdir.x); const vfloat tNearY = madd(lower_y, ray.rdir.y, ray.neg_org_rdir.y); const vfloat tNearZ = madd(lower_z, ray.rdir.z, ray.neg_org_rdir.z); diff --git a/kernels/common/accel.h b/kernels/common/accel.h index 7d959377ae..0de59a371e 100644 --- a/kernels/common/accel.h +++ b/kernels/common/accel.h @@ -367,7 +367,7 @@ namespace embree intersector16.occluded(valid,this,ray,context); } -#if defined(__SSE__) || defined(__ARM_NEON) +#if defined(__SSE__) || defined(__ARM_NEON) || defined(__riscv_v) __forceinline void occluded(const vbool4& valid, RayK<4>& ray, RayQueryContext* context) { const vint<4> mask = valid.mask32(); occluded4(&mask,(RTCRay4&)ray,context); diff --git a/kernels/geometry/instance_array_intersector.cpp b/kernels/geometry/instance_array_intersector.cpp index 0cb6f50073..0aabfd145f 100644 --- a/kernels/geometry/instance_array_intersector.cpp +++ b/kernels/geometry/instance_array_intersector.cpp @@ -327,7 +327,7 @@ namespace embree return occluded; } -#if defined(__SSE__) || defined(__ARM_NEON) +#if defined(__SSE__) || defined(__ARM_NEON) || defined(__riscv_v) template struct InstanceArrayIntersectorK<4>; template struct InstanceArrayIntersectorKMB<4>; #endif diff --git a/kernels/geometry/instance_intersector.cpp b/kernels/geometry/instance_intersector.cpp index a9209c69c3..d1d5cc17dd 100644 --- a/kernels/geometry/instance_intersector.cpp +++ b/kernels/geometry/instance_intersector.cpp @@ -306,7 +306,7 @@ namespace embree return occluded; } -#if defined(__SSE__) || defined(__ARM_NEON) +#if defined(__SSE__) || defined(__ARM_NEON) || defined(__riscv_v) template struct InstanceIntersectorK<4>; template struct InstanceIntersectorKMB<4>; #endif From bac41d46cd7584a2a11d901cab541ddfe91a53b0 Mon Sep 17 00:00:00 2001 From: Ting Chou Date: Wed, 25 Sep 2024 16:03:47 +0800 Subject: [PATCH 3/5] Improve the precision of _mm_sqrt_ps(), _mm_rsqrt_ps(), and _mm_rcp_ps(). --- common/simd/riscv/emulation.h | 3 ++ common/simd/riscv/sse2rvv.h | 58 +++++++++++++++++++++++++---------- 2 files changed, 44 insertions(+), 17 deletions(-) diff --git a/common/simd/riscv/emulation.h b/common/simd/riscv/emulation.h index 46a73cb5d9..d299034cae 100644 --- a/common/simd/riscv/emulation.h +++ b/common/simd/riscv/emulation.h @@ -1,5 +1,8 @@ #pragma once +#define SSE2RVV_PRECISE_DIV 1 +#define SSE2RVV_PRECISE_SQRT 1 + #include "sse2rvv.h" #define _MM_SHUFFLE(fp3, fp2, fp1, fp0) \ diff --git a/common/simd/riscv/sse2rvv.h b/common/simd/riscv/sse2rvv.h index ca70f3af7b..afd8026ffa 100644 --- a/common/simd/riscv/sse2rvv.h +++ b/common/simd/riscv/sse2rvv.h @@ -26,6 +26,15 @@ * SOFTWARE. */ +/* _mm_rcp_ps */ +#ifndef SSE2RVV_PRECISE_DIV +#define SSE2RVV_PRECISE_DIV (0) +#endif +/* _mm_sqrt_ps and _mm_rsqrt_ps */ +#ifndef SSE2RVV_PRECISE_SQRT +#define SSE2RVV_PRECISE_SQRT (0) +#endif + /* compiler specific definitions */ #if defined(__GNUC__) || defined(__clang__) #pragma push_macro("FORCE_INLINE") @@ -2533,16 +2542,21 @@ FORCE_INLINE __m64 _m_pshufw(__m64 a, int imm8) { } FORCE_INLINE __m128 _mm_rcp_ps(__m128 a) { - // TODO add high precision mode vfloat32m1_t _a = vreinterpretq_m128_f32(a); - return vreinterpretq_f32_m128(__riscv_vfrec7_v_f32m1(_a, 4)); + vfloat32m1_t recip = __riscv_vfrec7_v_f32m1(_a, 4); +#if SSE2RVV_PRECISE_DIV + vfloat32m1_t two = __riscv_vfmv_v_f_f32m1(2.0f, 4); + // Additional Netwon-Raphson iteration for accuracy + recip = __riscv_vfmul_vv_f32m1(recip, + __riscv_vfsub_vv_f32m1(two, __riscv_vfmul_vv_f32m1(_a, recip, 4), 4), 4); +#endif + return vreinterpretq_f32_m128(recip); } FORCE_INLINE __m128 _mm_rcp_ss(__m128 a) { - // TODO add high precision mode - vfloat32m1_t _a = vreinterpretq_m128_f32(a); - vfloat32m1_t recip = __riscv_vfrec7_v_f32m1(_a, 4); - return vreinterpretq_f32_m128(__riscv_vslideup_vx_f32m1(_a, recip, 0, 1)); + return vreinterpretq_f32_m128( + __riscv_vslideup_vx_f32m1(vreinterpretq_m128_f32(a), + vreinterpretq_m128_f32(_mm_rcp_ps(a)), 0, 1)); } // FORCE_INLINE __m128d _mm_round_pd (__m128d a, int rounding) {} @@ -2554,16 +2568,22 @@ FORCE_INLINE __m128 _mm_rcp_ss(__m128 a) { // FORCE_INLINE __m128 _mm_round_ss (__m128 a, __m128 b, int rounding) {} FORCE_INLINE __m128 _mm_rsqrt_ps(__m128 a) { - // TODO add high precision mode vfloat32m1_t _a = vreinterpretq_m128_f32(a); - return vreinterpretq_f32_m128(__riscv_vfrsqrt7_v_f32m1(_a, 4)); + vfloat32m1_t recip = __riscv_vfrsqrt7_v_f32m1(_a, 4); +#if SSE2RVV_PRECISE_SQRT + // Additional Netwon-Raphson iteration for accuracy + recip = __riscv_vfadd_vv_f32m1( + __riscv_vfmul_vf_f32m1(recip, 1.5f, 4), + __riscv_vfmul_vv_f32m1(__riscv_vfmul_vv_f32m1(__riscv_vfmul_vf_f32m1(_a, -0.5f, 4), recip, 4), + __riscv_vfmul_vv_f32m1(recip, recip, 4), 4), 4); +#endif + return vreinterpretq_f32_m128(recip); } FORCE_INLINE __m128 _mm_rsqrt_ss(__m128 a) { - // TODO add high precision mode - vfloat32m1_t _a = vreinterpretq_m128_f32(a); - vfloat32m1_t sqrt = __riscv_vfrsqrt7_v_f32m1(_a, 4); - return vreinterpretq_f32_m128(__riscv_vslideup_vx_f32m1(_a, sqrt, 0, 1)); + return vreinterpretq_f32_m128( + __riscv_vslideup_vx_f32m1(vreinterpretq_m128_f32(a), + vreinterpretq_m128_f32(_mm_rsqrt_ps(a)), 0, 1)); } FORCE_INLINE __m128i _mm_sad_epu8(__m128i a, __m128i b) { @@ -2966,8 +2986,12 @@ FORCE_INLINE __m128d _mm_sqrt_pd(__m128d a) { FORCE_INLINE __m128 _mm_sqrt_ps(__m128 a) { vfloat32m1_t _a = vreinterpretq_m128_f32(a); - return vreinterpretq_f32_m128( - __riscv_vfrec7_v_f32m1(__riscv_vfrsqrt7_v_f32m1(_a, 4), 4)); +#if SSE2RVV_PRECISE_SQRT + return vreinterpretq_f32_m128(__riscv_vfsqrt_v_f32m1(_a, 4)); +#else + vfloat32m1_t recip = __riscv_vfrsqrt7_v_f32m1(_a, 4); + return vreinterpretq_f32_m128(__riscv_vfrec7_v_f32m1(recip, 4), 4); +#endif } FORCE_INLINE __m128d _mm_sqrt_sd(__m128d a, __m128d b) { @@ -2979,9 +3003,9 @@ FORCE_INLINE __m128d _mm_sqrt_sd(__m128d a, __m128d b) { } FORCE_INLINE __m128 _mm_sqrt_ss(__m128 a) { - vfloat32m1_t _a = vreinterpretq_m128_f32(a); - vfloat32m1_t rnd = __riscv_vfrec7_v_f32m1(__riscv_vfrsqrt7_v_f32m1(_a, 4), 4); - return vreinterpretq_f32_m128(__riscv_vslideup_vx_f32m1_tu(_a, rnd, 0, 1)); + return vreinterpretq_f32_m128( + __riscv_vslideup_vx_f32m1_tu(vreinterpretq_m128_f32(a), + vreinterpretq_m128_f32(_mm_sqrt_ps(a)), 0, 1)); } FORCE_INLINE __m128i _mm_sra_epi16(__m128i a, __m128i count) { From 6bedb90e6fa422da95281f03674a9bb94e716ed3 Mon Sep 17 00:00:00 2001 From: Ting Chou Date: Wed, 16 Oct 2024 15:08:24 +0800 Subject: [PATCH 4/5] Handle NaN in the second operad of _mm_min_ps and _mm_max_ps to pass hair_geometry. --- common/simd/riscv/emulation.h | 5 +++-- common/simd/riscv/sse2rvv.h | 17 +++++++++++++++++ 2 files changed, 20 insertions(+), 2 deletions(-) diff --git a/common/simd/riscv/emulation.h b/common/simd/riscv/emulation.h index d299034cae..19243caa1e 100644 --- a/common/simd/riscv/emulation.h +++ b/common/simd/riscv/emulation.h @@ -1,7 +1,8 @@ #pragma once -#define SSE2RVV_PRECISE_DIV 1 -#define SSE2RVV_PRECISE_SQRT 1 +#define SSE2RVV_PRECISE_DIV 1 +#define SSE2RVV_PRECISE_SQRT 1 +#define SSE2RVV_PRECISE_MINMAX 1 #include "sse2rvv.h" diff --git a/common/simd/riscv/sse2rvv.h b/common/simd/riscv/sse2rvv.h index afd8026ffa..47c6d34ce4 100644 --- a/common/simd/riscv/sse2rvv.h +++ b/common/simd/riscv/sse2rvv.h @@ -34,6 +34,10 @@ #ifndef SSE2RVV_PRECISE_SQRT #define SSE2RVV_PRECISE_SQRT (0) #endif +/* _mm_min_ps and _mm_max_ps */ +#ifndef SSE2RVV_PRECISE_MINMAX +#define SSE2RVV_PRECISE_MINMAX (0) +#endif /* compiler specific definitions */ #if defined(__GNUC__) || defined(__clang__) @@ -2125,7 +2129,14 @@ FORCE_INLINE __m64 _mm_max_pi16(__m64 a, __m64 b) { FORCE_INLINE __m128 _mm_max_ps(__m128 a, __m128 b) { vfloat32m1_t _a = vreinterpretq_m128_f32(a); vfloat32m1_t _b = vreinterpretq_m128_f32(b); +#if SSE2RVV_PRECISE_MINMAX + // Return NaN when the second operand is NaN. + vbool32_t isnan = __riscv_vmfne_vv_f32m1_b32(b, b, 4); + vfloat32m1_t max = __riscv_vfmax_vv_f32m1(_a, _b, 4); + return vreinterpretq_f32_m128(__riscv_vmerge_vvm_f32m1(max, _b, isnan, 4)); +#else return vreinterpretq_f32_m128(__riscv_vfmax_vv_f32m1(_a, _b, 4)); +#endif } FORCE_INLINE __m64 _mm_max_pu8(__m64 a, __m64 b) { @@ -2201,7 +2212,13 @@ FORCE_INLINE __m64 _mm_min_pi16(__m64 a, __m64 b) { FORCE_INLINE __m128 _mm_min_ps(__m128 a, __m128 b) { vfloat32m1_t _a = vreinterpretq_m128_f32(a); vfloat32m1_t _b = vreinterpretq_m128_f32(b); +#if SSE2RVV_PRECISE_MINMAX + vbool32_t isnan = __riscv_vmfne_vv_f32m1_b32(b, b, 4); + vfloat32m1_t min = __riscv_vfmin_vv_f32m1(_a, _b, 4); + return vreinterpretq_f32_m128(__riscv_vmerge_vvm_f32m1(min, _b, isnan, 4)); +#else return vreinterpretq_f32_m128(__riscv_vfmin_vv_f32m1(_a, _b, 4)); +#endif } FORCE_INLINE __m64 _mm_min_pu8(__m64 a, __m64 b) { From 1d88f078c024192bd7e2e320603411f7636a9424 Mon Sep 17 00:00:00 2001 From: Ting Chou Date: Thu, 7 Nov 2024 16:09:39 +0800 Subject: [PATCH 5/5] Upgrade to SSE2RVV commit f3a1d7d2. --- common/simd/riscv/emulation.h | 72 ++++------------ common/simd/riscv/sse2rvv.h | 149 ++++++++++++++++++++++++++-------- 2 files changed, 133 insertions(+), 88 deletions(-) diff --git a/common/simd/riscv/emulation.h b/common/simd/riscv/emulation.h index 19243caa1e..ca369b8271 100644 --- a/common/simd/riscv/emulation.h +++ b/common/simd/riscv/emulation.h @@ -9,17 +9,6 @@ #define _MM_SHUFFLE(fp3, fp2, fp1, fp0) \ (((fp3) << 6) | ((fp2) << 4) | ((fp1) << 2) | ((fp0))) -/* Rounding mode macros. */ -#define _MM_FROUND_TO_NEAREST_INT 0x00 -#define _MM_FROUND_TO_NEG_INF 0x01 -#define _MM_FROUND_TO_POS_INF 0x02 -#define _MM_FROUND_TO_ZERO 0x03 -#define _MM_FROUND_CUR_DIRECTION 0x04 -#define _MM_FROUND_NO_EXC 0x08 -#define _MM_ROUND_NEAREST 0x0000 -#define _MM_ROUND_DOWN 0x2000 -#define _MM_ROUND_UP 0x4000 -#define _MM_ROUND_TOWARD_ZERO 0x6000 /* Flush zero mode macros. */ #define _MM_FLUSH_ZERO_MASK 0x8000 #define _MM_FLUSH_ZERO_ON 0x8000 @@ -32,18 +21,10 @@ enum _mm_hint { _MM_HINT_T2 = 3, }; -__forceinline __m128i _mm_cvtps_epi32(__m128 a) { - return __riscv_vfcvt_x_f_v_i32m1(a, 4); -} - __forceinline int _mm_cvtsi128_si32(__m128i a) { return __riscv_vmv_x_s_i32m1_i32(a); } -__forceinline float _mm_cvtss_f32 (__m128 a) { - return __riscv_vfmv_f_s_f32m1_f32(a); -} - __forceinline __m128 _mm_dp_ps(__m128 a, __m128 b, const int imm8) { vfloat32m1_t zeros = __riscv_vfmv_v_f_f32m1(0, 4); vbool32_t high = __riscv_vreinterpret_v_i32m1_b32(__riscv_vmv_s_x_i32m1(imm8 >> 4, 1)); @@ -57,28 +38,27 @@ __forceinline __int64 _mm_cvtsi128_si64 (__m128i a) { } __forceinline unsigned int _mm_getcsr(void) { - return 0; + union { + fcsr_bitfield field; + uint32_t value; + } r; + + __asm__ volatile("csrr %0, fcsr" : "=r"(r)); + + switch (r.field.frm) { + case __RISCV_FRM_RTZ: + return _MM_ROUND_TOWARD_ZERO; + case __RISCV_FRM_RDN: + return _MM_ROUND_DOWN; + case __RISCV_FRM_RUP: + return _MM_ROUND_UP; + default: + return _MM_ROUND_NEAREST; + } } __forceinline void _mm_setcsr(unsigned int a) { - int rm; - - switch (a) { - case _MM_ROUND_TOWARD_ZERO: - // FIXME: I can't find the straightforward mapping of this. - rm = 0b01; - break; - case _MM_ROUND_DOWN: - rm = 0b10; - break; - case _MM_ROUND_UP: - rm = 0b00; - break; - default: //_MM_ROUND_NEAREST - rm = 0b01; - } - - asm volatile("csrw vxrm,%0" :: "r"(rm)); + _MM_SET_ROUNDING_MODE(a); } __forceinline void _mm_mfence (void) { @@ -95,22 +75,6 @@ __forceinline void _mm_prefetch (char const* p, int i) { __builtin_prefetch(p); } -__forceinline __m128 _mm_round_ps(__m128 a, int rounding) { - vfloat32m1_t _a = vreinterpretq_m128_f32(a); - switch (rounding) { - case (_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC): - return __riscv_vfcvt_f_x_v_f32m1(__riscv_vfcvt_x_f_v_i32m1_rm(_a, 0, 4), 4); - case (_MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC): - return _mm_floor_ps(a); - case (_MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC): - return _mm_ceil_ps(a); - case (_MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC): - return __riscv_vfcvt_f_x_v_f32m1(__riscv_vfcvt_x_f_v_i32m1_rm(_a, 1, 4), 4); - default: //_MM_FROUND_CUR_DIRECTION - return __riscv_vfcvt_f_x_v_f32m1(__riscv_vfcvt_x_f_v_i32m1(_a, 4), 4); - } -} - __forceinline int _mm_popcnt_u32(unsigned int a) { return __builtin_popcount(a); } diff --git a/common/simd/riscv/sse2rvv.h b/common/simd/riscv/sse2rvv.h index 47c6d34ce4..643f1cf259 100644 --- a/common/simd/riscv/sse2rvv.h +++ b/common/simd/riscv/sse2rvv.h @@ -7,6 +7,12 @@ /* * sse2rvv is freely redistributable under the MIT License. * + * Copyright (c) 2023-2024 SSE2RVV Contributors. + * + * Contributors to this work are: + * Yang Hau + * Cheng-Hao + * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal * in the Software without restriction, including without limitation the rights @@ -255,6 +261,49 @@ typedef union ALIGN_STRUCT(16) SIMDVec { #endif #endif +// XRM +// #define __RISCV_VXRM_RNU 0 // round-to-nearest-up (add +0.5 LSB) +// #define __RISCV_VXRM_RNE 1 // round-to-nearest-even +// #define __RISCV_VXRM_RDN 2 // round-down (truncate) +// #define __RISCV_VXRM_ROD 3 // round-to-odd (OR bits into LSB, aka "jam") +// FRM +// #define __RISCV_FRM_RNE 0 // round to nearest, ties to even +// #define __RISCV_FRM_RTZ 1 // round towards zero +// #define __RISCV_FRM_RDN 2 // round down (towards -infinity) +// #define __RISCV_FRM_RUP 3 // round up (towards +infinity) +// #define __RISCV_FRM_RMM 4 // round to nearest, ties to max magnitude + +// The bit field mapping to the FCSR (floating-point control and status +// register) +typedef struct { + uint8_t nx : 1; + uint8_t uf : 1; + uint8_t of : 1; + uint8_t dz : 1; + uint8_t nv : 1; + uint8_t frm : 3; + uint32_t reserved : 24; +} fcsr_bitfield; + +/* Rounding mode macros. */ +#define _MM_FROUND_TO_NEAREST_INT 0x00 +#define _MM_FROUND_TO_NEG_INF 0x01 +#define _MM_FROUND_TO_POS_INF 0x02 +#define _MM_FROUND_TO_ZERO 0x03 +#define _MM_FROUND_CUR_DIRECTION 0x04 +#define _MM_FROUND_NO_EXC 0x08 +#define _MM_FROUND_RAISE_EXC 0x00 +#define _MM_FROUND_NINT (_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_RAISE_EXC) +#define _MM_FROUND_FLOOR (_MM_FROUND_TO_NEG_INF | _MM_FROUND_RAISE_EXC) +#define _MM_FROUND_CEIL (_MM_FROUND_TO_POS_INF | _MM_FROUND_RAISE_EXC) +#define _MM_FROUND_TRUNC (_MM_FROUND_TO_ZERO | _MM_FROUND_RAISE_EXC) +#define _MM_FROUND_RINT (_MM_FROUND_CUR_DIRECTION | _MM_FROUND_RAISE_EXC) +#define _MM_FROUND_NEARBYINT (_MM_FROUND_CUR_DIRECTION | _MM_FROUND_NO_EXC) +#define _MM_ROUND_NEAREST 0x0000 +#define _MM_ROUND_DOWN 0x2000 +#define _MM_ROUND_UP 0x4000 +#define _MM_ROUND_TOWARD_ZERO 0x6000 + // forward declaration FORCE_INLINE int _mm_extract_pi16(__m64 a, int imm8); FORCE_INLINE __m64 _mm_sad_pu8(__m64 a, __m64 b); @@ -1352,21 +1401,19 @@ FORCE_INLINE int _mm_cvt_ss2si(__m128 a) { FORCE_INLINE __m128i _mm_cvtepi16_epi32(__m128i a) { vint16m1_t _a = vreinterpretq_m128i_i16(a); return vreinterpretq_i32_m128i( - __riscv_vlmul_trunc_v_i32m2_i32m1(__riscv_vsext_vf2_i32m2(_a, 4))); + __riscv_vsext_vf2_i32m1(__riscv_vlmul_trunc_v_i16m1_i16mf2(_a), 4)); } FORCE_INLINE __m128i _mm_cvtepi16_epi64(__m128i a) { vint16m1_t _a = vreinterpretq_m128i_i16(a); - vint32m1_t a_ext = - __riscv_vlmul_trunc_v_i32m2_i32m1(__riscv_vsext_vf2_i32m2(_a, 4)); return vreinterpretq_i64_m128i( - __riscv_vlmul_trunc_v_i64m2_i64m1(__riscv_vsext_vf2_i64m2(a_ext, 2))); + __riscv_vsext_vf4_i64m1(__riscv_vlmul_trunc_v_i16m1_i16mf4(_a), 2)); } FORCE_INLINE __m128i _mm_cvtepi32_epi64(__m128i a) { vint32m1_t _a = vreinterpretq_m128i_i32(a); return vreinterpretq_i64_m128i( - __riscv_vlmul_trunc_v_i64m2_i64m1(__riscv_vsext_vf2_i64m2(_a, 2))); + __riscv_vsext_vf2_i64m1(__riscv_vlmul_trunc_v_i32m1_i32mf2(_a), 2)); } FORCE_INLINE __m128d _mm_cvtepi32_pd(__m128i a) { @@ -1384,69 +1431,55 @@ FORCE_INLINE __m128 _mm_cvtepi32_ps(__m128i a) { FORCE_INLINE __m128i _mm_cvtepi8_epi16(__m128i a) { vint8m1_t _a = vreinterpretq_m128i_i8(a); return vreinterpretq_i16_m128i( - __riscv_vlmul_trunc_v_i16m2_i16m1(__riscv_vsext_vf2_i16m2(_a, 8))); + __riscv_vsext_vf2_i16m1(__riscv_vlmul_trunc_v_i8m1_i8mf2(_a), 8)); } FORCE_INLINE __m128i _mm_cvtepi8_epi32(__m128i a) { vint8m1_t _a = vreinterpretq_m128i_i8(a); - vint16m1_t a_ext = - __riscv_vlmul_trunc_v_i16m2_i16m1(__riscv_vsext_vf2_i16m2(_a, 8)); return vreinterpretq_i32_m128i( - __riscv_vlmul_trunc_v_i32m2_i32m1(__riscv_vsext_vf2_i32m2(a_ext, 4))); + __riscv_vsext_vf4_i32m1(__riscv_vlmul_trunc_v_i8m1_i8mf4(_a), 4)); } FORCE_INLINE __m128i _mm_cvtepi8_epi64(__m128i a) { vint8m1_t _a = vreinterpretq_m128i_i8(a); - vint16m1_t a_ext1 = - __riscv_vlmul_trunc_v_i16m2_i16m1(__riscv_vsext_vf2_i16m2(_a, 8)); - vint32m1_t a_ext2 = - __riscv_vlmul_trunc_v_i32m2_i32m1(__riscv_vsext_vf2_i32m2(a_ext1, 4)); return vreinterpretq_i64_m128i( - __riscv_vlmul_trunc_v_i64m2_i64m1(__riscv_vsext_vf2_i64m2(a_ext2, 2))); + __riscv_vsext_vf8_i64m1(__riscv_vlmul_trunc_v_i8m1_i8mf8(_a), 2)); } FORCE_INLINE __m128i _mm_cvtepu16_epi32(__m128i a) { vuint16m1_t _a = vreinterpretq_m128i_u16(a); return vreinterpretq_u32_m128i( - __riscv_vlmul_trunc_v_u32m2_u32m1(__riscv_vzext_vf2_u32m2(_a, 4))); + __riscv_vzext_vf2_u32m1(__riscv_vlmul_trunc_v_u16m1_u16mf2(_a), 4)); } FORCE_INLINE __m128i _mm_cvtepu16_epi64(__m128i a) { vuint16m1_t _a = vreinterpretq_m128i_u16(a); - vuint32m1_t a_ext = - __riscv_vlmul_trunc_v_u32m2_u32m1(__riscv_vzext_vf2_u32m2(_a, 4)); return vreinterpretq_u64_m128i( - __riscv_vlmul_trunc_v_u64m2_u64m1(__riscv_vzext_vf2_u64m2(a_ext, 2))); + __riscv_vzext_vf4_u64m1(__riscv_vlmul_trunc_v_u16m1_u16mf4(_a), 2)); } FORCE_INLINE __m128i _mm_cvtepu32_epi64(__m128i a) { vuint32m1_t _a = vreinterpretq_m128i_u32(a); return vreinterpretq_u64_m128i( - __riscv_vlmul_trunc_v_u64m2_u64m1(__riscv_vzext_vf2_u64m2(_a, 2))); + __riscv_vzext_vf2_u64m1(__riscv_vlmul_trunc_v_u32m1_u32mf2(_a), 2)); } FORCE_INLINE __m128i _mm_cvtepu8_epi16(__m128i a) { vuint8m1_t _a = vreinterpretq_m128i_u8(a); return vreinterpretq_u16_m128i( - __riscv_vlmul_trunc_v_u16m2_u16m1(__riscv_vzext_vf2_u16m2(_a, 8))); + __riscv_vzext_vf2_u16m1(__riscv_vlmul_trunc_v_u8m1_u8mf2(_a), 8)); } FORCE_INLINE __m128i _mm_cvtepu8_epi32(__m128i a) { vuint8m1_t _a = vreinterpretq_m128i_u8(a); - vuint16m1_t a_ext = - __riscv_vlmul_trunc_v_u16m2_u16m1(__riscv_vzext_vf2_u16m2(_a, 8)); return vreinterpretq_u32_m128i( - __riscv_vlmul_trunc_v_u32m2_u32m1(__riscv_vzext_vf2_u32m2(a_ext, 4))); + __riscv_vzext_vf4_u32m1(__riscv_vlmul_trunc_v_u8m1_u8mf4(_a), 4)); } FORCE_INLINE __m128i _mm_cvtepu8_epi64(__m128i a) { vuint8m1_t _a = vreinterpretq_m128i_u8(a); - vuint16m1_t a_ext1 = - __riscv_vlmul_trunc_v_u16m2_u16m1(__riscv_vzext_vf2_u16m2(_a, 8)); - vuint32m1_t a_ext2 = - __riscv_vlmul_trunc_v_u32m2_u32m1(__riscv_vzext_vf2_u32m2(a_ext1, 4)); return vreinterpretq_u64_m128i( - __riscv_vlmul_trunc_v_u64m2_u64m1(__riscv_vzext_vf2_u64m2(a_ext2, 2))); + __riscv_vzext_vf8_u64m1(__riscv_vlmul_trunc_v_u8m1_u8mf8(_a), 2)); } // FORCE_INLINE __m128i _mm_cvtpd_epi32 (__m128d a) {} @@ -1465,7 +1498,10 @@ FORCE_INLINE __m128i _mm_cvtepu8_epi64(__m128i a) { // FORCE_INLINE __m128 _mm_cvtpi8_ps (__m64 a) {} -// FORCE_INLINE __m128i _mm_cvtps_epi32 (__m128 a) {} +FORCE_INLINE __m128i _mm_cvtps_epi32(__m128 a) { + vfloat32m1_t _a = vreinterpretq_m128_f32(a); + return vreinterpretq_i32_m128i(__riscv_vfcvt_x_f_v_i32m1(_a, 4)); +} // FORCE_INLINE __m128d _mm_cvtps_pd (__m128 a) {} @@ -1511,7 +1547,10 @@ FORCE_INLINE __m128i _mm_cvtepu8_epi64(__m128i a) { // FORCE_INLINE __m128i _mm_cvtsi64x_si128 (__int64 a) {} -// FORCE_INLINE float _mm_cvtss_f32 (__m128 a) {} +FORCE_INLINE float _mm_cvtss_f32(__m128 a) { + vfloat32m1_t _a = vreinterpretq_m128_f32(a); + return (float)__riscv_vfmv_f_s_f32m1_f32(_a); +} // FORCE_INLINE __m128d _mm_cvtss_sd (__m128d a, __m128 b) {} @@ -2578,7 +2617,26 @@ FORCE_INLINE __m128 _mm_rcp_ss(__m128 a) { // FORCE_INLINE __m128d _mm_round_pd (__m128d a, int rounding) {} -// FORCE_INLINE __m128 _mm_round_ps (__m128 a, int rounding) {} +FORCE_INLINE __m128 _mm_round_ps(__m128 a, int rounding) { + vfloat32m1_t _a = vreinterpretq_m128_f32(a); + switch (rounding) { + case (_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC): + return vreinterpretq_f32_m128(__riscv_vfcvt_f_x_v_f32m1( + __riscv_vfcvt_x_f_v_i32m1_rm(_a, __RISCV_FRM_RNE, 4), 4)); + case (_MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC): + return vreinterpretq_f32_m128(__riscv_vfcvt_f_x_v_f32m1( + __riscv_vfcvt_x_f_v_i32m1_rm(_a, __RISCV_FRM_RDN, 4), 4)); + case (_MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC): + return vreinterpretq_f32_m128(__riscv_vfcvt_f_x_v_f32m1( + __riscv_vfcvt_x_f_v_i32m1_rm(_a, __RISCV_FRM_RUP, 4), 4)); + case (_MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC): + return vreinterpretq_f32_m128(__riscv_vfcvt_f_x_v_f32m1( + __riscv_vfcvt_x_f_v_i32m1_rm(_a, __RISCV_FRM_RTZ, 4), 4)); + default: //_MM_FROUND_CUR_DIRECTION + return vreinterpretq_f32_m128( + __riscv_vfcvt_f_x_v_f32m1(__riscv_vfcvt_x_f_v_i32m1(_a, 4), 4)); + } +} // FORCE_INLINE __m128d _mm_round_sd (__m128d a, __m128d b, int rounding) {} @@ -2684,7 +2742,30 @@ FORCE_INLINE __m128 _mm_set_ps1(float a) { return vreinterpretq_f32_m128(__riscv_vfmv_v_f_f32m1(a, 4)); } -// FORCE_INLINE void _MM_SET_ROUNDING_MODE (unsigned int a) {} +FORCE_INLINE void _MM_SET_ROUNDING_MODE(unsigned int a) { + union { + fcsr_bitfield field; + uint32_t value; + } r; + + __asm__ volatile("csrr %0, fcsr" : "=r"(r)); + + switch (a) { + case _MM_ROUND_TOWARD_ZERO: + r.field.frm = __RISCV_FRM_RTZ; + break; + case _MM_ROUND_DOWN: + r.field.frm = __RISCV_FRM_RDN; + break; + case _MM_ROUND_UP: + r.field.frm = __RISCV_FRM_RUP; + break; + default: //_MM_ROUND_NEAREST + r.field.frm = __RISCV_FRM_RNE; + } + + __asm__ volatile("csrw fcsr, %0" : : "r"(r)); +} FORCE_INLINE __m128d _mm_set_sd(double a) { double arr[2] = {a, 0}; @@ -3006,8 +3087,8 @@ FORCE_INLINE __m128 _mm_sqrt_ps(__m128 a) { #if SSE2RVV_PRECISE_SQRT return vreinterpretq_f32_m128(__riscv_vfsqrt_v_f32m1(_a, 4)); #else - vfloat32m1_t recip = __riscv_vfrsqrt7_v_f32m1(_a, 4); - return vreinterpretq_f32_m128(__riscv_vfrec7_v_f32m1(recip, 4), 4); + return vreinterpretq_f32_m128( + __riscv_vfrec7_v_f32m1(__riscv_vfrsqrt7_v_f32m1(_a, 4), 4)); #endif }