diff --git a/src/geometry.cpp b/src/geometry.cpp index 90fc203d..b5311f19 100644 --- a/src/geometry.cpp +++ b/src/geometry.cpp @@ -3,22 +3,31 @@ namespace Clockwork::geometry { -// clang-format off // Offset arrangement is AVX2-specific (due to punpck-ordering). constexpr std::array AVX2_OFFSETS{{ - 0210, 0211, 0212, 0213, 0214, 0215, 0216, 0217, - 0310, 0311, 0312, 0313, 0314, 0315, 0316, 0317, - 0230, 0231, 0232, 0233, 0234, 0235, 0236, 0237, - 0330, 0331, 0332, 0333, 0334, 0335, 0336, 0337, - 0250, 0251, 0252, 0253, 0254, 0255, 0256, 0257, - 0350, 0351, 0352, 0353, 0354, 0355, 0356, 0357, - 0270, 0271, 0272, 0273, 0274, 0275, 0276, 0277, - 0370, 0371, 0372, 0373, 0374, 0375, 0376, 0377, + 0210, 0211, 0212, 0213, 0214, 0215, 0216, 0217, // + 0310, 0311, 0312, 0313, 0314, 0315, 0316, 0317, // + 0230, 0231, 0232, 0233, 0234, 0235, 0236, 0237, // + 0330, 0331, 0332, 0333, 0334, 0335, 0336, 0337, // + 0250, 0251, 0252, 0253, 0254, 0255, 0256, 0257, // + 0350, 0351, 0352, 0353, 0354, 0355, 0356, 0357, // + 0270, 0271, 0272, 0273, 0274, 0275, 0276, 0277, // + 0370, 0371, 0372, 0373, 0374, 0375, 0376, 0377, // }}; -// clang-format on +constexpr std::array AVX512_OFFSETS{{ + 0210, 0211, 0212, 0213, 0214, 0215, 0216, 0217, // + 0230, 0231, 0232, 0233, 0234, 0235, 0236, 0237, // + 0250, 0251, 0252, 0253, 0254, 0255, 0256, 0257, // + 0270, 0271, 0272, 0273, 0274, 0275, 0276, 0277, // + 0310, 0311, 0312, 0313, 0314, 0315, 0316, 0317, // + 0330, 0331, 0332, 0333, 0334, 0335, 0336, 0337, // + 0350, 0351, 0352, 0353, 0354, 0355, 0356, 0357, // + 0370, 0371, 0372, 0373, 0374, 0375, 0376, 0377, // +}}; -const std::array SUPERPIECE_INVERSE_RAYS_AVX2_TABLE = []() { +template OFFSETS, u8 RAY_OFFSET> +consteval std::array calc_superpiece_inverse_rays_table() { // clang-format off constexpr u8 NONE = 0x80; constexpr std::array BASE{{ @@ -46,15 +55,25 @@ const std::array SUPERPIECE_INVERSE_RAYS_AVX2_TABLE = []() { u8 esq = internal::expand_sq(Square{sq}); std::array b; for (usize i = 0; i < 64; i++) { - u8 value = BASE[AVX2_OFFSETS[i] - esq]; + u8 value = BASE[OFFSETS[i] - esq]; + value = value != NONE ? (value + RAY_OFFSET) % 64 : NONE; b[i] = value; } table[sq] = u8x64{b}; } return table; -}(); +} + -const std::array PIECE_MOVES_AVX2_TABLE = []() { +const std::array SUPERPIECE_INVERSE_RAYS_AVX2_TABLE = + calc_superpiece_inverse_rays_table(); +const std::array SUPERPIECE_INVERSE_RAYS_AVX512_TABLE = + calc_superpiece_inverse_rays_table(); +const std::array SUPERPIECE_INVERSE_RAYS_FLIPPED_AVX512_TABLE = + calc_superpiece_inverse_rays_table(); + +template OFFSETS> +consteval std::array calc_piece_moves_table() { // clang-format off constexpr u8 K = 1 << static_cast(PieceType::King); constexpr u8 Q = 1 << static_cast(PieceType::Queen); @@ -92,11 +111,14 @@ const std::array PIECE_MOVES_AVX2_TABLE = []() { u8 esq = internal::expand_sq(Square{sq}); std::array b; for (usize i = 0; i < 64; i++) { - b[i] = BASE[AVX2_OFFSETS[i] - esq]; + b[i] = BASE[OFFSETS[i] - esq]; } table[sq] = u8x64{b}; } return table; -}(); +} + +const std::array PIECE_MOVES_AVX2_TABLE = calc_piece_moves_table(); +const std::array PIECE_MOVES_AVX512_TABLE = calc_piece_moves_table(); } // namespace Clockwork::geometry diff --git a/src/geometry.hpp b/src/geometry.hpp index 6af979b6..9aeb82a0 100644 --- a/src/geometry.hpp +++ b/src/geometry.hpp @@ -124,6 +124,18 @@ forceinline u8x64 superpiece_inverse_rays_avx2(Square sq) { return SUPERPIECE_INVERSE_RAYS_AVX2_TABLE[sq.raw]; } +extern const std::array SUPERPIECE_INVERSE_RAYS_AVX512_TABLE; + +forceinline u8x64 superpiece_inverse_rays_avx512(Square sq) { + return SUPERPIECE_INVERSE_RAYS_AVX512_TABLE[sq.raw]; +} + +extern const std::array SUPERPIECE_INVERSE_RAYS_FLIPPED_AVX512_TABLE; + +forceinline u8x64 superpiece_inverse_rays_flipped_avx512(Square sq) { + return SUPERPIECE_INVERSE_RAYS_FLIPPED_AVX512_TABLE[sq.raw]; +} + extern const std::array PIECE_MOVES_AVX2_TABLE; forceinline m8x64 piece_moves_avx2(bool color, PieceType ptype, Square sq) { @@ -134,6 +146,16 @@ forceinline m8x64 piece_moves_avx2(bool color, PieceType ptype, Square sq) { return table.test(bit); } +extern const std::array PIECE_MOVES_AVX512_TABLE; + +forceinline m8x64 piece_moves_avx512(bool color, PieceType ptype, Square sq) { + assert(ptype != PieceType::None); + i32 index = ptype == PieceType::Pawn ? color : static_cast(ptype); + u8x64 bit = u8x64::splat(static_cast(1 << index)); + u8x64 table = PIECE_MOVES_AVX512_TABLE[sq.raw]; + return table.test(bit); +} + forceinline u8x64 slider_broadcast(u8x64 x) { #if LPS_AVX512 constexpr u8 NONE = 0xFF; @@ -172,7 +194,19 @@ forceinline u8x64 slider_broadcast(u8x64 x) { } forceinline u8x64 lane_broadcast(u8x64 x) { -#if LPS_AVX2 +#if LPS_AVX512 + u8x64 EXPAND_IDX{{ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, // + 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, // + 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, // + 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, // + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, // + 0x28, 0x28, 0x28, 0x28, 0x28, 0x28, 0x28, 0x28, // + 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, // + 0x38, 0x38, 0x38, 0x38, 0x38, 0x38, 0x38, 0x38, // + }}; + return EXPAND_IDX.swizzle(u8x64{_mm512_sad_epu8(x.raw, _mm512_setzero_si512())}); +#elif LPS_AVX2 u8x64 y; y.raw[0].raw = _mm256_sad_epu8(x.raw[0].raw, _mm256_setzero_si256()); y.raw[1].raw = _mm256_sad_epu8(x.raw[1].raw, _mm256_setzero_si256()); diff --git a/src/movegen.cpp b/src/movegen.cpp index 0a5dc172..ce4a4bed 100644 --- a/src/movegen.cpp +++ b/src/movegen.cpp @@ -443,10 +443,21 @@ bool MoveGen::is_hside_castling_legal(Bitboard empty, Bitboard danger) const { } void MoveGen::write(MoveList& moves, Square dest, PieceMask piecemask, MoveFlags mf) { +#if LPS_AVX512 + moves.unsafe_append([&](Move* data) { + u16x16 vec = + u16x16::splat(Move{Square{0}, dest, mf}.raw) + | u16x16{_mm256_cvtepu8_epi16(m_position.piece_list_sq(m_active_color).to_vector().raw)}; + vec = m16x16{piecemask.value()}.compress(vec); + std::memcpy(data, &vec, sizeof(vec)); + return piecemask.popcount(); + }); +#else for (PieceId id : piecemask) { Square src = m_position.piece_list_sq(m_active_color)[id]; moves.push_back(Move{src, dest, mf}); } +#endif } void MoveGen::write(MoveList& moves, @@ -460,10 +471,32 @@ void MoveGen::write(MoveList& moves, } void MoveGen::write_pawn(MoveList& moves, Bitboard src_bb, i32 shift, MoveFlags mf) { +#if LPS_AVX512 + u16x32 base = []() consteval { + std::array base; + for (usize i = 0; i < 32; i++) { + Square src{static_cast(i)}; + Square dest{static_cast(i)}; + base[i] = Move{src, dest, static_cast(0)}; + } + return std::bit_cast(base); + }(); + for (int i : {0, 32}) { + moves.unsafe_append([&](Move* data) { + m16x32 mask{static_cast(src_bb.value() >> i)}; + u16x32 vec = + u16x32::splat(static_cast(i + ((i + shift) << 6) + static_cast(mf))) + base; + vec = mask.compress(vec); + std::memcpy(data, &vec, sizeof(vec)); + return static_cast(std::popcount(mask.raw)); + }); + } +#else for (Square src : src_bb) { Square dest{static_cast(src.raw + shift)}; moves.push_back(Move{src, dest, mf}); } +#endif } bool MoveGen::is_ep_clearance_pinned(PieceMask ep_attackers_mask) const { diff --git a/src/position.cpp b/src/position.cpp index 1174d6e4..948c7abf 100644 --- a/src/position.cpp +++ b/src/position.cpp @@ -175,22 +175,44 @@ void Position::incrementally_move_piece( dst_slider_ids = dst_raymask.mask(geometry::flip_rays(dst_slider_ids)); // flip rays dst_slider_ids |= dst_raymask.mask(u8x64::splat(0x20)); // pack information for efficiency +#if LPS_AVX512 + u8x64 src_inv_perm = geometry::superpiece_inverse_rays_avx512(from); + u8x64 dst_inv_perm = geometry::superpiece_inverse_rays_avx512(to); +#else u8x64 src_inv_perm = geometry::superpiece_inverse_rays_avx2(from); u8x64 dst_inv_perm = geometry::superpiece_inverse_rays_avx2(to); +#endif // Transform into board layout src_slider_ids = src_inv_perm.swizzle(src_slider_ids); dst_slider_ids = dst_inv_perm.swizzle(dst_slider_ids); // Recover color information - u8x64 src_col = src_slider_ids.test(u8x64::splat(0x10)).to_vector(); - u8x64 dst_col = dst_slider_ids.test(u8x64::splat(0x10)).to_vector(); + m8x64 src_col = src_slider_ids.test(u8x64::splat(0x10)); + m8x64 dst_col = dst_slider_ids.test(u8x64::splat(0x10)); // Recover ray mask information m8x64 ret = dst_slider_ids.test(u8x64::splat(0x20)); src_slider_ids &= u8x64::splat(0x0F); dst_slider_ids &= u8x64::splat(0x0F); +#if LPS_AVX512 + u16x64 src_slider_ids2 = std::bit_cast( + std::array{_mm512_cvtepu8_epi16(_mm512_castsi512_si256(src_slider_ids.raw)), + _mm512_cvtepu8_epi16(_mm512_extracti64x4_epi64(src_slider_ids.raw, 1))}); + u16x64 dst_slider_ids2 = std::bit_cast( + std::array{_mm512_cvtepu8_epi16(_mm512_castsi512_si256(dst_slider_ids.raw)), + _mm512_cvtepu8_epi16(_mm512_extracti64x4_epi64(dst_slider_ids.raw, 1))}); + + u16x64 src_at = m16x64{src_slider_ids.nonzeros().raw}.mask(u16x64::splat(1) << src_slider_ids2); + u16x64 dst_at = m16x64{dst_slider_ids.nonzeros().raw}.mask(u16x64::splat(1) << dst_slider_ids2); + + m16x64 src_color{src_col.raw}; + m16x64 dst_color{dst_col.raw}; + + m_attack_table[0].raw ^= (~src_color).mask(src_at) ^ (~dst_color).mask(dst_at); + m_attack_table[1].raw ^= src_color.mask(src_at) ^ dst_color.mask(dst_at); +#else // AVX2 doesn't have a variable word shift, so were're doing it this way. // Index zero is invalid here (the king is never a slider), so 0 converts to 0. static const u8x16 BITS_LO{{0x00, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, // @@ -202,10 +224,10 @@ void Position::incrementally_move_piece( u8x64 dst_at_lo = dst_slider_ids.swizzle(BITS_LO); u8x64 dst_at_hi = dst_slider_ids.swizzle(BITS_HI); - u8x64 src_color0 = src_col.zip_low_128lanes(src_col); - u8x64 src_color1 = src_col.zip_high_128lanes(src_col); - u8x64 dst_color0 = dst_col.zip_low_128lanes(dst_col); - u8x64 dst_color1 = dst_col.zip_high_128lanes(dst_col); + u8x64 src_color0 = src_col.to_vector().zip_low_128lanes(src_col.to_vector()); + u8x64 src_color1 = src_col.to_vector().zip_high_128lanes(src_col.to_vector()); + u8x64 dst_color0 = dst_col.to_vector().zip_low_128lanes(dst_col.to_vector()); + u8x64 dst_color1 = dst_col.to_vector().zip_high_128lanes(dst_col.to_vector()); u16x64 src_color = std::bit_cast(std::array{src_color0, src_color1}); u16x64 dst_color = std::bit_cast(std::array{dst_color0, dst_color1}); @@ -220,6 +242,7 @@ void Position::incrementally_move_piece( m_attack_table[0].raw ^= src_at.andnot(src_color) ^ dst_at.andnot(dst_color); m_attack_table[1].raw ^= (src_at & src_color) ^ (dst_at & dst_color); +#endif add_attacks(color, p.id(), to, p.ptype(), ret); } @@ -242,18 +265,34 @@ m8x64 Position::toggle_rays(Square sq) { slider_ids = raymask.mask(geometry::flip_rays(slider_ids)); // flip rays slider_ids |= raymask.mask(u8x64::splat(0x20)); // pack information for efficiency +#if LPS_AVX512 + u8x64 inv_perm = geometry::superpiece_inverse_rays_avx512(sq); +#else u8x64 inv_perm = geometry::superpiece_inverse_rays_avx2(sq); +#endif // Transform into board layout slider_ids = inv_perm.swizzle(slider_ids); // Recover color information - u8x64 col = slider_ids.test(u8x64::splat(0x10)).to_vector(); + m8x64 col = slider_ids.test(u8x64::splat(0x10)); // Recover ray mask information m8x64 ret = slider_ids.test(u8x64::splat(0x20)); slider_ids &= u8x64::splat(0x0F); +#if LPS_AVX512 + u16x64 slider_ids2 = std::bit_cast( + std::array{_mm512_cvtepu8_epi16(_mm512_castsi512_si256(slider_ids.raw)), + _mm512_cvtepu8_epi16(_mm512_extracti64x4_epi64(slider_ids.raw, 1))}); + + u16x64 at = m16x64{slider_ids.nonzeros().raw}.mask(u16x64::splat(1) << slider_ids2); + + m16x64 color{col.raw}; + + m_attack_table[0].raw ^= (~color).mask(at); + m_attack_table[1].raw ^= color.mask(at); +#else // AVX2 doesn't have a variable word shift, so were're doing it this way. // Index zero is invalid here (the king is never a slider), so 0 converts to 0. static const u8x16 BITS_LO{{0x00, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, // @@ -263,8 +302,8 @@ m8x64 Position::toggle_rays(Square sq) { u8x64 at_lo = slider_ids.swizzle(BITS_LO); u8x64 at_hi = slider_ids.swizzle(BITS_HI); - u8x64 color0 = col.zip_low_128lanes(col); - u8x64 color1 = col.zip_high_128lanes(col); + u8x64 color0 = col.to_vector().zip_low_128lanes(col.to_vector()); + u8x64 color1 = col.to_vector().zip_high_128lanes(col.to_vector()); u16x64 color = std::bit_cast(std::array{color0, color1}); u8x64 at0 = at_lo.zip_low_128lanes(at_hi); @@ -273,6 +312,7 @@ m8x64 Position::toggle_rays(Square sq) { m_attack_table[0].raw ^= at.andnot(color); m_attack_table[1].raw ^= at & color; +#endif return ret; } @@ -293,7 +333,11 @@ void Position::add_attacks(bool color, PieceId id, Square sq, PieceType ptype) { u8x64 ray_places = ray_coords.swizzle(m_board.to_vector()); m8x64 raymask = geometry::superpiece_attacks(ray_places, ray_valid); - u8x64 inv_perm = geometry::superpiece_inverse_rays_avx2(sq); +#if LPS_AVX512 + u8x64 inv_perm = geometry::superpiece_inverse_rays_avx512(sq); +#else + u8x64 inv_perm = geometry::superpiece_inverse_rays_avx2(sq); +#endif m8x64 boardmask = inv_perm.swizzle(raymask); add_attacks(color, id, sq, ptype, boardmask); @@ -303,14 +347,22 @@ void Position::add_attacks(bool color, PieceId id, Square sq, PieceType ptype) { } void Position::add_attacks(bool color, PieceId id, Square sq, PieceType ptype, m8x64 mask) { - u8x64 moves = (mask & geometry::piece_moves_avx2(color, ptype, sq)).to_vector(); + u16x64 bit = u16x64::splat(id.to_piece_mask().value()); - u8x64 m0 = moves.zip_low_128lanes(moves); - u8x64 m1 = moves.zip_high_128lanes(moves); +#if LPS_AVX512 + m8x64 moves = mask & geometry::piece_moves_avx512(color, ptype, sq); + + m_attack_table[color].raw |= m16x64{moves.raw}.mask(bit); +#else + m8x64 moves = mask & geometry::piece_moves_avx2(color, ptype, sq); + u8x64 moves_vec = moves.to_vector(); + + u8x64 m0 = moves_vec.zip_low_128lanes(moves_vec); + u8x64 m1 = moves_vec.zip_high_128lanes(moves_vec); u16x64 m = std::bit_cast(std::array{m0, m1}); - u16x64 bit = u16x64::splat(id.to_piece_mask().value()); m_attack_table[color].raw |= m & bit; +#endif } template @@ -523,13 +575,14 @@ std::tuple Position::calc_pin_mask() const { // Does this ray have a pinner? #if LPS_AVX512 - m8x64 no_pinner_mask{ - std::bit_cast(std::bit_cast(pinner.to_vector()).zeros().to_vector()) - .to_bits()}; + const m8x16 has_attacker_vecmask = u8x16{_mm_set1_epi64x(pinner.raw)}.nonzeros(); + const m8x64 pinned = m8x64{static_cast( + _mm_cvtsi128_si64(has_attacker_vecmask.mask(u8x16{_mm_set1_epi64x(maybe_pinned.raw)}).raw))}; #else m8x64 no_pinner_mask = std::bit_cast(std::bit_cast(pinner).to_vector().zeros()); + m8x64 pinned = maybe_pinned.andnot(no_pinner_mask); #endif - m8x64 pinned = maybe_pinned.andnot(no_pinner_mask); + u8x64 nonmasked_pinned_ids = geometry::lane_broadcast(pinned.mask(ray_places & u8x64::splat(0xF))); diff --git a/src/util/static_vector.hpp b/src/util/static_vector.hpp index 3d039d0c..6dde8c23 100644 --- a/src/util/static_vector.hpp +++ b/src/util/static_vector.hpp @@ -92,6 +92,12 @@ class StaticVector { return res; } + template + void unsafe_append(F&& f) { + usize sz = f(end()); + m_len += sz; + } + void append(const StaticVector& other) { assert(m_len + other.m_len <= cap); std::uninitialized_copy(other.begin(), other.end(), end());