Skip to content

Commit 4fc5b6d

Browse files
authored
[libc++] Optimize {std,ranges}::for_each for iterating over __trees (#164405)
This patch optimizes how `for_each` iterates over trees by using recursion and storing pointers to the next nodes on the stack. This avoids pointer chasing through the `__parent_` pointer, reducing cache misses. It also makes use of the compiler being able tail-call optimize the recursive function, removing back-tracking the iterators have to do. ``` Benchmark old new Difference % Difference -------------------------------------------------- -------------- -------------- ------------ -------------- rng::for_each(map<int>)/32 35.19 26.67 -8.52 -24.21% rng::for_each(map<int>)/50 64.13 40.68 -23.45 -36.57% rng::for_each(map<int>)/8 5.06 6.49 1.43 28.21% rng::for_each(map<int>)/8192 22893.89 9266.68 -13627.21 -59.52% rng::for_each(map<int>::iterator)/32 35.51 26.88 -8.63 -24.31% rng::for_each(map<int>::iterator)/50 64.39 41.24 -23.15 -35.95% rng::for_each(map<int>::iterator)/8 5.12 5.93 0.81 15.80% rng::for_each(map<int>::iterator)/8192 21283.14 9736.83 -11546.31 -54.25% rng::for_each(multimap<int>)/32 35.22 26.61 -8.61 -24.45% rng::for_each(multimap<int>)/50 64.10 40.07 -24.03 -37.49% rng::for_each(multimap<int>)/8 5.08 6.69 1.61 31.70% rng::for_each(multimap<int>)/8192 23130.44 9026.16 -14104.28 -60.98% rng::for_each(multimap<int>::iterator)/32 35.40 25.08 -10.32 -29.15% rng::for_each(multimap<int>::iterator)/50 64.19 38.15 -26.04 -40.56% rng::for_each(multimap<int>::iterator)/8 5.04 5.25 0.22 4.31% rng::for_each(multimap<int>::iterator)/8192 22875.97 9392.08 -13483.89 -58.94% rng::for_each(multiset<int>)/32 35.82 27.11 -8.72 -24.33% rng::for_each(multiset<int>)/50 62.92 41.59 -21.32 -33.89% rng::for_each(multiset<int>)/8 4.79 6.79 2.00 41.70% rng::for_each(multiset<int>)/8192 22642.68 9280.95 -13361.73 -59.01% rng::for_each(multiset<int>::iterator)/32 35.76 26.71 -9.04 -25.28% rng::for_each(multiset<int>::iterator)/50 63.44 39.00 -24.44 -38.53% rng::for_each(multiset<int>::iterator)/8 4.90 5.21 0.30 6.18% rng::for_each(multiset<int>::iterator)/8192 19930.45 9867.60 -10062.85 -50.49% rng::for_each(set<int>)/32 35.90 27.30 -8.60 -23.96% rng::for_each(set<int>)/50 63.15 40.75 -22.40 -35.47% rng::for_each(set<int>)/8 4.77 6.83 2.06 43.23% rng::for_each(set<int>)/8192 20262.77 9381.57 -10881.20 -53.70% rng::for_each(set<int>::iterator)/32 36.02 26.42 -9.60 -26.64% rng::for_each(set<int>::iterator)/50 63.29 37.97 -25.32 -40.01% rng::for_each(set<int>::iterator)/8 4.72 5.22 0.50 10.50% rng::for_each(set<int>::iterator)/8192 20041.91 9831.91 -10210.00 -50.94% ```
1 parent d0d8359 commit 4fc5b6d

File tree

11 files changed

+562
-5
lines changed

11 files changed

+562
-5
lines changed

libcxx/docs/ReleaseNotes/22.rst

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -86,6 +86,8 @@ Improvements and New Features
8686
This is guarded behind the ABI Macro ``_LIBCPP_ABI_ATOMIC_WAIT_NATIVE_BY_SIZE``.
8787
- The performance of ``vector<bool>::reserve()`` has been improved by up to 2x.
8888

89+
- ``std::for_each`` and ``ranges::for_each`` have been optimized to iterate more efficiently over the associative
90+
containers, resulting in performance improvements of up to 2x.
8991
- The ``num_get::do_get`` integral overloads have been optimized, resulting in a performance improvement of up to 2.8x.
9092

9193
Deprecations and Removals

libcxx/include/__algorithm/for_each.h

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@
1111
#define _LIBCPP___ALGORITHM_FOR_EACH_H
1212

1313
#include <__algorithm/for_each_segment.h>
14+
#include <__algorithm/specialized_algorithms.h>
1415
#include <__config>
1516
#include <__functional/identity.h>
1617
#include <__iterator/segmented_iterator.h>
@@ -27,7 +28,12 @@ template <class _InputIterator, class _Sent, class _Func, class _Proj>
2728
_LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 _InputIterator
2829
__for_each(_InputIterator __first, _Sent __last, _Func& __func, _Proj& __proj) {
2930
#ifndef _LIBCPP_CXX03_LANG
30-
if constexpr (is_same<_InputIterator, _Sent>::value && __is_segmented_iterator_v<_InputIterator>) {
31+
if constexpr (using _SpecialAlg =
32+
__specialized_algorithm<_Algorithm::__for_each, __iterator_pair<_InputIterator, _Sent>>;
33+
_SpecialAlg::__has_algorithm) {
34+
_SpecialAlg()(__first, __last, __func, __proj);
35+
return __last;
36+
} else if constexpr (is_same<_InputIterator, _Sent>::value && __is_segmented_iterator_v<_InputIterator>) {
3137
using __local_iterator_t = typename __segmented_iterator_traits<_InputIterator>::__local_iterator;
3238
std::__for_each_segment(__first, __last, [&](__local_iterator_t __lfirst, __local_iterator_t __llast) {
3339
std::__for_each(__lfirst, __llast, __func, __proj);

libcxx/include/__algorithm/ranges_for_each.h

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@
1212
#include <__algorithm/for_each.h>
1313
#include <__algorithm/for_each_n.h>
1414
#include <__algorithm/in_fun_result.h>
15+
#include <__algorithm/specialized_algorithms.h>
1516
#include <__concepts/assignable.h>
1617
#include <__config>
1718
#include <__functional/identity.h>
@@ -20,6 +21,7 @@
2021
#include <__ranges/access.h>
2122
#include <__ranges/concepts.h>
2223
#include <__ranges/dangling.h>
24+
#include <__type_traits/remove_cvref.h>
2325
#include <__utility/move.h>
2426

2527
#if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
@@ -71,7 +73,13 @@ struct __for_each {
7173
indirectly_unary_invocable<projected<iterator_t<_Range>, _Proj>> _Func>
7274
_LIBCPP_HIDE_FROM_ABI constexpr for_each_result<borrowed_iterator_t<_Range>, _Func>
7375
operator()(_Range&& __range, _Func __func, _Proj __proj = {}) const {
74-
return __for_each_impl(ranges::begin(__range), ranges::end(__range), __func, __proj);
76+
using _SpecialAlg = __specialized_algorithm<_Algorithm::__for_each, __single_range<remove_cvref_t<_Range>>>;
77+
if constexpr (_SpecialAlg::__has_algorithm) {
78+
auto [__iter, __func2] = _SpecialAlg()(__range, std::move(__func), std::move(__proj));
79+
return {std::move(__iter), std::move(__func)};
80+
} else {
81+
return __for_each_impl(ranges::begin(__range), ranges::end(__range), __func, __proj);
82+
}
7583
}
7684
};
7785

libcxx/include/__algorithm/specialized_algorithms.h

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -19,11 +19,18 @@ _LIBCPP_BEGIN_NAMESPACE_STD
1919

2020
namespace _Algorithm {
2121
struct __fill_n {};
22+
struct __for_each {};
2223
} // namespace _Algorithm
2324

2425
template <class>
2526
struct __single_iterator;
2627

28+
template <class, class>
29+
struct __iterator_pair;
30+
31+
template <class>
32+
struct __single_range;
33+
2734
// This struct allows specializing algorithms for specific arguments. This is useful when we know a more efficient
2835
// algorithm implementation for e.g. library-defined iterators. _Alg is one of tags defined inside the _Algorithm
2936
// namespace above. _Ranges is an essentially arbitrary subset of the arguments to the algorithm that are used for

libcxx/include/__tree

Lines changed: 107 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@
1111
#define _LIBCPP___TREE
1212

1313
#include <__algorithm/min.h>
14+
#include <__algorithm/specialized_algorithms.h>
1415
#include <__assert>
1516
#include <__config>
1617
#include <__fwd/pair.h>
@@ -36,6 +37,7 @@
3637
#include <__type_traits/is_swappable.h>
3738
#include <__type_traits/make_transparent.h>
3839
#include <__type_traits/remove_const.h>
40+
#include <__type_traits/remove_cvref.h>
3941
#include <__utility/forward.h>
4042
#include <__utility/lazy_synth_three_way_comparator.h>
4143
#include <__utility/move.h>
@@ -656,6 +658,53 @@ struct __generic_container_node_destructor<__tree_node<_Tp, _VoidPtr>, _Alloc> :
656658
};
657659
#endif
658660

661+
// Do an in-order traversal of the tree until `__break` returns true. Takes the root node of the tree.
662+
template <class _Reference, class _Break, class _NodePtr, class _Func, class _Proj>
663+
#ifndef _LIBCPP_COMPILER_GCC // This function is recursive, so GCC complains about always_inline.
664+
_LIBCPP_HIDE_FROM_ABI
665+
#endif
666+
bool __tree_iterate_from_root(_Break __break, _NodePtr __root, _Func& __func, _Proj& __proj) {
667+
if (__root->__left_) {
668+
if (std::__tree_iterate_from_root<_Reference>(__break, static_cast<_NodePtr>(__root->__left_), __func, __proj))
669+
return true;
670+
}
671+
if (__break(__root))
672+
return true;
673+
__func(static_cast<_Reference>(__root->__get_value()));
674+
if (__root->__right_)
675+
return std::__tree_iterate_from_root<_Reference>(__break, static_cast<_NodePtr>(__root->__right_), __func, __proj);
676+
return false;
677+
}
678+
679+
// Do an in-order traversal of the tree from __first to __last.
680+
template <class _NodeIter, class _Func, class _Proj>
681+
_LIBCPP_HIDE_FROM_ABI void
682+
__tree_iterate_subrange(_NodeIter __first_it, _NodeIter __last_it, _Func& __func, _Proj& __proj) {
683+
using _NodePtr = typename _NodeIter::__node_pointer;
684+
using _Reference = typename _NodeIter::reference;
685+
686+
auto __first = __first_it.__ptr_;
687+
auto __last = __last_it.__ptr_;
688+
689+
while (true) {
690+
if (__first == __last)
691+
return;
692+
const auto __nfirst = static_cast<_NodePtr>(__first);
693+
__func(static_cast<_Reference>(__nfirst->__get_value()));
694+
if (__nfirst->__right_) {
695+
if (std::__tree_iterate_from_root<_Reference>(
696+
[&](_NodePtr __node) -> bool { return __node == __last; },
697+
static_cast<_NodePtr>(__nfirst->__right_),
698+
__func,
699+
__proj))
700+
return;
701+
}
702+
while (!std::__tree_is_left_child(static_cast<_NodePtr>(__first)))
703+
__first = static_cast<_NodePtr>(__first)->__parent_;
704+
__first = static_cast<_NodePtr>(__first)->__parent_;
705+
}
706+
}
707+
659708
template <class _Tp, class _NodePtr, class _DiffType>
660709
class __tree_iterator {
661710
using _NodeTypes _LIBCPP_NODEBUG = __tree_node_types<_NodePtr>;
@@ -715,7 +764,27 @@ private:
715764
friend class __tree;
716765
template <class, class, class>
717766
friend class __tree_const_iterator;
767+
768+
template <class _NodeIter, class _Func, class _Proj>
769+
friend void __tree_iterate_subrange(_NodeIter, _NodeIter, _Func&, _Proj&);
770+
};
771+
772+
#ifndef _LIBCPP_CXX03_LANG
773+
// This also handles {multi,}set::iterator, since they're just aliases to __tree::iterator
774+
template <class _Tp, class _NodePtr, class _DiffType>
775+
struct __specialized_algorithm<
776+
_Algorithm::__for_each,
777+
__iterator_pair<__tree_iterator<_Tp, _NodePtr, _DiffType>, __tree_iterator<_Tp, _NodePtr, _DiffType>>> {
778+
static const bool __has_algorithm = true;
779+
780+
using __iterator _LIBCPP_NODEBUG = __tree_iterator<_Tp, _NodePtr, _DiffType>;
781+
782+
template <class _Func, class _Proj>
783+
_LIBCPP_HIDE_FROM_ABI static void operator()(__iterator __first, __iterator __last, _Func& __func, _Proj& __proj) {
784+
std::__tree_iterate_subrange(__first, __last, __func, __proj);
785+
}
718786
};
787+
#endif
719788

720789
template <class _Tp, class _NodePtr, class _DiffType>
721790
class __tree_const_iterator {
@@ -780,8 +849,28 @@ private:
780849

781850
template <class, class, class>
782851
friend class __tree;
852+
853+
template <class _NodeIter, class _Func, class _Proj>
854+
friend void __tree_iterate_subrange(_NodeIter, _NodeIter, _Func&, _Proj&);
783855
};
784856

857+
#ifndef _LIBCPP_CXX03_LANG
858+
// This also handles {multi,}set::const_iterator, since they're just aliases to __tree::iterator
859+
template <class _Tp, class _NodePtr, class _DiffType>
860+
struct __specialized_algorithm<
861+
_Algorithm::__for_each,
862+
__iterator_pair<__tree_const_iterator<_Tp, _NodePtr, _DiffType>, __tree_const_iterator<_Tp, _NodePtr, _DiffType>>> {
863+
static const bool __has_algorithm = true;
864+
865+
using __iterator _LIBCPP_NODEBUG = __tree_const_iterator<_Tp, _NodePtr, _DiffType>;
866+
867+
template <class _Func, class _Proj>
868+
_LIBCPP_HIDE_FROM_ABI static void operator()(__iterator __first, __iterator __last, _Func& __func, _Proj& __proj) {
869+
std::__tree_iterate_subrange(__first, __last, __func, __proj);
870+
}
871+
};
872+
#endif
873+
785874
template <class _Tp, class _Compare>
786875
#ifndef _LIBCPP_CXX03_LANG
787876
_LIBCPP_DIAGNOSE_WARNING(!__is_invocable_v<_Compare const&, _Tp const&, _Tp const&>,
@@ -1440,7 +1529,25 @@ private:
14401529
[](value_type& __lhs, value_type& __rhs) { __assign_value(__lhs, std::move(__rhs)); },
14411530
[this](__node_pointer __nd) { return __move_construct_tree(__nd); });
14421531
}
1532+
1533+
friend struct __specialized_algorithm<_Algorithm::__for_each, __single_range<__tree> >;
1534+
};
1535+
1536+
#if _LIBCPP_STD_VER >= 14
1537+
template <class _Tp, class _Compare, class _Allocator>
1538+
struct __specialized_algorithm<_Algorithm::__for_each, __single_range<__tree<_Tp, _Compare, _Allocator> > > {
1539+
static const bool __has_algorithm = true;
1540+
1541+
using __node_pointer _LIBCPP_NODEBUG = typename __tree<_Tp, _Compare, _Allocator>::__node_pointer;
1542+
1543+
template <class _Tree, class _Func, class _Proj>
1544+
_LIBCPP_HIDE_FROM_ABI static auto operator()(_Tree&& __range, _Func __func, _Proj __proj) {
1545+
std::__tree_iterate_from_root<__copy_cvref_t<_Tree, typename __remove_cvref_t<_Tree>::value_type>>(
1546+
[](__node_pointer) { return false; }, __range.__root(), __func, __proj);
1547+
return std::make_pair(__range.end(), std::move(__func));
1548+
}
14431549
};
1550+
#endif
14441551

14451552
template <class _Tp, class _Compare, class _Allocator>
14461553
__tree<_Tp, _Compare, _Allocator>& __tree<_Tp, _Compare, _Allocator>::operator=(const __tree& __t) {

libcxx/include/map

Lines changed: 77 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -577,6 +577,7 @@ erase_if(multimap<Key, T, Compare, Allocator>& c, Predicate pred); // C++20
577577
# include <__algorithm/equal.h>
578578
# include <__algorithm/lexicographical_compare.h>
579579
# include <__algorithm/lexicographical_compare_three_way.h>
580+
# include <__algorithm/specialized_algorithms.h>
580581
# include <__assert>
581582
# include <__config>
582583
# include <__functional/binary_function.h>
@@ -818,7 +819,26 @@ public:
818819
friend class multimap;
819820
template <class>
820821
friend class __map_const_iterator;
822+
823+
template <class, class...>
824+
friend struct __specialized_algorithm;
825+
};
826+
827+
# ifndef _LIBCPP_CXX03_LANG
828+
template <class _Alg, class _TreeIterator>
829+
struct __specialized_algorithm<_Alg, __iterator_pair<__map_iterator<_TreeIterator>, __map_iterator<_TreeIterator>>> {
830+
using __base _LIBCPP_NODEBUG = __specialized_algorithm<_Alg, __iterator_pair<_TreeIterator, _TreeIterator>>;
831+
832+
static const bool __has_algorithm = __base::__has_algorithm;
833+
834+
using __iterator _LIBCPP_NODEBUG = __map_iterator<_TreeIterator>;
835+
836+
template <class... _Args>
837+
_LIBCPP_HIDE_FROM_ABI static void operator()(__iterator __first, __iterator __last, _Args&&... __args) {
838+
__base()(__first.__i_, __last.__i_, std::forward<_Args>(__args)...);
839+
}
821840
};
841+
# endif
822842

823843
template <class _TreeIterator>
824844
class __map_const_iterator {
@@ -873,7 +893,28 @@ public:
873893
friend class multimap;
874894
template <class, class, class>
875895
friend class __tree_const_iterator;
896+
897+
template <class, class...>
898+
friend struct __specialized_algorithm;
899+
};
900+
901+
# ifndef _LIBCPP_CXX03_LANG
902+
template <class _Alg, class _TreeIterator>
903+
struct __specialized_algorithm<
904+
_Alg,
905+
__iterator_pair<__map_const_iterator<_TreeIterator>, __map_const_iterator<_TreeIterator>>> {
906+
using __base _LIBCPP_NODEBUG = __specialized_algorithm<_Alg, __iterator_pair<_TreeIterator, _TreeIterator>>;
907+
908+
static const bool __has_algorithm = __base::__has_algorithm;
909+
910+
using __iterator _LIBCPP_NODEBUG = __map_const_iterator<_TreeIterator>;
911+
912+
template <class... _Args>
913+
_LIBCPP_HIDE_FROM_ABI static void operator()(__iterator __first, __iterator __last, _Args&&... __args) {
914+
__base()(__first.__i_, __last.__i_, std::forward<_Args>(__args)...);
915+
}
876916
};
917+
# endif
877918

878919
template <class _Key, class _Tp, class _Compare = less<_Key>, class _Allocator = allocator<pair<const _Key, _Tp> > >
879920
class multimap;
@@ -1371,6 +1412,8 @@ private:
13711412
# ifdef _LIBCPP_CXX03_LANG
13721413
_LIBCPP_HIDE_FROM_ABI __node_holder __construct_node_with_key(const key_type& __k);
13731414
# endif
1415+
1416+
friend struct __specialized_algorithm<_Algorithm::__for_each, __single_range<map> >;
13741417
};
13751418

13761419
# if _LIBCPP_STD_VER >= 17
@@ -1423,6 +1466,22 @@ map(initializer_list<pair<_Key, _Tp>>, _Allocator)
14231466
-> map<remove_const_t<_Key>, _Tp, less<remove_const_t<_Key>>, _Allocator>;
14241467
# endif
14251468

1469+
# if _LIBCPP_STD_VER >= 14
1470+
template <class _Key, class _Tp, class _Compare, class _Allocator>
1471+
struct __specialized_algorithm<_Algorithm::__for_each, __single_range<map<_Key, _Tp, _Compare, _Allocator>>> {
1472+
using __map _LIBCPP_NODEBUG = map<_Key, _Tp, _Compare, _Allocator>;
1473+
1474+
static const bool __has_algorithm = true;
1475+
1476+
template <class _Map, class _Func, class _Proj>
1477+
_LIBCPP_HIDE_FROM_ABI static auto operator()(_Map&& __map, _Func __func, _Proj __proj) {
1478+
auto [_, __func2] = __specialized_algorithm<_Algorithm::__for_each, __single_range<typename __map::__base>>()(
1479+
__map.__tree_, std::move(__func), std::move(__proj));
1480+
return std::make_pair(__map.end(), std::move(__func2));
1481+
}
1482+
};
1483+
# endif
1484+
14261485
# ifndef _LIBCPP_CXX03_LANG
14271486
template <class _Key, class _Tp, class _Compare, class _Allocator>
14281487
_Tp& map<_Key, _Tp, _Compare, _Allocator>::operator[](const key_type& __k) {
@@ -1922,6 +1981,8 @@ private:
19221981

19231982
typedef __map_node_destructor<__node_allocator> _Dp;
19241983
typedef unique_ptr<__node, _Dp> __node_holder;
1984+
1985+
friend struct __specialized_algorithm<_Algorithm::__for_each, __single_range<multimap> >;
19251986
};
19261987

19271988
# if _LIBCPP_STD_VER >= 17
@@ -1974,6 +2035,22 @@ multimap(initializer_list<pair<_Key, _Tp>>, _Allocator)
19742035
-> multimap<remove_const_t<_Key>, _Tp, less<remove_const_t<_Key>>, _Allocator>;
19752036
# endif
19762037

2038+
# if _LIBCPP_STD_VER >= 14
2039+
template <class _Key, class _Tp, class _Compare, class _Allocator>
2040+
struct __specialized_algorithm<_Algorithm::__for_each, __single_range<multimap<_Key, _Tp, _Compare, _Allocator>>> {
2041+
using __map _LIBCPP_NODEBUG = multimap<_Key, _Tp, _Compare, _Allocator>;
2042+
2043+
static const bool __has_algorithm = true;
2044+
2045+
template <class _Map, class _Func, class _Proj>
2046+
_LIBCPP_HIDE_FROM_ABI static auto operator()(_Map&& __map, _Func __func, _Proj __proj) {
2047+
auto [_, __func2] = __specialized_algorithm<_Algorithm::__for_each, __single_range<typename __map::__base>>()(
2048+
__map.__tree_, std::move(__func), std::move(__proj));
2049+
return std::make_pair(__map.end(), std::move(__func2));
2050+
}
2051+
};
2052+
# endif
2053+
19772054
template <class _Key, class _Tp, class _Compare, class _Allocator>
19782055
inline _LIBCPP_HIDE_FROM_ABI bool
19792056
operator==(const multimap<_Key, _Tp, _Compare, _Allocator>& __x, const multimap<_Key, _Tp, _Compare, _Allocator>& __y) {

0 commit comments

Comments
 (0)