orxfun
diff --git a/‎.github/workflows/ci.yml‎
Lines changed: 1 addition & 1 deletion b/‎.github/workflows/ci.yml‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎Cargo.toml‎
Lines changed: 11 additions & 10 deletions b/‎Cargo.toml‎
Lines changed: 11 additions & 10 deletions
diff --git a/‎README.md‎
Lines changed: 68 additions & 2 deletions b/‎README.md‎
Lines changed: 68 additions & 2 deletions
diff --git a/‎benches/rec_iter_map_collect.rs‎
Lines changed: 166 additions & 0 deletions b/‎benches/rec_iter_map_collect.rs‎
Lines changed: 166 additions & 0 deletions
@@ -16,7 +16,7 @@ jobs:
     strategy:
       matrix:
         toolchain: ["stable"]
-        features: ["", "--features generic_iterator"]
+        features: ["", "--all-features", "--no-default-features"]
 
     steps:
     - uses: actions/checkout@v4
 
@@ -1,6 +1,6 @@
 [package]
 name = "orx-parallel"
-version = "3.3.0"
+version = "3.4.0"
 edition = "2024"
 authors = ["orxfun <orx.ugur.arikan@gmail.com>"]
 readme = "README.md"
@@ -11,16 +11,17 @@ keywords = ["parallel", "concurrency", "performance", "thread", "iterator"]
 categories = ["concurrency", "algorithms"]
 
 [dependencies]
-orx-pinned-vec = { version = "3.17.0", default-features = false }
-orx-fixed-vec = { version = "3.19.0", default-features = false }
-orx-split-vec = { version = "3.19.0", default-features = false }
-orx-concurrent-iter = { version = "3.1.0", default-features = false }
-orx-concurrent-bag = { version = "3.1.0", default-features = false }
-orx-concurrent-ordered-bag = { version = "3.1.0", default-features = false }
+orx-pinned-vec = { version = "3.21.0", default-features = false }
+orx-fixed-vec = { version = "3.22.0", default-features = false }
+orx-split-vec = { version = "3.22.0", default-features = false }
+orx-concurrent-iter = { version = "3.3.0", default-features = false }
+orx-concurrent-bag = { version = "3.4.0", default-features = false }
+orx-concurrent-ordered-bag = { version = "3.4.0", default-features = false }
+orx-pinned-concurrent-col = { version = "2.18.0", default-features = false }
 orx-iterable = { version = "1.3.0", default-features = false }
-orx-pinned-concurrent-col = { version = "2.15.0", default-features = false }
 orx-priority-queue = { version = "1.7.0", default-features = false }
 orx-pseudo-default = { version = "2.1.0", default-features = false }
+orx-concurrent-recursive-iter = { version = "2.0.0", default-features = false }
 
 # optional: generic iterator
 rayon = { version = "1.11.0", optional = true, default-features = false }
@@ -35,10 +36,10 @@ yastl = { version = "0.1.2", optional = true, default-features = false }
 
 [dev-dependencies]
 chrono = "0.4.42"
-clap = { version = "4.5.47", features = ["derive"] }
+clap = { version = "4.5.50", features = ["derive"] }
 criterion = "0.7.0"
 orx-concurrent-option = { version = "1.5.0", default-features = false }
-orx-concurrent-vec = "3.8.0"
+orx-concurrent-vec = "3.10.0"
 rand = "0.9.2"
 rand_chacha = "0.9"
 rayon = "1.11.0"
 
@@ -8,6 +8,7 @@
 
 * [Parallel Computation by Iterators](#parallel-computation-by-iterators)
 * [Parallelizable Collections](#parallelizable-collections)
+* [Parallelization over Nonlinear Data Structures](#parallelization-over-nonlinear-data-structures)
 * [Performance and Benchmarks](#performance-and-benchmarks)
 * [Fallible Parallel Iterators](#fallible-parallel-iterators)
 * [Using Mutable Variables](#using-mutable-variables)
@@ -150,8 +151,47 @@ The following table demonstrates these methods for the `HashSet`; however, they
 
 Note that each approach can be more efficient in different scenarios. For large elements, (ii) might be preferred to avoid allocation of the vector. For insignificant tasks to be performed on each element, (i) might be preferred to take full benefit of vector-specific optimizations.
 
+## Parallelization over Nonlinear Data Structures
+
+[IntoParIterRec](https://docs.rs/orx-parallel/latest/orx_parallel/trait.IntoParIterRec.html) trait can be used to create a **parallel recursive iterator** over an initial set of elements which is useful when working with non-linear data structures such as **trees** and **graphs**.
+
+Consider, for instance, a tree which is defined by the following node struct:
+
+```rust ignore
+pub struct Node<T> {
+    pub data: T,
+    pub children: Vec<Node<T>>,
+}
+```
+
+Assume that we want to map all the data with `map: impl Fn(T) -> u64` and compute the sum of mapped values of all nodes descending from a `root: &Node`.
+
+We can express this computation and execute in parallel with the following:
+
+```rust ignore
+fn extend<'a>(node: &&'a Node, queue: &Queue<&'a Node>) {
+    queue.extend(&node.children);
+}
+
+[root].into_par_rec(extend).map(map).sum()
+```
+
+Instead of `into_par`, we use `into_par_rec` and provide `extend` function as its argument. This function defines the recursive extension of the parallel iterator such that every time we process a `node` we first add its children to the `queue`. [`Queue`](https://docs.rs/orx-concurrent-recursive-iter/latest/orx_concurrent_recursive_iter/struct.Queue.html) is the queue of elements to be processed and it exposes two growth methods to define the recursive extension: `push` and `extend`.
+
+Although we create the parallel iterator differently, we get a `ParIter`. Therefore, we have access to all features of a regular parallel iterator.
+
+For instance, assume we want to filter nodes first. Further, instead of summing up the mapped values, we need to collect them in a vector. We can express this computation just as we would do on a linear data structure:
+
+```rust ignore
+[root].into_par_rec(extend).filter(filter).map(map).collect()
+```
+
+For more details, you may see the [parallelization_on_tree](https://github.com/orxfun/orx-parallel/blob/main/examples/parallelization_on_tree) example.
+
 ## Performance and Benchmarks
 
+*Please also see [impact of ChunkSize on performance](#impact-of-chunksize-on-performance) section.*
+
 You may find some sample parallel programs in [examples](https://github.com/orxfun/orx-parallel/blob/main/examples) directory. These examples allow to express parallel computations as iterator method compositions and run quick experiments with different approaches. Examples use `GenericIterator`. As the name suggests, it is a generalization of sequential iterator, rayon's parallel iterator and orx-parallel's parallel iterator, and hence, allows for convenient experiments. You may play with the code, update the tested computations and run these examples by including **generic_iterator** feature, such as:
 
 `cargo run --release --features generic_iterator --example benchmark_collect -- --len 123456 --num-repetitions 10`
@@ -419,6 +459,26 @@ This is guaranteed by the fact that both consuming computation calls and configu
 
 Additionally, maximum number of threads that can be used by parallel computations can be globally bounded by the environment variable `ORX_PARALLEL_MAX_NUM_THREADS`. Please see the corresponding [example](https://github.com/orxfun/orx-parallel/blob/main/examples/max_num_threads_config.rs) for details.
 
+### Impact of `ChunkSize` on Performance
+
+The impact of the chunk size on performance might be significant.
+
+Our objective is to minimize the sum of two computational costs:
+* parallelization overhead => it gets smaller as chunk size gets greater
+* cost of heterogeneity => it gets larger as chunk size gets greater
+
+Parallelization overhead can further be divided into two:
+* concurrent state update: This often corresponds to one atomic update per chunk. It may be significant if our computation is very small such as `input.par().sum()`. Otherwise, cost of atomic update could be negligible.
+* false sharing: This is relevant only if we are writing results. For instance, when we are one-to-one mapping an input and collecting the results such as `input.par().map(|x| x.to_string()).collect()`, or if are writing with mut references such as `input.par().for_each(|x| *x += 1)`. Here, the performance might suffer from false sharing when the `chunk size × size of output item` is not large enough. You may also see [false sharing](https://docs.rs/orx-concurrent-bag/latest/orx_concurrent_bag/#false-sharing) section for `ConcurrentBag`.
+
+In either case, when computation on each item is sufficiently long, parallelization overhead is negligible. Here, we want to make sure that we do not have heterogeneity cost. Therefore, a safe chunk size choice would be one, `par.chunk_size(1)`.
+
+Otherwise, our choice depends on the use case. As a rule of thumb, we want a chunk size that is **just large enough** to mitigate the parallelization overhead but not larger so that we do not suffer from heterogeneity.
+
+The default configuration `par.chunk_size(ChunkSize::Auto)` or `par.chunk_size(0)` uses a heuristic to solve this tradeoff. A difficult case for the current version is when the tasks are significantly heterogeneous (see the [discussion](https://github.com/orxfun/orx-parallel/discussions/26) for future improvements).
+
+As described above, the **best way to deal with heterogeneity** is to have `par.chunk_size(1)`. You may of course test larger chunk sizes to optimize the computation for your data.
+
 
 ## Runner: Pools and Executors
 
@@ -459,9 +519,13 @@ let inputs: Vec<_> = (0..42).collect();
 let sum = inputs.par().sum();
 
 // equivalent to:
-let sum2 = inputs.par().with_pool(StdDefaultPool::default()).sum();
-assert_eq!(sum, sum2);
+#[cfg(feature = "std")]
+{
+    let sum2 = inputs.par().with_pool(StdDefaultPool::default()).sum();
+    assert_eq!(sum, sum2);
+}
 
+#[cfg(not(miri))]
 #[cfg(feature = "scoped_threadpool")]
 {
     let mut pool = scoped_threadpool::Pool::new(8);
@@ -470,6 +534,7 @@ assert_eq!(sum, sum2);
     assert_eq!(sum, sum2);
 }
 
+#[cfg(not(miri))]
 #[cfg(feature = "rayon-core")]
 {
     let pool = rayon_core::ThreadPoolBuilder::new()
@@ -481,6 +546,7 @@ assert_eq!(sum, sum2);
     assert_eq!(sum, sum2);
 }
 
+#[cfg(not(miri))]
 #[cfg(feature = "yastl")]
 {
     let pool = YastlPool::new(8);
 
@@ -0,0 +1,166 @@
+use criterion::{BenchmarkId, Criterion, criterion_group, criterion_main};
+use orx_concurrent_recursive_iter::Queue;
+use orx_parallel::*;
+use orx_split_vec::SplitVec;
+use rand::prelude::*;
+use rand_chacha::ChaCha8Rng;
+use std::hint::black_box;
+
+fn fibonacci(n: u64, work: usize) -> u64 {
+    (7..(work + 7))
+        .map(|j| {
+            let n = black_box((n + j as u64) % 100);
+            let mut a = 0;
+            let mut b = 1;
+            for _ in 0..n {
+                let c = a + b;
+                a = b;
+                b = c;
+            }
+            a
+        })
+        .sum()
+}
+
+struct Node {
+    value: Vec<u64>,
+    children: Vec<Node>,
+}
+
+impl Node {
+    fn new(mut n: u32, rng: &mut impl Rng) -> Self {
+        let mut children = Vec::new();
+        if n < 5 {
+            for _ in 0..n {
+                children.push(Node::new(0, rng));
+            }
+        } else {
+            while n > 0 {
+                let n2 = rng.random_range(0..=n);
+                children.push(Node::new(n2, rng));
+                n -= n2;
+            }
+        }
+        Self {
+            value: (0..rng.random_range(1..500))
+                .map(|_| rng.random_range(0..40))
+                .collect(),
+            children,
+        }
+    }
+
+    fn seq_num_nodes(&self) -> usize {
+        1 + self
+            .children
+            .iter()
+            .map(|node| node.seq_num_nodes())
+            .sum::<usize>()
+    }
+
+    fn seq(&self, work: usize, numbers: &mut Vec<u64>) {
+        numbers.extend(self.value.iter().map(|x| fibonacci(*x, work)));
+        for c in &self.children {
+            c.seq(work, numbers);
+        }
+    }
+}
+
+// alternatives
+
+fn seq(roots: &[Node], work: usize) -> Vec<u64> {
+    let mut result = vec![];
+    for root in roots {
+        root.seq(work, &mut result);
+    }
+    result
+}
+
+fn orx_lazy_unknown_chunk1024(roots: &[Node], work: usize) -> SplitVec<u64> {
+    fn extend<'a>(node: &&'a Node, queue: &Queue<&'a Node>) {
+        queue.extend(&node.children);
+    }
+
+    roots
+        .into_par_rec(extend)
+        .chunk_size(1024)
+        .flat_map(|x| x.value.iter().map(|x| fibonacci(*x, work)))
+        .collect()
+}
+
+fn orx_lazy_exact(roots: &[Node], work: usize, num_nodes: usize) -> SplitVec<u64> {
+    fn extend<'a>(node: &&'a Node, queue: &Queue<&'a Node>) {
+        queue.extend(&node.children);
+    }
+
+    roots
+        .into_par_rec_exact(extend, num_nodes)
+        .flat_map(|x| x.value.iter().map(|x| fibonacci(*x, work)))
+        .collect()
+}
+
+fn orx_linearized(roots: &[Node], work: usize) -> SplitVec<u64> {
+    fn extend<'a>(node: &&'a Node, queue: &Queue<&'a Node>) {
+        queue.extend(&node.children);
+    }
+
+    roots
+        .into_par_rec(extend)
+        .linearize()
+        .flat_map(|x| x.value.iter().map(|x| fibonacci(*x, work)))
+        .collect()
+}
+
+fn run(c: &mut Criterion) {
+    let treatments = [1, 10, 25];
+    let mut group = c.benchmark_group("rec_iter_map_collect");
+    let mut rng = ChaCha8Rng::seed_from_u64(42);
+    let roots = vec![
+        Node::new(5000, &mut rng),
+        Node::new(2000, &mut rng),
+        Node::new(4000, &mut rng),
+    ];
+
+    let num_nodes: usize = roots.iter().map(|x| x.seq_num_nodes()).sum();
+
+    for work in &treatments {
+        let mut expected = seq(&roots, *work);
+        expected.sort();
+
+        group.bench_with_input(BenchmarkId::new("seq", work), work, |b, _| {
+            let mut result = seq(&roots, *work);
+            result.sort();
+            assert_eq!(&expected, &result);
+            b.iter(|| seq(&roots, *work))
+        });
+
+        group.bench_with_input(BenchmarkId::new("orx_lazy_exact", work), work, |b, _| {
+            let mut result = orx_lazy_exact(&roots, *work, num_nodes).to_vec();
+            result.sort();
+            assert_eq!(&expected, &result);
+            b.iter(|| orx_lazy_exact(&roots, *work, num_nodes))
+        });
+
+        group.bench_with_input(
+            BenchmarkId::new("orx_lazy_unknown_chunk1024", work),
+            work,
+            |b, _| {
+                let mut result = orx_lazy_unknown_chunk1024(&roots, *work).to_vec();
+                result.sort();
+                assert_eq!(&expected, &result);
+                b.iter(|| orx_lazy_unknown_chunk1024(&roots, *work))
+            },
+        );
+
+        group.bench_with_input(BenchmarkId::new("orx_linearized", work), work, |b, _| {
+            let mut result = orx_linearized(&roots, *work).to_vec();
+            result.sort();
+            assert_eq!(&expected, &result);
+            b.iter(|| orx_linearized(&roots, *work))
+        });
+    }
+
+    group.finish();
+}
+
+criterion_group!(benches, run);
+criterion_main!(benches);