From df8100d0c843075ed5ed5c3a54cf6103d06ee89a Mon Sep 17 00:00:00 2001
From: Muhammad Haseeb <mhaseeb@nvidia.com>
Date: Wed, 15 Oct 2025 02:25:27 +0000
Subject: [PATCH 01/25] Flatten the data page mask

---
 .../experimental/hybrid_scan_chunking.cu      |   7 +-
 .../experimental/hybrid_scan_helpers.hpp      |   6 +-
 .../parquet/experimental/hybrid_scan_impl.cpp |  31 +-
 .../parquet/experimental/hybrid_scan_impl.hpp |   7 +-
 .../parquet/experimental/page_index_filter.cu | 353 ++++--------------
 .../experimental/page_index_filter_utils.cu   | 235 ++++++++++++
 .../experimental/page_index_filter_utils.hpp  |  95 +++++
 7 files changed, 437 insertions(+), 297 deletions(-)
 create mode 100644 cpp/src/io/parquet/experimental/page_index_filter_utils.cu
 create mode 100644 cpp/src/io/parquet/experimental/page_index_filter_utils.hpp
diff --git a/cpp/src/io/parquet/experimental/hybrid_scan_chunking.cu b/cpp/src/io/parquet/experimental/hybrid_scan_chunking.cu
index b49d13d4000..7e824a89c60 100644
--- a/cpp/src/io/parquet/experimental/hybrid_scan_chunking.cu
+++ b/cpp/src/io/parquet/experimental/hybrid_scan_chunking.cu
@@ -41,10 +41,9 @@ namespace cudf::io::parquet::experimental::detail {
 using parquet::detail::ColumnChunkDesc;
 using parquet::detail::pass_intermediate_data;
 
-void hybrid_scan_reader_impl::handle_chunking(
-  read_mode mode,
-  std::vector<rmm::device_buffer> column_chunk_buffers,
-  cudf::host_span<std::vector<bool> const> data_page_mask)
+void hybrid_scan_reader_impl::handle_chunking(read_mode mode,
+                                              std::vector<rmm::device_buffer> column_chunk_buffers,
+                                              std::vector<bool> const& data_page_mask)
 {
   // if this is our first time in here, setup the first pass.
   if (!_pass_itm_data) {
diff --git a/cpp/src/io/parquet/experimental/hybrid_scan_helpers.hpp b/cpp/src/io/parquet/experimental/hybrid_scan_helpers.hpp
index cb8e18ceaec..1330353045e 100644
--- a/cpp/src/io/parquet/experimental/hybrid_scan_helpers.hpp
+++ b/cpp/src/io/parquet/experimental/hybrid_scan_helpers.hpp
@@ -287,10 +287,10 @@ class aggregate_reader_metadata : public aggregate_reader_metadata_base {
    * @param row_mask_offset Offset into the row mask column for the current pass
    * @param stream CUDA stream used for device memory operations and kernel launches
    *
-   * @return A vector of boolean vectors indicating which data pages need to be decoded to produce
-   *         the output table based on the input row mask, one per input column
+   * @return Boolean vector indicating which data pages need to be decoded to produce
+   *         the output table based on the input row mask across all input columns
    */
-  [[nodiscard]] std::vector<std::vector<bool>> compute_data_page_mask(
+  [[nodiscard]] std::vector<bool> compute_data_page_mask(
     cudf::column_view row_mask,
     cudf::host_span<std::vector<size_type> const> row_group_indices,
     cudf::host_span<input_column_info const> input_columns,
diff --git a/cpp/src/io/parquet/experimental/hybrid_scan_impl.cpp b/cpp/src/io/parquet/experimental/hybrid_scan_impl.cpp
index cf7bd7419d4..422ed80222c 100644
--- a/cpp/src/io/parquet/experimental/hybrid_scan_impl.cpp
+++ b/cpp/src/io/parquet/experimental/hybrid_scan_impl.cpp
@@ -455,7 +455,7 @@ table_with_metadata hybrid_scan_reader_impl::materialize_filter_columns(
     (mask_data_pages == use_data_page_mask::YES)
       ? _extended_metadata->compute_data_page_mask(
           row_mask, row_group_indices, _input_columns, _rows_processed_so_far, stream)
-      : std::vector<std::vector<bool>>{};
+      : std::vector<bool>{};
 
   prepare_data(
     read_mode::READ_ALL, row_group_indices, std::move(column_chunk_buffers), data_page_mask);
@@ -485,7 +485,7 @@ table_with_metadata hybrid_scan_reader_impl::materialize_payload_columns(
     (mask_data_pages == use_data_page_mask::YES)
       ? _extended_metadata->compute_data_page_mask(
           row_mask, row_group_indices, _input_columns, _rows_processed_so_far, stream)
-      : std::vector<std::vector<bool>>{};
+      : std::vector<bool>{};
 
   prepare_data(
     read_mode::READ_ALL, row_group_indices, std::move(column_chunk_buffers), data_page_mask);
@@ -524,7 +524,7 @@ void hybrid_scan_reader_impl::setup_chunking_for_filter_columns(
     (mask_data_pages == use_data_page_mask::YES)
       ? _extended_metadata->compute_data_page_mask(
           row_mask, row_group_indices, _input_columns, _rows_processed_so_far, _stream)
-      : std::vector<std::vector<bool>>{};
+      : std::vector<bool>{};
 
   prepare_data(
     read_mode::CHUNKED_READ, row_group_indices, std::move(column_chunk_buffers), data_page_mask);
@@ -575,7 +575,7 @@ void hybrid_scan_reader_impl::setup_chunking_for_payload_columns(
     (mask_data_pages == use_data_page_mask::YES)
       ? _extended_metadata->compute_data_page_mask(
           row_mask, row_group_indices, _input_columns, _rows_processed_so_far, _stream)
-      : std::vector<std::vector<bool>>{};
+      : std::vector<bool>{};
 
   prepare_data(
     read_mode::CHUNKED_READ, row_group_indices, std::move(column_chunk_buffers), data_page_mask);
@@ -656,7 +656,7 @@ void hybrid_scan_reader_impl::prepare_data(
   read_mode mode,
   cudf::host_span<std::vector<size_type> const> row_group_indices,
   std::vector<rmm::device_buffer>&& column_chunk_buffers,
-  cudf::host_span<std::vector<bool> const> data_page_mask)
+  std::vector<bool> const& data_page_mask)
 {
   // if we have not preprocessed at the whole-file level, do that now
   if (not _file_preprocessed) {
@@ -888,8 +888,7 @@ table_with_metadata hybrid_scan_reader_impl::finalize_output(
   }
 }
 
-void hybrid_scan_reader_impl::set_pass_page_mask(
-  cudf::host_span<std::vector<bool> const> data_page_mask)
+void hybrid_scan_reader_impl::set_pass_page_mask(std::vector<bool> const& data_page_mask)
 {
   auto const& pass   = _pass_itm_data;
   auto const& chunks = pass->chunks;
@@ -903,13 +902,11 @@ void hybrid_scan_reader_impl::set_pass_page_mask(
     return;
   }
 
+  size_t num_inserted_data_pages = 0;
   std::for_each(
     thrust::counting_iterator<size_t>(0),
     thrust::counting_iterator(_input_columns.size()),
     [&](auto col_idx) {
-      auto const& col_page_mask      = data_page_mask[col_idx];
-      size_t num_inserted_data_pages = 0;
-
       for (size_t chunk_idx = col_idx; chunk_idx < chunks.size(); chunk_idx += num_columns) {
         // Insert a true value for each dictionary page
         if (chunks[chunk_idx].num_dict_pages > 0) { _pass_page_mask.push_back(true); }
@@ -919,23 +916,23 @@ void hybrid_scan_reader_impl::set_pass_page_mask(
 
         // Make sure we have enough page mask for this column chunk
         CUDF_EXPECTS(
-          col_page_mask.size() >= num_inserted_data_pages + num_data_pages_this_col_chunk,
+          data_page_mask.size() >= num_inserted_data_pages + num_data_pages_this_col_chunk,
           "Encountered invalid data page mask size");
 
         // Insert page mask for this column chunk
         _pass_page_mask.insert(
           _pass_page_mask.end(),
-          col_page_mask.begin() + num_inserted_data_pages,
-          col_page_mask.begin() + num_inserted_data_pages + num_data_pages_this_col_chunk);
-
+          data_page_mask.begin() + num_inserted_data_pages,
+          data_page_mask.begin() + num_inserted_data_pages + num_data_pages_this_col_chunk);
         // Update the number of inserted data pages
         num_inserted_data_pages += num_data_pages_this_col_chunk;
       }
-      // Make sure we inserted exactly the number of data pages for this column
-      CUDF_EXPECTS(num_inserted_data_pages == col_page_mask.size(),
-                   "Encountered mismatch in number of data pages and page mask size");
     });
 
+  // Make sure we inserted exactly the number of data pages for this column
+  CUDF_EXPECTS(num_inserted_data_pages == data_page_mask.size(),
+               "Encountered mismatch in number of data pages and page mask size");
+
   // Make sure we inserted exactly the number of pages for this pass
   CUDF_EXPECTS(_pass_page_mask.size() == pass->pages.size(),
                "Encountered mismatch in number of pass pages and page mask size");
diff --git a/cpp/src/io/parquet/experimental/hybrid_scan_impl.hpp b/cpp/src/io/parquet/experimental/hybrid_scan_impl.hpp
index e862959a44f..9fbeea9a58d 100644
--- a/cpp/src/io/parquet/experimental/hybrid_scan_impl.hpp
+++ b/cpp/src/io/parquet/experimental/hybrid_scan_impl.hpp
@@ -266,7 +266,7 @@ class hybrid_scan_reader_impl : public parquet::detail::reader_impl {
    *
    * @param data_page_mask Input data page mask from page-pruning step
    */
-  void set_pass_page_mask(cudf::host_span<std::vector<bool> const> data_page_mask);
+  void set_pass_page_mask(std::vector<bool> const& data_page_mask);
 
   /**
    * @brief Select the columns to be read based on the read mode
@@ -296,11 +296,12 @@ class hybrid_scan_reader_impl : public parquet::detail::reader_impl {
    * @param mode Value indicating if the data sources are read all at once or chunk by chunk
    * @param row_group_indices Row group indices to read
    * @param column_chunk_buffers Device buffers containing column chunk data
+   * @param data_page_mask Input data page mask from page-pruning step
    */
   void prepare_data(read_mode mode,
                     cudf::host_span<std::vector<size_type> const> row_group_indices,
                     std::vector<rmm::device_buffer>&& column_chunk_buffers,
-                    cudf::host_span<std::vector<bool> const> data_page_mask);
+                    std::vector<bool> const& data_page_mask);
 
   /**
    * @brief Create descriptors for filter column chunks and decode dictionary page headers
@@ -342,7 +343,7 @@ class hybrid_scan_reader_impl : public parquet::detail::reader_impl {
    */
   void handle_chunking(read_mode mode,
                        std::vector<rmm::device_buffer> column_chunk_buffers,
-                       cudf::host_span<std::vector<bool> const> data_page_mask);
+                       std::vector<bool> const& data_page_mask);
 
   /**
    * @brief Setup step for the next input read pass.
diff --git a/cpp/src/io/parquet/experimental/page_index_filter.cu b/cpp/src/io/parquet/experimental/page_index_filter.cu
index d6a6e2e9f3e..b5948197537 100644
--- a/cpp/src/io/parquet/experimental/page_index_filter.cu
+++ b/cpp/src/io/parquet/experimental/page_index_filter.cu
@@ -15,8 +15,8 @@
  */
 
 #include "hybrid_scan_helpers.hpp"
-#include "io/parquet/reader_impl_helpers.hpp"
 #include "io/parquet/stats_filter_helpers.hpp"
+#include "page_index_filter_utils.hpp"
 
 #include <cudf/column/column_factories.hpp>
 #include <cudf/detail/iterator.cuh>
@@ -27,12 +27,10 @@
 #include <cudf/detail/utilities/host_worker_pool.hpp>
 #include <cudf/detail/utilities/stream_pool.hpp>
 #include <cudf/detail/utilities/vector_factories.hpp>
-#include <cudf/io/types.hpp>
 #include <cudf/logger.hpp>
 #include <cudf/null_mask.hpp>
 #include <cudf/types.hpp>
 #include <cudf/utilities/error.hpp>
-#include <cudf/utilities/memory_resource.hpp>
 #include <cudf/utilities/span.hpp>
 #include <cudf/utilities/traits.hpp>
 #include <cudf/utilities/type_dispatcher.hpp>
@@ -43,14 +41,11 @@
 #include <rmm/exec_policy.hpp>
 
 #include <thrust/gather.h>
-#include <thrust/host_vector.h>
 #include <thrust/iterator/counting_iterator.h>
 #include <thrust/logical.h>
-#include <thrust/unique.h>
 
 #include <algorithm>
 #include <limits>
-#include <numeric>
 
 namespace cudf::io::parquet::experimental::detail {
 
@@ -60,187 +55,6 @@ using string_index_pair = parquet::detail::string_index_pair;
 
 namespace {
 
-/**
- * @brief Make a device vector where each row contains the index of the page it belongs to
- */
-[[nodiscard]] rmm::device_uvector<size_type> make_page_indices_async(
-  cudf::host_span<cudf::size_type const> page_row_counts,
-  cudf::host_span<cudf::size_type const> page_row_offsets,
-  cudf::size_type total_rows,
-  rmm::cuda_stream_view stream)
-{
-  auto mr = cudf::get_current_device_resource_ref();
-
-  // Copy page-level row counts and offsets to device
-  auto row_counts  = cudf::detail::make_device_uvector_async(page_row_counts, stream, mr);
-  auto row_offsets = cudf::detail::make_device_uvector_async(page_row_offsets, stream, mr);
-
-  // Make a zeroed device vector to store page indices of each row
-  auto page_indices =
-    cudf::detail::make_zeroed_device_uvector_async<cudf::size_type>(total_rows, stream, mr);
-
-  // Scatter page indices across the their first row's index
-  thrust::scatter_if(rmm::exec_policy_nosync(stream),
-                     thrust::counting_iterator<size_type>(0),
-                     thrust::counting_iterator<size_type>(row_counts.size()),
-                     row_offsets.begin(),
-                     row_counts.begin(),
-                     page_indices.begin());
-
-  // Inclusive scan with maximum to replace zeros with the (increasing) page index it belongs to.
-  // Page indices are scattered at their first row's index.
-  thrust::inclusive_scan(rmm::exec_policy_nosync(stream),
-                         page_indices.begin(),
-                         page_indices.end(),
-                         page_indices.begin(),
-                         cuda::maximum<cudf::size_type>());
-  return page_indices;
-}
-
-/**
- * @brief Compute page row counts and page row offsets and column chunk page (count) offsets for a
- * given column schema index
- */
-[[nodiscard]] auto make_page_row_counts_and_offsets(
-  cudf::host_span<metadata_base const> per_file_metadata,
-  cudf::host_span<std::vector<size_type> const> row_group_indices,
-  size_type schema_idx,
-  rmm::cuda_stream_view stream)
-{
-  // Compute total number of row groups
-  auto const total_row_groups =
-    std::accumulate(row_group_indices.begin(),
-                    row_group_indices.end(),
-                    size_t{0},
-                    [](auto sum, auto const& rg_indices) { return sum + rg_indices.size(); });
-
-  // Vector to store how many rows are present in each page - set initial capacity to two data pages
-  // per row group
-  auto page_row_counts =
-    cudf::detail::make_empty_host_vector<size_type>(2 * total_row_groups, stream);
-  // Vector to store the cumulative number of rows in each page - - set initial capacity to two data
-  // pages per row group
-  auto page_row_offsets =
-    cudf::detail::make_empty_host_vector<size_type>((2 * total_row_groups) + 1, stream);
-  // Vector to store the cumulative number of pages in each column chunk
-  auto col_chunk_page_offsets =
-    cudf::detail::make_empty_host_vector<size_type>(total_row_groups + 1, stream);
-
-  page_row_offsets.push_back(0);
-  col_chunk_page_offsets.push_back(0);
-
-  // For all data sources
-  std::for_each(
-    thrust::counting_iterator<size_t>(0),
-    thrust::counting_iterator(row_group_indices.size()),
-    [&](auto src_idx) {
-      auto const& rg_indices = row_group_indices[src_idx];
-      // For all column chunks in this data source
-      std::for_each(rg_indices.cbegin(), rg_indices.cend(), [&](auto rg_idx) {
-        auto const& row_group = per_file_metadata[src_idx].row_groups[rg_idx];
-        // Find the column chunk with the given schema index
-        auto colchunk_iter = std::find_if(
-          row_group.columns.begin(), row_group.columns.end(), [schema_idx](ColumnChunk const& col) {
-            return col.schema_idx == schema_idx;
-          });
-
-        CUDF_EXPECTS(colchunk_iter != row_group.columns.end(),
-                     "Column chunk with schema index " + std::to_string(schema_idx) +
-                       " not found in row group",
-                     std::invalid_argument);
-
-        // Compute page row counts and offsets if this column chunk has column and offset indexes
-        if (colchunk_iter->offset_index.has_value()) {
-          CUDF_EXPECTS(colchunk_iter->column_index.has_value(),
-                       "Both offset and column indexes must be present");
-          // Get the offset and column indexes of the column chunk
-          auto const& offset_index = colchunk_iter->offset_index.value();
-          auto const& column_index = colchunk_iter->column_index.value();
-
-          // Number of pages in this column chunk
-          auto const row_group_num_pages = offset_index.page_locations.size();
-
-          CUDF_EXPECTS(column_index.min_values.size() == column_index.max_values.size(),
-                       "page min and max values should be of same size");
-          CUDF_EXPECTS(column_index.min_values.size() == row_group_num_pages,
-                       "mismatch between size of min/max page values and the size of page "
-                       "locations");
-          // Update the cumulative number of pages in this column chunk
-          col_chunk_page_offsets.push_back(col_chunk_page_offsets.back() + row_group_num_pages);
-
-          // For all pages in this column chunk, update page row counts and offsets.
-          std::for_each(
-            thrust::counting_iterator<size_t>(0),
-            thrust::counting_iterator(row_group_num_pages),
-            [&](auto const page_idx) {
-              int64_t const first_row_idx = offset_index.page_locations[page_idx].first_row_index;
-              // For the last page, this is simply the total number of rows in the column chunk
-              int64_t const last_row_idx =
-                (page_idx < row_group_num_pages - 1)
-                  ? offset_index.page_locations[page_idx + 1].first_row_index
-                  : row_group.num_rows;
-
-              // Update the page row counts and offsets
-              page_row_counts.push_back(last_row_idx - first_row_idx);
-              page_row_offsets.push_back(page_row_offsets.back() + page_row_counts.back());
-            });
-        }
-      });
-    });
-
-  return std::tuple{
-    std::move(page_row_counts), std::move(page_row_offsets), std::move(col_chunk_page_offsets)};
-}
-
-/**
- * @brief Compute if the page index is present in all parquet data sources for all columns
- */
-[[nodiscard]] bool compute_has_page_index(
-  cudf::host_span<metadata_base const> file_metadatas,
-  cudf::host_span<std::vector<size_type> const> row_group_indices,
-  cudf::host_span<size_type const> column_schema_indices)
-{
-  // For all output columns, check all parquet data sources
-  return std::all_of(
-    column_schema_indices.begin(), column_schema_indices.end(), [&](auto const schema_idx) {
-      // For all parquet data sources
-      return std::all_of(
-        thrust::counting_iterator<size_t>(0),
-        thrust::counting_iterator(row_group_indices.size()),
-        [&](auto const src_index) {
-          // For all row groups in this parquet data source
-          auto const& rg_indices = row_group_indices[src_index];
-          return std::all_of(rg_indices.begin(), rg_indices.end(), [&](auto const& rg_index) {
-            auto const& row_group = file_metadatas[src_index].row_groups[rg_index];
-            auto col              = std::find_if(
-              row_group.columns.begin(),
-              row_group.columns.end(),
-              [schema_idx](ColumnChunk const& col) { return col.schema_idx == schema_idx; });
-            // Check if the offset_index and column_index are present
-            return col != file_metadatas[src_index].row_groups[rg_index].columns.end() and
-                   col->offset_index.has_value() and col->column_index.has_value();
-          });
-        });
-    });
-}
-
-/**
- * @brief Construct a vector of all required data pages from the page row counts
- */
-[[nodiscard]] auto all_required_data_pages(
-  cudf::host_span<cudf::detail::host_vector<size_type> const> page_row_counts)
-{
-  std::vector<std::vector<bool>> all_required_data_pages;
-  all_required_data_pages.reserve(page_row_counts.size());
-  std::transform(
-    page_row_counts.begin(),
-    page_row_counts.end(),
-    std::back_inserter(all_required_data_pages),
-    [&](auto const& col_page_counts) { return std::vector<bool>(col_page_counts.size(), true); });
-
-  return all_required_data_pages;
-};
-
 /**
  * @brief Converts page-level statistics of a column to 2 device columns - min, max values. Each
  * column has number of rows equal to the total rows in all row groups.
@@ -455,7 +269,8 @@ struct page_stats_caster : public stats_caster_base {
     } else {
       // Compute column chunk level page count offsets, and page level row counts and row offsets.
       auto const [page_row_counts, page_row_offsets, col_chunk_page_offsets] =
-        make_page_row_counts_and_offsets(per_file_metadata, row_group_indices, schema_idx, stream);
+        compute_page_row_counts_and_offsets(
+          per_file_metadata, row_group_indices, schema_idx, stream);
 
       CUDF_EXPECTS(
         page_row_offsets.back() == total_rows,
@@ -479,7 +294,7 @@ struct page_stats_caster : public stats_caster_base {
           std::for_each(rg_indices.cbegin(), rg_indices.cend(), [&](auto rg_idx) {
             auto const& row_group = per_file_metadata[src_idx].row_groups[rg_idx];
             // Find colchunk_iter in row_group.columns. Guaranteed to be found as already verified
-            // in make_page_row_counts_and_offsets()
+            // in compute_page_row_counts_and_offsets()
             auto colchunk_iter = std::find_if(
               row_group.columns.begin(),
               row_group.columns.end(),
@@ -672,7 +487,7 @@ std::unique_ptr<cudf::column> aggregate_reader_metadata::build_row_mask_with_pag
   return cudf::detail::compute_column(stats_table, stats_expr.get_stats_expr().get(), stream, mr);
 }
 
-std::vector<std::vector<bool>> aggregate_reader_metadata::compute_data_page_mask(
+std::vector<bool> aggregate_reader_metadata::compute_data_page_mask(
   cudf::column_view row_mask,
   cudf::host_span<std::vector<size_type> const> row_group_indices,
   cudf::host_span<input_column_info const> input_columns,
@@ -701,53 +516,50 @@ std::vector<std::vector<bool>> aggregate_reader_metadata::compute_data_page_mask
     return {};  // An empty data page mask indicates all pages are required
   }
 
-  // Compute page row counts, offsets, and column chunk page offsets for each column
-  std::vector<cudf::detail::host_vector<size_type>> page_row_counts;
-  std::vector<cudf::detail::host_vector<size_type>> page_row_offsets;
-  std::vector<cudf::detail::host_vector<size_type>> col_chunk_page_offsets;
-  page_row_counts.reserve(num_columns);
-  page_row_offsets.reserve(num_columns);
-  col_chunk_page_offsets.reserve(num_columns);
+  // Compute page row offsets and column chunk page offsets for each column
+  std::vector<size_type> page_row_offsets;
+  std::vector<size_type> col_page_offsets;
+  col_page_offsets.reserve(num_columns + 1);
+  col_page_offsets.push_back(0);
+
+  size_type max_page_size = 0;
 
   if (num_columns == 1) {
-    auto const schema_idx = column_schema_indices.front();
-    auto [counts, offsets, chunk_offsets] =
-      make_page_row_counts_and_offsets(per_file_metadata, row_group_indices, schema_idx, stream);
-    page_row_counts.emplace_back(std::move(counts));
-    page_row_offsets.emplace_back(std::move(offsets));
+    auto const schema_idx   = column_schema_indices.front();
+    size_type col_num_pages = 0;
+    std::tie(page_row_offsets, col_num_pages, max_page_size) =
+      compute_page_row_offsets(per_file_metadata, row_group_indices, schema_idx);
+    // Add 1 to include the the 0th page's offset for each column
+    col_page_offsets.emplace_back(col_num_pages + 1);
   } else {
-    std::vector<std::future<std::tuple<cudf::detail::host_vector<size_type>,
-                                       cudf::detail::host_vector<size_type>,
-                                       cudf::detail::host_vector<size_type>>>>
-      page_row_counts_and_offsets_tasks;
-    page_row_counts_and_offsets_tasks.reserve(num_columns);
-
-    auto streams = cudf::detail::fork_streams(stream, num_columns);
+    std::vector<std::future<std::tuple<std::vector<size_type>, size_type, size_type>>>
+      page_row_offsets_tasks;
+    page_row_offsets_tasks.reserve(num_columns);
 
     std::for_each(thrust::counting_iterator<size_t>(0),
                   thrust::counting_iterator(num_columns),
                   [&](auto const col_idx) {
-                    page_row_counts_and_offsets_tasks.emplace_back(
+                    page_row_offsets_tasks.emplace_back(
                       cudf::detail::host_worker_pool().submit_task([&, col_idx = col_idx] {
-                        return make_page_row_counts_and_offsets(per_file_metadata,
-                                                                row_group_indices,
-                                                                column_schema_indices[col_idx],
-                                                                streams[col_idx]);
+                        return compute_page_row_offsets(
+                          per_file_metadata, row_group_indices, column_schema_indices[col_idx]);
                       }));
                   });
 
     // Collect results from all tasks
-    std::for_each(page_row_counts_and_offsets_tasks.begin(),
-                  page_row_counts_and_offsets_tasks.end(),
-                  [&](auto& task) {
-                    auto [counts, offsets, chunk_offsets] = std::move(task).get();
-                    page_row_counts.emplace_back(std::move(counts));
-                    page_row_offsets.emplace_back(std::move(offsets));
-                    col_chunk_page_offsets.emplace_back(std::move(chunk_offsets));
-                  });
+    std::for_each(page_row_offsets_tasks.begin(), page_row_offsets_tasks.end(), [&](auto& task) {
+      auto [col_page_row_offsets, col_num_pages, col_max_page_size] = std::move(task).get();
+      page_row_offsets.insert(page_row_offsets.end(),
+                              std::make_move_iterator(col_page_row_offsets.begin()),
+                              std::make_move_iterator(col_page_row_offsets.end()));
+      max_page_size = std::max<size_type>(max_page_size, col_max_page_size);
+      // Add 1 to include the the 0th page's offset for each column
+      col_page_offsets.emplace_back(col_page_offsets.back() + col_num_pages + 1);
+    });
   }
 
-  auto const total_rows = page_row_offsets.back().back();
+  auto const total_rows  = page_row_offsets.back();
+  auto const total_pages = page_row_offsets.size() - num_columns;
 
   CUDF_EXPECTS(row_mask_offset + total_rows <= row_mask.size(),
                "Mismatch in total rows in input row mask and row groups",
@@ -759,17 +571,14 @@ std::vector<std::vector<bool>> aggregate_reader_metadata::compute_data_page_mask
                      row_mask.begin<bool>() + row_mask_offset,
                      row_mask.begin<bool>() + row_mask_offset + total_rows,
                      cuda::std::identity{})) {
-    return all_required_data_pages(page_row_counts);
+    return std::vector<bool>(total_pages, true);
   }
 
-  // Vector to hold data page mask for each column
-  auto data_page_mask = std::vector<std::vector<bool>>(num_columns);
-
   // Total number of surviving pages across all columns
   std::atomic<size_t> total_surviving_pages{0};
 
   // Tasks to compute data page mask for each column
-  std::vector<std::future<void>> data_page_mask_tasks;
+  std::vector<std::future<std::vector<bool>>> data_page_mask_tasks;
   data_page_mask_tasks.reserve(num_columns);
 
   // Host row mask validity and first bit offset
@@ -807,49 +616,53 @@ std::vector<std::vector<bool>> aggregate_reader_metadata::compute_data_page_mask
   // For all columns, look up which pages contain at least one required row. i.e.
   // !validity_it[row_idx] or is_row_required[row_idx] satisfies, and add its byte range to the
   // output list of byte ranges for the column.
-  std::for_each(
-    thrust::counting_iterator<size_t>(0),
-    thrust::counting_iterator(num_columns),
-    [&](auto const col_idx) {
-      data_page_mask_tasks.emplace_back(
-        cudf::detail::host_worker_pool().submit_task([&, col_idx = col_idx] {
-          // Construct a row indices mapping based on page row counts and offsets
-          auto const total_pages_this_column = page_row_counts[col_idx].size();
-          auto valid_pages_this_column       = std::vector<bool>(total_pages_this_column, false);
-          // Number of final filtered pages for this column
-          size_t num_surviving_pages_this_column = 0;
-          // For all rows
-          for (auto row_idx = 0; row_idx < total_rows; ++row_idx) {
-            // If this row is required or invalid, add its page index to the output list.
-            if (not is_row_valid[row_idx] or is_row_required[row_idx]) {
-              // binary search to find the page index this row_idx belongs to and set the
-              // page index to true page_indices
-              auto const& offsets = page_row_offsets[col_idx];
-              auto const page_itr = std::upper_bound(offsets.cbegin(), offsets.cend(), row_idx);
-              CUDF_EXPECTS(page_itr != offsets.cbegin() and page_itr != offsets.cend(),
-                           "Invalid page index");
-              auto const page_idx               = std::distance(offsets.cbegin(), page_itr) - 1;
-              valid_pages_this_column[page_idx] = true;
-              num_surviving_pages_this_column++;
-              // Move row_idx to the last row of this page
-              row_idx = offsets[page_idx + 1] - 1;
-            }
-          }
-
-          total_surviving_pages.fetch_add(num_surviving_pages_this_column);
-          data_page_mask[col_idx] = std::move(valid_pages_this_column);
-        }));
-    });
-
-  std::for_each(
-    data_page_mask_tasks.begin(), data_page_mask_tasks.end(), [](auto& task) { task.get(); });
+  std::for_each(thrust::counting_iterator<size_t>(0),
+                thrust::counting_iterator(num_columns),
+                [&](auto const col_idx) {
+                  data_page_mask_tasks.emplace_back(
+                    cudf::detail::host_worker_pool().submit_task([&, col_idx = col_idx] {
+                      // Construct a row indices mapping based on page row counts and offsets
+                      auto const total_pages_this_column =
+                        col_page_offsets[col_idx + 1] - col_page_offsets[col_idx] - 1;
+                      auto col_data_page_mask = std::vector<bool>(total_pages_this_column, false);
+                      // Number of final filtered pages for this column
+                      size_t num_surviving_pages_this_column = 0;
+                      // For all rows
+                      for (auto row_idx = 0; row_idx < total_rows; ++row_idx) {
+                        // If this row is required or invalid, add its page index to the output
+                        // list.
+                        if (not is_row_valid[row_idx] or is_row_required[row_idx]) {
+                          // binary search to find the page index this row_idx belongs to and set
+                          // the page index to true page_indices
+                          auto const offsets = cudf::host_span<size_type const>(
+                            page_row_offsets.data() + col_page_offsets[col_idx],
+                            col_page_offsets[col_idx + 1] - col_page_offsets[col_idx]);
+                          auto const page_itr =
+                            std::upper_bound(offsets.begin(), offsets.end(), row_idx);
+                          CUDF_EXPECTS(page_itr != offsets.begin() and page_itr != offsets.end(),
+                                       "Invalid page index");
+                          auto const page_idx = std::distance(offsets.begin(), page_itr) - 1;
+                          col_data_page_mask[page_idx] = true;
+                          num_surviving_pages_this_column++;
+                          // Move row_idx to the last row of this page
+                          row_idx = offsets[page_idx + 1] - 1;
+                        }
+                      }
+                      total_surviving_pages.fetch_add(num_surviving_pages_this_column);
+                      return col_data_page_mask;
+                    }));
+                });
 
-  // Total number of input pages across all columns
-  auto const total_pages = std::accumulate(
-    page_row_counts.cbegin(),
-    page_row_counts.cend(),
-    size_t{0},
-    [](auto sum, auto const& page_row_counts) { return sum + page_row_counts.size(); });
+  // Vector to hold data page mask for each column
+  auto data_page_mask = std::vector<bool>();
+  data_page_mask.reserve(total_pages);
+
+  std::for_each(data_page_mask_tasks.begin(), data_page_mask_tasks.end(), [&](auto& task) {
+    auto col_data_page_mask = std::move(task).get();
+    data_page_mask.insert(data_page_mask.end(),
+                          std::make_move_iterator(col_data_page_mask.begin()),
+                          std::make_move_iterator(col_data_page_mask.end()));
+  });
 
   CUDF_EXPECTS(
     total_surviving_pages <= total_pages,
diff --git a/cpp/src/io/parquet/experimental/page_index_filter_utils.cu b/cpp/src/io/parquet/experimental/page_index_filter_utils.cu
new file mode 100644
index 00000000000..3dfb8e69d51
--- /dev/null
+++ b/cpp/src/io/parquet/experimental/page_index_filter_utils.cu
@@ -0,0 +1,235 @@
+
+/*
+ * Copyright (c) 2025, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "page_index_filter_utils.hpp"
+
+#include <cudf/detail/utilities/host_vector.hpp>
+#include <cudf/detail/utilities/vector_factories.hpp>
+#include <cudf/types.hpp>
+
+#include <rmm/cuda_stream_view.hpp>
+#include <rmm/device_uvector.hpp>
+#include <rmm/exec_policy.hpp>
+
+#include <thrust/gather.h>
+
+#include <algorithm>
+#include <numeric>
+#include <vector>
+
+namespace cudf::io::parquet::experimental::detail {
+
+bool compute_has_page_index(cudf::host_span<metadata_base const> file_metadatas,
+                            cudf::host_span<std::vector<size_type> const> row_group_indices,
+                            cudf::host_span<size_type const> column_schema_indices)
+{
+  // For all output columns, check all parquet data sources
+  return std::all_of(
+    column_schema_indices.begin(), column_schema_indices.end(), [&](auto const schema_idx) {
+      // For all parquet data sources
+      return std::all_of(
+        thrust::counting_iterator<size_t>(0),
+        thrust::counting_iterator(row_group_indices.size()),
+        [&](auto const src_index) {
+          // For all row groups in this parquet data source
+          auto const& rg_indices = row_group_indices[src_index];
+          return std::all_of(rg_indices.begin(), rg_indices.end(), [&](auto const& rg_index) {
+            auto const& row_group = file_metadatas[src_index].row_groups[rg_index];
+            auto col              = std::find_if(
+              row_group.columns.begin(),
+              row_group.columns.end(),
+              [schema_idx](ColumnChunk const& col) { return col.schema_idx == schema_idx; });
+            // Check if the offset_index and column_index are present
+            return col != file_metadatas[src_index].row_groups[rg_index].columns.end() and
+                   col->offset_index.has_value() and col->column_index.has_value();
+          });
+        });
+    });
+}
+
+std::tuple<cudf::detail::host_vector<size_type>,
+           cudf::detail::host_vector<size_type>,
+           cudf::detail::host_vector<size_type>>
+compute_page_row_counts_and_offsets(cudf::host_span<metadata_base const> per_file_metadata,
+                                    cudf::host_span<std::vector<size_type> const> row_group_indices,
+                                    size_type schema_idx,
+                                    rmm::cuda_stream_view stream)
+{
+  // Compute total number of row groups
+  auto const total_row_groups =
+    std::accumulate(row_group_indices.begin(),
+                    row_group_indices.end(),
+                    size_t{0},
+                    [](auto sum, auto const& rg_indices) { return sum + rg_indices.size(); });
+
+  // Vector to store how many rows are present in each page - set initial capacity to two data pages
+  // per row group
+  auto page_row_counts =
+    cudf::detail::make_empty_host_vector<cudf::size_type>(2 * total_row_groups, stream);
+  // Vector to store the cumulative number of rows in each page - - set initial capacity to two data
+  // pages per row group
+  auto page_row_offsets =
+    cudf::detail::make_empty_host_vector<cudf::size_type>((2 * total_row_groups) + 1, stream);
+  // Vector to store the cumulative number of pages in each column chunk
+  auto col_chunk_page_offsets =
+    cudf::detail::make_empty_host_vector<cudf::size_type>(total_row_groups + 1, stream);
+
+  page_row_offsets.push_back(0);
+  col_chunk_page_offsets.push_back(0);
+
+  // For all data sources
+  std::for_each(
+    thrust::counting_iterator<size_t>(0),
+    thrust::counting_iterator(row_group_indices.size()),
+    [&](auto src_idx) {
+      auto const& rg_indices = row_group_indices[src_idx];
+      // For all column chunks in this data source
+      std::for_each(rg_indices.cbegin(), rg_indices.cend(), [&](auto rg_idx) {
+        auto const& row_group = per_file_metadata[src_idx].row_groups[rg_idx];
+        // Find the column chunk with the given schema index
+        auto colchunk_iter = std::find_if(
+          row_group.columns.begin(), row_group.columns.end(), [schema_idx](ColumnChunk const& col) {
+            return col.schema_idx == schema_idx;
+          });
+
+        CUDF_EXPECTS(colchunk_iter != row_group.columns.end(),
+                     "Column chunk with schema index " + std::to_string(schema_idx) +
+                       " not found in row group",
+                     std::invalid_argument);
+
+        // Compute page row counts and offsets if this column chunk has column and offset indexes
+        if (colchunk_iter->offset_index.has_value()) {
+          // Get the offset index of the column chunk
+          auto const& offset_index       = colchunk_iter->offset_index.value();
+          auto const row_group_num_pages = offset_index.page_locations.size();
+
+          col_chunk_page_offsets.push_back(col_chunk_page_offsets.back() + row_group_num_pages);
+
+          // For all pages in this column chunk, update page row counts and offsets.
+          std::for_each(
+            thrust::counting_iterator<size_t>(0),
+            thrust::counting_iterator(row_group_num_pages),
+            [&](auto const page_idx) {
+              int64_t const first_row_idx = offset_index.page_locations[page_idx].first_row_index;
+              // For the last page, this is simply the total number of rows in the column chunk
+              int64_t const last_row_idx =
+                (page_idx < row_group_num_pages - 1)
+                  ? offset_index.page_locations[page_idx + 1].first_row_index
+                  : row_group.num_rows;
+
+              // Update the page row counts and offsets
+              page_row_counts.push_back(last_row_idx - first_row_idx);
+              page_row_offsets.push_back(page_row_offsets.back() + page_row_counts.back());
+            });
+        }
+      });
+    });
+
+  return {
+    std::move(page_row_counts), std::move(page_row_offsets), std::move(col_chunk_page_offsets)};
+}
+
+std::tuple<std::vector<size_type>, size_type, size_type> compute_page_row_offsets(
+  cudf::host_span<metadata_base const> per_file_metadata,
+  cudf::host_span<std::vector<size_type> const> row_group_indices,
+  size_type schema_idx)
+{
+  // Compute total number of row groups
+  auto const total_row_groups =
+    std::accumulate(row_group_indices.begin(),
+                    row_group_indices.end(),
+                    size_t{0},
+                    [](auto sum, auto const& rg_indices) { return sum + rg_indices.size(); });
+
+  std::vector<size_type> page_row_offsets;
+  page_row_offsets.push_back(0);
+  size_type max_page_size = 0;
+  size_type num_pages     = 0;
+
+  std::for_each(thrust::counting_iterator<size_t>(0),
+                thrust::counting_iterator(row_group_indices.size()),
+                [&](auto const src_idx) {
+                  auto const& rg_indices = row_group_indices[src_idx];
+                  // For all row groups in this source
+                  std::for_each(rg_indices.begin(), rg_indices.end(), [&](auto const& rg_idx) {
+                    auto const& row_group = per_file_metadata[src_idx].row_groups[rg_idx];
+                    // Find the column chunk with the given schema index
+                    auto colchunk_iter = std::find_if(
+                      row_group.columns.begin(),
+                      row_group.columns.end(),
+                      [schema_idx](auto const& col) { return col.schema_idx == schema_idx; });
+                    CUDF_EXPECTS(colchunk_iter != row_group.columns.end(),
+                                 "Column chunk with schema index " + std::to_string(schema_idx) +
+                                   " not found in row group",
+                                 std::invalid_argument);
+                    auto const& offset_index       = colchunk_iter->offset_index.value();
+                    auto const row_group_num_pages = offset_index.page_locations.size();
+                    num_pages += static_cast<size_type>(row_group_num_pages);
+                    std::for_each(thrust::counting_iterator<size_t>(0),
+                                  thrust::counting_iterator(row_group_num_pages),
+                                  [&](auto const page_idx) {
+                                    int64_t const first_row_idx =
+                                      offset_index.page_locations[page_idx].first_row_index;
+                                    int64_t const last_row_idx =
+                                      (page_idx < row_group_num_pages - 1)
+                                        ? offset_index.page_locations[page_idx + 1].first_row_index
+                                        : row_group.num_rows;
+                                    auto const page_size = last_row_idx - first_row_idx;
+                                    max_page_size = std::max<size_type>(max_page_size, page_size);
+                                    page_row_offsets.push_back(page_row_offsets.back() + page_size);
+                                  });
+                  });
+                });
+
+  return {std::move(page_row_offsets), num_pages, max_page_size};
+}
+
+rmm::device_uvector<size_type> make_page_indices_async(
+  cudf::host_span<cudf::size_type const> page_row_counts,
+  cudf::host_span<cudf::size_type const> page_row_offsets,
+  cudf::size_type total_rows,
+  rmm::cuda_stream_view stream)
+{
+  auto mr = cudf::get_current_device_resource_ref();
+
+  // Copy page-level row counts and offsets to device
+  auto row_counts  = cudf::detail::make_device_uvector_async(page_row_counts, stream, mr);
+  auto row_offsets = cudf::detail::make_device_uvector_async(page_row_offsets, stream, mr);
+
+  // Make a zeroed device vector to store page indices of each row
+  auto page_indices =
+    cudf::detail::make_zeroed_device_uvector_async<cudf::size_type>(total_rows, stream, mr);
+
+  // Scatter page indices across the their first row's index
+  thrust::scatter_if(rmm::exec_policy_nosync(stream),
+                     thrust::counting_iterator<size_type>(0),
+                     thrust::counting_iterator<size_type>(row_counts.size()),
+                     row_offsets.begin(),
+                     row_counts.begin(),
+                     page_indices.begin());
+
+  // Inclusive scan with maximum to replace zeros with the (increasing) page index it belongs to.
+  // Page indices are scattered at their first row's index.
+  thrust::inclusive_scan(rmm::exec_policy_nosync(stream),
+                         page_indices.begin(),
+                         page_indices.end(),
+                         page_indices.begin(),
+                         cuda::maximum<cudf::size_type>());
+  return page_indices;
+}
+
+}  // namespace cudf::io::parquet::experimental::detail
\ No newline at end of file
diff --git a/cpp/src/io/parquet/experimental/page_index_filter_utils.hpp b/cpp/src/io/parquet/experimental/page_index_filter_utils.hpp
new file mode 100644
index 00000000000..f53bede91c7
--- /dev/null
+++ b/cpp/src/io/parquet/experimental/page_index_filter_utils.hpp
@@ -0,0 +1,95 @@
+/*
+ * Copyright (c) 2025, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "io/parquet/reader_impl_helpers.hpp"
+
+#include <cudf/detail/utilities/host_vector.hpp>
+#include <cudf/io/parquet.hpp>
+#include <cudf/types.hpp>
+#include <cudf/utilities/span.hpp>
+
+#include <rmm/cuda_stream_view.hpp>
+#include <rmm/device_uvector.hpp>
+
+namespace cudf::io::parquet::experimental::detail {
+
+using metadata_base = parquet::detail::metadata;
+
+/**
+ * @brief Compute if the page index is present in all parquet data sources for all columns
+ *
+ * @param file_metadatas Span of parquet footer metadata
+ * @param row_group_indices Span of input row group indices
+ * @param column_schema_indices Span of input column schema indices
+ * @return Boolean indicating if the page index is present in all parquet data sources for all
+ * columns
+ */
+[[nodiscard]] bool compute_has_page_index(
+  cudf::host_span<metadata_base const> file_metadatas,
+  cudf::host_span<std::vector<size_type> const> row_group_indices,
+  cudf::host_span<size_type const> column_schema_indices);
+
+/**
+ * @brief Compute page row counts and page row offsets and column chunk page (count) offsets for a
+ * given column schema index
+ *
+ * @param per_file_metadata Span of parquet footer metadata
+ * @param row_group_indices Span of input row group indices
+ * @param schema_idx Column's schema index
+ * @param stream CUDA stream
+ * @return Tuple of page row counts, page row offsets, and column chunk page (count) offsets
+ */
+[[nodiscard]] std::tuple<cudf::detail::host_vector<size_type>,
+                         cudf::detail::host_vector<size_type>,
+                         cudf::detail::host_vector<size_type>>
+compute_page_row_counts_and_offsets(cudf::host_span<metadata_base const> per_file_metadata,
+                                    cudf::host_span<std::vector<size_type> const> row_group_indices,
+                                    size_type schema_idx,
+                                    rmm::cuda_stream_view stream);
+
+/**
+ * @brief Compute page row offsets for a given column schema index
+ *
+ * @param per_file_metadata Span of parquet footer metadata
+ * @param row_group_indices Span of input row group indices
+ * @param schema_idx Column's schema index
+ * @param stream CUDA stream
+ * @return Tuple of page row offsets, number of pages, and the size of the largest page in this
+ * column
+ */
+[[nodiscard]] std::tuple<std::vector<size_type>, size_type, size_type> compute_page_row_offsets(
+  cudf::host_span<metadata_base const> per_file_metadata,
+  cudf::host_span<std::vector<size_type> const> row_group_indices,
+  size_type schema_idx);
+
+/**
+ * @brief Make a device vector where each row contains the index of the page it belongs to
+ *
+ * @param page_row_counts Span of page row counts
+ * @param page_row_offsets Span of page row offsets
+ * @param total_rows Total number of rows
+ * @param stream CUDA stream
+ * @return Device vector where each row contains the index of the page it belongs to
+ */
+[[nodiscard]] rmm::device_uvector<size_type> make_page_indices_async(
+  cudf::host_span<cudf::size_type const> page_row_counts,
+  cudf::host_span<cudf::size_type const> page_row_offsets,
+  cudf::size_type total_rows,
+  rmm::cuda_stream_view stream);
+
+}  // namespace cudf::io::parquet::experimental::detail
\ No newline at end of file

From bc6f08b528ff5a77a8eb080d19b9f5edcda90f77 Mon Sep 17 00:00:00 2001
From: Muhammad Haseeb <mhaseeb@nvidia.com>
Date: Wed, 15 Oct 2025 02:58:11 +0000
Subject: [PATCH 02/25] Templatize ColumnView type for compute data page mask

---
 .../experimental/hybrid_scan_helpers.hpp      |  6 +-
 .../parquet/experimental/page_index_filter.cu | 79 ++++++++++---------
 2 files changed, 45 insertions(+), 40 deletions(-)

diff --git a/cpp/src/io/parquet/experimental/hybrid_scan_helpers.hpp b/cpp/src/io/parquet/experimental/hybrid_scan_helpers.hpp
index 1330353045e..6ad25641407 100644
--- a/cpp/src/io/parquet/experimental/hybrid_scan_helpers.hpp
+++ b/cpp/src/io/parquet/experimental/hybrid_scan_helpers.hpp
@@ -281,6 +281,9 @@ class aggregate_reader_metadata : public aggregate_reader_metadata_base {
    * Compute a vector of boolean vectors indicating which data pages need to be decoded to
    * construct each input column based on the row mask, one vector per column
    *
+   * @tparam ColumnView Type of the row mask column view - cudf::mutable_column_view for filter
+   * columns and cudf::column_view for payload columns
+   *
    * @param row_mask Boolean column indicating which rows need to be read after page-pruning
    * @param row_group_indices Input row groups indices
    * @param input_columns Input column information
@@ -290,8 +293,9 @@ class aggregate_reader_metadata : public aggregate_reader_metadata_base {
    * @return Boolean vector indicating which data pages need to be decoded to produce
    *         the output table based on the input row mask across all input columns
    */
+  template <typename ColumnView>
   [[nodiscard]] std::vector<bool> compute_data_page_mask(
-    cudf::column_view row_mask,
+    ColumnView const& row_mask,
     cudf::host_span<std::vector<size_type> const> row_group_indices,
     cudf::host_span<input_column_info const> input_columns,
     cudf::size_type row_mask_offset,
diff --git a/cpp/src/io/parquet/experimental/page_index_filter.cu b/cpp/src/io/parquet/experimental/page_index_filter.cu
index b5948197537..937b2dcfa6e 100644
--- a/cpp/src/io/parquet/experimental/page_index_filter.cu
+++ b/cpp/src/io/parquet/experimental/page_index_filter.cu
@@ -487,8 +487,9 @@ std::unique_ptr<cudf::column> aggregate_reader_metadata::build_row_mask_with_pag
   return cudf::detail::compute_column(stats_table, stats_expr.get_stats_expr().get(), stream, mr);
 }
 
+template <typename ColumnView>
 std::vector<bool> aggregate_reader_metadata::compute_data_page_mask(
-  cudf::column_view row_mask,
+  ColumnView const& row_mask,
   cudf::host_span<std::vector<size_type> const> row_group_indices,
   cudf::host_span<input_column_info const> input_columns,
   cudf::size_type row_mask_offset,
@@ -565,13 +566,26 @@ std::vector<bool> aggregate_reader_metadata::compute_data_page_mask(
                "Mismatch in total rows in input row mask and row groups",
                std::invalid_argument);
 
-  // Return if all rows are required or all are invalid.
-  if (row_mask.null_count(row_mask_offset, row_mask_offset + total_rows) == total_rows or
-      thrust::all_of(rmm::exec_policy(stream),
-                     row_mask.begin<bool>() + row_mask_offset,
-                     row_mask.begin<bool>() + row_mask_offset + total_rows,
+  if constexpr (cuda::std::is_same_v<ColumnView, cudf::mutable_column_view>) {
+    if (row_mask.nullable()) {
+      thrust::for_each(rmm::exec_policy_nosync(stream),
+                       thrust::counting_iterator(row_mask_offset),
+                       thrust::counting_iterator(row_mask_offset + total_rows),
+                       [row_mask  = row_mask.template begin<bool>(),
+                        null_mask = row_mask.null_mask()] __device__(auto const row_idx) {
+                         if (not bit_is_set(null_mask, row_idx)) { row_mask[row_idx] = true; }
+                       });
+    }
+  } else {
+    CUDF_EXPECTS(not row_mask.nullable(), "Row mask must not be nullable for payload columns");
+  }
+
+  // Return an empty vector if all rows are required or all are invalid.
+  if (thrust::all_of(rmm::exec_policy(stream),
+                     row_mask.template begin<bool>() + row_mask_offset,
+                     row_mask.template begin<bool>() + row_mask_offset + total_rows,
                      cuda::std::identity{})) {
-    return std::vector<bool>(total_pages, true);
+    return {};
   }
 
   // Total number of surviving pages across all columns
@@ -581,37 +595,10 @@ std::vector<bool> aggregate_reader_metadata::compute_data_page_mask(
   std::vector<std::future<std::vector<bool>>> data_page_mask_tasks;
   data_page_mask_tasks.reserve(num_columns);
 
-  // Host row mask validity and first bit offset
-  auto const [host_row_mask_validity, first_bit_offset] = [&] {
-    if (row_mask.nullable()) {
-      auto const first_word_idx = word_index(row_mask_offset);
-      auto const last_word_idx  = word_index(row_mask_offset + total_rows);
-      auto const num_words      = last_word_idx - first_word_idx + 1;
-      auto const max_words      = num_bitmask_words(row_mask.size()) - first_word_idx - 1;
-      CUDF_EXPECTS(num_words <= max_words,
-                   "Encountered unexpected number of bitmask words to copy from the row mask");
-      return std::pair{
-        cudf::detail::make_host_vector(
-          device_span<bitmask_type const>(row_mask.null_mask() + first_word_idx, num_words),
-          stream),
-        intra_word_index(row_mask_offset)};
-    } else {
-      // Empty vector if row mask is not nullable
-      return std::pair{cudf::detail::make_host_vector<bitmask_type>(0, stream), 0};
-    }
-  }();
-
-  // Iterator for row mask validity
-  auto is_row_valid = cudf::detail::make_counting_transform_iterator(
-    first_bit_offset,
-    [is_nullable = row_mask.nullable(), nullmask = host_row_mask_validity.data()](auto bit_index) {
-      // Always valid if row mask is not nullable or check if the corresponding bit is set
-      return not is_nullable or bit_is_set(nullmask, bit_index);
-    });
-
   // Host row mask data
   auto const is_row_required = cudf::detail::make_host_vector(
-    device_span<uint8_t const>(row_mask.data<uint8_t>() + row_mask_offset, total_rows), stream);
+    device_span<uint8_t const>(row_mask.template data<uint8_t>() + row_mask_offset, total_rows),
+    stream);
 
   // For all columns, look up which pages contain at least one required row. i.e.
   // !validity_it[row_idx] or is_row_required[row_idx] satisfies, and add its byte range to the
@@ -629,9 +616,8 @@ std::vector<bool> aggregate_reader_metadata::compute_data_page_mask(
                       size_t num_surviving_pages_this_column = 0;
                       // For all rows
                       for (auto row_idx = 0; row_idx < total_rows; ++row_idx) {
-                        // If this row is required or invalid, add its page index to the output
-                        // list.
-                        if (not is_row_valid[row_idx] or is_row_required[row_idx]) {
+                        // If this row is required, add its page index to the output list.
+                        if (is_row_required[row_idx]) {
                           // binary search to find the page index this row_idx belongs to and set
                           // the page index to true page_indices
                           auto const offsets = cudf::host_span<size_type const>(
@@ -671,4 +657,19 @@ std::vector<bool> aggregate_reader_metadata::compute_data_page_mask(
   return data_page_mask;
 }
 
+// Instantiate the templates with ColumnView as cudf::column_view and cudf::mutable_column_view
+template std::vector<bool> aggregate_reader_metadata::compute_data_page_mask<cudf::column_view>(
+  cudf::column_view const& row_mask,
+  cudf::host_span<std::vector<size_type> const> row_group_indices,
+  cudf::host_span<input_column_info const> input_columns,
+  cudf::size_type row_mask_offset,
+  rmm::cuda_stream_view stream) const;
+
+template std::vector<bool> aggregate_reader_metadata::compute_data_page_mask<
+  cudf::mutable_column_view>(cudf::mutable_column_view const& row_mask,
+                             cudf::host_span<std::vector<size_type> const> row_group_indices,
+                             cudf::host_span<input_column_info const> input_columns,
+                             cudf::size_type row_mask_offset,
+                             rmm::cuda_stream_view stream) const;
+
 }  // namespace cudf::io::parquet::experimental::detail

From 30dbbee03aa703b2a6391c44e38e2e99b49f1180 Mon Sep 17 00:00:00 2001
From: Muhammad Haseeb <mhaseeb@nvidia.com>
Date: Wed, 15 Oct 2025 03:24:54 +0000
Subject: [PATCH 03/25] Minor refactor

---
 .../parquet/experimental/page_index_filter.cu | 20 ++++++++++---------
 1 file changed, 11 insertions(+), 9 deletions(-)

diff --git a/cpp/src/io/parquet/experimental/page_index_filter.cu b/cpp/src/io/parquet/experimental/page_index_filter.cu
index 937b2dcfa6e..abf352d1266 100644
--- a/cpp/src/io/parquet/experimental/page_index_filter.cu
+++ b/cpp/src/io/parquet/experimental/page_index_filter.cu
@@ -566,6 +566,15 @@ std::vector<bool> aggregate_reader_metadata::compute_data_page_mask(
                "Mismatch in total rows in input row mask and row groups",
                std::invalid_argument);
 
+  // Return an empty vector if all rows are invalid or all rows are required
+  if (row_mask.null_count(row_mask_offset, row_mask_offset + total_rows, stream) == total_rows or
+      thrust::all_of(rmm::exec_policy(stream),
+                     row_mask.template begin<bool>() + row_mask_offset,
+                     row_mask.template begin<bool>() + row_mask_offset + total_rows,
+                     cuda::std::identity{})) {
+    return {};
+  }
+
   if constexpr (cuda::std::is_same_v<ColumnView, cudf::mutable_column_view>) {
     if (row_mask.nullable()) {
       thrust::for_each(rmm::exec_policy_nosync(stream),
@@ -577,15 +586,8 @@ std::vector<bool> aggregate_reader_metadata::compute_data_page_mask(
                        });
     }
   } else {
-    CUDF_EXPECTS(not row_mask.nullable(), "Row mask must not be nullable for payload columns");
-  }
-
-  // Return an empty vector if all rows are required or all are invalid.
-  if (thrust::all_of(rmm::exec_policy(stream),
-                     row_mask.template begin<bool>() + row_mask_offset,
-                     row_mask.template begin<bool>() + row_mask_offset + total_rows,
-                     cuda::std::identity{})) {
-    return {};
+    CUDF_EXPECTS(not row_mask.nullable() or row_mask.null_count() == 0,
+                 "Row mask must not contain nulls for payload columns");
   }
 
   // Total number of surviving pages across all columns

From 73f64f218468c47145c4815fc71dc09a22dfcd5b Mon Sep 17 00:00:00 2001
From: Muhammad Haseeb <mhaseeb@nvidia.com>
Date: Wed, 15 Oct 2025 22:47:44 +0000
Subject: [PATCH 04/25] Initial impl of GPU data page mask algorithm

---
 .../parquet/experimental/page_index_filter.cu | 292 +++++++++++++-----
 .../experimental/page_index_filter_utils.cu   |  28 +-
 .../experimental/page_index_filter_utils.hpp  |  15 +-
 3 files changed, 255 insertions(+), 80 deletions(-)

diff --git a/cpp/src/io/parquet/experimental/page_index_filter.cu b/cpp/src/io/parquet/experimental/page_index_filter.cu
index abf352d1266..450dd2a3de7 100644
--- a/cpp/src/io/parquet/experimental/page_index_filter.cu
+++ b/cpp/src/io/parquet/experimental/page_index_filter.cu
@@ -24,6 +24,7 @@
 #include <cudf/detail/nvtx/ranges.hpp>
 #include <cudf/detail/transform.hpp>
 #include <cudf/detail/utilities/batched_memcpy.hpp>
+#include <cudf/detail/utilities/grid_1d.cuh>
 #include <cudf/detail/utilities/host_worker_pool.hpp>
 #include <cudf/detail/utilities/stream_pool.hpp>
 #include <cudf/detail/utilities/vector_factories.hpp>
@@ -487,6 +488,147 @@ std::unique_ptr<cudf::column> aggregate_reader_metadata::build_row_mask_with_pag
   return cudf::detail::compute_column(stats_table, stats_expr.get_stats_expr().get(), stream, mr);
 }
 
+/**
+ * @brief Custom CUDA kernel using Cooperative Groups to perform the paired logical OR reduction.
+ * * NOTE: This operation is a map/stride-2-read, not a true block-to-global reduction.
+ * CUB's BlockReduce is unsuitable here as it reduces a block to a single element.
+ * Cooperative Groups is used here for robust global thread ID calculation.
+ */
+__global__ void reduce_or_kernel(cudf::device_span<bool*> level_ptrs,
+                                 size_type current_level,
+                                 size_t current_level_size)  // Size of the source level
+{
+  // Use Cooperative Groups to get the global thread index (tid)
+  auto tid = cudf::detail::grid_1d::global_thread_id();
+
+  auto const current_level_ptr = level_ptrs[current_level];
+  auto next_level_ptr          = level_ptrs[current_level + 1];
+
+  // The reduction only needs to run for half the size of the current level
+  size_t next_level_size = (current_level_size + 1) / 2;
+
+  if (tid < next_level_size) {
+    size_t idx1 = tid * 2;
+    size_t idx2 = tid * 2 + 1;
+
+    // Perform the logical OR reduction and write to the next level's location
+    next_level_ptr[tid] = current_level_ptr[idx1] || current_level_ptr[idx2];
+  }
+
+  // Handle the odd-sized remaining element if current_level_size is odd
+  if (current_level_size % 2 != 0 && tid == next_level_size) {
+    // The last element is carried forward (ORed with false)
+    next_level_ptr[tid] = current_level_ptr[current_level_size - 1];
+  }
+}
+
+/**
+ * @brief CUDA kernel to probe multiple ranges against the pre-calculated mask hierarchy.
+ * One thread handles the binary decomposition and query for one range [M, N).
+ * * @param d_level_ptrs Device array of pointers, where d_level_ptrs[k] points to the start of
+ * Level k mask.
+ * @param d_range_offsets Device array where range i is [d_range_offsets[i], d_range_offsets[i+1]).
+ * @param num_ranges The number of ranges to process.
+ * @param d_results Pointer to device memory to store the boolean result (true if a '1' is found in
+ * the range).
+ */
+__global__ void probe_hierarchical_masks_kernel(
+  cudf::device_span<bool* const> level_ptrs,        // Pointers to start of each mask level
+  cudf::device_span<cudf::size_type> page_offsets,  // Range boundary array
+  bool* results)
+{
+  auto const num_ranges = page_offsets.size() - 1;
+  size_t range_idx      = cudf::detail::grid_1d::global_thread_id();
+
+  if (range_idx > num_ranges) { return; }
+
+  // Initialize result for this range to false (assuming no set bit)
+  results[range_idx] = false;
+
+  // Retrieve M and N for the current range [M, N)
+  size_type M = page_offsets[range_idx];
+  size_type N = page_offsets[range_idx + 1];
+
+  // If the range is empty or invalid, terminate
+  if (M >= N) { return; }
+
+  // Binary Decomposition Loop
+  while (M < N) {
+    // 1. Calculate the largest power of 2 that can align M up to the boundary.
+    // This is determined by the Least Significant Bit (LSB) of M.
+    // If M=0, LSB is usually defined as the full size, but here M is typically > 0
+    // or we handle M=0 implicitly by the full range check.
+    // The expression (M & -M) gives the value of the LSB, which is the block size (2^k).
+    size_t m_lsb_block_size = (M == 0) ? N : (M & -M);
+    size_t m_next_aligned   = M + m_lsb_block_size;
+
+    // 2. Calculate the largest power of 2 block that can align N down to the boundary.
+    // This is determined by the LSB of (N - M), but simpler to use N's alignment for the end.
+    // The expression (N & -N) gives the block size corresponding to N's alignment.
+    // We ensure N_lsb_block_size does not exceed the remaining range size (N-M).
+    size_t n_lsb_block_size = N & -N;
+
+    // --- Decision Logic: Which side to consume? ---
+
+    // Block 1: M-aligned block (from M up to m_next_aligned)
+    size_t block1_size = m_next_aligned - M;
+
+    // Block 2: N-aligned block (from N - n_lsb_block_size up to N)
+    size_t block2_size = n_lsb_block_size;
+
+    // Block 3: The remaining central range block
+
+    if (block1_size > 0 && M < m_next_aligned && m_next_aligned <= N) {
+      // If the M-aligned block is fully contained in the range [M, N)
+
+      // Check if block1_size is 2^k. k = log2(block1_size).
+      // Since block1_size is based on LSB, it is always a power of 2.
+      size_t k1 = __ffs(block1_size) - 1;
+
+      // Calculate mask index: The starting point M is divided by the block size.
+      size_t mask_idx = M / block1_size;
+
+      // Look up the mask value
+      if (level_ptrs[k1][mask_idx]) {
+        results[range_idx] = true;
+        return;  // Found a set bit, terminate for this range
+      }
+
+      // Advance M
+      M = m_next_aligned;
+    } else if (block2_size > 0 && N - block2_size >= M) {
+      // If the N-aligned block is fully contained and does not overlap M's new position
+
+      // Check if block2_size is 2^k. k = log2(block2_size).
+      size_t k2 = __ffs(block2_size) - 1;
+
+      // Calculate mask index
+      size_t mask_idx = (N - block2_size) / block2_size;
+
+      // Look up the mask value
+      if (level_ptrs[k2][mask_idx]) {
+        results[range_idx] = true;
+        return;  // Found a set bit, terminate for this range
+      }
+
+      // Backtrack N
+      N = N - block2_size;
+    } else {
+      // The remaining range is unaligned and small (or just 1 element).
+      // This happens when M and N are close and unaligned (e.g., [11, 13]).
+
+      // Prioritize M (1-row check) or N (1-row check) until they meet.
+
+      // Check single row at M (Level 0)
+      if (level_ptrs[0][M]) {
+        results[range_idx] = true;
+        return;
+      }
+      M++;
+    }
+  }
+}
+
 template <typename ColumnView>
 std::vector<bool> aggregate_reader_metadata::compute_data_page_mask(
   ColumnView const& row_mask,
@@ -500,6 +642,21 @@ std::vector<bool> aggregate_reader_metadata::compute_data_page_mask(
   CUDF_EXPECTS(row_mask.type().id() == cudf::type_id::BOOL8,
                "Input row bitmask should be of type BOOL8");
 
+  auto const total_rows = total_rows_in_row_groups(row_group_indices);
+
+  // Return an empty vector if all rows are invalid or all rows are required
+  if (row_mask.null_count(row_mask_offset, row_mask_offset + total_rows, stream) == total_rows or
+      thrust::all_of(rmm::exec_policy(stream),
+                     row_mask.template begin<bool>() + row_mask_offset,
+                     row_mask.template begin<bool>() + row_mask_offset + total_rows,
+                     cuda::std::identity{})) {
+    return {};
+  }
+
+  CUDF_EXPECTS(row_mask_offset + total_rows <= row_mask.size(),
+               "Mismatch in total rows in input row mask and row groups",
+               std::invalid_argument);
+
   auto const num_columns = input_columns.size();
 
   // Collect column schema indices from the input columns.
@@ -529,7 +686,7 @@ std::vector<bool> aggregate_reader_metadata::compute_data_page_mask(
     auto const schema_idx   = column_schema_indices.front();
     size_type col_num_pages = 0;
     std::tie(page_row_offsets, col_num_pages, max_page_size) =
-      compute_page_row_offsets(per_file_metadata, row_group_indices, schema_idx);
+      compute_page_row_offsets(per_file_metadata, row_group_indices, schema_idx, row_mask_offset);
     // Add 1 to include the the 0th page's offset for each column
     col_page_offsets.emplace_back(col_num_pages + 1);
   } else {
@@ -542,8 +699,10 @@ std::vector<bool> aggregate_reader_metadata::compute_data_page_mask(
                   [&](auto const col_idx) {
                     page_row_offsets_tasks.emplace_back(
                       cudf::detail::host_worker_pool().submit_task([&, col_idx = col_idx] {
-                        return compute_page_row_offsets(
-                          per_file_metadata, row_group_indices, column_schema_indices[col_idx]);
+                        return compute_page_row_offsets(per_file_metadata,
+                                                        row_group_indices,
+                                                        column_schema_indices[col_idx],
+                                                        row_mask_offset);
                       }));
                   });
 
@@ -559,22 +718,9 @@ std::vector<bool> aggregate_reader_metadata::compute_data_page_mask(
     });
   }
 
-  auto const total_rows  = page_row_offsets.back();
   auto const total_pages = page_row_offsets.size() - num_columns;
 
-  CUDF_EXPECTS(row_mask_offset + total_rows <= row_mask.size(),
-               "Mismatch in total rows in input row mask and row groups",
-               std::invalid_argument);
-
-  // Return an empty vector if all rows are invalid or all rows are required
-  if (row_mask.null_count(row_mask_offset, row_mask_offset + total_rows, stream) == total_rows or
-      thrust::all_of(rmm::exec_policy(stream),
-                     row_mask.template begin<bool>() + row_mask_offset,
-                     row_mask.template begin<bool>() + row_mask_offset + total_rows,
-                     cuda::std::identity{})) {
-    return {};
-  }
-
+  // Make sure all row_mask elements contain valid values even if they are nulls
   if constexpr (cuda::std::is_same_v<ColumnView, cudf::mutable_column_view>) {
     if (row_mask.nullable()) {
       thrust::for_each(rmm::exec_policy_nosync(stream),
@@ -590,71 +736,65 @@ std::vector<bool> aggregate_reader_metadata::compute_data_page_mask(
                  "Row mask must not contain nulls for payload columns");
   }
 
-  // Total number of surviving pages across all columns
-  std::atomic<size_t> total_surviving_pages{0};
+  auto const mr = cudf::get_current_device_resource_ref();
+  auto const [level_offsets, total_levels_size] =
+    compute_row_mask_levels(row_mask.size(), max_page_size);
+  auto const total_levels = level_offsets.size();
 
-  // Tasks to compute data page mask for each column
-  std::vector<std::future<std::vector<bool>>> data_page_mask_tasks;
-  data_page_mask_tasks.reserve(num_columns);
+  auto levels_data = rmm::device_uvector<bool>(total_levels_size, stream, mr);
 
-  // Host row mask data
-  auto const is_row_required = cudf::detail::make_host_vector(
-    device_span<uint8_t const>(row_mask.template data<uint8_t>() + row_mask_offset, total_rows),
-    stream);
-
-  // For all columns, look up which pages contain at least one required row. i.e.
-  // !validity_it[row_idx] or is_row_required[row_idx] satisfies, and add its byte range to the
-  // output list of byte ranges for the column.
-  std::for_each(thrust::counting_iterator<size_t>(0),
-                thrust::counting_iterator(num_columns),
-                [&](auto const col_idx) {
-                  data_page_mask_tasks.emplace_back(
-                    cudf::detail::host_worker_pool().submit_task([&, col_idx = col_idx] {
-                      // Construct a row indices mapping based on page row counts and offsets
-                      auto const total_pages_this_column =
-                        col_page_offsets[col_idx + 1] - col_page_offsets[col_idx] - 1;
-                      auto col_data_page_mask = std::vector<bool>(total_pages_this_column, false);
-                      // Number of final filtered pages for this column
-                      size_t num_surviving_pages_this_column = 0;
-                      // For all rows
-                      for (auto row_idx = 0; row_idx < total_rows; ++row_idx) {
-                        // If this row is required, add its page index to the output list.
-                        if (is_row_required[row_idx]) {
-                          // binary search to find the page index this row_idx belongs to and set
-                          // the page index to true page_indices
-                          auto const offsets = cudf::host_span<size_type const>(
-                            page_row_offsets.data() + col_page_offsets[col_idx],
-                            col_page_offsets[col_idx + 1] - col_page_offsets[col_idx]);
-                          auto const page_itr =
-                            std::upper_bound(offsets.begin(), offsets.end(), row_idx);
-                          CUDF_EXPECTS(page_itr != offsets.begin() and page_itr != offsets.end(),
-                                       "Invalid page index");
-                          auto const page_idx = std::distance(offsets.begin(), page_itr) - 1;
-                          col_data_page_mask[page_idx] = true;
-                          num_surviving_pages_this_column++;
-                          // Move row_idx to the last row of this page
-                          row_idx = offsets[page_idx + 1] - 1;
-                        }
-                      }
-                      total_surviving_pages.fetch_add(num_surviving_pages_this_column);
-                      return col_data_page_mask;
-                    }));
+  auto host_level_ptrs = cudf::detail::make_host_vector<bool*>(total_levels, stream);
+  host_level_ptrs[0]   = const_cast<bool*>(row_mask.template begin<bool>()) + row_mask_offset;
+  std::for_each(thrust::counting_iterator<size_t>(1),
+                thrust::counting_iterator(total_levels),
+                [&](auto const level_idx) {
+                  host_level_ptrs[level_idx] = levels_data.data() + level_offsets[level_idx - 1];
                 });
+  auto device_level_ptrs = cudf::detail::make_device_uvector_async(host_level_ptrs, stream, mr);
 
-  // Vector to hold data page mask for each column
-  auto data_page_mask = std::vector<bool>();
-  data_page_mask.reserve(total_pages);
+  {
+    auto const next_level_size    = level_offsets[1];
+    auto const current_level_size = row_mask.size();
+    cudf::detail::grid_1d config(next_level_size, 256, 1);
+    reduce_or_kernel<<<config.num_blocks, config.num_threads_per_block, 0, stream.value()>>>(
+      device_level_ptrs, 0, current_level_size);
+    stream.synchronize();
+  }
+
+  for (size_t current_level_idx = 1; current_level_idx < total_levels - 1; current_level_idx++) {
+    auto const next_level_size =
+      level_offsets[current_level_idx + 1] - level_offsets[current_level_idx];
+    auto const current_level_size =
+      level_offsets[current_level_idx] - level_offsets[current_level_idx - 1];
+    cudf::detail::grid_1d config(next_level_size, 256, 1);
+    reduce_or_kernel<<<config.num_blocks, config.num_threads_per_block, 0, stream.value()>>>(
+      device_level_ptrs, current_level_idx, current_level_size);
+    stream.synchronize();
+  }
 
-  std::for_each(data_page_mask_tasks.begin(), data_page_mask_tasks.end(), [&](auto& task) {
-    auto col_data_page_mask = std::move(task).get();
-    data_page_mask.insert(data_page_mask.end(),
-                          std::make_move_iterator(col_data_page_mask.begin()),
-                          std::make_move_iterator(col_data_page_mask.end()));
-  });
+  auto const num_ranges = page_row_offsets.size() - 1;
+  rmm::device_uvector<bool> device_data_page_mask(num_ranges, stream, mr);
+  auto page_offsets = cudf::detail::make_device_uvector_async(page_row_offsets, stream, mr);
+  {
+    cudf::detail::grid_1d config(num_ranges, 256, 1);
+    probe_hierarchical_masks_kernel<<<config.num_blocks,
+                                      config.num_threads_per_block,
+                                      0,
+                                      stream.value()>>>(
+      device_level_ptrs, page_offsets, device_data_page_mask.data());
+  }
 
-  CUDF_EXPECTS(
-    total_surviving_pages <= total_pages,
-    "Number of surviving pages must be less than or equal to the total number of input pages");
+  auto host_results = cudf::detail::make_host_vector(device_data_page_mask, stream);
+  std::vector<bool> data_page_mask{};
+  data_page_mask.reserve(total_pages);
+  thrust::host_vector<bool> gather_mask(num_ranges, true);
+  std::for_each(thrust::counting_iterator<size_t>(0),
+                thrust::counting_iterator(num_ranges),
+                [&](auto const range_idx) {
+                  if (page_row_offsets[range_idx] < page_row_offsets[range_idx + 1]) {
+                    data_page_mask.push_back(host_results[range_idx]);
+                  }
+                });
 
   return data_page_mask;
 }
diff --git a/cpp/src/io/parquet/experimental/page_index_filter_utils.cu b/cpp/src/io/parquet/experimental/page_index_filter_utils.cu
index 3dfb8e69d51..0aa3d58a75e 100644
--- a/cpp/src/io/parquet/experimental/page_index_filter_utils.cu
+++ b/cpp/src/io/parquet/experimental/page_index_filter_utils.cu
@@ -146,7 +146,8 @@ compute_page_row_counts_and_offsets(cudf::host_span<metadata_base const> per_fil
 std::tuple<std::vector<size_type>, size_type, size_type> compute_page_row_offsets(
   cudf::host_span<metadata_base const> per_file_metadata,
   cudf::host_span<std::vector<size_type> const> row_group_indices,
-  size_type schema_idx)
+  cudf::size_type schema_idx,
+  cudf::size_type row_mask_offset)
 {
   // Compute total number of row groups
   auto const total_row_groups =
@@ -156,7 +157,7 @@ std::tuple<std::vector<size_type>, size_type, size_type> compute_page_row_offset
                     [](auto sum, auto const& rg_indices) { return sum + rg_indices.size(); });
 
   std::vector<size_type> page_row_offsets;
-  page_row_offsets.push_back(0);
+  page_row_offsets.push_back(row_mask_offset);
   size_type max_page_size = 0;
   size_type num_pages     = 0;
 
@@ -232,4 +233,27 @@ rmm::device_uvector<size_type> make_page_indices_async(
   return page_indices;
 }
 
+std::pair<std::vector<size_type>, size_type> compute_row_mask_levels(cudf::size_type num_rows,
+                                                                     cudf::size_type max_page_size)
+{
+  std::vector<size_type> level_offsets;
+  level_offsets.push_back(0);
+
+  size_t current_size  = (num_rows + 1) / 2;
+  size_t current_level = 1;
+
+  while (current_size > 0) {
+    size_t block_size = 1ULL << current_level;
+
+    level_offsets.push_back(current_size);
+    current_size += num_rows;
+
+    if (std::cmp_greater_equal(block_size, max_page_size)) { break; }
+
+    current_size = (current_size + 1) / 2;
+    current_level++;
+  }
+  return {std::move(level_offsets), current_size};
+}
+
 }  // namespace cudf::io::parquet::experimental::detail
\ No newline at end of file
diff --git a/cpp/src/io/parquet/experimental/page_index_filter_utils.hpp b/cpp/src/io/parquet/experimental/page_index_filter_utils.hpp
index f53bede91c7..ef29b9be9a1 100644
--- a/cpp/src/io/parquet/experimental/page_index_filter_utils.hpp
+++ b/cpp/src/io/parquet/experimental/page_index_filter_utils.hpp
@@ -68,14 +68,15 @@ compute_page_row_counts_and_offsets(cudf::host_span<metadata_base const> per_fil
  * @param per_file_metadata Span of parquet footer metadata
  * @param row_group_indices Span of input row group indices
  * @param schema_idx Column's schema index
- * @param stream CUDA stream
+ * @param row_mask_offset Offset of the row mask
  * @return Tuple of page row offsets, number of pages, and the size of the largest page in this
  * column
  */
 [[nodiscard]] std::tuple<std::vector<size_type>, size_type, size_type> compute_page_row_offsets(
   cudf::host_span<metadata_base const> per_file_metadata,
   cudf::host_span<std::vector<size_type> const> row_group_indices,
-  size_type schema_idx);
+  size_type schema_idx,
+  cudf::size_type row_mask_offset);
 
 /**
  * @brief Make a device vector where each row contains the index of the page it belongs to
@@ -92,4 +93,14 @@ compute_page_row_counts_and_offsets(cudf::host_span<metadata_base const> per_fil
   cudf::size_type total_rows,
   rmm::cuda_stream_view stream);
 
+/**
+ * @brief Compute the levels of the row mask
+ *
+ * @param num_rows Number of rows in the row mask
+ * @param max_page_size Maximum page size
+ * @return Pair of level offsets and total levels size
+ */
+[[nodiscard]] std::pair<std::vector<size_type>, size_type> compute_row_mask_levels(
+  cudf::size_type num_rows, cudf::size_type max_page_size);
+
 }  // namespace cudf::io::parquet::experimental::detail
\ No newline at end of file

From de651839bf3c7c064b6cf93c119e35c8b1a79f08 Mon Sep 17 00:00:00 2001
From: Muhammad Haseeb <mhaseeb@nvidia.com>
Date: Thu, 16 Oct 2025 00:28:26 +0000
Subject: [PATCH 05/25] Minor improvements

---
 .../parquet/experimental/page_index_filter.cu | 323 +++++++++---------
 1 file changed, 154 insertions(+), 169 deletions(-)

diff --git a/cpp/src/io/parquet/experimental/page_index_filter.cu b/cpp/src/io/parquet/experimental/page_index_filter.cu
index 450dd2a3de7..2534a0f05e4 100644
--- a/cpp/src/io/parquet/experimental/page_index_filter.cu
+++ b/cpp/src/io/parquet/experimental/page_index_filter.cu
@@ -395,6 +395,139 @@ struct page_stats_caster : public stats_caster_base {
   }
 };
 
+/**
+ * @brief Custom CUDA kernel using Cooperative Groups to perform the paired logical OR reduction.
+ * * NOTE: This operation is a map/stride-2-read, not a true block-to-global reduction.
+ * CUB's BlockReduce is unsuitable here as it reduces a block to a single element.
+ * Cooperative Groups is used here for robust global thread ID calculation.
+ */
+CUDF_KERNEL void reduce_or_kernel(bool** const level_ptrs,
+                                  size_type current_level,
+                                  size_t current_level_size)  // Size of the source level
+{
+  // Use Cooperative Groups to get the global thread index (tid)
+  auto tid = cudf::detail::grid_1d::global_thread_id();
+
+  auto const current_level_ptr = level_ptrs[current_level];
+  auto next_level_ptr          = level_ptrs[current_level + 1];
+
+  // The reduction only needs to run for half the size of the current level
+  size_t next_level_size = (current_level_size + 1) / 2;
+
+  if (tid < next_level_size) {
+    size_t idx1 = tid * 2;
+    size_t idx2 = tid * 2 + 1;
+
+    // Perform the logical OR reduction and write to the next level's location
+    next_level_ptr[tid] = current_level_ptr[idx1] || current_level_ptr[idx2];
+  }
+
+  // Handle the odd-sized remaining element if current_level_size is odd
+  if (current_level_size % 2 != 0 && tid == next_level_size) {
+    // The last element is carried forward (ORed with false)
+    next_level_ptr[tid] = current_level_ptr[current_level_size - 1];
+  }
+}
+
+/**
+ * @brief CUDA kernel to probe multiple ranges against the pre-calculated mask hierarchy.
+ * One thread handles the binary decomposition and query for one range [M, N).
+ * * @param d_level_ptrs Device array of pointers, where d_level_ptrs[k] points to the start of
+ * Level k mask.
+ * @param d_range_offsets Device array where range i is [d_range_offsets[i], d_range_offsets[i+1]).
+ * @param num_ranges The number of ranges to process.
+ * @param d_results Pointer to device memory to store the boolean result (true if a '1' is found in
+ * the range).
+ */
+CUDF_KERNEL void probe_hierarchical_masks_kernel(
+  cudf::device_span<bool* const> level_ptrs,        // Pointers to start of each mask level
+  cudf::device_span<cudf::size_type> page_offsets,  // Range boundary array
+  bool* results)
+{
+  auto const num_ranges = page_offsets.size() - 1;
+  size_t range_idx      = cudf::detail::grid_1d::global_thread_id();
+
+  if (range_idx >= num_ranges) { return; }
+
+  // Initialize result for this range to false (assuming no set bit)
+  results[range_idx] = false;
+
+  // Retrieve M and N for the current range [M, N)
+  size_type M = page_offsets[range_idx];
+  size_type N = page_offsets[range_idx + 1];
+
+  // If the range is empty or invalid, terminate
+  if (M >= N) { return; }
+
+  // Binary Decomposition Loop
+  while (M < N) {
+    // 1. M Alignment: Find the largest power-of-two block that starts at M and aligns M up.
+    // Block size is determined by the Least Significant Bit (LSB) of M.
+    // If M=0, the LSB is the full range N, but we handle the LSB only for M>0.
+    // The __ffs intrinsic (Find First Set, 1-based) is the fastest way to get the LSB position (k).
+    size_t m_lsb_position = __ffs(M);                      // Position is 1-based (k+1)
+    size_t m_block_size   = 1ULL << (m_lsb_position - 1);  // Size is 2^k
+
+    // 2. N Alignment: Find the largest power-of-two block that aligns N down.
+    // N & -N gives the LSB block size *if* N were the start, but we use it as the largest
+    // possible size that evenly divides N.
+    size_t n_block_size = N & -N;
+
+    // The largest block size we can consume from the current range [M, N)
+    size_t max_block_size = 0;
+    size_t mask_level     = 0;  // k (k=0 is 1 row, k=1 is 2 rows, etc.)
+    size_t mask_index     = 0;
+
+    // --- Core Decomposition Logic ---
+
+    // Check the M side alignment block: [M, M + m_block_size)
+    // This is only valid if M + m_block_size <= N (the block fits).
+    if (M > 0 && M + m_block_size <= N) {
+      max_block_size = m_block_size;
+      mask_level     = m_lsb_position - 1;
+      mask_index     = M >> mask_level;  // M / 2^k
+    }
+
+    // Check the N side alignment block: [N - n_block_size, N)
+    // This is only valid if N - n_block_size >= M and the N block is larger or equal to the M
+    // block.
+    if (n_block_size > 0 && N - n_block_size >= M && n_block_size >= max_block_size) {
+      // If the N block is larger or we are at the end, prioritize the N block
+      max_block_size = n_block_size;
+      mask_level     = __ffs(n_block_size) - 1;
+      mask_index     = (N - n_block_size) >> mask_level;  // (N - 2^k) / 2^k
+    }
+
+    // Fallback for small, unaligned ranges (e.g., [11, 13) where M and N are close)
+    // If max_block_size is 0 or too large, reduce by 1 row (Level 0)
+    if (max_block_size == 0 || max_block_size > N - M) {
+      max_block_size = 1;
+      mask_level     = 0;
+      mask_index     = M;  // Level 0 index is just M
+    }
+
+    // --- Query Mask and Advance ---
+
+    // Look up the mask value at the determined level and index
+    if (level_ptrs[mask_level][mask_index]) {
+      results[range_idx] = true;
+      return;  // Found a set bit, terminate for this range
+    }
+
+    // Advance M or N based on which block was consumed (whichever has the smaller index)
+    if (mask_level == 0) {
+      // Consumed a single row
+      M += max_block_size;
+    } else if (M == mask_index * max_block_size) {
+      // Consumed an M-aligned block (moving M up)
+      M += max_block_size;
+    } else {
+      // Consumed an N-aligned block (moving N down)
+      N -= max_block_size;
+    }
+  }
+}
+
 }  // namespace
 
 std::unique_ptr<cudf::column> aggregate_reader_metadata::build_row_mask_with_page_index_stats(
@@ -488,147 +621,6 @@ std::unique_ptr<cudf::column> aggregate_reader_metadata::build_row_mask_with_pag
   return cudf::detail::compute_column(stats_table, stats_expr.get_stats_expr().get(), stream, mr);
 }
 
-/**
- * @brief Custom CUDA kernel using Cooperative Groups to perform the paired logical OR reduction.
- * * NOTE: This operation is a map/stride-2-read, not a true block-to-global reduction.
- * CUB's BlockReduce is unsuitable here as it reduces a block to a single element.
- * Cooperative Groups is used here for robust global thread ID calculation.
- */
-__global__ void reduce_or_kernel(cudf::device_span<bool*> level_ptrs,
-                                 size_type current_level,
-                                 size_t current_level_size)  // Size of the source level
-{
-  // Use Cooperative Groups to get the global thread index (tid)
-  auto tid = cudf::detail::grid_1d::global_thread_id();
-
-  auto const current_level_ptr = level_ptrs[current_level];
-  auto next_level_ptr          = level_ptrs[current_level + 1];
-
-  // The reduction only needs to run for half the size of the current level
-  size_t next_level_size = (current_level_size + 1) / 2;
-
-  if (tid < next_level_size) {
-    size_t idx1 = tid * 2;
-    size_t idx2 = tid * 2 + 1;
-
-    // Perform the logical OR reduction and write to the next level's location
-    next_level_ptr[tid] = current_level_ptr[idx1] || current_level_ptr[idx2];
-  }
-
-  // Handle the odd-sized remaining element if current_level_size is odd
-  if (current_level_size % 2 != 0 && tid == next_level_size) {
-    // The last element is carried forward (ORed with false)
-    next_level_ptr[tid] = current_level_ptr[current_level_size - 1];
-  }
-}
-
-/**
- * @brief CUDA kernel to probe multiple ranges against the pre-calculated mask hierarchy.
- * One thread handles the binary decomposition and query for one range [M, N).
- * * @param d_level_ptrs Device array of pointers, where d_level_ptrs[k] points to the start of
- * Level k mask.
- * @param d_range_offsets Device array where range i is [d_range_offsets[i], d_range_offsets[i+1]).
- * @param num_ranges The number of ranges to process.
- * @param d_results Pointer to device memory to store the boolean result (true if a '1' is found in
- * the range).
- */
-__global__ void probe_hierarchical_masks_kernel(
-  cudf::device_span<bool* const> level_ptrs,        // Pointers to start of each mask level
-  cudf::device_span<cudf::size_type> page_offsets,  // Range boundary array
-  bool* results)
-{
-  auto const num_ranges = page_offsets.size() - 1;
-  size_t range_idx      = cudf::detail::grid_1d::global_thread_id();
-
-  if (range_idx > num_ranges) { return; }
-
-  // Initialize result for this range to false (assuming no set bit)
-  results[range_idx] = false;
-
-  // Retrieve M and N for the current range [M, N)
-  size_type M = page_offsets[range_idx];
-  size_type N = page_offsets[range_idx + 1];
-
-  // If the range is empty or invalid, terminate
-  if (M >= N) { return; }
-
-  // Binary Decomposition Loop
-  while (M < N) {
-    // 1. Calculate the largest power of 2 that can align M up to the boundary.
-    // This is determined by the Least Significant Bit (LSB) of M.
-    // If M=0, LSB is usually defined as the full size, but here M is typically > 0
-    // or we handle M=0 implicitly by the full range check.
-    // The expression (M & -M) gives the value of the LSB, which is the block size (2^k).
-    size_t m_lsb_block_size = (M == 0) ? N : (M & -M);
-    size_t m_next_aligned   = M + m_lsb_block_size;
-
-    // 2. Calculate the largest power of 2 block that can align N down to the boundary.
-    // This is determined by the LSB of (N - M), but simpler to use N's alignment for the end.
-    // The expression (N & -N) gives the block size corresponding to N's alignment.
-    // We ensure N_lsb_block_size does not exceed the remaining range size (N-M).
-    size_t n_lsb_block_size = N & -N;
-
-    // --- Decision Logic: Which side to consume? ---
-
-    // Block 1: M-aligned block (from M up to m_next_aligned)
-    size_t block1_size = m_next_aligned - M;
-
-    // Block 2: N-aligned block (from N - n_lsb_block_size up to N)
-    size_t block2_size = n_lsb_block_size;
-
-    // Block 3: The remaining central range block
-
-    if (block1_size > 0 && M < m_next_aligned && m_next_aligned <= N) {
-      // If the M-aligned block is fully contained in the range [M, N)
-
-      // Check if block1_size is 2^k. k = log2(block1_size).
-      // Since block1_size is based on LSB, it is always a power of 2.
-      size_t k1 = __ffs(block1_size) - 1;
-
-      // Calculate mask index: The starting point M is divided by the block size.
-      size_t mask_idx = M / block1_size;
-
-      // Look up the mask value
-      if (level_ptrs[k1][mask_idx]) {
-        results[range_idx] = true;
-        return;  // Found a set bit, terminate for this range
-      }
-
-      // Advance M
-      M = m_next_aligned;
-    } else if (block2_size > 0 && N - block2_size >= M) {
-      // If the N-aligned block is fully contained and does not overlap M's new position
-
-      // Check if block2_size is 2^k. k = log2(block2_size).
-      size_t k2 = __ffs(block2_size) - 1;
-
-      // Calculate mask index
-      size_t mask_idx = (N - block2_size) / block2_size;
-
-      // Look up the mask value
-      if (level_ptrs[k2][mask_idx]) {
-        results[range_idx] = true;
-        return;  // Found a set bit, terminate for this range
-      }
-
-      // Backtrack N
-      N = N - block2_size;
-    } else {
-      // The remaining range is unaligned and small (or just 1 element).
-      // This happens when M and N are close and unaligned (e.g., [11, 13]).
-
-      // Prioritize M (1-row check) or N (1-row check) until they meet.
-
-      // Check single row at M (Level 0)
-      if (level_ptrs[0][M]) {
-        results[range_idx] = true;
-        return;
-      }
-      M++;
-    }
-  }
-}
-
 template <typename ColumnView>
 std::vector<bool> aggregate_reader_metadata::compute_data_page_mask(
   ColumnView const& row_mask,
@@ -750,27 +742,19 @@ std::vector<bool> aggregate_reader_metadata::compute_data_page_mask(
                 [&](auto const level_idx) {
                   host_level_ptrs[level_idx] = levels_data.data() + level_offsets[level_idx - 1];
                 });
-  auto device_level_ptrs = cudf::detail::make_device_uvector_async(host_level_ptrs, stream, mr);
-
-  {
-    auto const next_level_size    = level_offsets[1];
-    auto const current_level_size = row_mask.size();
-    cudf::detail::grid_1d config(next_level_size, 256, 1);
-    reduce_or_kernel<<<config.num_blocks, config.num_threads_per_block, 0, stream.value()>>>(
-      device_level_ptrs, 0, current_level_size);
-    stream.synchronize();
-  }
 
-  for (size_t current_level_idx = 1; current_level_idx < total_levels - 1; current_level_idx++) {
-    auto const next_level_size =
-      level_offsets[current_level_idx + 1] - level_offsets[current_level_idx];
-    auto const current_level_size =
-      level_offsets[current_level_idx] - level_offsets[current_level_idx - 1];
-    cudf::detail::grid_1d config(next_level_size, 256, 1);
-    reduce_or_kernel<<<config.num_blocks, config.num_threads_per_block, 0, stream.value()>>>(
-      device_level_ptrs, current_level_idx, current_level_size);
-    stream.synchronize();
-  }
+  auto device_level_ptrs  = cudf::detail::make_device_uvector_async(host_level_ptrs, stream, mr);
+  auto current_level_size = row_mask.size();
+  std::for_each(
+    thrust::counting_iterator<size_t>(0),
+    thrust::counting_iterator(total_levels - 1),
+    [&](auto const level) {
+      auto const next_level_size = level_offsets[level + 1] - level_offsets[level];
+      cudf::detail::grid_1d config(next_level_size, 256, 1);
+      reduce_or_kernel<<<config.num_blocks, config.num_threads_per_block, 0, stream.value()>>>(
+        device_level_ptrs.data(), level, current_level_size);
+      current_level_size = next_level_size;
+    });
 
   auto const num_ranges = page_row_offsets.size() - 1;
   rmm::device_uvector<bool> device_data_page_mask(num_ranges, stream, mr);
@@ -784,18 +768,19 @@ std::vector<bool> aggregate_reader_metadata::compute_data_page_mask(
       device_level_ptrs, page_offsets, device_data_page_mask.data());
   }
 
-  auto host_results = cudf::detail::make_host_vector(device_data_page_mask, stream);
+  auto host_results      = cudf::detail::make_host_vector(device_data_page_mask, stream);
+  auto host_results_iter = host_results.begin();
   std::vector<bool> data_page_mask{};
   data_page_mask.reserve(total_pages);
-  thrust::host_vector<bool> gather_mask(num_ranges, true);
   std::for_each(thrust::counting_iterator<size_t>(0),
-                thrust::counting_iterator(num_ranges),
-                [&](auto const range_idx) {
-                  if (page_row_offsets[range_idx] < page_row_offsets[range_idx + 1]) {
-                    data_page_mask.push_back(host_results[range_idx]);
-                  }
+                thrust::counting_iterator(num_columns),
+                [&](auto col_idx) {
+                  auto const col_num_pages =
+                    col_page_offsets[col_idx + 1] - col_page_offsets[col_idx] - 1;
+                  data_page_mask.insert(
+                    data_page_mask.end(), host_results_iter, host_results_iter + col_num_pages);
+                  std::advance(host_results_iter, col_num_pages + 1);
                 });
-
   return data_page_mask;
 }
 

From f6b79c797371d94432fe0d90dcd8423dccedcaa1 Mon Sep 17 00:00:00 2001
From: Muhammad Haseeb <mhaseeb@nvidia.com>
Date: Thu, 16 Oct 2025 00:31:53 +0000
Subject: [PATCH 06/25] Add cmake changes

---
 cpp/CMakeLists.txt | 1 +
 1 file changed, 1 insertion(+)

diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index f3eb82a113f..c189575c4ed 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -546,6 +546,7 @@ add_library(
   src/io/parquet/experimental/hybrid_scan_impl.cpp
   src/io/parquet/experimental/hybrid_scan_preprocess.cu
   src/io/parquet/experimental/page_index_filter.cu
+  src/io/parquet/experimental/page_index_filter_utils.cu
   src/io/parquet/page_data.cu
   src/io/parquet/chunk_dict.cu
   src/io/parquet/page_enc.cu

From aef9d646558d97ab3739831dbc26c1e2e48bc3b4 Mon Sep 17 00:00:00 2001
From: Muhammad Haseeb <mhaseeb@nvidia.com>
Date: Thu, 16 Oct 2025 01:35:07 +0000
Subject: [PATCH 07/25] Revert to old probe impl

---
 .../parquet/experimental/page_index_filter.cu | 129 ++++++++++--------
 1 file changed, 69 insertions(+), 60 deletions(-)

diff --git a/cpp/src/io/parquet/experimental/page_index_filter.cu b/cpp/src/io/parquet/experimental/page_index_filter.cu
index 2534a0f05e4..36983d706a0 100644
--- a/cpp/src/io/parquet/experimental/page_index_filter.cu
+++ b/cpp/src/io/parquet/experimental/page_index_filter.cu
@@ -461,69 +461,77 @@ CUDF_KERNEL void probe_hierarchical_masks_kernel(
 
   // Binary Decomposition Loop
   while (M < N) {
-    // 1. M Alignment: Find the largest power-of-two block that starts at M and aligns M up.
-    // Block size is determined by the Least Significant Bit (LSB) of M.
-    // If M=0, the LSB is the full range N, but we handle the LSB only for M>0.
-    // The __ffs intrinsic (Find First Set, 1-based) is the fastest way to get the LSB position (k).
-    size_t m_lsb_position = __ffs(M);                      // Position is 1-based (k+1)
-    size_t m_block_size   = 1ULL << (m_lsb_position - 1);  // Size is 2^k
-
-    // 2. N Alignment: Find the largest power-of-two block that aligns N down.
-    // N & -N gives the LSB block size *if* N were the start, but we use it as the largest
-    // possible size that evenly divides N.
-    size_t n_block_size = N & -N;
-
-    // The largest block size we can consume from the current range [M, N)
-    size_t max_block_size = 0;
-    size_t mask_level     = 0;  // k (k=0 is 1 row, k=1 is 2 rows, etc.)
-    size_t mask_index     = 0;
-
-    // --- Core Decomposition Logic ---
-
-    // Check the M side alignment block: [M, M + m_block_size)
-    // This is only valid if M + m_block_size <= N (the block fits).
-    if (M > 0 && M + m_block_size <= N) {
-      max_block_size = m_block_size;
-      mask_level     = m_lsb_position - 1;
-      mask_index     = M >> mask_level;  // M / 2^k
-    }
+    // 1. Calculate the largest power of 2 that can align M up to the boundary.
+    // This is determined by the Least Significant Bit (LSB) of M.
+    // If M=0, LSB is usually defined as the full size, but here M is typically > 0
+    // or we handle M=0 implicitly by the full range check.
+    // The expression (M & -M) gives the value of the LSB, which is the block size (2^k).
+    size_t m_lsb_block_size = (M == 0) ? N : (M & -M);
+    size_t m_next_aligned   = M + m_lsb_block_size;
 
-    // Check the N side alignment block: [N - n_block_size, N)
-    // This is only valid if N - n_block_size >= M and the N block is larger or equal to the M
-    // block.
-    if (n_block_size > 0 && N - n_block_size >= M && n_block_size >= max_block_size) {
-      // If the N block is larger or we are at the end, prioritize the N block
-      max_block_size = n_block_size;
-      mask_level     = __ffs(n_block_size) - 1;
-      mask_index     = (N - n_block_size) >> mask_level;  // (N - 2^k) / 2^k
-    }
+    // 2. Calculate the largest power of 2 block that can align N down to the boundary.
+    // This is determined by the LSB of (N - M), but simpler to use N's alignment for the end.
+    // The expression (N & -N) gives the block size corresponding to N's alignment.
+    // We ensure N_lsb_block_size does not exceed the remaining range size (N-M).
+    size_t n_lsb_block_size = N & -N;
 
-    // Fallback for small, unaligned ranges (e.g., [11, 13) where M and N are close)
-    // If max_block_size is 0 or too large, reduce by 1 row (Level 0)
-    if (max_block_size == 0 || max_block_size > N - M) {
-      max_block_size = 1;
-      mask_level     = 0;
-      mask_index     = M;  // Level 0 index is just M
-    }
+    // --- Decision Logic: Which side to consume? ---
 
-    // --- Query Mask and Advance ---
+    // Block 1: M-aligned block (from M up to m_next_aligned)
+    size_t block1_size = m_next_aligned - M;
 
-    // Look up the mask value at the determined level and index
-    if (level_ptrs[mask_level][mask_index]) {
-      results[range_idx] = true;
-      return;  // Found a set bit, terminate for this range
-    }
+    // Block 2: N-aligned block (from N - n_lsb_block_size up to N)
+    size_t block2_size = n_lsb_block_size;
+
+    // Block 3: The remaining central range block
+
+    if (block1_size > 0 && M < m_next_aligned && m_next_aligned <= N) {
+      // If the M-aligned block is fully contained in the range [M, N)
+
+      // Check if block1_size is 2^k. k = log2(block1_size).
+      // Since block1_size is based on LSB, it is always a power of 2.
+      size_t k1 = __ffs(block1_size) - 1;
+
+      // Calculate mask index: The starting point M is divided by the block size.
+      size_t mask_idx = M / block1_size;
+
+      // Look up the mask value
+      if (level_ptrs[k1][mask_idx]) {
+        results[range_idx] = true;
+        return;  // Found a set bit, terminate for this range
+      }
+
+      // Advance M
+      M = m_next_aligned;
+    } else if (block2_size > 0 && N - block2_size >= M) {
+      // If the N-aligned block is fully contained and does not overlap M's new position
 
-    // Advance M or N based on which block was consumed (whichever has the smaller index)
-    if (mask_level == 0) {
-      // Consumed a single row
-      M += max_block_size;
-    } else if (M == mask_index * max_block_size) {
-      // Consumed an M-aligned block (moving M up)
-      M += max_block_size;
+      // Check if block2_size is 2^k. k = log2(block2_size).
+      size_t k2 = __ffs(block2_size) - 1;
+
+      // Calculate mask index
+      size_t mask_idx = (N - block2_size) / block2_size;
+
+      // Look up the mask value
+      if (level_ptrs[k2][mask_idx]) {
+        results[range_idx] = true;
+        return;  // Found a set bit, terminate for this range
+      }
+
+      // Backtrack N
+      N = N - block2_size;
     } else {
-      // Consumed an N-aligned block (moving N down)
-      N -= max_block_size;
+      // The remaining range is unaligned and small (or just 1 element).
+      // This happens when M and N are close and unaligned (e.g., [11, 13]).
+
+      // Prioritize M (1-row check) or N (1-row check) until they meet.
+
+      // Check single row at M (Level 0)
+      if (level_ptrs[0][M]) {
+        results[range_idx] = true;
+        return;
+      }
+      M++;
     }
   }
 }
@@ -768,10 +776,11 @@ std::vector<bool> aggregate_reader_metadata::compute_data_page_mask(
       device_level_ptrs, page_offsets, device_data_page_mask.data());
   }
 
-  auto host_results      = cudf::detail::make_host_vector(device_data_page_mask, stream);
-  auto host_results_iter = host_results.begin();
+  auto host_results = cudf::detail::make_host_vector_async(device_data_page_mask, stream);
   std::vector<bool> data_page_mask{};
   data_page_mask.reserve(total_pages);
+  stream.synchronize();
+  auto host_results_iter = host_results.begin();
   std::for_each(thrust::counting_iterator<size_t>(0),
                 thrust::counting_iterator(num_columns),
                 [&](auto col_idx) {
@@ -779,7 +788,7 @@ std::vector<bool> aggregate_reader_metadata::compute_data_page_mask(
                     col_page_offsets[col_idx + 1] - col_page_offsets[col_idx] - 1;
                   data_page_mask.insert(
                     data_page_mask.end(), host_results_iter, host_results_iter + col_num_pages);
-                  std::advance(host_results_iter, col_num_pages + 1);
+                  host_results_iter += col_num_pages + 1;
                 });
   return data_page_mask;
 }

From ebf460a620c463f805c4ce2c6d14406a4d777ff5 Mon Sep 17 00:00:00 2001
From: Muhammad Haseeb <mhaseeb@nvidia.com>
Date: Thu, 16 Oct 2025 01:43:26 +0000
Subject: [PATCH 08/25] style fix

---
 cpp/src/io/parquet/experimental/page_index_filter_utils.cu  | 2 +-
 cpp/src/io/parquet/experimental/page_index_filter_utils.hpp | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/cpp/src/io/parquet/experimental/page_index_filter_utils.cu b/cpp/src/io/parquet/experimental/page_index_filter_utils.cu
index 0aa3d58a75e..ee12761dcd2 100644
--- a/cpp/src/io/parquet/experimental/page_index_filter_utils.cu
+++ b/cpp/src/io/parquet/experimental/page_index_filter_utils.cu
@@ -256,4 +256,4 @@ std::pair<std::vector<size_type>, size_type> compute_row_mask_levels(cudf::size_
   return {std::move(level_offsets), current_size};
 }
 
-}  // namespace cudf::io::parquet::experimental::detail
\ No newline at end of file
+}  // namespace cudf::io::parquet::experimental::detail
diff --git a/cpp/src/io/parquet/experimental/page_index_filter_utils.hpp b/cpp/src/io/parquet/experimental/page_index_filter_utils.hpp
index ef29b9be9a1..03c857c2533 100644
--- a/cpp/src/io/parquet/experimental/page_index_filter_utils.hpp
+++ b/cpp/src/io/parquet/experimental/page_index_filter_utils.hpp
@@ -103,4 +103,4 @@ compute_page_row_counts_and_offsets(cudf::host_span<metadata_base const> per_fil
 [[nodiscard]] std::pair<std::vector<size_type>, size_type> compute_row_mask_levels(
   cudf::size_type num_rows, cudf::size_type max_page_size);
 
-}  // namespace cudf::io::parquet::experimental::detail
\ No newline at end of file
+}  // namespace cudf::io::parquet::experimental::detail

From c62f6b794b8af95cbe338a1364b5f53967cfaedf Mon Sep 17 00:00:00 2001
From: Muhammad Haseeb <14217455+mhaseeb123@users.noreply.github.com>
Date: Thu, 16 Oct 2025 04:10:14 +0000
Subject: [PATCH 09/25] Use functors

---
 .../parquet/experimental/page_index_filter.cu | 224 ++++++++----------
 .../experimental/page_index_filter_utils.cu   |   7 +-
 2 files changed, 108 insertions(+), 123 deletions(-)

diff --git a/cpp/src/io/parquet/experimental/page_index_filter.cu b/cpp/src/io/parquet/experimental/page_index_filter.cu
index 36983d706a0..cba29876774 100644
--- a/cpp/src/io/parquet/experimental/page_index_filter.cu
+++ b/cpp/src/io/parquet/experimental/page_index_filter.cu
@@ -26,6 +26,7 @@
 #include <cudf/detail/utilities/batched_memcpy.hpp>
 #include <cudf/detail/utilities/grid_1d.cuh>
 #include <cudf/detail/utilities/host_worker_pool.hpp>
+#include <cudf/detail/utilities/integer_utils.hpp>
 #include <cudf/detail/utilities/stream_pool.hpp>
 #include <cudf/detail/utilities/vector_factories.hpp>
 #include <cudf/logger.hpp>
@@ -401,33 +402,28 @@ struct page_stats_caster : public stats_caster_base {
  * CUB's BlockReduce is unsuitable here as it reduces a block to a single element.
  * Cooperative Groups is used here for robust global thread ID calculation.
  */
-CUDF_KERNEL void reduce_or_kernel(bool** const level_ptrs,
-                                  size_type current_level,
-                                  size_t current_level_size)  // Size of the source level
-{
-  // Use Cooperative Groups to get the global thread index (tid)
-  auto tid = cudf::detail::grid_1d::global_thread_id();
-
-  auto const current_level_ptr = level_ptrs[current_level];
-  auto next_level_ptr          = level_ptrs[current_level + 1];
-
-  // The reduction only needs to run for half the size of the current level
-  size_t next_level_size = (current_level_size + 1) / 2;
-
-  if (tid < next_level_size) {
-    size_t idx1 = tid * 2;
-    size_t idx2 = tid * 2 + 1;
+struct compute_next_level_functor {
+  cudf::device_span<bool*> const level_ptrs;
+  cudf::size_type const current_level;
+  cudf::size_type const current_level_size;
+  cudf::size_type const next_level_size;
 
-    // Perform the logical OR reduction and write to the next level's location
-    next_level_ptr[tid] = current_level_ptr[idx1] || current_level_ptr[idx2];
-  }
+  __device__ void operator()(cudf::size_type next_level_index) const noexcept
+  {
+    auto const current_level_ptr = level_ptrs.begin()[current_level];
+    auto next_level_ptr          = level_ptrs.begin()[current_level + 1];
 
-  // Handle the odd-sized remaining element if current_level_size is odd
-  if (current_level_size % 2 != 0 && tid == next_level_size) {
-    // The last element is carried forward (ORed with false)
-    next_level_ptr[tid] = current_level_ptr[current_level_size - 1];
+    // Handle the odd-sized remaining element if current_level_size is odd
+    if (current_level_size % 2 and next_level_index == (next_level_size - 1)) {
+      // The last element is carried forward (ORed with false)
+      next_level_ptr[next_level_index] = current_level_ptr[current_level_size - 1];
+    } else {
+      // Perform the logical OR reduction and write to the next level's location
+      next_level_ptr[next_level_index] =
+        current_level_ptr[(next_level_index * 2)] or current_level_ptr[(next_level_index * 2) + 1];
+    }
   }
-}
+};
 
 /**
  * @brief CUDA kernel to probe multiple ranges against the pre-calculated mask hierarchy.
@@ -439,102 +435,94 @@ CUDF_KERNEL void reduce_or_kernel(bool** const level_ptrs,
  * @param d_results Pointer to device memory to store the boolean result (true if a '1' is found in
  * the range).
  */
-CUDF_KERNEL void probe_hierarchical_masks_kernel(
-  cudf::device_span<bool* const> level_ptrs,        // Pointers to start of each mask level
-  cudf::device_span<cudf::size_type> page_offsets,  // Range boundary array
-  bool* results)
-{
-  auto const num_ranges = page_offsets.size() - 1;
-  size_t range_idx      = cudf::detail::grid_1d::global_thread_id();
+struct probe_masks_functor {
+  cudf::device_span<bool* const> level_ptrs;
+  cudf::device_span<cudf::size_type const> page_offsets;
 
-  if (range_idx >= num_ranges) { return; }
+  __device__ bool operator()(cudf::size_type range_idx) const noexcept
+  {
+    auto const num_ranges = page_offsets.size() - 1;
 
-  // Initialize result for this range to false (assuming no set bit)
-  results[range_idx] = false;
+    // Retrieve M and N for the current range [M, N)
+    size_type M = page_offsets[range_idx];
+    size_type N = page_offsets[range_idx + 1];
 
-  // Retrieve M and N for the current range [M, N)
-  size_type M = page_offsets[range_idx];
-  size_type N = page_offsets[range_idx + 1];
+    // If the range is empty or invalid, terminate
+    if (M >= N) { return false; }
 
-  // If the range is empty or invalid, terminate
-  if (M >= N) { return; }
+    // Binary Decomposition Loop
+    while (M < N) {
+      // 1. Calculate the largest power of 2 that can align M up to the boundary.
+      // This is determined by the Least Significant Bit (LSB) of M.
+      // If M=0, LSB is usually defined as the full size, but here M is typically > 0
+      // or we handle M=0 implicitly by the full range check.
+      // The expression (M & -M) gives the value of the LSB, which is the block size (2^k).
+      size_t m_lsb_block_size = (M == 0) ? N : (M & -M);
+      size_t m_next_aligned   = M + m_lsb_block_size;
 
-  // Binary Decomposition Loop
-  while (M < N) {
-    // 1. Calculate the largest power of 2 that can align M up to the boundary.
-    // This is determined by the Least Significant Bit (LSB) of M.
-    // If M=0, LSB is usually defined as the full size, but here M is typically > 0
-    // or we handle M=0 implicitly by the full range check.
-    // The expression (M & -M) gives the value of the LSB, which is the block size (2^k).
-    size_t m_lsb_block_size = (M == 0) ? N : (M & -M);
-    size_t m_next_aligned   = M + m_lsb_block_size;
+      // 2. Calculate the largest power of 2 block that can align N down to the boundary.
+      // This is determined by the LSB of (N - M), but simpler to use N's alignment for the end.
+      // The expression (N & -N) gives the block size corresponding to N's alignment.
+      // We ensure N_lsb_block_size does not exceed the remaining range size (N-M).
+      size_t n_lsb_block_size = N & -N;
 
-    // 2. Calculate the largest power of 2 block that can align N down to the boundary.
-    // This is determined by the LSB of (N - M), but simpler to use N's alignment for the end.
-    // The expression (N & -N) gives the block size corresponding to N's alignment.
-    // We ensure N_lsb_block_size does not exceed the remaining range size (N-M).
-    size_t n_lsb_block_size = N & -N;
+      // --- Decision Logic: Which side to consume? ---
 
-    // --- Decision Logic: Which side to consume? ---
+      // Block 1: M-aligned block (from M up to m_next_aligned)
+      size_t block1_size = m_next_aligned - M;
 
-    // Block 1: M-aligned block (from M up to m_next_aligned)
-    size_t block1_size = m_next_aligned - M;
+      // Block 2: N-aligned block (from N - n_lsb_block_size up to N)
+      size_t block2_size = n_lsb_block_size;
 
-    // Block 2: N-aligned block (from N - n_lsb_block_size up to N)
-    size_t block2_size = n_lsb_block_size;
+      // Block 3: The remaining central range block
 
-    // Block 3: The remaining central range block
+      if (block1_size > 0 && M < m_next_aligned && m_next_aligned <= N) {
+        // If the M-aligned block is fully contained in the range [M, N)
 
-    if (block1_size > 0 && M < m_next_aligned && m_next_aligned <= N) {
-      // If the M-aligned block is fully contained in the range [M, N)
+        // Check if block1_size is 2^k. k = log2(block1_size).
+        // Since block1_size is based on LSB, it is always a power of 2.
+        size_t k1 = __ffs(block1_size) - 1;
 
-      // Check if block1_size is 2^k. k = log2(block1_size).
-      // Since block1_size is based on LSB, it is always a power of 2.
-      size_t k1 = __ffs(block1_size) - 1;
+        // Calculate mask index: The starting point M is divided by the block size.
+        size_t mask_idx = M / block1_size;
 
-      // Calculate mask index: The starting point M is divided by the block size.
-      size_t mask_idx = M / block1_size;
+        // Look up the mask value
+        if (level_ptrs[k1][mask_idx]) {
+          return true;  // Found a set bit, terminate for this range
+        }
 
-      // Look up the mask value
-      if (level_ptrs[k1][mask_idx]) {
-        results[range_idx] = true;
-        return;  // Found a set bit, terminate for this range
-      }
+        // Advance M
+        M = m_next_aligned;
+      } else if (block2_size > 0 && N - block2_size >= M) {
+        // If the N-aligned block is fully contained and does not overlap M's new position
 
-      // Advance M
-      M = m_next_aligned;
-    } else if (block2_size > 0 && N - block2_size >= M) {
-      // If the N-aligned block is fully contained and does not overlap M's new position
+        // Check if block2_size is 2^k. k = log2(block2_size).
+        size_t k2 = __ffs(block2_size) - 1;
 
-      // Check if block2_size is 2^k. k = log2(block2_size).
-      size_t k2 = __ffs(block2_size) - 1;
+        // Calculate mask index
+        size_t mask_idx = (N - block2_size) / block2_size;
 
-      // Calculate mask index
-      size_t mask_idx = (N - block2_size) / block2_size;
+        // Look up the mask value
+        if (level_ptrs[k2][mask_idx]) {
+          return true;  // Found a set bit, terminate for this range
+        }
 
-      // Look up the mask value
-      if (level_ptrs[k2][mask_idx]) {
-        results[range_idx] = true;
-        return;  // Found a set bit, terminate for this range
-      }
-
-      // Backtrack N
-      N = N - block2_size;
-    } else {
-      // The remaining range is unaligned and small (or just 1 element).
-      // This happens when M and N are close and unaligned (e.g., [11, 13]).
+        // Backtrack N
+        N = N - block2_size;
+      } else {
+        // The remaining range is unaligned and small (or just 1 element).
+        // This happens when M and N are close and unaligned (e.g., [11, 13]).
 
-      // Prioritize M (1-row check) or N (1-row check) until they meet.
+        // Prioritize M (1-row check) or N (1-row check) until they meet.
 
-      // Check single row at M (Level 0)
-      if (level_ptrs[0][M]) {
-        results[range_idx] = true;
-        return;
+        // Check single row at M (Level 0)
+        if (level_ptrs[0][M]) { return true; }
+        M++;
       }
-      M++;
     }
+    return false;
   }
-}
+};
 
 }  // namespace
 
@@ -739,42 +727,38 @@ std::vector<bool> aggregate_reader_metadata::compute_data_page_mask(
   auto const mr = cudf::get_current_device_resource_ref();
   auto const [level_offsets, total_levels_size] =
     compute_row_mask_levels(row_mask.size(), max_page_size);
-  auto const total_levels = level_offsets.size();
+  auto const num_levels = static_cast<cudf::size_type>(level_offsets.size());
 
   auto levels_data = rmm::device_uvector<bool>(total_levels_size, stream, mr);
 
-  auto host_level_ptrs = cudf::detail::make_host_vector<bool*>(total_levels, stream);
+  auto host_level_ptrs = cudf::detail::make_host_vector<bool*>(num_levels, stream);
   host_level_ptrs[0]   = const_cast<bool*>(row_mask.template begin<bool>()) + row_mask_offset;
-  std::for_each(thrust::counting_iterator<size_t>(1),
-                thrust::counting_iterator(total_levels),
-                [&](auto const level_idx) {
-                  host_level_ptrs[level_idx] = levels_data.data() + level_offsets[level_idx - 1];
-                });
+  std::for_each(
+    thrust::counting_iterator(1), thrust::counting_iterator(num_levels), [&](auto const level_idx) {
+      host_level_ptrs[level_idx] = levels_data.data() + level_offsets[level_idx - 1];
+    });
 
   auto device_level_ptrs  = cudf::detail::make_device_uvector_async(host_level_ptrs, stream, mr);
   auto current_level_size = row_mask.size();
   std::for_each(
-    thrust::counting_iterator<size_t>(0),
-    thrust::counting_iterator(total_levels - 1),
-    [&](auto const level) {
-      auto const next_level_size = level_offsets[level + 1] - level_offsets[level];
-      cudf::detail::grid_1d config(next_level_size, 256, 1);
-      reduce_or_kernel<<<config.num_blocks, config.num_threads_per_block, 0, stream.value()>>>(
-        device_level_ptrs.data(), level, current_level_size);
+    thrust::counting_iterator(0), thrust::counting_iterator(num_levels - 1), [&](auto const level) {
+      auto const next_level_size = cudf::util::div_rounding_up_unsafe(current_level_size, 2);
+      thrust::for_each(
+        rmm::exec_policy_nosync(stream),
+        thrust::counting_iterator(0),
+        thrust::counting_iterator(next_level_size),
+        compute_next_level_functor{device_level_ptrs, level, current_level_size, next_level_size});
       current_level_size = next_level_size;
     });
 
-  auto const num_ranges = page_row_offsets.size() - 1;
+  auto const num_ranges = static_cast<cudf::size_type>(page_row_offsets.size() - 1);
   rmm::device_uvector<bool> device_data_page_mask(num_ranges, stream, mr);
   auto page_offsets = cudf::detail::make_device_uvector_async(page_row_offsets, stream, mr);
-  {
-    cudf::detail::grid_1d config(num_ranges, 256, 1);
-    probe_hierarchical_masks_kernel<<<config.num_blocks,
-                                      config.num_threads_per_block,
-                                      0,
-                                      stream.value()>>>(
-      device_level_ptrs, page_offsets, device_data_page_mask.data());
-  }
+  thrust::transform(rmm::exec_policy_nosync(stream),
+                    thrust::counting_iterator(0),
+                    thrust::counting_iterator(num_ranges),
+                    device_data_page_mask.begin(),
+                    probe_masks_functor{device_level_ptrs, page_offsets});
 
   auto host_results = cudf::detail::make_host_vector_async(device_data_page_mask, stream);
   std::vector<bool> data_page_mask{};
diff --git a/cpp/src/io/parquet/experimental/page_index_filter_utils.cu b/cpp/src/io/parquet/experimental/page_index_filter_utils.cu
index ee12761dcd2..d64d3eaf313 100644
--- a/cpp/src/io/parquet/experimental/page_index_filter_utils.cu
+++ b/cpp/src/io/parquet/experimental/page_index_filter_utils.cu
@@ -18,6 +18,7 @@
 #include "page_index_filter_utils.hpp"
 
 #include <cudf/detail/utilities/host_vector.hpp>
+#include <cudf/detail/utilities/integer_utils.hpp>
 #include <cudf/detail/utilities/vector_factories.hpp>
 #include <cudf/types.hpp>
 
@@ -239,18 +240,18 @@ std::pair<std::vector<size_type>, size_type> compute_row_mask_levels(cudf::size_
   std::vector<size_type> level_offsets;
   level_offsets.push_back(0);
 
-  size_t current_size  = (num_rows + 1) / 2;
+  size_t current_size  = cudf::util::div_rounding_up_unsafe(num_rows, 2);
   size_t current_level = 1;
 
   while (current_size > 0) {
-    size_t block_size = 1ULL << current_level;
+    size_t block_size = size_t{1} << current_level;
 
     level_offsets.push_back(current_size);
     current_size += num_rows;
 
     if (std::cmp_greater_equal(block_size, max_page_size)) { break; }
 
-    current_size = (current_size + 1) / 2;
+    current_size = cudf::util::div_rounding_up_unsafe(current_size, 2);
     current_level++;
   }
   return {std::move(level_offsets), current_size};

From f9e8f85d4100b7dff87a2021d431bafd8f5541e4 Mon Sep 17 00:00:00 2001
From: Muhammad Haseeb <14217455+mhaseeb123@users.noreply.github.com>
Date: Thu, 16 Oct 2025 04:12:48 +0000
Subject: [PATCH 10/25] Optimize for the chunked reader

---
 cpp/src/io/parquet/experimental/page_index_filter.cu | 12 +++++-------
 .../parquet/experimental/page_index_filter_utils.hpp |  2 +-
 2 files changed, 6 insertions(+), 8 deletions(-)

diff --git a/cpp/src/io/parquet/experimental/page_index_filter.cu b/cpp/src/io/parquet/experimental/page_index_filter.cu
index cba29876774..35a47c3f3c4 100644
--- a/cpp/src/io/parquet/experimental/page_index_filter.cu
+++ b/cpp/src/io/parquet/experimental/page_index_filter.cu
@@ -674,7 +674,7 @@ std::vector<bool> aggregate_reader_metadata::compute_data_page_mask(
     auto const schema_idx   = column_schema_indices.front();
     size_type col_num_pages = 0;
     std::tie(page_row_offsets, col_num_pages, max_page_size) =
-      compute_page_row_offsets(per_file_metadata, row_group_indices, schema_idx, row_mask_offset);
+      compute_page_row_offsets(per_file_metadata, row_group_indices, schema_idx);
     // Add 1 to include the the 0th page's offset for each column
     col_page_offsets.emplace_back(col_num_pages + 1);
   } else {
@@ -687,10 +687,8 @@ std::vector<bool> aggregate_reader_metadata::compute_data_page_mask(
                   [&](auto const col_idx) {
                     page_row_offsets_tasks.emplace_back(
                       cudf::detail::host_worker_pool().submit_task([&, col_idx = col_idx] {
-                        return compute_page_row_offsets(per_file_metadata,
-                                                        row_group_indices,
-                                                        column_schema_indices[col_idx],
-                                                        row_mask_offset);
+                        return compute_page_row_offsets(
+                          per_file_metadata, row_group_indices, column_schema_indices[col_idx]);
                       }));
                   });
 
@@ -726,7 +724,7 @@ std::vector<bool> aggregate_reader_metadata::compute_data_page_mask(
 
   auto const mr = cudf::get_current_device_resource_ref();
   auto const [level_offsets, total_levels_size] =
-    compute_row_mask_levels(row_mask.size(), max_page_size);
+    compute_row_mask_levels(total_rows, max_page_size);
   auto const num_levels = static_cast<cudf::size_type>(level_offsets.size());
 
   auto levels_data = rmm::device_uvector<bool>(total_levels_size, stream, mr);
@@ -739,7 +737,7 @@ std::vector<bool> aggregate_reader_metadata::compute_data_page_mask(
     });
 
   auto device_level_ptrs  = cudf::detail::make_device_uvector_async(host_level_ptrs, stream, mr);
-  auto current_level_size = row_mask.size();
+  auto current_level_size = total_rows;
   std::for_each(
     thrust::counting_iterator(0), thrust::counting_iterator(num_levels - 1), [&](auto const level) {
       auto const next_level_size = cudf::util::div_rounding_up_unsafe(current_level_size, 2);
diff --git a/cpp/src/io/parquet/experimental/page_index_filter_utils.hpp b/cpp/src/io/parquet/experimental/page_index_filter_utils.hpp
index 03c857c2533..c84746e07ac 100644
--- a/cpp/src/io/parquet/experimental/page_index_filter_utils.hpp
+++ b/cpp/src/io/parquet/experimental/page_index_filter_utils.hpp
@@ -76,7 +76,7 @@ compute_page_row_counts_and_offsets(cudf::host_span<metadata_base const> per_fil
   cudf::host_span<metadata_base const> per_file_metadata,
   cudf::host_span<std::vector<size_type> const> row_group_indices,
   size_type schema_idx,
-  cudf::size_type row_mask_offset);
+  cudf::size_type row_mask_offset = 0);
 
 /**
  * @brief Make a device vector where each row contains the index of the page it belongs to

From 9ea83f5fc4ff22b98e2c038f95ab95abf792c8e4 Mon Sep 17 00:00:00 2001
From: Muhammad Haseeb <14217455+mhaseeb123@users.noreply.github.com>
Date: Thu, 16 Oct 2025 04:56:10 +0000
Subject: [PATCH 11/25] Minor optimizations

---
 .../parquet/experimental/page_index_filter.cu | 25 +++++++++----------
 1 file changed, 12 insertions(+), 13 deletions(-)

diff --git a/cpp/src/io/parquet/experimental/page_index_filter.cu b/cpp/src/io/parquet/experimental/page_index_filter.cu
index 35a47c3f3c4..6b948e8f3ea 100644
--- a/cpp/src/io/parquet/experimental/page_index_filter.cu
+++ b/cpp/src/io/parquet/experimental/page_index_filter.cu
@@ -403,15 +403,15 @@ struct page_stats_caster : public stats_caster_base {
  * Cooperative Groups is used here for robust global thread ID calculation.
  */
 struct compute_next_level_functor {
-  cudf::device_span<bool*> const level_ptrs;
+  bool** const level_ptrs;
   cudf::size_type const current_level;
   cudf::size_type const current_level_size;
   cudf::size_type const next_level_size;
 
   __device__ void operator()(cudf::size_type next_level_index) const noexcept
   {
-    auto const current_level_ptr = level_ptrs.begin()[current_level];
-    auto next_level_ptr          = level_ptrs.begin()[current_level + 1];
+    auto const current_level_ptr = level_ptrs[current_level];
+    auto next_level_ptr          = level_ptrs[current_level + 1];
 
     // Handle the odd-sized remaining element if current_level_size is odd
     if (current_level_size % 2 and next_level_index == (next_level_size - 1)) {
@@ -436,13 +436,12 @@ struct compute_next_level_functor {
  * the range).
  */
 struct probe_masks_functor {
-  cudf::device_span<bool* const> level_ptrs;
-  cudf::device_span<cudf::size_type const> page_offsets;
+  bool** const level_ptrs;
+  cudf::size_type const* const page_offsets;
+  cudf::size_type const num_ranges;
 
   __device__ bool operator()(cudf::size_type range_idx) const noexcept
   {
-    auto const num_ranges = page_offsets.size() - 1;
-
     // Retrieve M and N for the current range [M, N)
     size_type M = page_offsets[range_idx];
     size_type N = page_offsets[range_idx + 1];
@@ -741,11 +740,11 @@ std::vector<bool> aggregate_reader_metadata::compute_data_page_mask(
   std::for_each(
     thrust::counting_iterator(0), thrust::counting_iterator(num_levels - 1), [&](auto const level) {
       auto const next_level_size = cudf::util::div_rounding_up_unsafe(current_level_size, 2);
-      thrust::for_each(
-        rmm::exec_policy_nosync(stream),
-        thrust::counting_iterator(0),
-        thrust::counting_iterator(next_level_size),
-        compute_next_level_functor{device_level_ptrs, level, current_level_size, next_level_size});
+      thrust::for_each(rmm::exec_policy_nosync(stream),
+                       thrust::counting_iterator(0),
+                       thrust::counting_iterator(next_level_size),
+                       compute_next_level_functor{
+                         device_level_ptrs.data(), level, current_level_size, next_level_size});
       current_level_size = next_level_size;
     });
 
@@ -756,7 +755,7 @@ std::vector<bool> aggregate_reader_metadata::compute_data_page_mask(
                     thrust::counting_iterator(0),
                     thrust::counting_iterator(num_ranges),
                     device_data_page_mask.begin(),
-                    probe_masks_functor{device_level_ptrs, page_offsets});
+                    probe_masks_functor{device_level_ptrs.data(), page_offsets.data(), num_ranges});
 
   auto host_results = cudf::detail::make_host_vector_async(device_data_page_mask, stream);
   std::vector<bool> data_page_mask{};

From 38a640bc70b1ea44deefc2778e1bf54a7d8e5183 Mon Sep 17 00:00:00 2001
From: Muhammad Haseeb <14217455+mhaseeb123@users.noreply.github.com>
Date: Thu, 16 Oct 2025 05:24:40 +0000
Subject: [PATCH 12/25] Small optimizations

---
 .../experimental/hybrid_scan_chunking.cu      | 10 +++---
 .../experimental/hybrid_scan_helpers.hpp      |  2 +-
 .../parquet/experimental/hybrid_scan_impl.cpp | 16 ++++-----
 .../parquet/experimental/hybrid_scan_impl.hpp | 12 +++----
 .../experimental/hybrid_scan_preprocess.cu    |  2 +-
 .../parquet/experimental/page_index_filter.cu | 36 +++++++++----------
 .../experimental/page_index_filter_utils.cu   | 34 +++++++-----------
 .../experimental/page_index_filter_utils.hpp  |  4 +--
 8 files changed, 51 insertions(+), 65 deletions(-)

diff --git a/cpp/src/io/parquet/experimental/hybrid_scan_chunking.cu b/cpp/src/io/parquet/experimental/hybrid_scan_chunking.cu
index 7e824a89c60..41129c6b5b8 100644
--- a/cpp/src/io/parquet/experimental/hybrid_scan_chunking.cu
+++ b/cpp/src/io/parquet/experimental/hybrid_scan_chunking.cu
@@ -41,9 +41,10 @@ namespace cudf::io::parquet::experimental::detail {
 using parquet::detail::ColumnChunkDesc;
 using parquet::detail::pass_intermediate_data;
 
-void hybrid_scan_reader_impl::handle_chunking(read_mode mode,
-                                              std::vector<rmm::device_buffer> column_chunk_buffers,
-                                              std::vector<bool> const& data_page_mask)
+void hybrid_scan_reader_impl::handle_chunking(
+  read_mode mode,
+  std::vector<rmm::device_buffer>&& column_chunk_buffers,
+  cudf::host_span<bool const> data_page_mask)
 {
   // if this is our first time in here, setup the first pass.
   if (!_pass_itm_data) {
@@ -87,7 +88,8 @@ void hybrid_scan_reader_impl::handle_chunking(read_mode mode,
   setup_next_subpass(mode);
 }
 
-void hybrid_scan_reader_impl::setup_next_pass(std::vector<rmm::device_buffer> column_chunk_buffers)
+void hybrid_scan_reader_impl::setup_next_pass(
+  std::vector<rmm::device_buffer>&& column_chunk_buffers)
 {
   auto const num_passes = _file_itm_data.num_passes();
   CUDF_EXPECTS(num_passes == 1,
diff --git a/cpp/src/io/parquet/experimental/hybrid_scan_helpers.hpp b/cpp/src/io/parquet/experimental/hybrid_scan_helpers.hpp
index 6ad25641407..eaa8860ec98 100644
--- a/cpp/src/io/parquet/experimental/hybrid_scan_helpers.hpp
+++ b/cpp/src/io/parquet/experimental/hybrid_scan_helpers.hpp
@@ -294,7 +294,7 @@ class aggregate_reader_metadata : public aggregate_reader_metadata_base {
    *         the output table based on the input row mask across all input columns
    */
   template <typename ColumnView>
-  [[nodiscard]] std::vector<bool> compute_data_page_mask(
+  [[nodiscard]] cudf::detail::host_vector<bool> compute_data_page_mask(
     ColumnView const& row_mask,
     cudf::host_span<std::vector<size_type> const> row_group_indices,
     cudf::host_span<input_column_info const> input_columns,
diff --git a/cpp/src/io/parquet/experimental/hybrid_scan_impl.cpp b/cpp/src/io/parquet/experimental/hybrid_scan_impl.cpp
index 1915cce1073..266f6288424 100644
--- a/cpp/src/io/parquet/experimental/hybrid_scan_impl.cpp
+++ b/cpp/src/io/parquet/experimental/hybrid_scan_impl.cpp
@@ -455,7 +455,7 @@ table_with_metadata hybrid_scan_reader_impl::materialize_filter_columns(
     (mask_data_pages == use_data_page_mask::YES)
       ? _extended_metadata->compute_data_page_mask(
           row_mask, row_group_indices, _input_columns, _rows_processed_so_far, stream)
-      : std::vector<bool>{};
+      : cudf::detail::make_empty_host_vector<bool>(0, stream);
 
   prepare_data(
     read_mode::READ_ALL, row_group_indices, std::move(column_chunk_buffers), data_page_mask);
@@ -485,7 +485,7 @@ table_with_metadata hybrid_scan_reader_impl::materialize_payload_columns(
     (mask_data_pages == use_data_page_mask::YES)
       ? _extended_metadata->compute_data_page_mask(
           row_mask, row_group_indices, _input_columns, _rows_processed_so_far, stream)
-      : std::vector<bool>{};
+      : cudf::detail::make_empty_host_vector<bool>(0, stream);
 
   prepare_data(
     read_mode::READ_ALL, row_group_indices, std::move(column_chunk_buffers), data_page_mask);
@@ -524,7 +524,7 @@ void hybrid_scan_reader_impl::setup_chunking_for_filter_columns(
     (mask_data_pages == use_data_page_mask::YES)
       ? _extended_metadata->compute_data_page_mask(
           row_mask, row_group_indices, _input_columns, _rows_processed_so_far, _stream)
-      : std::vector<bool>{};
+      : cudf::detail::make_empty_host_vector<bool>(0, _stream);
 
   prepare_data(
     read_mode::CHUNKED_READ, row_group_indices, std::move(column_chunk_buffers), data_page_mask);
@@ -575,7 +575,7 @@ void hybrid_scan_reader_impl::setup_chunking_for_payload_columns(
     (mask_data_pages == use_data_page_mask::YES)
       ? _extended_metadata->compute_data_page_mask(
           row_mask, row_group_indices, _input_columns, _rows_processed_so_far, _stream)
-      : std::vector<bool>{};
+      : cudf::detail::make_empty_host_vector<bool>(0, _stream);
 
   prepare_data(
     read_mode::CHUNKED_READ, row_group_indices, std::move(column_chunk_buffers), data_page_mask);
@@ -656,7 +656,7 @@ void hybrid_scan_reader_impl::prepare_data(
   read_mode mode,
   cudf::host_span<std::vector<size_type> const> row_group_indices,
   std::vector<rmm::device_buffer>&& column_chunk_buffers,
-  std::vector<bool> const& data_page_mask)
+  cudf::host_span<bool const> data_page_mask)
 {
   // if we have not preprocessed at the whole-file level, do that now
   if (not _file_preprocessed) {
@@ -885,7 +885,7 @@ table_with_metadata hybrid_scan_reader_impl::finalize_output(
   }
 }
 
-void hybrid_scan_reader_impl::set_pass_page_mask(std::vector<bool> const& data_page_mask)
+void hybrid_scan_reader_impl::set_pass_page_mask(cudf::host_span<bool const> data_page_mask)
 {
   auto const& pass   = _pass_itm_data;
   auto const& chunks = pass->chunks;
@@ -926,10 +926,6 @@ void hybrid_scan_reader_impl::set_pass_page_mask(std::vector<bool> const& data_p
       }
     });
 
-  // Make sure we inserted exactly the number of data pages for this column
-  CUDF_EXPECTS(num_inserted_data_pages == data_page_mask.size(),
-               "Encountered mismatch in number of data pages and page mask size");
-
   // Make sure we inserted exactly the number of pages for this pass
   CUDF_EXPECTS(_pass_page_mask.size() == pass->pages.size(),
                "Encountered mismatch in number of pass pages and page mask size");
diff --git a/cpp/src/io/parquet/experimental/hybrid_scan_impl.hpp b/cpp/src/io/parquet/experimental/hybrid_scan_impl.hpp
index 9fbeea9a58d..9c6fd5cca45 100644
--- a/cpp/src/io/parquet/experimental/hybrid_scan_impl.hpp
+++ b/cpp/src/io/parquet/experimental/hybrid_scan_impl.hpp
@@ -266,7 +266,7 @@ class hybrid_scan_reader_impl : public parquet::detail::reader_impl {
    *
    * @param data_page_mask Input data page mask from page-pruning step
    */
-  void set_pass_page_mask(std::vector<bool> const& data_page_mask);
+  void set_pass_page_mask(cudf::host_span<bool const> data_page_mask);
 
   /**
    * @brief Select the columns to be read based on the read mode
@@ -301,7 +301,7 @@ class hybrid_scan_reader_impl : public parquet::detail::reader_impl {
   void prepare_data(read_mode mode,
                     cudf::host_span<std::vector<size_type> const> row_group_indices,
                     std::vector<rmm::device_buffer>&& column_chunk_buffers,
-                    std::vector<bool> const& data_page_mask);
+                    cudf::host_span<bool const> data_page_mask);
 
   /**
    * @brief Create descriptors for filter column chunks and decode dictionary page headers
@@ -342,8 +342,8 @@ class hybrid_scan_reader_impl : public parquet::detail::reader_impl {
    * @param data_page_mask Input data page mask from page-pruning step for the current pass
    */
   void handle_chunking(read_mode mode,
-                       std::vector<rmm::device_buffer> column_chunk_buffers,
-                       std::vector<bool> const& data_page_mask);
+                       std::vector<rmm::device_buffer>&& column_chunk_buffers,
+                       cudf::host_span<bool const> data_page_mask);
 
   /**
    * @brief Setup step for the next input read pass.
@@ -353,7 +353,7 @@ class hybrid_scan_reader_impl : public parquet::detail::reader_impl {
    *
    * @param column_chunk_buffers Device buffers containing column chunk data
    */
-  void setup_next_pass(std::vector<rmm::device_buffer> column_chunk_buffers);
+  void setup_next_pass(std::vector<rmm::device_buffer>&& column_chunk_buffers);
 
   /**
    * @brief Setup pointers to columns chunks to be processed for this pass.
@@ -369,7 +369,7 @@ class hybrid_scan_reader_impl : public parquet::detail::reader_impl {
    *
    * @param column_chunk_buffers Device buffers containing column chunk data
    */
-  void setup_compressed_data(std::vector<rmm::device_buffer> column_chunk_buffers);
+  void setup_compressed_data(std::vector<rmm::device_buffer>&& column_chunk_buffers);
 
   /**
    * @brief Reset the internal state of the reader.
diff --git a/cpp/src/io/parquet/experimental/hybrid_scan_preprocess.cu b/cpp/src/io/parquet/experimental/hybrid_scan_preprocess.cu
index 5892cfb0fa9..2798827a234 100644
--- a/cpp/src/io/parquet/experimental/hybrid_scan_preprocess.cu
+++ b/cpp/src/io/parquet/experimental/hybrid_scan_preprocess.cu
@@ -183,7 +183,7 @@ bool hybrid_scan_reader_impl::setup_column_chunks()
 }
 
 void hybrid_scan_reader_impl::setup_compressed_data(
-  std::vector<rmm::device_buffer> column_chunk_buffers)
+  std::vector<rmm::device_buffer>&& column_chunk_buffers)
 {
   auto& pass = *_pass_itm_data;
 
diff --git a/cpp/src/io/parquet/experimental/page_index_filter.cu b/cpp/src/io/parquet/experimental/page_index_filter.cu
index 6b948e8f3ea..c28731dc689 100644
--- a/cpp/src/io/parquet/experimental/page_index_filter.cu
+++ b/cpp/src/io/parquet/experimental/page_index_filter.cu
@@ -539,8 +539,7 @@ std::unique_ptr<cudf::column> aggregate_reader_metadata::build_row_mask_with_pag
   if (row_group_indices.empty()) { return cudf::make_empty_column(cudf::type_id::BOOL8); }
 
   // Check if we have page index for all columns in all row groups
-  auto const has_page_index =
-    compute_has_page_index(per_file_metadata, row_group_indices, output_column_schemas);
+  auto const has_page_index = compute_has_page_index(per_file_metadata, row_group_indices);
 
   // Return if page index is not present
   CUDF_EXPECTS(has_page_index,
@@ -617,7 +616,7 @@ std::unique_ptr<cudf::column> aggregate_reader_metadata::build_row_mask_with_pag
 }
 
 template <typename ColumnView>
-std::vector<bool> aggregate_reader_metadata::compute_data_page_mask(
+cudf::detail::host_vector<bool> aggregate_reader_metadata::compute_data_page_mask(
   ColumnView const& row_mask,
   cudf::host_span<std::vector<size_type> const> row_group_indices,
   cudf::host_span<input_column_info const> input_columns,
@@ -637,7 +636,7 @@ std::vector<bool> aggregate_reader_metadata::compute_data_page_mask(
                      row_mask.template begin<bool>() + row_mask_offset,
                      row_mask.template begin<bool>() + row_mask_offset + total_rows,
                      cuda::std::identity{})) {
-    return {};
+    return cudf::detail::make_empty_host_vector<bool>(0, stream);
   }
 
   CUDF_EXPECTS(row_mask_offset + total_rows <= row_mask.size(),
@@ -652,13 +651,13 @@ std::vector<bool> aggregate_reader_metadata::compute_data_page_mask(
     input_columns.begin(), input_columns.end(), column_schema_indices.begin(), [](auto const& col) {
       return col.schema_idx;
     });
-  auto const has_page_index =
-    compute_has_page_index(per_file_metadata, row_group_indices, column_schema_indices);
+  auto const has_page_index = compute_has_page_index(per_file_metadata, row_group_indices);
 
   // Return early if page index is not present
   if (not has_page_index) {
     CUDF_LOG_WARN("Encountered missing Parquet page index for one or more output columns");
-    return {};  // An empty data page mask indicates all pages are required
+    return cudf::detail::make_empty_host_vector<bool>(
+      0, stream);  // An empty data page mask indicates all pages are required
   }
 
   // Compute page row offsets and column chunk page offsets for each column
@@ -757,11 +756,10 @@ std::vector<bool> aggregate_reader_metadata::compute_data_page_mask(
                     device_data_page_mask.begin(),
                     probe_masks_functor{device_level_ptrs.data(), page_offsets.data(), num_ranges});
 
-  auto host_results = cudf::detail::make_host_vector_async(device_data_page_mask, stream);
-  std::vector<bool> data_page_mask{};
-  data_page_mask.reserve(total_pages);
-  stream.synchronize();
+  auto host_results      = cudf::detail::make_host_vector_async(device_data_page_mask, stream);
+  auto data_page_mask    = cudf::detail::make_empty_host_vector<bool>(total_pages, stream);
   auto host_results_iter = host_results.begin();
+  stream.synchronize();
   std::for_each(thrust::counting_iterator<size_t>(0),
                 thrust::counting_iterator(num_columns),
                 [&](auto col_idx) {
@@ -775,14 +773,14 @@ std::vector<bool> aggregate_reader_metadata::compute_data_page_mask(
 }
 
 // Instantiate the templates with ColumnView as cudf::column_view and cudf::mutable_column_view
-template std::vector<bool> aggregate_reader_metadata::compute_data_page_mask<cudf::column_view>(
-  cudf::column_view const& row_mask,
-  cudf::host_span<std::vector<size_type> const> row_group_indices,
-  cudf::host_span<input_column_info const> input_columns,
-  cudf::size_type row_mask_offset,
-  rmm::cuda_stream_view stream) const;
-
-template std::vector<bool> aggregate_reader_metadata::compute_data_page_mask<
+template cudf::detail::host_vector<bool> aggregate_reader_metadata::compute_data_page_mask<
+  cudf::column_view>(cudf::column_view const& row_mask,
+                     cudf::host_span<std::vector<size_type> const> row_group_indices,
+                     cudf::host_span<input_column_info const> input_columns,
+                     cudf::size_type row_mask_offset,
+                     rmm::cuda_stream_view stream) const;
+
+template cudf::detail::host_vector<bool> aggregate_reader_metadata::compute_data_page_mask<
   cudf::mutable_column_view>(cudf::mutable_column_view const& row_mask,
                              cudf::host_span<std::vector<size_type> const> row_group_indices,
                              cudf::host_span<input_column_info const> input_columns,
diff --git a/cpp/src/io/parquet/experimental/page_index_filter_utils.cu b/cpp/src/io/parquet/experimental/page_index_filter_utils.cu
index d64d3eaf313..59c2e76c523 100644
--- a/cpp/src/io/parquet/experimental/page_index_filter_utils.cu
+++ b/cpp/src/io/parquet/experimental/page_index_filter_utils.cu
@@ -35,30 +35,22 @@
 namespace cudf::io::parquet::experimental::detail {
 
 bool compute_has_page_index(cudf::host_span<metadata_base const> file_metadatas,
-                            cudf::host_span<std::vector<size_type> const> row_group_indices,
-                            cudf::host_span<size_type const> column_schema_indices)
+                            cudf::host_span<std::vector<size_type> const> row_group_indices)
 {
-  // For all output columns, check all parquet data sources
+  // For all parquet data sources
   return std::all_of(
-    column_schema_indices.begin(), column_schema_indices.end(), [&](auto const schema_idx) {
-      // For all parquet data sources
-      return std::all_of(
-        thrust::counting_iterator<size_t>(0),
-        thrust::counting_iterator(row_group_indices.size()),
-        [&](auto const src_index) {
-          // For all row groups in this parquet data source
-          auto const& rg_indices = row_group_indices[src_index];
-          return std::all_of(rg_indices.begin(), rg_indices.end(), [&](auto const& rg_index) {
-            auto const& row_group = file_metadatas[src_index].row_groups[rg_index];
-            auto col              = std::find_if(
-              row_group.columns.begin(),
-              row_group.columns.end(),
-              [schema_idx](ColumnChunk const& col) { return col.schema_idx == schema_idx; });
-            // Check if the offset_index and column_index are present
-            return col != file_metadatas[src_index].row_groups[rg_index].columns.end() and
-                   col->offset_index.has_value() and col->column_index.has_value();
+    thrust::counting_iterator<size_t>(0),
+    thrust::counting_iterator(row_group_indices.size()),
+    [&](auto const src_index) {
+      // For all row groups in this parquet data source
+      auto const& rg_indices = row_group_indices[src_index];
+      return std::all_of(rg_indices.begin(), rg_indices.end(), [&](auto const& rg_index) {
+        auto const& row_group = file_metadatas[src_index].row_groups[rg_index];
+        return std::any_of(
+          row_group.columns.begin(), row_group.columns.end(), [&](auto const& col) {
+            return col.offset_index.has_value() and col.column_index.has_value();
           });
-        });
+      });
     });
 }
 
diff --git a/cpp/src/io/parquet/experimental/page_index_filter_utils.hpp b/cpp/src/io/parquet/experimental/page_index_filter_utils.hpp
index c84746e07ac..299639e4172 100644
--- a/cpp/src/io/parquet/experimental/page_index_filter_utils.hpp
+++ b/cpp/src/io/parquet/experimental/page_index_filter_utils.hpp
@@ -35,14 +35,12 @@ using metadata_base = parquet::detail::metadata;
  *
  * @param file_metadatas Span of parquet footer metadata
  * @param row_group_indices Span of input row group indices
- * @param column_schema_indices Span of input column schema indices
  * @return Boolean indicating if the page index is present in all parquet data sources for all
  * columns
  */
 [[nodiscard]] bool compute_has_page_index(
   cudf::host_span<metadata_base const> file_metadatas,
-  cudf::host_span<std::vector<size_type> const> row_group_indices,
-  cudf::host_span<size_type const> column_schema_indices);
+  cudf::host_span<std::vector<size_type> const> row_group_indices);
 
 /**
  * @brief Compute page row counts and page row offsets and column chunk page (count) offsets for a

From 0071dc738565f13c03d018adcd88240fae364388 Mon Sep 17 00:00:00 2001
From: Muhammad Haseeb <14217455+mhaseeb123@users.noreply.github.com>
Date: Thu, 16 Oct 2025 21:09:34 +0000
Subject: [PATCH 13/25] CPU optimizations

---
 .../parquet/experimental/page_index_filter.cu | 74 +++++++++++--------
 .../experimental/page_index_filter_utils.cu   | 70 +++++++++++-------
 .../experimental/page_index_filter_utils.hpp  |  8 +-
 3 files changed, 88 insertions(+), 64 deletions(-)

diff --git a/cpp/src/io/parquet/experimental/page_index_filter.cu b/cpp/src/io/parquet/experimental/page_index_filter.cu
index c28731dc689..6a8534f5c41 100644
--- a/cpp/src/io/parquet/experimental/page_index_filter.cu
+++ b/cpp/src/io/parquet/experimental/page_index_filter.cu
@@ -25,7 +25,6 @@
 #include <cudf/detail/transform.hpp>
 #include <cudf/detail/utilities/batched_memcpy.hpp>
 #include <cudf/detail/utilities/grid_1d.cuh>
-#include <cudf/detail/utilities/host_worker_pool.hpp>
 #include <cudf/detail/utilities/integer_utils.hpp>
 #include <cudf/detail/utilities/stream_pool.hpp>
 #include <cudf/detail/utilities/vector_factories.hpp>
@@ -48,6 +47,7 @@
 
 #include <algorithm>
 #include <limits>
+#include <thread>
 
 namespace cudf::io::parquet::experimental::detail {
 
@@ -668,38 +668,48 @@ cudf::detail::host_vector<bool> aggregate_reader_metadata::compute_data_page_mas
 
   size_type max_page_size = 0;
 
-  if (num_columns == 1) {
-    auto const schema_idx   = column_schema_indices.front();
-    size_type col_num_pages = 0;
-    std::tie(page_row_offsets, col_num_pages, max_page_size) =
-      compute_page_row_offsets(per_file_metadata, row_group_indices, schema_idx);
-    // Add 1 to include the the 0th page's offset for each column
-    col_page_offsets.emplace_back(col_num_pages + 1);
+  if (num_columns <= 2) {
+    std::for_each(
+      column_schema_indices.begin(), column_schema_indices.end(), [&](auto const schema_idx) {
+        auto [col_page_row_offsets, col_max_page_size] =
+          compute_page_row_offsets(per_file_metadata, row_group_indices, schema_idx);
+        page_row_offsets.insert(
+          page_row_offsets.end(), col_page_row_offsets.begin(), col_page_row_offsets.end());
+        max_page_size = std::max<size_type>(max_page_size, col_max_page_size);
+        col_page_offsets.emplace_back(page_row_offsets.size());
+      });
   } else {
-    std::vector<std::future<std::tuple<std::vector<size_type>, size_type, size_type>>>
-      page_row_offsets_tasks;
-    page_row_offsets_tasks.reserve(num_columns);
-
-    std::for_each(thrust::counting_iterator<size_t>(0),
-                  thrust::counting_iterator(num_columns),
-                  [&](auto const col_idx) {
-                    page_row_offsets_tasks.emplace_back(
-                      cudf::detail::host_worker_pool().submit_task([&, col_idx = col_idx] {
-                        return compute_page_row_offsets(
-                          per_file_metadata, row_group_indices, column_schema_indices[col_idx]);
-                      }));
-                  });
-
-    // Collect results from all tasks
-    std::for_each(page_row_offsets_tasks.begin(), page_row_offsets_tasks.end(), [&](auto& task) {
-      auto [col_page_row_offsets, col_num_pages, col_max_page_size] = std::move(task).get();
-      page_row_offsets.insert(page_row_offsets.end(),
-                              std::make_move_iterator(col_page_row_offsets.begin()),
-                              std::make_move_iterator(col_page_row_offsets.end()));
-      max_page_size = std::max<size_type>(max_page_size, col_max_page_size);
-      // Add 1 to include the the 0th page's offset for each column
-      col_page_offsets.emplace_back(col_page_offsets.back() + col_num_pages + 1);
-    });
+    auto constexpr max_data_page_mask_threads = 2;
+    auto const num_threads = std::min<size_type>(max_data_page_mask_threads, num_columns);
+    std::vector<std::thread> threads{};
+    threads.reserve(num_threads);
+    std::vector<std::vector<std::pair<std::vector<size_type>, size_type>>> thread_results(
+      num_threads);
+    auto const cols_per_thread = cudf::util::div_rounding_up_unsafe(num_columns, num_threads);
+    std::for_each(
+      thrust::counting_iterator(0), thrust::counting_iterator(num_threads), [&](auto const tid) {
+        threads.emplace_back([&, tid = tid]() {
+          auto const start_col = std::min(tid * cols_per_thread, num_columns);
+          auto const end_col   = std::min(start_col + cols_per_thread, num_columns);
+          std::for_each(thrust::counting_iterator(start_col),
+                        thrust::counting_iterator(end_col),
+                        [&](auto const col_idx) {
+                          thread_results[tid].emplace_back(compute_page_row_offsets(
+                            per_file_metadata, row_group_indices, column_schema_indices[col_idx]));
+                        });
+        });
+      });
+
+    std::for_each(
+      thrust::counting_iterator(0), thrust::counting_iterator(num_threads), [&](auto const tid) {
+        threads[tid].join();
+        for (auto& [col_page_row_offsets, col_max_page_size] : thread_results[tid]) {
+          page_row_offsets.insert(
+            page_row_offsets.end(), col_page_row_offsets.begin(), col_page_row_offsets.end());
+          max_page_size = std::max<size_type>(max_page_size, col_max_page_size);
+          col_page_offsets.emplace_back(page_row_offsets.size());
+        }
+      });
   }
 
   auto const total_pages = page_row_offsets.size() - num_columns;
diff --git a/cpp/src/io/parquet/experimental/page_index_filter_utils.cu b/cpp/src/io/parquet/experimental/page_index_filter_utils.cu
index 59c2e76c523..aacbc860dee 100644
--- a/cpp/src/io/parquet/experimental/page_index_filter_utils.cu
+++ b/cpp/src/io/parquet/experimental/page_index_filter_utils.cu
@@ -34,6 +34,30 @@
 
 namespace cudf::io::parquet::experimental::detail {
 
+namespace {
+
+/**
+ * @brief Find the offset of the column chunk with the given schema index in the row group
+ *
+ * @param row_group Row group
+ * @param schema_idx Schema index
+ * @return Offset of the column chunk iterator
+ */
+[[nodiscard]] auto find_colchunk_iter_offset(RowGroup const& row_group, size_type schema_idx)
+{
+  auto const& colchunk_iter =
+    std::find_if(row_group.columns.begin(), row_group.columns.end(), [schema_idx](auto const& col) {
+      return col.schema_idx == schema_idx;
+    });
+  CUDF_EXPECTS(
+    colchunk_iter != row_group.columns.end(),
+    "Column chunk with schema index " + std::to_string(schema_idx) + " not found in row group",
+    std::invalid_argument);
+  return std::distance(row_group.columns.begin(), colchunk_iter);
+}
+
+}  // namespace
+
 bool compute_has_page_index(cudf::host_span<metadata_base const> file_metadatas,
                             cudf::host_span<std::vector<size_type> const> row_group_indices)
 {
@@ -89,20 +113,16 @@ compute_page_row_counts_and_offsets(cudf::host_span<metadata_base const> per_fil
     thrust::counting_iterator<size_t>(0),
     thrust::counting_iterator(row_group_indices.size()),
     [&](auto src_idx) {
-      auto const& rg_indices = row_group_indices[src_idx];
       // For all column chunks in this data source
+      auto const& rg_indices = row_group_indices[src_idx];
+      std::optional<size_type> colchunk_iter_offset{};
       std::for_each(rg_indices.cbegin(), rg_indices.cend(), [&](auto rg_idx) {
         auto const& row_group = per_file_metadata[src_idx].row_groups[rg_idx];
-        // Find the column chunk with the given schema index
-        auto colchunk_iter = std::find_if(
-          row_group.columns.begin(), row_group.columns.end(), [schema_idx](ColumnChunk const& col) {
-            return col.schema_idx == schema_idx;
-          });
-
-        CUDF_EXPECTS(colchunk_iter != row_group.columns.end(),
-                     "Column chunk with schema index " + std::to_string(schema_idx) +
-                       " not found in row group",
-                     std::invalid_argument);
+        if (not colchunk_iter_offset.has_value() or
+            row_group.columns[colchunk_iter_offset.value()].schema_idx != schema_idx) {
+          colchunk_iter_offset = find_colchunk_iter_offset(row_group, schema_idx);
+        }
+        auto const& colchunk_iter = row_group.columns.begin() + colchunk_iter_offset.value();
 
         // Compute page row counts and offsets if this column chunk has column and offset indexes
         if (colchunk_iter->offset_index.has_value()) {
@@ -136,11 +156,10 @@ compute_page_row_counts_and_offsets(cudf::host_span<metadata_base const> per_fil
     std::move(page_row_counts), std::move(page_row_offsets), std::move(col_chunk_page_offsets)};
 }
 
-std::tuple<std::vector<size_type>, size_type, size_type> compute_page_row_offsets(
+std::pair<std::vector<size_type>, size_type> compute_page_row_offsets(
   cudf::host_span<metadata_base const> per_file_metadata,
   cudf::host_span<std::vector<size_type> const> row_group_indices,
-  cudf::size_type schema_idx,
-  cudf::size_type row_mask_offset)
+  cudf::size_type schema_idx)
 {
   // Compute total number of row groups
   auto const total_row_groups =
@@ -150,29 +169,26 @@ std::tuple<std::vector<size_type>, size_type, size_type> compute_page_row_offset
                     [](auto sum, auto const& rg_indices) { return sum + rg_indices.size(); });
 
   std::vector<size_type> page_row_offsets;
-  page_row_offsets.push_back(row_mask_offset);
+  page_row_offsets.push_back(0);
   size_type max_page_size = 0;
-  size_type num_pages     = 0;
 
   std::for_each(thrust::counting_iterator<size_t>(0),
                 thrust::counting_iterator(row_group_indices.size()),
                 [&](auto const src_idx) {
-                  auto const& rg_indices = row_group_indices[src_idx];
                   // For all row groups in this source
+                  auto const& rg_indices = row_group_indices[src_idx];
+                  std::optional<size_type> colchunk_iter_offset{};
                   std::for_each(rg_indices.begin(), rg_indices.end(), [&](auto const& rg_idx) {
                     auto const& row_group = per_file_metadata[src_idx].row_groups[rg_idx];
                     // Find the column chunk with the given schema index
-                    auto colchunk_iter = std::find_if(
-                      row_group.columns.begin(),
-                      row_group.columns.end(),
-                      [schema_idx](auto const& col) { return col.schema_idx == schema_idx; });
-                    CUDF_EXPECTS(colchunk_iter != row_group.columns.end(),
-                                 "Column chunk with schema index " + std::to_string(schema_idx) +
-                                   " not found in row group",
-                                 std::invalid_argument);
+                    if (not colchunk_iter_offset.has_value() or
+                        row_group.columns[colchunk_iter_offset.value()].schema_idx != schema_idx) {
+                      colchunk_iter_offset = find_colchunk_iter_offset(row_group, schema_idx);
+                    }
+                    auto const& colchunk_iter =
+                      row_group.columns.begin() + colchunk_iter_offset.value();
                     auto const& offset_index       = colchunk_iter->offset_index.value();
                     auto const row_group_num_pages = offset_index.page_locations.size();
-                    num_pages += static_cast<size_type>(row_group_num_pages);
                     std::for_each(thrust::counting_iterator<size_t>(0),
                                   thrust::counting_iterator(row_group_num_pages),
                                   [&](auto const page_idx) {
@@ -189,7 +205,7 @@ std::tuple<std::vector<size_type>, size_type, size_type> compute_page_row_offset
                   });
                 });
 
-  return {std::move(page_row_offsets), num_pages, max_page_size};
+  return {std::move(page_row_offsets), max_page_size};
 }
 
 rmm::device_uvector<size_type> make_page_indices_async(
diff --git a/cpp/src/io/parquet/experimental/page_index_filter_utils.hpp b/cpp/src/io/parquet/experimental/page_index_filter_utils.hpp
index 299639e4172..42fdde40aaa 100644
--- a/cpp/src/io/parquet/experimental/page_index_filter_utils.hpp
+++ b/cpp/src/io/parquet/experimental/page_index_filter_utils.hpp
@@ -66,15 +66,13 @@ compute_page_row_counts_and_offsets(cudf::host_span<metadata_base const> per_fil
  * @param per_file_metadata Span of parquet footer metadata
  * @param row_group_indices Span of input row group indices
  * @param schema_idx Column's schema index
- * @param row_mask_offset Offset of the row mask
- * @return Tuple of page row offsets, number of pages, and the size of the largest page in this
+ * @return Pair of page row offsets and the size of the largest page in this
  * column
  */
-[[nodiscard]] std::tuple<std::vector<size_type>, size_type, size_type> compute_page_row_offsets(
+[[nodiscard]] std::pair<std::vector<size_type>, size_type> compute_page_row_offsets(
   cudf::host_span<metadata_base const> per_file_metadata,
   cudf::host_span<std::vector<size_type> const> row_group_indices,
-  size_type schema_idx,
-  cudf::size_type row_mask_offset = 0);
+  size_type schema_idx);
 
 /**
  * @brief Make a device vector where each row contains the index of the page it belongs to

From 06dfca49e176c31fb9911e65fc1dc536270b05cb Mon Sep 17 00:00:00 2001
From: Muhammad Haseeb <14217455+mhaseeb123@users.noreply.github.com>
Date: Fri, 17 Oct 2025 02:47:36 +0000
Subject: [PATCH 14/25] Bug fixing in fenwick tree search

---
 .../parquet/experimental/page_index_filter.cu | 192 +++++++++---------
 .../experimental/page_index_filter_utils.cu   |  22 +-
 .../experimental/page_index_filter_utils.hpp  |  13 +-
 3 files changed, 111 insertions(+), 116 deletions(-)

diff --git a/cpp/src/io/parquet/experimental/page_index_filter.cu b/cpp/src/io/parquet/experimental/page_index_filter.cu
index 6a8534f5c41..433b96d8b94 100644
--- a/cpp/src/io/parquet/experimental/page_index_filter.cu
+++ b/cpp/src/io/parquet/experimental/page_index_filter.cu
@@ -402,7 +402,7 @@ struct page_stats_caster : public stats_caster_base {
  * CUB's BlockReduce is unsuitable here as it reduces a block to a single element.
  * Cooperative Groups is used here for robust global thread ID calculation.
  */
-struct compute_next_level_functor {
+struct build_fenwick_tree_level_functor {
   bool** const level_ptrs;
   cudf::size_type const current_level;
   cudf::size_type const current_level_size;
@@ -414,11 +414,9 @@ struct compute_next_level_functor {
     auto next_level_ptr          = level_ptrs[current_level + 1];
 
     // Handle the odd-sized remaining element if current_level_size is odd
-    if (current_level_size % 2 and next_level_index == (next_level_size - 1)) {
-      // The last element is carried forward (ORed with false)
+    if (current_level_size % 2 and next_level_index == next_level_size - 1) {
       next_level_ptr[next_level_index] = current_level_ptr[current_level_size - 1];
     } else {
-      // Perform the logical OR reduction and write to the next level's location
       next_level_ptr[next_level_index] =
         current_level_ptr[(next_level_index * 2)] or current_level_ptr[(next_level_index * 2) + 1];
     }
@@ -435,89 +433,75 @@ struct compute_next_level_functor {
  * @param d_results Pointer to device memory to store the boolean result (true if a '1' is found in
  * the range).
  */
-struct probe_masks_functor {
+struct search_fenwick_tree_functor {
   bool** const level_ptrs;
   cudf::size_type const* const page_offsets;
   cudf::size_type const num_ranges;
 
   __device__ bool operator()(cudf::size_type range_idx) const noexcept
   {
-    // Retrieve M and N for the current range [M, N)
+    // Retrieve start and end for the current range [start, end)
     size_type M = page_offsets[range_idx];
     size_type N = page_offsets[range_idx + 1];
 
-    // If the range is empty or invalid, terminate
-    if (M >= N) { return false; }
+    // Return early if the range is empty or invalid
+    if (M >= N or range_idx >= num_ranges) { return false; }
 
     // Binary Decomposition Loop
     while (M < N) {
-      // 1. Calculate the largest power of 2 that can align M up to the boundary.
-      // This is determined by the Least Significant Bit (LSB) of M.
-      // If M=0, LSB is usually defined as the full size, but here M is typically > 0
-      // or we handle M=0 implicitly by the full range check.
-      // The expression (M & -M) gives the value of the LSB, which is the block size (2^k).
-      size_t m_lsb_block_size = (M == 0) ? N : (M & -M);
-      size_t m_next_aligned   = M + m_lsb_block_size;
-
-      // 2. Calculate the largest power of 2 block that can align N down to the boundary.
-      // This is determined by the LSB of (N - M), but simpler to use N's alignment for the end.
-      // The expression (N & -N) gives the block size corresponding to N's alignment.
-      // We ensure N_lsb_block_size does not exceed the remaining range size (N-M).
-      size_t n_lsb_block_size = N & -N;
-
-      // --- Decision Logic: Which side to consume? ---
-
-      // Block 1: M-aligned block (from M up to m_next_aligned)
-      size_t block1_size = m_next_aligned - M;
-
-      // Block 2: N-aligned block (from N - n_lsb_block_size up to N)
-      size_t block2_size = n_lsb_block_size;
-
-      // Block 3: The remaining central range block
-
-      if (block1_size > 0 && M < m_next_aligned && m_next_aligned <= N) {
-        // If the M-aligned block is fully contained in the range [M, N)
-
-        // Check if block1_size is 2^k. k = log2(block1_size).
-        // Since block1_size is based on LSB, it is always a power of 2.
-        size_t k1 = __ffs(block1_size) - 1;
-
-        // Calculate mask index: The starting point M is divided by the block size.
-        size_t mask_idx = M / block1_size;
-
-        // Look up the mask value
-        if (level_ptrs[k1][mask_idx]) {
-          return true;  // Found a set bit, terminate for this range
-        }
-
-        // Advance M
-        M = m_next_aligned;
-      } else if (block2_size > 0 && N - block2_size >= M) {
-        // If the N-aligned block is fully contained and does not overlap M's new position
-
-        // Check if block2_size is 2^k. k = log2(block2_size).
-        size_t k2 = __ffs(block2_size) - 1;
-
-        // Calculate mask index
-        size_t mask_idx = (N - block2_size) / block2_size;
-
-        // Look up the mask value
-        if (level_ptrs[k2][mask_idx]) {
-          return true;  // Found a set bit, terminate for this range
-        }
-
-        // Backtrack N
-        N = N - block2_size;
-      } else {
-        // The remaining range is unaligned and small (or just 1 element).
-        // This happens when M and N are close and unaligned (e.g., [11, 13]).
+      // 1. M Alignment: Find the largest power-of-two block that starts at M and aligns M up.
+      // Block size is determined by the Least Significant Bit (LSB) of M.
+      // If M=0, the LSB is the full range N, but we handle the LSB only for M>0.
+      // The __ffs intrinsic (Find First Set, 1-based) is the fastest way to get the LSB position
+      // (k).
+      size_t m_lsb_position = __ffs(M);                      // Position is 1-based (k+1)
+      size_t m_block_size   = 1ULL << (m_lsb_position - 1);  // Size is 2^k
+
+      // 2. N Alignment: Find the largest power-of-two block that aligns N down.
+      // N & -N gives the LSB block size *if* N were the start, but we use it as the largest
+      // possible size that evenly divides N.
+      size_t n_block_size = N & -N;
+
+      // The largest block size we can consume from the current range [M, N)
+      size_t max_block_size = 0;
+      size_t mask_level     = 0;  // k (k=0 is 1 row, k=1 is 2 rows, etc.)
+      size_t mask_index     = 0;
+
+      // --- Core Decomposition Logic ---
+
+      // Check the M side alignment block: [M, M + m_block_size)
+      // This is only valid if M + m_block_size <= N (the block fits).
+      if (M > 0 && M + m_block_size <= N) {
+        max_block_size = m_block_size;
+        mask_level     = m_lsb_position - 1;
+        mask_index     = M >> mask_level;  // M / 2^k
+        M += max_block_size;
+      }
 
-        // Prioritize M (1-row check) or N (1-row check) until they meet.
+      // Check the N side alignment block: [N - n_block_size, N)
+      // This is only valid if N - n_block_size >= M and the N block is larger or equal to the M
+      // block.
+      else if (n_block_size > 0 && N - n_block_size >= M && n_block_size >= max_block_size) {
+        // If the N block is larger or we are at the end, prioritize the N block
+        max_block_size = n_block_size;
+        mask_level     = __ffs(n_block_size) - 1;
+        mask_index     = (N - n_block_size) >> mask_level;  // (N - 2^k) / 2^k
+        N -= max_block_size;
+      }
 
-        // Check single row at M (Level 0)
-        if (level_ptrs[0][M]) { return true; }
+      // Fallback for small, unaligned ranges (e.g., [11, 13) where M and N are close)
+      // If max_block_size is 0 or too large, reduce by 1 row (Level 0)
+      else if (max_block_size == 0 || max_block_size > N - M) {
+        max_block_size = 1;
+        mask_level     = 0;
+        mask_index     = M;  // Level 0 index is just M
         M++;
       }
+
+      // --- Query Mask and Advance ---
+
+      // Look up the mask value at the determined level and index
+      if (level_ptrs[mask_level][mask_index]) { return true; }
     }
     return false;
   }
@@ -643,14 +627,6 @@ cudf::detail::host_vector<bool> aggregate_reader_metadata::compute_data_page_mas
                "Mismatch in total rows in input row mask and row groups",
                std::invalid_argument);
 
-  auto const num_columns = input_columns.size();
-
-  // Collect column schema indices from the input columns.
-  auto column_schema_indices = std::vector<size_type>(input_columns.size());
-  std::transform(
-    input_columns.begin(), input_columns.end(), column_schema_indices.begin(), [](auto const& col) {
-      return col.schema_idx;
-    });
   auto const has_page_index = compute_has_page_index(per_file_metadata, row_group_indices);
 
   // Return early if page index is not present
@@ -660,7 +636,15 @@ cudf::detail::host_vector<bool> aggregate_reader_metadata::compute_data_page_mas
       0, stream);  // An empty data page mask indicates all pages are required
   }
 
+  // Collect column schema indices from the input columns.
+  auto column_schema_indices = std::vector<size_type>(input_columns.size());
+  std::transform(
+    input_columns.begin(), input_columns.end(), column_schema_indices.begin(), [](auto const& col) {
+      return col.schema_idx;
+    });
+
   // Compute page row offsets and column chunk page offsets for each column
+  auto const num_columns = input_columns.size();
   std::vector<size_type> page_row_offsets;
   std::vector<size_type> col_page_offsets;
   col_page_offsets.reserve(num_columns + 1);
@@ -731,45 +715,57 @@ cudf::detail::host_vector<bool> aggregate_reader_metadata::compute_data_page_mas
   }
 
   auto const mr = cudf::get_current_device_resource_ref();
-  auto const [level_offsets, total_levels_size] =
-    compute_row_mask_levels(total_rows, max_page_size);
-  auto const num_levels = static_cast<cudf::size_type>(level_offsets.size());
 
-  auto levels_data = rmm::device_uvector<bool>(total_levels_size, stream, mr);
+  // Compute fenwick tree level offsets and total size (level 1 and higher)
+  auto const tree_level_offsets = compute_fenwick_tree_level_offsets(total_rows, max_page_size);
+  auto const num_levels         = static_cast<cudf::size_type>(tree_level_offsets.size());
+  // Buffer to store Fenwick tree levels (level 1 and higher) data
+  auto tree_levels_data = rmm::device_uvector<bool>(tree_level_offsets.back(), stream, mr);
 
-  auto host_level_ptrs = cudf::detail::make_host_vector<bool*>(num_levels, stream);
-  host_level_ptrs[0]   = const_cast<bool*>(row_mask.template begin<bool>()) + row_mask_offset;
+  // Pointers to each Fenwick tree level data
+  auto host_tree_level_ptrs = cudf::detail::make_host_vector<bool*>(num_levels, stream);
+  // Zeroth level is just the row mask itself
+  host_tree_level_ptrs[0] = const_cast<bool*>(row_mask.template begin<bool>()) + row_mask_offset;
   std::for_each(
     thrust::counting_iterator(1), thrust::counting_iterator(num_levels), [&](auto const level_idx) {
-      host_level_ptrs[level_idx] = levels_data.data() + level_offsets[level_idx - 1];
+      host_tree_level_ptrs[level_idx] = tree_levels_data.data() + tree_level_offsets[level_idx - 1];
     });
 
-  auto device_level_ptrs  = cudf::detail::make_device_uvector_async(host_level_ptrs, stream, mr);
+  auto fenwick_tree_level_ptrs =
+    cudf::detail::make_device_uvector_async(host_tree_level_ptrs, stream, mr);
+
+  // Build Fenwick tree levels
   auto current_level_size = total_rows;
   std::for_each(
     thrust::counting_iterator(0), thrust::counting_iterator(num_levels - 1), [&](auto const level) {
       auto const next_level_size = cudf::util::div_rounding_up_unsafe(current_level_size, 2);
-      thrust::for_each(rmm::exec_policy_nosync(stream),
-                       thrust::counting_iterator(0),
-                       thrust::counting_iterator(next_level_size),
-                       compute_next_level_functor{
-                         device_level_ptrs.data(), level, current_level_size, next_level_size});
+      thrust::for_each(
+        rmm::exec_policy_nosync(stream),
+        thrust::counting_iterator(0),
+        thrust::counting_iterator(next_level_size),
+        build_fenwick_tree_level_functor{
+          fenwick_tree_level_ptrs.data(), level, current_level_size, next_level_size});
       current_level_size = next_level_size;
     });
 
+  //  Search the Fenwick tree to see if there's a surviving row in each page's row range
   auto const num_ranges = static_cast<cudf::size_type>(page_row_offsets.size() - 1);
   rmm::device_uvector<bool> device_data_page_mask(num_ranges, stream, mr);
   auto page_offsets = cudf::detail::make_device_uvector_async(page_row_offsets, stream, mr);
-  thrust::transform(rmm::exec_policy_nosync(stream),
-                    thrust::counting_iterator(0),
-                    thrust::counting_iterator(num_ranges),
-                    device_data_page_mask.begin(),
-                    probe_masks_functor{device_level_ptrs.data(), page_offsets.data(), num_ranges});
-
+  thrust::transform(
+    rmm::exec_policy_nosync(stream),
+    thrust::counting_iterator(0),
+    thrust::counting_iterator(num_ranges),
+    device_data_page_mask.begin(),
+    search_fenwick_tree_functor{fenwick_tree_level_ptrs.data(), page_offsets.data(), num_ranges});
+
+  //  Copy over search results to host
   auto host_results      = cudf::detail::make_host_vector_async(device_data_page_mask, stream);
   auto data_page_mask    = cudf::detail::make_empty_host_vector<bool>(total_pages, stream);
   auto host_results_iter = host_results.begin();
   stream.synchronize();
+  // Discard results for invalid ranges. i.e. ranges starting at the last page of a column and
+  // ending at the first page of the next column
   std::for_each(thrust::counting_iterator<size_t>(0),
                 thrust::counting_iterator(num_columns),
                 [&](auto col_idx) {
diff --git a/cpp/src/io/parquet/experimental/page_index_filter_utils.cu b/cpp/src/io/parquet/experimental/page_index_filter_utils.cu
index aacbc860dee..be16cd46d09 100644
--- a/cpp/src/io/parquet/experimental/page_index_filter_utils.cu
+++ b/cpp/src/io/parquet/experimental/page_index_filter_utils.cu
@@ -242,27 +242,25 @@ rmm::device_uvector<size_type> make_page_indices_async(
   return page_indices;
 }
 
-std::pair<std::vector<size_type>, size_type> compute_row_mask_levels(cudf::size_type num_rows,
-                                                                     cudf::size_type max_page_size)
+std::vector<size_type> compute_fenwick_tree_level_offsets(cudf::size_type level0_size,
+                                                          cudf::size_type max_page_size)
 {
-  std::vector<size_type> level_offsets;
-  level_offsets.push_back(0);
+  std::vector<size_type> tree_level_offsets;
+  tree_level_offsets.push_back(0);
 
-  size_t current_size  = cudf::util::div_rounding_up_unsafe(num_rows, 2);
-  size_t current_level = 1;
+  cudf::size_type current_level_size = cudf::util::div_rounding_up_unsafe(level0_size, 2);
+  cudf::size_type current_level      = 1;
 
-  while (current_size > 0) {
+  while (current_level_size > 0) {
     size_t block_size = size_t{1} << current_level;
-
-    level_offsets.push_back(current_size);
-    current_size += num_rows;
+    tree_level_offsets.push_back(tree_level_offsets.back() + current_level_size);
 
     if (std::cmp_greater_equal(block_size, max_page_size)) { break; }
 
-    current_size = cudf::util::div_rounding_up_unsafe(current_size, 2);
+    current_level_size = cudf::util::div_rounding_up_unsafe(current_level_size, 2);
     current_level++;
   }
-  return {std::move(level_offsets), current_size};
+  return tree_level_offsets;
 }
 
 }  // namespace cudf::io::parquet::experimental::detail
diff --git a/cpp/src/io/parquet/experimental/page_index_filter_utils.hpp b/cpp/src/io/parquet/experimental/page_index_filter_utils.hpp
index 42fdde40aaa..cd0ef0f06b7 100644
--- a/cpp/src/io/parquet/experimental/page_index_filter_utils.hpp
+++ b/cpp/src/io/parquet/experimental/page_index_filter_utils.hpp
@@ -90,13 +90,14 @@ compute_page_row_counts_and_offsets(cudf::host_span<metadata_base const> per_fil
   rmm::cuda_stream_view stream);
 
 /**
- * @brief Compute the levels of the row mask
+ * @brief Computes the offsets of the Fenwick tree levels (level 1 and higher) until the tree
+ * level's block size is greater than the maximum page size (tree search range)
  *
- * @param num_rows Number of rows in the row mask
- * @param max_page_size Maximum page size
- * @return Pair of level offsets and total levels size
+ * @param level0_size Size of the zeroth tree level (the row mask)
+ * @param max_page_size Max page (tree search range) size
+ * @return Fenwick tree level offsets
  */
-[[nodiscard]] std::pair<std::vector<size_type>, size_type> compute_row_mask_levels(
-  cudf::size_type num_rows, cudf::size_type max_page_size);
+[[nodiscard]] std::vector<size_type> compute_fenwick_tree_level_offsets(
+  cudf::size_type level0_size, cudf::size_type max_page_size);
 
 }  // namespace cudf::io::parquet::experimental::detail

From 8ac9c3a20f6ca98ab3ef1503b22b042231476442 Mon Sep 17 00:00:00 2001
From: Muhammad Haseeb <14217455+mhaseeb123@users.noreply.github.com>
Date: Fri, 17 Oct 2025 05:50:27 +0000
Subject: [PATCH 15/25] Bug fixing in fenwick tree

---
 .../parquet/experimental/page_index_filter.cu | 139 +++++++++++-------
 .../experimental/page_index_filter_utils.cu   |  10 +-
 .../experimental/page_index_filter_utils.hpp  |   6 +-
 3 files changed, 88 insertions(+), 67 deletions(-)

diff --git a/cpp/src/io/parquet/experimental/page_index_filter.cu b/cpp/src/io/parquet/experimental/page_index_filter.cu
index 433b96d8b94..00676b1f56f 100644
--- a/cpp/src/io/parquet/experimental/page_index_filter.cu
+++ b/cpp/src/io/parquet/experimental/page_index_filter.cu
@@ -434,74 +434,101 @@ struct build_fenwick_tree_level_functor {
  * the range).
  */
 struct search_fenwick_tree_functor {
-  bool** const level_ptrs;
+  bool** const tree_level_ptrs;
   cudf::size_type const* const page_offsets;
   cudf::size_type const num_ranges;
 
+  /**
+   * @brief Checks if a value is a power of two
+   *
+   * @param value Value to check
+   * @return Boolean indicating if the value is a power of two
+   */
+  __device__ bool constexpr is_power_of_two(size_type value) const noexcept
+  {
+    return (value & (value - 1)) == 0;
+  }
+
+  /**
+   * @brief Finds the largest power of two in the range (start, end]; 0 otherwise
+   *
+   * @param start Range start
+   * @param end Range end
+   * @return Largest power of two in the range (start, end]; 0 otherwise
+   */
+  __device__ size_type constexpr largest_power_of_two_in_range(size_type start,
+                                                               size_type end) const noexcept
+  {
+    auto constexpr nbits = cudf::detail::size_in_bits<size_type>() - 1;
+    auto const result    = size_type{1} << (nbits - cuda::std::countl_zero<uint32_t>(end));
+    return result > start ? result : 0;
+  }
+
+  /**
+   * @brief Searches the Fenwick tree to find a `true` value in range [start, end)
+   *
+   * @param range_idx Index of the range to search
+   * @return Boolean indicating if a `true` value is found in the range
+   */
   __device__ bool operator()(cudf::size_type range_idx) const noexcept
   {
     // Retrieve start and end for the current range [start, end)
-    size_type M = page_offsets[range_idx];
-    size_type N = page_offsets[range_idx + 1];
+    size_type start = page_offsets[range_idx];
+    size_type end   = page_offsets[range_idx + 1];
 
     // Return early if the range is empty or invalid
-    if (M >= N or range_idx >= num_ranges) { return false; }
-
-    // Binary Decomposition Loop
-    while (M < N) {
-      // 1. M Alignment: Find the largest power-of-two block that starts at M and aligns M up.
-      // Block size is determined by the Least Significant Bit (LSB) of M.
-      // If M=0, the LSB is the full range N, but we handle the LSB only for M>0.
-      // The __ffs intrinsic (Find First Set, 1-based) is the fastest way to get the LSB position
-      // (k).
-      size_t m_lsb_position = __ffs(M);                      // Position is 1-based (k+1)
-      size_t m_block_size   = 1ULL << (m_lsb_position - 1);  // Size is 2^k
-
-      // 2. N Alignment: Find the largest power-of-two block that aligns N down.
-      // N & -N gives the LSB block size *if* N were the start, but we use it as the largest
-      // possible size that evenly divides N.
-      size_t n_block_size = N & -N;
-
-      // The largest block size we can consume from the current range [M, N)
-      size_t max_block_size = 0;
-      size_t mask_level     = 0;  // k (k=0 is 1 row, k=1 is 2 rows, etc.)
-      size_t mask_index     = 0;
-
-      // --- Core Decomposition Logic ---
-
-      // Check the M side alignment block: [M, M + m_block_size)
-      // This is only valid if M + m_block_size <= N (the block fits).
-      if (M > 0 && M + m_block_size <= N) {
-        max_block_size = m_block_size;
-        mask_level     = m_lsb_position - 1;
-        mask_index     = M >> mask_level;  // M / 2^k
-        M += max_block_size;
+    if (start >= end or range_idx >= num_ranges) { return false; }
+
+    // Binary search decomposition loop
+    while (start < end) {
+      // Base case: start is zero or start and/or end are power(s) of two
+      if (start == 0 or is_power_of_two(start)) {
+        auto const block_size = largest_power_of_two_in_range(start, end);
+        if (block_size) {
+          auto const mask_level = cuda::std::countr_zero<uint32_t>(block_size);
+          auto const mask_index = start >> mask_level;
+          if (tree_level_ptrs[mask_level][mask_index]) { return true; }
+          start += block_size;
+        }
+      } else if (is_power_of_two(end)) {
+        auto const block_size = largest_power_of_two_in_range(start, end);
+        if (block_size) {
+          auto const mask_level = cuda::std::countr_zero<uint32_t>(block_size);
+          auto const mask_index = (end - block_size) >> mask_level;
+          if (tree_level_ptrs[mask_level][mask_index]) { return true; }
+          end -= block_size;
+        }
       }
 
-      // Check the N side alignment block: [N - n_block_size, N)
-      // This is only valid if N - n_block_size >= M and the N block is larger or equal to the M
-      // block.
-      else if (n_block_size > 0 && N - n_block_size >= M && n_block_size >= max_block_size) {
-        // If the N block is larger or we are at the end, prioritize the N block
-        max_block_size = n_block_size;
-        mask_level     = __ffs(n_block_size) - 1;
-        mask_index     = (N - n_block_size) >> mask_level;  // (N - 2^k) / 2^k
-        N -= max_block_size;
-      }
+      // Return early if start >= end already
+      if (start >= end) { return false; }
 
-      // Fallback for small, unaligned ranges (e.g., [11, 13) where M and N are close)
-      // If max_block_size is 0 or too large, reduce by 1 row (Level 0)
-      else if (max_block_size == 0 || max_block_size > N - M) {
-        max_block_size = 1;
-        mask_level     = 0;
-        mask_index     = M;  // Level 0 index is just M
-        M++;
-      }
+      // Find the largest power-of-two block that begins and `start` and aligns it up
+      size_type const start_mask_level = cuda::std::countr_zero<uint32_t>(start);
+      size_type const start_block_size = size_t{1} << (start_mask_level);
 
-      // --- Query Mask and Advance ---
+      // Find the largest power-of-two block that aligns `end` down.
+      size_type const end_block_size = end & -end;
 
-      // Look up the mask value at the determined level and index
-      if (level_ptrs[mask_level][mask_index]) { return true; }
+      // Check the `start` side alignment block: [M, M + m_block_size) and if it's the larger block
+      if (start + start_block_size <= end and start_block_size >= end_block_size) {
+        auto const mask_level = start_mask_level;
+        auto const mask_index = start >> mask_level;
+        if (tree_level_ptrs[mask_level][mask_index]) { return true; }
+        start += start_block_size;
+      }
+      // Otherwise, check the `end` side alignment block: [end - end_block_size, end)
+      else if (end - end_block_size >= start) {
+        auto const mask_level = cuda::std::countr_zero<uint32_t>(end_block_size);
+        auto const mask_index = (end - end_block_size) >> mask_level;
+        if (tree_level_ptrs[mask_level][mask_index]) { return true; }
+        end -= end_block_size;
+      }
+      // Fallback for small, unaligned ranges. e.g., [11, 13)
+      else {
+        if (tree_level_ptrs[0][start]) { return true; }
+        start++;
+      }
     }
     return false;
   }
@@ -717,7 +744,7 @@ cudf::detail::host_vector<bool> aggregate_reader_metadata::compute_data_page_mas
   auto const mr = cudf::get_current_device_resource_ref();
 
   // Compute fenwick tree level offsets and total size (level 1 and higher)
-  auto const tree_level_offsets = compute_fenwick_tree_level_offsets(total_rows, max_page_size);
+  auto const tree_level_offsets = compute_fenwick_tree_level_offsets(total_rows);
   auto const num_levels         = static_cast<cudf::size_type>(tree_level_offsets.size());
   // Buffer to store Fenwick tree levels (level 1 and higher) data
   auto tree_levels_data = rmm::device_uvector<bool>(tree_level_offsets.back(), stream, mr);
diff --git a/cpp/src/io/parquet/experimental/page_index_filter_utils.cu b/cpp/src/io/parquet/experimental/page_index_filter_utils.cu
index be16cd46d09..75be66d7865 100644
--- a/cpp/src/io/parquet/experimental/page_index_filter_utils.cu
+++ b/cpp/src/io/parquet/experimental/page_index_filter_utils.cu
@@ -242,8 +242,7 @@ rmm::device_uvector<size_type> make_page_indices_async(
   return page_indices;
 }
 
-std::vector<size_type> compute_fenwick_tree_level_offsets(cudf::size_type level0_size,
-                                                          cudf::size_type max_page_size)
+std::vector<size_type> compute_fenwick_tree_level_offsets(cudf::size_type level0_size)
 {
   std::vector<size_type> tree_level_offsets;
   tree_level_offsets.push_back(0);
@@ -252,12 +251,9 @@ std::vector<size_type> compute_fenwick_tree_level_offsets(cudf::size_type level0
   cudf::size_type current_level      = 1;
 
   while (current_level_size > 0) {
-    size_t block_size = size_t{1} << current_level;
     tree_level_offsets.push_back(tree_level_offsets.back() + current_level_size);
-
-    if (std::cmp_greater_equal(block_size, max_page_size)) { break; }
-
-    current_level_size = cudf::util::div_rounding_up_unsafe(current_level_size, 2);
+    current_level_size =
+      current_level_size == 1 ? 0 : cudf::util::div_rounding_up_unsafe(current_level_size, 2);
     current_level++;
   }
   return tree_level_offsets;
diff --git a/cpp/src/io/parquet/experimental/page_index_filter_utils.hpp b/cpp/src/io/parquet/experimental/page_index_filter_utils.hpp
index cd0ef0f06b7..f3f51e3768f 100644
--- a/cpp/src/io/parquet/experimental/page_index_filter_utils.hpp
+++ b/cpp/src/io/parquet/experimental/page_index_filter_utils.hpp
@@ -90,14 +90,12 @@ compute_page_row_counts_and_offsets(cudf::host_span<metadata_base const> per_fil
   rmm::cuda_stream_view stream);
 
 /**
- * @brief Computes the offsets of the Fenwick tree levels (level 1 and higher) until the tree
- * level's block size is greater than the maximum page size (tree search range)
+ * @brief Computes the offsets of the Fenwick tree levels (level 1 and higher)
  *
  * @param level0_size Size of the zeroth tree level (the row mask)
- * @param max_page_size Max page (tree search range) size
  * @return Fenwick tree level offsets
  */
 [[nodiscard]] std::vector<size_type> compute_fenwick_tree_level_offsets(
-  cudf::size_type level0_size, cudf::size_type max_page_size);
+  cudf::size_type level0_size);
 
 }  // namespace cudf::io::parquet::experimental::detail

From e5e12ecb4cb87c31a84967ad367a2f7f30bbc6a0 Mon Sep 17 00:00:00 2001
From: Muhammad Haseeb <14217455+mhaseeb123@users.noreply.github.com>
Date: Fri, 17 Oct 2025 06:12:18 +0000
Subject: [PATCH 16/25] Minor optimizations

---
 .../parquet/experimental/page_index_filter.cu | 52 +++++--------------
 1 file changed, 14 insertions(+), 38 deletions(-)

diff --git a/cpp/src/io/parquet/experimental/page_index_filter.cu b/cpp/src/io/parquet/experimental/page_index_filter.cu
index 00676b1f56f..98e1781da47 100644
--- a/cpp/src/io/parquet/experimental/page_index_filter.cu
+++ b/cpp/src/io/parquet/experimental/page_index_filter.cu
@@ -438,17 +438,6 @@ struct search_fenwick_tree_functor {
   cudf::size_type const* const page_offsets;
   cudf::size_type const num_ranges;
 
-  /**
-   * @brief Checks if a value is a power of two
-   *
-   * @param value Value to check
-   * @return Boolean indicating if the value is a power of two
-   */
-  __device__ bool constexpr is_power_of_two(size_type value) const noexcept
-  {
-    return (value & (value - 1)) == 0;
-  }
-
   /**
    * @brief Finds the largest power of two in the range (start, end]; 0 otherwise
    *
@@ -456,8 +445,8 @@ struct search_fenwick_tree_functor {
    * @param end Range end
    * @return Largest power of two in the range (start, end]; 0 otherwise
    */
-  __device__ size_type constexpr largest_power_of_two_in_range(size_type start,
-                                                               size_type end) const noexcept
+  __device__ size_type inline constexpr largest_power_of_two_in_range(size_type start,
+                                                                      size_type end) const noexcept
   {
     auto constexpr nbits = cudf::detail::size_in_bits<size_type>() - 1;
     auto const result    = size_type{1} << (nbits - cuda::std::countl_zero<uint32_t>(end));
@@ -481,36 +470,23 @@ struct search_fenwick_tree_functor {
 
     // Binary search decomposition loop
     while (start < end) {
-      // Base case: start is zero or start and/or end are power(s) of two
-      if (start == 0 or is_power_of_two(start)) {
-        auto const block_size = largest_power_of_two_in_range(start, end);
-        if (block_size) {
-          auto const mask_level = cuda::std::countr_zero<uint32_t>(block_size);
-          auto const mask_index = start >> mask_level;
-          if (tree_level_ptrs[mask_level][mask_index]) { return true; }
-          start += block_size;
-        }
-      } else if (is_power_of_two(end)) {
-        auto const block_size = largest_power_of_two_in_range(start, end);
-        if (block_size) {
+      // Find the largest power-of-two block that begins and `start` and aligns it up
+      auto const [start_mask_level, start_block_size] = [&]() {
+        if (start == 0) {
+          auto const block_size = largest_power_of_two_in_range(start, end);
           auto const mask_level = cuda::std::countr_zero<uint32_t>(block_size);
-          auto const mask_index = (end - block_size) >> mask_level;
-          if (tree_level_ptrs[mask_level][mask_index]) { return true; }
-          end -= block_size;
+          return cuda::std::pair{mask_level, block_size};
+        } else {
+          auto const mask_level = cuda::std::countr_zero<uint32_t>(start);
+          return cuda::std::pair{mask_level, size_type{1} << mask_level};
         }
-      }
-
-      // Return early if start >= end already
-      if (start >= end) { return false; }
-
-      // Find the largest power-of-two block that begins and `start` and aligns it up
-      size_type const start_mask_level = cuda::std::countr_zero<uint32_t>(start);
-      size_type const start_block_size = size_t{1} << (start_mask_level);
+      }();
 
       // Find the largest power-of-two block that aligns `end` down.
-      size_type const end_block_size = end & -end;
+      auto const end_block_size = end & -end;
 
-      // Check the `start` side alignment block: [M, M + m_block_size) and if it's the larger block
+      // Check the `start` side alignment block: [M, M + m_block_size) and if it's the larger
+      // block
       if (start + start_block_size <= end and start_block_size >= end_block_size) {
         auto const mask_level = start_mask_level;
         auto const mask_index = start >> mask_level;

From 3d056ed4bed561cba34cff1723438efa93f7ddfa Mon Sep 17 00:00:00 2001
From: Muhammad Haseeb <14217455+mhaseeb123@users.noreply.github.com>
Date: Fri, 17 Oct 2025 06:20:34 +0000
Subject: [PATCH 17/25] Documentation and style

---
 .../parquet/experimental/page_index_filter.cu | 52 +++++++++++--------
 1 file changed, 29 insertions(+), 23 deletions(-)

diff --git a/cpp/src/io/parquet/experimental/page_index_filter.cu b/cpp/src/io/parquet/experimental/page_index_filter.cu
index 98e1781da47..ed77605f9ab 100644
--- a/cpp/src/io/parquet/experimental/page_index_filter.cu
+++ b/cpp/src/io/parquet/experimental/page_index_filter.cu
@@ -397,41 +397,47 @@ struct page_stats_caster : public stats_caster_base {
 };
 
 /**
- * @brief Custom CUDA kernel using Cooperative Groups to perform the paired logical OR reduction.
- * * NOTE: This operation is a map/stride-2-read, not a true block-to-global reduction.
- * CUB's BlockReduce is unsuitable here as it reduces a block to a single element.
- * Cooperative Groups is used here for robust global thread ID calculation.
+ * @brief Functor to build the NEXT Fenwick tree level from the current level data
+ *
+ * @param tree_level_ptrs Pointers to the start of Fenwick tree level data
+ * @param current_level Current tree level
+ * @param current_level_size Size of the current tree level
+ * @param next_level_size Size of the next tree level
  */
-struct build_fenwick_tree_level_functor {
-  bool** const level_ptrs;
+struct build_next_fenwick_tree_level_functor {
+  bool** const tree_level_ptrs;
   cudf::size_type const current_level;
   cudf::size_type const current_level_size;
   cudf::size_type const next_level_size;
 
-  __device__ void operator()(cudf::size_type next_level_index) const noexcept
+  /**
+   * @brief Builds the next Fenwick tree level from the current level data
+   * by ORing the two children of the current level.
+   *
+   * @param next_level_idx Next tree level element index
+   */
+  __device__ void operator()(cudf::size_type next_level_idx) const noexcept
   {
-    auto const current_level_ptr = level_ptrs[current_level];
-    auto next_level_ptr          = level_ptrs[current_level + 1];
+    auto const current_level_ptr = tree_level_ptrs[current_level];
+    auto next_level_ptr          = tree_level_ptrs[current_level + 1];
 
     // Handle the odd-sized remaining element if current_level_size is odd
-    if (current_level_size % 2 and next_level_index == next_level_size - 1) {
-      next_level_ptr[next_level_index] = current_level_ptr[current_level_size - 1];
+    if (current_level_size % 2 and next_level_idx == next_level_size - 1) {
+      next_level_ptr[next_level_idx] = current_level_ptr[current_level_size - 1];
     } else {
-      next_level_ptr[next_level_index] =
-        current_level_ptr[(next_level_index * 2)] or current_level_ptr[(next_level_index * 2) + 1];
+      next_level_ptr[next_level_idx] =
+        current_level_ptr[(next_level_idx * 2)] or current_level_ptr[(next_level_idx * 2) + 1];
     }
   }
 };
 
 /**
- * @brief CUDA kernel to probe multiple ranges against the pre-calculated mask hierarchy.
- * One thread handles the binary decomposition and query for one range [M, N).
- * * @param d_level_ptrs Device array of pointers, where d_level_ptrs[k] points to the start of
- * Level k mask.
- * @param d_range_offsets Device array where range i is [d_range_offsets[i], d_range_offsets[i+1]).
- * @param num_ranges The number of ranges to process.
- * @param d_results Pointer to device memory to store the boolean result (true if a '1' is found in
- * the range).
+ * @brief Functor to binary search a `true` value in the Fenwick tree in range [start, end)
+ *
+ * @param tree_level_ptrs Pointers to the start of Fenwick tree level data
+ * @param page_offsets Pointer to page offsets describing each search range i as [page_offsets[i],
+ * page_offsets[i+1))
+ * @param num_ranges Number of search ranges
  */
 struct search_fenwick_tree_functor {
   bool** const tree_level_ptrs;
@@ -737,7 +743,7 @@ cudf::detail::host_vector<bool> aggregate_reader_metadata::compute_data_page_mas
   auto fenwick_tree_level_ptrs =
     cudf::detail::make_device_uvector_async(host_tree_level_ptrs, stream, mr);
 
-  // Build Fenwick tree levels
+  // Build Fenwick tree levels (zeroth level is just the row mask itself)
   auto current_level_size = total_rows;
   std::for_each(
     thrust::counting_iterator(0), thrust::counting_iterator(num_levels - 1), [&](auto const level) {
@@ -746,7 +752,7 @@ cudf::detail::host_vector<bool> aggregate_reader_metadata::compute_data_page_mas
         rmm::exec_policy_nosync(stream),
         thrust::counting_iterator(0),
         thrust::counting_iterator(next_level_size),
-        build_fenwick_tree_level_functor{
+        build_next_fenwick_tree_level_functor{
           fenwick_tree_level_ptrs.data(), level, current_level_size, next_level_size});
       current_level_size = next_level_size;
     });

From 55948f09253a60651e6f441f0fd3abb88d0d2b55 Mon Sep 17 00:00:00 2001
From: Muhammad Haseeb <14217455+mhaseeb123@users.noreply.github.com>
Date: Fri, 17 Oct 2025 07:14:22 +0000
Subject: [PATCH 18/25] Small optimization

---
 cpp/src/io/parquet/experimental/page_index_filter.cu        | 4 ++--
 cpp/src/io/parquet/experimental/page_index_filter_utils.cu  | 6 ++++--
 cpp/src/io/parquet/experimental/page_index_filter_utils.hpp | 6 ++++--
 3 files changed, 10 insertions(+), 6 deletions(-)

diff --git a/cpp/src/io/parquet/experimental/page_index_filter.cu b/cpp/src/io/parquet/experimental/page_index_filter.cu
index ed77605f9ab..6c5a9425754 100644
--- a/cpp/src/io/parquet/experimental/page_index_filter.cu
+++ b/cpp/src/io/parquet/experimental/page_index_filter.cu
@@ -709,7 +709,7 @@ cudf::detail::host_vector<bool> aggregate_reader_metadata::compute_data_page_mas
 
   // Make sure all row_mask elements contain valid values even if they are nulls
   if constexpr (cuda::std::is_same_v<ColumnView, cudf::mutable_column_view>) {
-    if (row_mask.nullable()) {
+    if (row_mask.nullable() and row_mask.null_count() > 0) {
       thrust::for_each(rmm::exec_policy_nosync(stream),
                        thrust::counting_iterator(row_mask_offset),
                        thrust::counting_iterator(row_mask_offset + total_rows),
@@ -726,7 +726,7 @@ cudf::detail::host_vector<bool> aggregate_reader_metadata::compute_data_page_mas
   auto const mr = cudf::get_current_device_resource_ref();
 
   // Compute fenwick tree level offsets and total size (level 1 and higher)
-  auto const tree_level_offsets = compute_fenwick_tree_level_offsets(total_rows);
+  auto const tree_level_offsets = compute_fenwick_tree_level_offsets(total_rows, max_page_size);
   auto const num_levels         = static_cast<cudf::size_type>(tree_level_offsets.size());
   // Buffer to store Fenwick tree levels (level 1 and higher) data
   auto tree_levels_data = rmm::device_uvector<bool>(tree_level_offsets.back(), stream, mr);
diff --git a/cpp/src/io/parquet/experimental/page_index_filter_utils.cu b/cpp/src/io/parquet/experimental/page_index_filter_utils.cu
index 75be66d7865..8750449fc24 100644
--- a/cpp/src/io/parquet/experimental/page_index_filter_utils.cu
+++ b/cpp/src/io/parquet/experimental/page_index_filter_utils.cu
@@ -1,4 +1,3 @@
-
 /*
  * Copyright (c) 2025, NVIDIA CORPORATION.
  *
@@ -242,7 +241,8 @@ rmm::device_uvector<size_type> make_page_indices_async(
   return page_indices;
 }
 
-std::vector<size_type> compute_fenwick_tree_level_offsets(cudf::size_type level0_size)
+std::vector<size_type> compute_fenwick_tree_level_offsets(cudf::size_type level0_size,
+                                                          cudf::size_type max_page_size)
 {
   std::vector<size_type> tree_level_offsets;
   tree_level_offsets.push_back(0);
@@ -251,6 +251,8 @@ std::vector<size_type> compute_fenwick_tree_level_offsets(cudf::size_type level0
   cudf::size_type current_level      = 1;
 
   while (current_level_size > 0) {
+    auto const block_size = 1 << current_level;
+    if (std::cmp_greater(block_size, max_page_size)) { break; }
     tree_level_offsets.push_back(tree_level_offsets.back() + current_level_size);
     current_level_size =
       current_level_size == 1 ? 0 : cudf::util::div_rounding_up_unsafe(current_level_size, 2);
diff --git a/cpp/src/io/parquet/experimental/page_index_filter_utils.hpp b/cpp/src/io/parquet/experimental/page_index_filter_utils.hpp
index f3f51e3768f..adb2b3b1a0f 100644
--- a/cpp/src/io/parquet/experimental/page_index_filter_utils.hpp
+++ b/cpp/src/io/parquet/experimental/page_index_filter_utils.hpp
@@ -90,12 +90,14 @@ compute_page_row_counts_and_offsets(cudf::host_span<metadata_base const> per_fil
   rmm::cuda_stream_view stream);
 
 /**
- * @brief Computes the offsets of the Fenwick tree levels (level 1 and higher)
+ * @brief Computes the offsets of the Fenwick tree levels (level 1 and higher) until the tree level
+ * block size becomes larger than the maximum page (search range) size
  *
  * @param level0_size Size of the zeroth tree level (the row mask)
+ * @param max_page_size Maximum page (search range) size
  * @return Fenwick tree level offsets
  */
 [[nodiscard]] std::vector<size_type> compute_fenwick_tree_level_offsets(
-  cudf::size_type level0_size);
+  cudf::size_type level0_size, cudf::size_type max_page_size);
 
 }  // namespace cudf::io::parquet::experimental::detail

From a2221a55ffd75b983e21404f804aada0fc8af845 Mon Sep 17 00:00:00 2001
From: Muhammad Haseeb <14217455+mhaseeb123@users.noreply.github.com>
Date: Fri, 17 Oct 2025 09:46:35 +0000
Subject: [PATCH 19/25] Minor optimizations

---
 .../parquet/experimental/page_index_filter.cu | 97 +++++++++++++++----
 1 file changed, 80 insertions(+), 17 deletions(-)

diff --git a/cpp/src/io/parquet/experimental/page_index_filter.cu b/cpp/src/io/parquet/experimental/page_index_filter.cu
index 6c5a9425754..d762c9bed3a 100644
--- a/cpp/src/io/parquet/experimental/page_index_filter.cu
+++ b/cpp/src/io/parquet/experimental/page_index_filter.cu
@@ -444,6 +444,45 @@ struct search_fenwick_tree_functor {
   cudf::size_type const* const page_offsets;
   cudf::size_type const num_ranges;
 
+  /**
+   * @brief Enum class to represent which range boundary to align
+   */
+  enum class alignment : uint8_t {
+    START = 0,
+    END   = 1,
+  };
+
+  /**
+   * @brief Checks if a value is a power of two
+   *
+   * @param value Value to check
+   * @return Boolean indicating if the value is a power of two
+   */
+  __device__ bool inline constexpr is_power_of_two(cudf::size_type value) const noexcept
+  {
+    return (value & (value - 1)) == 0;
+  }
+
+  /**
+   * @brief Finds the smallest power of two in the range [start, end); 0 otherwise
+   *
+   * @param start Range start
+   * @param end Range end
+   * @return Largest power of two in the range [start, end); 0 otherwise
+   */
+  __device__ cudf::size_type inline constexpr smallest_power_of_two_in_range(
+    cudf::size_type start, cudf::size_type end) const noexcept
+  {
+    start--;
+    start |= start >> 1;
+    start |= start >> 2;
+    start |= start >> 4;
+    start |= start >> 8;
+    start |= start >> 16;
+    auto const result = start + 1;
+    return result < end ? result : 0;
+  }
+
   /**
    * @brief Finds the largest power of two in the range (start, end]; 0 otherwise
    *
@@ -459,6 +498,37 @@ struct search_fenwick_tree_functor {
     return result > start ? result : 0;
   }
 
+  /**
+   * @brief Aligns a range boundary to the next power-of-two block
+   *
+   * @tparam Alignment The bounday to align
+   * @param start Range start
+   * @param end Range end
+   * @return A pair of the tree level and block size
+   */
+  template <alignment Alignment>
+  __device__
+    cuda::std::pair<cudf::size_type, cudf::size_type> inline constexpr align_range_boundary(
+      cudf::size_type start, cudf::size_type end) const noexcept
+  {
+    if constexpr (Alignment == alignment::START) {
+      if (start == 0 or is_power_of_two(start)) {
+        auto const block_size = largest_power_of_two_in_range(start, end);
+        auto const tree_level = cuda::std::countr_zero<uint32_t>(block_size);
+        return cuda::std::pair{tree_level, block_size};
+      } else {
+        auto const tree_level = cuda::std::countr_zero<uint32_t>(start);
+        return cuda::std::pair{tree_level, size_type{1} << tree_level};
+      }
+    } else {
+      auto block_size = end & -end;
+      if (start > 0 and is_power_of_two(end)) {
+        block_size = end - smallest_power_of_two_in_range(start, end);
+      }
+      return cuda::std::pair{cuda::std::countr_zero<uint32_t>(block_size), block_size};
+    }
+  }
+
   /**
    * @brief Searches the Fenwick tree to find a `true` value in range [start, end)
    *
@@ -477,33 +547,26 @@ struct search_fenwick_tree_functor {
     // Binary search decomposition loop
     while (start < end) {
       // Find the largest power-of-two block that begins and `start` and aligns it up
-      auto const [start_mask_level, start_block_size] = [&]() {
-        if (start == 0) {
-          auto const block_size = largest_power_of_two_in_range(start, end);
-          auto const mask_level = cuda::std::countr_zero<uint32_t>(block_size);
-          return cuda::std::pair{mask_level, block_size};
-        } else {
-          auto const mask_level = cuda::std::countr_zero<uint32_t>(start);
-          return cuda::std::pair{mask_level, size_type{1} << mask_level};
-        }
-      }();
+      auto const [start_tree_level, start_block_size] =
+        align_range_boundary<alignment::START>(start, end);
 
       // Find the largest power-of-two block that aligns `end` down.
-      auto const end_block_size = end & -end;
+      auto const [end_tree_level, end_block_size] =
+        align_range_boundary<alignment::END>(start, end);
 
       // Check the `start` side alignment block: [M, M + m_block_size) and if it's the larger
       // block
       if (start + start_block_size <= end and start_block_size >= end_block_size) {
-        auto const mask_level = start_mask_level;
-        auto const mask_index = start >> mask_level;
-        if (tree_level_ptrs[mask_level][mask_index]) { return true; }
+        auto const tree_level = start_tree_level;
+        auto const mask_index = start >> tree_level;
+        if (tree_level_ptrs[tree_level][mask_index]) { return true; }
         start += start_block_size;
       }
       // Otherwise, check the `end` side alignment block: [end - end_block_size, end)
       else if (end - end_block_size >= start) {
-        auto const mask_level = cuda::std::countr_zero<uint32_t>(end_block_size);
-        auto const mask_index = (end - end_block_size) >> mask_level;
-        if (tree_level_ptrs[mask_level][mask_index]) { return true; }
+        auto const tree_level = end_tree_level;
+        auto const mask_index = (end - end_block_size) >> tree_level;
+        if (tree_level_ptrs[tree_level][mask_index]) { return true; }
         end -= end_block_size;
       }
       // Fallback for small, unaligned ranges. e.g., [11, 13)

From fc6c5347912da9530f661a37054e90648489ecf8 Mon Sep 17 00:00:00 2001
From: Muhammad Haseeb <14217455+mhaseeb123@users.noreply.github.com>
Date: Fri, 17 Oct 2025 17:13:45 +0000
Subject: [PATCH 20/25] Style check

---
 cpp/examples/hybrid_scan_io/common_utils.cpp  | 21 ++++++++++++++-----
 .../parquet/experimental/page_index_filter.cu |  2 +-
 2 files changed, 17 insertions(+), 6 deletions(-)

diff --git a/cpp/examples/hybrid_scan_io/common_utils.cpp b/cpp/examples/hybrid_scan_io/common_utils.cpp
index 0eac78d18da..96611dd961e 100644
--- a/cpp/examples/hybrid_scan_io/common_utils.cpp
+++ b/cpp/examples/hybrid_scan_io/common_utils.cpp
@@ -81,13 +81,24 @@ void check_tables_equal(cudf::table_view const& lhs_table,
     cudf::filtered_join join_obj(
       lhs_table, cudf::null_equality::EQUAL, cudf::set_as_build_table::RIGHT, stream);
     auto const indices = join_obj.anti_join(rhs_table, stream);
-
     // No exception thrown, check indices
-    auto const valid = indices->size() == 0;
-    std::cout << "Tables identical: " << std::boolalpha << valid << "\n\n";
+    auto const tables_equal = indices->size() == 0;
+    if (tables_equal) {
+      std::cout << "Tables identical: " << std::boolalpha << tables_equal << "\n\n";
+    } else {
+      // Helper to write parquet data for inspection
+      auto const write_parquet =
+        [](cudf::table_view table, std::string filepath, rmm::cuda_stream_view stream) {
+          auto sink_info = cudf::io::sink_info(filepath);
+          auto opts      = cudf::io::parquet_writer_options::builder(sink_info, table).build();
+          cudf::io::write_parquet(opts, stream);
+        };
+      write_parquet(lhs_table, "lhs_table.parquet", stream);
+      write_parquet(rhs_table, "rhs_table.parquet", stream);
+      throw std::logic_error("Tables identical: false\n\n");
+    }
   } catch (std::exception& e) {
-    std::cerr << e.what() << std::endl << std::endl;
-    throw std::runtime_error("Tables identical: false\n\n");
+    std::cout << e.what() << std::endl;
   }
 }
 
diff --git a/cpp/src/io/parquet/experimental/page_index_filter.cu b/cpp/src/io/parquet/experimental/page_index_filter.cu
index d762c9bed3a..81a2d5a3c69 100644
--- a/cpp/src/io/parquet/experimental/page_index_filter.cu
+++ b/cpp/src/io/parquet/experimental/page_index_filter.cu
@@ -501,7 +501,7 @@ struct search_fenwick_tree_functor {
   /**
    * @brief Aligns a range boundary to the next power-of-two block
    *
-   * @tparam Alignment The bounday to align
+   * @tparam Alignment The boundary (start or end) to align
    * @param start Range start
    * @param end Range end
    * @return A pair of the tree level and block size

From 488b1de6c61335237d5cff69a6be8a99ce6605d2 Mon Sep 17 00:00:00 2001
From: Muhammad Haseeb <14217455+mhaseeb123@users.noreply.github.com>
Date: Fri, 17 Oct 2025 17:35:10 +0000
Subject: [PATCH 21/25] Use host worker pool instead of c++ threads

---
 .../parquet/experimental/page_index_filter.cu | 75 ++++++++++---------
 1 file changed, 40 insertions(+), 35 deletions(-)

diff --git a/cpp/src/io/parquet/experimental/page_index_filter.cu b/cpp/src/io/parquet/experimental/page_index_filter.cu
index 81a2d5a3c69..07deb58d834 100644
--- a/cpp/src/io/parquet/experimental/page_index_filter.cu
+++ b/cpp/src/io/parquet/experimental/page_index_filter.cu
@@ -25,6 +25,7 @@
 #include <cudf/detail/transform.hpp>
 #include <cudf/detail/utilities/batched_memcpy.hpp>
 #include <cudf/detail/utilities/grid_1d.cuh>
+#include <cudf/detail/utilities/host_worker_pool.hpp>
 #include <cudf/detail/utilities/integer_utils.hpp>
 #include <cudf/detail/utilities/stream_pool.hpp>
 #include <cudf/detail/utilities/vector_factories.hpp>
@@ -47,7 +48,6 @@
 
 #include <algorithm>
 #include <limits>
-#include <thread>
 
 namespace cudf::io::parquet::experimental::detail {
 
@@ -405,10 +405,10 @@ struct page_stats_caster : public stats_caster_base {
  * @param next_level_size Size of the next tree level
  */
 struct build_next_fenwick_tree_level_functor {
-  bool** const tree_level_ptrs;
-  cudf::size_type const current_level;
-  cudf::size_type const current_level_size;
-  cudf::size_type const next_level_size;
+  bool** tree_level_ptrs;
+  cudf::size_type current_level;
+  cudf::size_type current_level_size;
+  cudf::size_type next_level_size;
 
   /**
    * @brief Builds the next Fenwick tree level from the current level data
@@ -440,9 +440,9 @@ struct build_next_fenwick_tree_level_functor {
  * @param num_ranges Number of search ranges
  */
 struct search_fenwick_tree_functor {
-  bool** const tree_level_ptrs;
-  cudf::size_type const* const page_offsets;
-  cudf::size_type const num_ranges;
+  bool** tree_level_ptrs;
+  cudf::size_type const* page_offsets;
+  cudf::size_type num_ranges;
 
   /**
    * @brief Enum class to represent which range boundary to align
@@ -735,41 +735,45 @@ cudf::detail::host_vector<bool> aggregate_reader_metadata::compute_data_page_mas
         col_page_offsets.emplace_back(page_row_offsets.size());
       });
   } else {
-    auto constexpr max_data_page_mask_threads = 2;
-    auto const num_threads = std::min<size_type>(max_data_page_mask_threads, num_columns);
-    std::vector<std::thread> threads{};
-    threads.reserve(num_threads);
-    std::vector<std::vector<std::pair<std::vector<size_type>, size_type>>> thread_results(
-      num_threads);
+    auto constexpr num_threads = 2;
+    std::vector<std::future<std::vector<std::pair<std::vector<size_type>, size_type>>>>
+      page_row_offset_tasks{};
+    page_row_offset_tasks.reserve(num_threads);
     auto const cols_per_thread = cudf::util::div_rounding_up_unsafe(num_columns, num_threads);
-    std::for_each(
-      thrust::counting_iterator(0), thrust::counting_iterator(num_threads), [&](auto const tid) {
-        threads.emplace_back([&, tid = tid]() {
+
+    // Submit page row offset compute tasks
+    std::transform(
+      thrust::counting_iterator(0),
+      thrust::counting_iterator(num_threads),
+      std::back_inserter(page_row_offset_tasks),
+      [&](auto const tid) {
+        return cudf::detail::host_worker_pool().submit_task([&, tid = tid]() {
           auto const start_col = std::min(tid * cols_per_thread, num_columns);
           auto const end_col   = std::min(start_col + cols_per_thread, num_columns);
-          std::for_each(thrust::counting_iterator(start_col),
-                        thrust::counting_iterator(end_col),
-                        [&](auto const col_idx) {
-                          thread_results[tid].emplace_back(compute_page_row_offsets(
-                            per_file_metadata, row_group_indices, column_schema_indices[col_idx]));
-                        });
+          std::vector<std::pair<std::vector<size_type>, size_type>> thread_page_row_offsets{};
+          thread_page_row_offsets.reserve(end_col - start_col);
+          std::transform(thrust::counting_iterator(start_col),
+                         thrust::counting_iterator(end_col),
+                         std::back_inserter(thread_page_row_offsets),
+                         [&](auto const col_idx) {
+                           return compute_page_row_offsets(
+                             per_file_metadata, row_group_indices, column_schema_indices[col_idx]);
+                         });
+          return thread_page_row_offsets;
         });
       });
 
-    std::for_each(
-      thrust::counting_iterator(0), thrust::counting_iterator(num_threads), [&](auto const tid) {
-        threads[tid].join();
-        for (auto& [col_page_row_offsets, col_max_page_size] : thread_results[tid]) {
-          page_row_offsets.insert(
-            page_row_offsets.end(), col_page_row_offsets.begin(), col_page_row_offsets.end());
-          max_page_size = std::max<size_type>(max_page_size, col_max_page_size);
-          col_page_offsets.emplace_back(page_row_offsets.size());
-        }
-      });
+    std::for_each(page_row_offset_tasks.begin(), page_row_offset_tasks.end(), [&](auto& task) {
+      auto const& thread_page_row_offsets = task.get();
+      for (auto& [col_page_row_offsets, col_max_page_size] : thread_page_row_offsets) {
+        page_row_offsets.insert(
+          page_row_offsets.end(), col_page_row_offsets.begin(), col_page_row_offsets.end());
+        max_page_size = std::max<size_type>(max_page_size, col_max_page_size);
+        col_page_offsets.emplace_back(page_row_offsets.size());
+      }
+    });
   }
 
-  auto const total_pages = page_row_offsets.size() - num_columns;
-
   // Make sure all row_mask elements contain valid values even if they are nulls
   if constexpr (cuda::std::is_same_v<ColumnView, cudf::mutable_column_view>) {
     if (row_mask.nullable() and row_mask.null_count() > 0) {
@@ -833,6 +837,7 @@ cudf::detail::host_vector<bool> aggregate_reader_metadata::compute_data_page_mas
 
   //  Copy over search results to host
   auto host_results      = cudf::detail::make_host_vector_async(device_data_page_mask, stream);
+  auto const total_pages = page_row_offsets.size() - num_columns;
   auto data_page_mask    = cudf::detail::make_empty_host_vector<bool>(total_pages, stream);
   auto host_results_iter = host_results.begin();
   stream.synchronize();

From 33489df7e0932a12b41b31cfbbc7e351f1140f0f Mon Sep 17 00:00:00 2001
From: Muhammad Haseeb <14217455+mhaseeb123@users.noreply.github.com>
Date: Fri, 17 Oct 2025 10:41:10 -0700
Subject: [PATCH 22/25] Apply suggestion from @mhaseeb123

---
 cpp/src/io/parquet/experimental/page_index_filter.cu | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cpp/src/io/parquet/experimental/page_index_filter.cu b/cpp/src/io/parquet/experimental/page_index_filter.cu
index 07deb58d834..3676dedc0d9 100644
--- a/cpp/src/io/parquet/experimental/page_index_filter.cu
+++ b/cpp/src/io/parquet/experimental/page_index_filter.cu
@@ -554,7 +554,7 @@ struct search_fenwick_tree_functor {
       auto const [end_tree_level, end_block_size] =
         align_range_boundary<alignment::END>(start, end);
 
-      // Check the `start` side alignment block: [M, M + m_block_size) and if it's the larger
+      // Check the `start` side alignment block: [start, start + start_block_size) and if it's the larger
       // block
       if (start + start_block_size <= end and start_block_size >= end_block_size) {
         auto const tree_level = start_tree_level;

From ffacb9355ddb41f6c8cecb51b11116b544c27d5e Mon Sep 17 00:00:00 2001
From: Muhammad Haseeb <14217455+mhaseeb123@users.noreply.github.com>
Date: Fri, 17 Oct 2025 17:41:32 +0000
Subject: [PATCH 23/25] style pls

---
 cpp/src/io/parquet/experimental/page_index_filter.cu | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/cpp/src/io/parquet/experimental/page_index_filter.cu b/cpp/src/io/parquet/experimental/page_index_filter.cu
index 3676dedc0d9..9eccbcbc1fe 100644
--- a/cpp/src/io/parquet/experimental/page_index_filter.cu
+++ b/cpp/src/io/parquet/experimental/page_index_filter.cu
@@ -554,8 +554,8 @@ struct search_fenwick_tree_functor {
       auto const [end_tree_level, end_block_size] =
         align_range_boundary<alignment::END>(start, end);
 
-      // Check the `start` side alignment block: [start, start + start_block_size) and if it's the larger
-      // block
+      // Check the `start` side alignment block: [start, start + start_block_size) and if it's the
+      // larger block
       if (start + start_block_size <= end and start_block_size >= end_block_size) {
         auto const tree_level = start_tree_level;
         auto const mask_index = start >> tree_level;

From f9e6848f953616f64cd84dae82ca902ab2baa4f1 Mon Sep 17 00:00:00 2001
From: Muhammad Haseeb <14217455+mhaseeb123@users.noreply.github.com>
Date: Wed, 29 Oct 2025 01:26:58 +0000
Subject: [PATCH 24/25] Update headers to SPDX

---
 .../experimental/page_index_filter_utils.cu       | 15 ++-------------
 .../experimental/page_index_filter_utils.hpp      | 15 ++-------------
 2 files changed, 4 insertions(+), 26 deletions(-)

diff --git a/cpp/src/io/parquet/experimental/page_index_filter_utils.cu b/cpp/src/io/parquet/experimental/page_index_filter_utils.cu
index 8750449fc24..ff4c4d2ae58 100644
--- a/cpp/src/io/parquet/experimental/page_index_filter_utils.cu
+++ b/cpp/src/io/parquet/experimental/page_index_filter_utils.cu
@@ -1,17 +1,6 @@
 /*
- * Copyright (c) 2025, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
+ * SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION.
+ * SPDX-License-Identifier: Apache-2.0
  */
 
 #include "page_index_filter_utils.hpp"
diff --git a/cpp/src/io/parquet/experimental/page_index_filter_utils.hpp b/cpp/src/io/parquet/experimental/page_index_filter_utils.hpp
index adb2b3b1a0f..873cdfd8ec7 100644
--- a/cpp/src/io/parquet/experimental/page_index_filter_utils.hpp
+++ b/cpp/src/io/parquet/experimental/page_index_filter_utils.hpp
@@ -1,17 +1,6 @@
 /*
- * Copyright (c) 2025, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
+ * SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION.
+ * SPDX-License-Identifier: Apache-2.0
  */
 
 #pragma once

From bc14621c1d5314d15e270e5f51bbfd33edb48285 Mon Sep 17 00:00:00 2001
From: Muhammad Haseeb <14217455+mhaseeb123@users.noreply.github.com>
Date: Tue, 4 Nov 2025 03:06:00 +0000
Subject: [PATCH 25/25] Address reviews

---
 .../experimental/hybrid_scan_helpers.hpp      |  2 +-
 .../parquet/experimental/hybrid_scan_impl.cpp |  8 +--
 .../parquet/experimental/page_index_filter.cu | 68 ++++++++++---------
 .../experimental/page_index_filter_utils.cu   |  2 +-
 .../experimental/page_index_filter_utils.hpp  |  9 +--
 cpp/src/io/parquet/reader_impl.cpp            |  2 -
 cpp/src/io/parquet/reader_impl.hpp            |  6 +-
 7 files changed, 50 insertions(+), 47 deletions(-)

diff --git a/cpp/src/io/parquet/experimental/hybrid_scan_helpers.hpp b/cpp/src/io/parquet/experimental/hybrid_scan_helpers.hpp
index 4eefa293449..d25bd70964b 100644
--- a/cpp/src/io/parquet/experimental/hybrid_scan_helpers.hpp
+++ b/cpp/src/io/parquet/experimental/hybrid_scan_helpers.hpp
@@ -283,7 +283,7 @@ class aggregate_reader_metadata : public aggregate_reader_metadata_base {
    *         the output table based on the input row mask across all input columns
    */
   template <typename ColumnView>
-  [[nodiscard]] cudf::detail::host_vector<bool> compute_data_page_mask(
+  [[nodiscard]] thrust::host_vector<bool> compute_data_page_mask(
     ColumnView const& row_mask,
     cudf::host_span<std::vector<size_type> const> row_group_indices,
     cudf::host_span<input_column_info const> input_columns,
diff --git a/cpp/src/io/parquet/experimental/hybrid_scan_impl.cpp b/cpp/src/io/parquet/experimental/hybrid_scan_impl.cpp
index 8e06535b9f0..5a919ae2a0f 100644
--- a/cpp/src/io/parquet/experimental/hybrid_scan_impl.cpp
+++ b/cpp/src/io/parquet/experimental/hybrid_scan_impl.cpp
@@ -444,7 +444,7 @@ table_with_metadata hybrid_scan_reader_impl::materialize_filter_columns(
     (mask_data_pages == use_data_page_mask::YES)
       ? _extended_metadata->compute_data_page_mask(
           row_mask, row_group_indices, _input_columns, _rows_processed_so_far, stream)
-      : cudf::detail::make_empty_host_vector<bool>(0, stream);
+      : thrust::host_vector<bool>(0);
 
   prepare_data(
     read_mode::READ_ALL, row_group_indices, std::move(column_chunk_buffers), data_page_mask);
@@ -474,7 +474,7 @@ table_with_metadata hybrid_scan_reader_impl::materialize_payload_columns(
     (mask_data_pages == use_data_page_mask::YES)
       ? _extended_metadata->compute_data_page_mask(
           row_mask, row_group_indices, _input_columns, _rows_processed_so_far, stream)
-      : cudf::detail::make_empty_host_vector<bool>(0, stream);
+      : thrust::host_vector<bool>(0);
 
   prepare_data(
     read_mode::READ_ALL, row_group_indices, std::move(column_chunk_buffers), data_page_mask);
@@ -513,7 +513,7 @@ void hybrid_scan_reader_impl::setup_chunking_for_filter_columns(
     (mask_data_pages == use_data_page_mask::YES)
       ? _extended_metadata->compute_data_page_mask(
           row_mask, row_group_indices, _input_columns, _rows_processed_so_far, _stream)
-      : cudf::detail::make_empty_host_vector<bool>(0, _stream);
+      : thrust::host_vector<bool>(0);
 
   prepare_data(
     read_mode::CHUNKED_READ, row_group_indices, std::move(column_chunk_buffers), data_page_mask);
@@ -564,7 +564,7 @@ void hybrid_scan_reader_impl::setup_chunking_for_payload_columns(
     (mask_data_pages == use_data_page_mask::YES)
       ? _extended_metadata->compute_data_page_mask(
           row_mask, row_group_indices, _input_columns, _rows_processed_so_far, _stream)
-      : cudf::detail::make_empty_host_vector<bool>(0, _stream);
+      : thrust::host_vector<bool>(0);
 
   prepare_data(
     read_mode::CHUNKED_READ, row_group_indices, std::move(column_chunk_buffers), data_page_mask);
diff --git a/cpp/src/io/parquet/experimental/page_index_filter.cu b/cpp/src/io/parquet/experimental/page_index_filter.cu
index 51b15cf69a4..3d142475eeb 100644
--- a/cpp/src/io/parquet/experimental/page_index_filter.cu
+++ b/cpp/src/io/parquet/experimental/page_index_filter.cu
@@ -381,7 +381,7 @@ struct page_stats_caster : public stats_caster_base {
 
       // Construct a row indices mapping based on page row counts and offsets
       auto const page_indices =
-        make_page_indices_async(page_row_counts, page_row_offsets, total_rows, stream);
+        compute_page_indices_async(page_row_counts, page_row_offsets, total_rows, stream);
 
       // For non-strings columns, directly gather the page-level column data and bitmask to the
       // row-level.
@@ -786,7 +786,7 @@ std::unique_ptr<cudf::column> aggregate_reader_metadata::build_row_mask_with_pag
 }
 
 template <typename ColumnView>
-cudf::detail::host_vector<bool> aggregate_reader_metadata::compute_data_page_mask(
+thrust::host_vector<bool> aggregate_reader_metadata::compute_data_page_mask(
   ColumnView const& row_mask,
   cudf::host_span<std::vector<size_type> const> row_group_indices,
   cudf::host_span<input_column_info const> input_columns,
@@ -806,7 +806,7 @@ cudf::detail::host_vector<bool> aggregate_reader_metadata::compute_data_page_mas
                      row_mask.template begin<bool>() + row_mask_offset,
                      row_mask.template begin<bool>() + row_mask_offset + total_rows,
                      cuda::std::identity{})) {
-    return cudf::detail::make_empty_host_vector<bool>(0, stream);
+    return thrust::host_vector<bool>(0, stream);
   }
 
   CUDF_EXPECTS(row_mask_offset + total_rows <= row_mask.size(),
@@ -818,8 +818,8 @@ cudf::detail::host_vector<bool> aggregate_reader_metadata::compute_data_page_mas
   // Return early if page index is not present
   if (not has_page_index) {
     CUDF_LOG_WARN("Encountered missing Parquet page index for one or more output columns");
-    return cudf::detail::make_empty_host_vector<bool>(
-      0, stream);  // An empty data page mask indicates all pages are required
+    return thrust::host_vector<bool>(
+      0);  // An empty data page mask indicates all pages are required
   }
 
   // Collect column schema indices from the input columns.
@@ -849,37 +849,37 @@ cudf::detail::host_vector<bool> aggregate_reader_metadata::compute_data_page_mas
         col_page_offsets.emplace_back(page_row_offsets.size());
       });
   } else {
-    auto constexpr num_threads = 2;
-    std::vector<std::future<std::vector<std::pair<std::vector<size_type>, size_type>>>>
-      page_row_offset_tasks{};
-    page_row_offset_tasks.reserve(num_threads);
-    auto const cols_per_thread = cudf::util::div_rounding_up_unsafe(num_columns, num_threads);
+    auto constexpr max_tasks         = 2;
+    using task_page_row_offsets_type = std::vector<std::pair<std::vector<size_type>, size_type>>;
+    std::vector<std::future<task_page_row_offsets_type>> page_row_offset_tasks{};
+    page_row_offset_tasks.reserve(max_tasks);
+    auto const cols_per_thread = cudf::util::div_rounding_up_unsafe(num_columns, max_tasks);
 
     // Submit page row offset compute tasks
-    std::transform(
-      thrust::counting_iterator(0),
-      thrust::counting_iterator(num_threads),
-      std::back_inserter(page_row_offset_tasks),
-      [&](auto const tid) {
-        return cudf::detail::host_worker_pool().submit_task([&, tid = tid]() {
-          auto const start_col = std::min(tid * cols_per_thread, num_columns);
-          auto const end_col   = std::min(start_col + cols_per_thread, num_columns);
-          std::vector<std::pair<std::vector<size_type>, size_type>> thread_page_row_offsets{};
-          thread_page_row_offsets.reserve(end_col - start_col);
-          std::transform(thrust::counting_iterator(start_col),
+    std::transform(thrust::counting_iterator(0),
+                   thrust::counting_iterator(max_tasks),
+                   std::back_inserter(page_row_offset_tasks),
+                   [&](auto const tid) {
+                     return cudf::detail::host_worker_pool().submit_task([&, tid = tid]() {
+                       auto const start_col = std::min(tid * cols_per_thread, num_columns);
+                       auto const end_col   = std::min(start_col + cols_per_thread, num_columns);
+                       task_page_row_offsets_type task_page_row_offsets{};
+                       task_page_row_offsets.reserve(end_col - start_col);
+                       std::transform(
+                         thrust::counting_iterator(start_col),
                          thrust::counting_iterator(end_col),
-                         std::back_inserter(thread_page_row_offsets),
+                         std::back_inserter(task_page_row_offsets),
                          [&](auto const col_idx) {
                            return compute_page_row_offsets(
                              per_file_metadata, row_group_indices, column_schema_indices[col_idx]);
                          });
-          return thread_page_row_offsets;
-        });
-      });
+                       return task_page_row_offsets;
+                     });
+                   });
 
     std::for_each(page_row_offset_tasks.begin(), page_row_offset_tasks.end(), [&](auto& task) {
-      auto const& thread_page_row_offsets = task.get();
-      for (auto& [col_page_row_offsets, col_max_page_size] : thread_page_row_offsets) {
+      auto const& task_page_row_offsets = task.get();
+      for (auto& [col_page_row_offsets, col_max_page_size] : task_page_row_offsets) {
         page_row_offsets.insert(
           page_row_offsets.end(), col_page_row_offsets.begin(), col_page_row_offsets.end());
         max_page_size = std::max<size_type>(max_page_size, col_max_page_size);
@@ -952,32 +952,36 @@ cudf::detail::host_vector<bool> aggregate_reader_metadata::compute_data_page_mas
   //  Copy over search results to host
   auto host_results      = cudf::detail::make_host_vector_async(device_data_page_mask, stream);
   auto const total_pages = page_row_offsets.size() - num_columns;
-  auto data_page_mask    = cudf::detail::make_empty_host_vector<bool>(total_pages, stream);
+  auto data_page_mask    = thrust::host_vector<bool>(total_pages, stream);
   auto host_results_iter = host_results.begin();
   stream.synchronize();
+
   // Discard results for invalid ranges. i.e. ranges starting at the last page of a column and
   // ending at the first page of the next column
+  auto num_pages_inserted = 0;
   std::for_each(thrust::counting_iterator<size_t>(0),
                 thrust::counting_iterator(num_columns),
                 [&](auto col_idx) {
                   auto const col_num_pages =
                     col_page_offsets[col_idx + 1] - col_page_offsets[col_idx] - 1;
-                  data_page_mask.insert(
-                    data_page_mask.end(), host_results_iter, host_results_iter + col_num_pages);
+                  data_page_mask.insert(data_page_mask.begin() + num_pages_inserted,
+                                        host_results_iter,
+                                        host_results_iter + col_num_pages);
                   host_results_iter += col_num_pages + 1;
+                  num_pages_inserted += col_num_pages;
                 });
   return data_page_mask;
 }
 
 // Instantiate the templates with ColumnView as cudf::column_view and cudf::mutable_column_view
-template cudf::detail::host_vector<bool> aggregate_reader_metadata::compute_data_page_mask<
+template thrust::host_vector<bool> aggregate_reader_metadata::compute_data_page_mask<
   cudf::column_view>(cudf::column_view const& row_mask,
                      cudf::host_span<std::vector<size_type> const> row_group_indices,
                      cudf::host_span<input_column_info const> input_columns,
                      cudf::size_type row_mask_offset,
                      rmm::cuda_stream_view stream) const;
 
-template cudf::detail::host_vector<bool> aggregate_reader_metadata::compute_data_page_mask<
+template thrust::host_vector<bool> aggregate_reader_metadata::compute_data_page_mask<
   cudf::mutable_column_view>(cudf::mutable_column_view const& row_mask,
                              cudf::host_span<std::vector<size_type> const> row_group_indices,
                              cudf::host_span<input_column_info const> input_columns,
diff --git a/cpp/src/io/parquet/experimental/page_index_filter_utils.cu b/cpp/src/io/parquet/experimental/page_index_filter_utils.cu
index ff4c4d2ae58..5a4ccc8dea8 100644
--- a/cpp/src/io/parquet/experimental/page_index_filter_utils.cu
+++ b/cpp/src/io/parquet/experimental/page_index_filter_utils.cu
@@ -196,7 +196,7 @@ std::pair<std::vector<size_type>, size_type> compute_page_row_offsets(
   return {std::move(page_row_offsets), max_page_size};
 }
 
-rmm::device_uvector<size_type> make_page_indices_async(
+rmm::device_uvector<size_type> compute_page_indices_async(
   cudf::host_span<cudf::size_type const> page_row_counts,
   cudf::host_span<cudf::size_type const> page_row_offsets,
   cudf::size_type total_rows,
diff --git a/cpp/src/io/parquet/experimental/page_index_filter_utils.hpp b/cpp/src/io/parquet/experimental/page_index_filter_utils.hpp
index 873cdfd8ec7..f10ad3f2654 100644
--- a/cpp/src/io/parquet/experimental/page_index_filter_utils.hpp
+++ b/cpp/src/io/parquet/experimental/page_index_filter_utils.hpp
@@ -50,12 +50,13 @@ compute_page_row_counts_and_offsets(cudf::host_span<metadata_base const> per_fil
                                     rmm::cuda_stream_view stream);
 
 /**
- * @brief Compute page row offsets for a given column schema index
+ * @brief Computes page row offsets and the size (number of rows) of the largest page for a given
+ * column schema index
  *
  * @param per_file_metadata Span of parquet footer metadata
  * @param row_group_indices Span of input row group indices
  * @param schema_idx Column's schema index
- * @return Pair of page row offsets and the size of the largest page in this
+ * @return A pair of page row offsets and the size of the largest page in this
  * column
  */
 [[nodiscard]] std::pair<std::vector<size_type>, size_type> compute_page_row_offsets(
@@ -64,7 +65,7 @@ compute_page_row_counts_and_offsets(cudf::host_span<metadata_base const> per_fil
   size_type schema_idx);
 
 /**
- * @brief Make a device vector where each row contains the index of the page it belongs to
+ * @brief Computes a device vector where each row contains the index of the page it belongs to
  *
  * @param page_row_counts Span of page row counts
  * @param page_row_offsets Span of page row offsets
@@ -72,7 +73,7 @@ compute_page_row_counts_and_offsets(cudf::host_span<metadata_base const> per_fil
  * @param stream CUDA stream
  * @return Device vector where each row contains the index of the page it belongs to
  */
-[[nodiscard]] rmm::device_uvector<size_type> make_page_indices_async(
+[[nodiscard]] rmm::device_uvector<size_type> compute_page_indices_async(
   cudf::host_span<cudf::size_type const> page_row_counts,
   cudf::host_span<cudf::size_type const> page_row_offsets,
   cudf::size_type total_rows,
diff --git a/cpp/src/io/parquet/reader_impl.cpp b/cpp/src/io/parquet/reader_impl.cpp
index e193b84fb78..470420a6775 100644
--- a/cpp/src/io/parquet/reader_impl.cpp
+++ b/cpp/src/io/parquet/reader_impl.cpp
@@ -473,7 +473,6 @@ void reader_impl::decode_page_data(read_mode mode, size_t skip_rows, size_t num_
 
 reader_impl::reader_impl()
   : _options{},
-    _pass_page_mask{cudf::detail::make_host_vector<bool>(0, cudf::get_default_stream())},
     _subpass_page_mask{cudf::detail::hostdevice_vector<bool>(0, cudf::get_default_stream())}
 {
 }
@@ -507,7 +506,6 @@ reader_impl::reader_impl(std::size_t chunk_read_limit,
              options.get_row_groups(),
              options.is_enabled_use_jit_filter()},
     _sources{std::move(sources)},
-    _pass_page_mask{cudf::detail::make_host_vector<bool>(0, _stream)},
     _subpass_page_mask{cudf::detail::hostdevice_vector<bool>(0, _stream)},
     _output_chunk_read_limit{chunk_read_limit},
     _input_pass_read_limit{pass_read_limit}
diff --git a/cpp/src/io/parquet/reader_impl.hpp b/cpp/src/io/parquet/reader_impl.hpp
index 324df2f46b7..38204bdb37e 100644
--- a/cpp/src/io/parquet/reader_impl.hpp
+++ b/cpp/src/io/parquet/reader_impl.hpp
@@ -435,10 +435,10 @@ class reader_impl {
   // _output_buffers associated schema indices
   std::vector<int> _output_column_schemas;
 
-  // Page mask for filtering out pass data pages
-  cudf::detail::host_vector<bool> _pass_page_mask;
+  // Page mask for filtering out pass data pages (Not copied to the device)
+  thrust::host_vector<bool> _pass_page_mask;
 
-  // Page mask for filtering out subpass data pages
+  // Page mask for filtering out subpass data pages (Copied to the device)
   cudf::detail::hostdevice_vector<bool> _subpass_page_mask;
 
   // _output_buffers associated metadata