From df8100d0c843075ed5ed5c3a54cf6103d06ee89a Mon Sep 17 00:00:00 2001 From: Muhammad Haseeb Date: Wed, 15 Oct 2025 02:25:27 +0000 Subject: [PATCH 01/25] Flatten the data page mask --- .../experimental/hybrid_scan_chunking.cu | 7 +- .../experimental/hybrid_scan_helpers.hpp | 6 +- .../parquet/experimental/hybrid_scan_impl.cpp | 31 +- .../parquet/experimental/hybrid_scan_impl.hpp | 7 +- .../parquet/experimental/page_index_filter.cu | 353 ++++-------------- .../experimental/page_index_filter_utils.cu | 235 ++++++++++++ .../experimental/page_index_filter_utils.hpp | 95 +++++ 7 files changed, 437 insertions(+), 297 deletions(-) create mode 100644 cpp/src/io/parquet/experimental/page_index_filter_utils.cu create mode 100644 cpp/src/io/parquet/experimental/page_index_filter_utils.hpp diff --git a/cpp/src/io/parquet/experimental/hybrid_scan_chunking.cu b/cpp/src/io/parquet/experimental/hybrid_scan_chunking.cu index b49d13d4000..7e824a89c60 100644 --- a/cpp/src/io/parquet/experimental/hybrid_scan_chunking.cu +++ b/cpp/src/io/parquet/experimental/hybrid_scan_chunking.cu @@ -41,10 +41,9 @@ namespace cudf::io::parquet::experimental::detail { using parquet::detail::ColumnChunkDesc; using parquet::detail::pass_intermediate_data; -void hybrid_scan_reader_impl::handle_chunking( - read_mode mode, - std::vector column_chunk_buffers, - cudf::host_span const> data_page_mask) +void hybrid_scan_reader_impl::handle_chunking(read_mode mode, + std::vector column_chunk_buffers, + std::vector const& data_page_mask) { // if this is our first time in here, setup the first pass. if (!_pass_itm_data) { diff --git a/cpp/src/io/parquet/experimental/hybrid_scan_helpers.hpp b/cpp/src/io/parquet/experimental/hybrid_scan_helpers.hpp index cb8e18ceaec..1330353045e 100644 --- a/cpp/src/io/parquet/experimental/hybrid_scan_helpers.hpp +++ b/cpp/src/io/parquet/experimental/hybrid_scan_helpers.hpp @@ -287,10 +287,10 @@ class aggregate_reader_metadata : public aggregate_reader_metadata_base { * @param row_mask_offset Offset into the row mask column for the current pass * @param stream CUDA stream used for device memory operations and kernel launches * - * @return A vector of boolean vectors indicating which data pages need to be decoded to produce - * the output table based on the input row mask, one per input column + * @return Boolean vector indicating which data pages need to be decoded to produce + * the output table based on the input row mask across all input columns */ - [[nodiscard]] std::vector> compute_data_page_mask( + [[nodiscard]] std::vector compute_data_page_mask( cudf::column_view row_mask, cudf::host_span const> row_group_indices, cudf::host_span input_columns, diff --git a/cpp/src/io/parquet/experimental/hybrid_scan_impl.cpp b/cpp/src/io/parquet/experimental/hybrid_scan_impl.cpp index cf7bd7419d4..422ed80222c 100644 --- a/cpp/src/io/parquet/experimental/hybrid_scan_impl.cpp +++ b/cpp/src/io/parquet/experimental/hybrid_scan_impl.cpp @@ -455,7 +455,7 @@ table_with_metadata hybrid_scan_reader_impl::materialize_filter_columns( (mask_data_pages == use_data_page_mask::YES) ? _extended_metadata->compute_data_page_mask( row_mask, row_group_indices, _input_columns, _rows_processed_so_far, stream) - : std::vector>{}; + : std::vector{}; prepare_data( read_mode::READ_ALL, row_group_indices, std::move(column_chunk_buffers), data_page_mask); @@ -485,7 +485,7 @@ table_with_metadata hybrid_scan_reader_impl::materialize_payload_columns( (mask_data_pages == use_data_page_mask::YES) ? _extended_metadata->compute_data_page_mask( row_mask, row_group_indices, _input_columns, _rows_processed_so_far, stream) - : std::vector>{}; + : std::vector{}; prepare_data( read_mode::READ_ALL, row_group_indices, std::move(column_chunk_buffers), data_page_mask); @@ -524,7 +524,7 @@ void hybrid_scan_reader_impl::setup_chunking_for_filter_columns( (mask_data_pages == use_data_page_mask::YES) ? _extended_metadata->compute_data_page_mask( row_mask, row_group_indices, _input_columns, _rows_processed_so_far, _stream) - : std::vector>{}; + : std::vector{}; prepare_data( read_mode::CHUNKED_READ, row_group_indices, std::move(column_chunk_buffers), data_page_mask); @@ -575,7 +575,7 @@ void hybrid_scan_reader_impl::setup_chunking_for_payload_columns( (mask_data_pages == use_data_page_mask::YES) ? _extended_metadata->compute_data_page_mask( row_mask, row_group_indices, _input_columns, _rows_processed_so_far, _stream) - : std::vector>{}; + : std::vector{}; prepare_data( read_mode::CHUNKED_READ, row_group_indices, std::move(column_chunk_buffers), data_page_mask); @@ -656,7 +656,7 @@ void hybrid_scan_reader_impl::prepare_data( read_mode mode, cudf::host_span const> row_group_indices, std::vector&& column_chunk_buffers, - cudf::host_span const> data_page_mask) + std::vector const& data_page_mask) { // if we have not preprocessed at the whole-file level, do that now if (not _file_preprocessed) { @@ -888,8 +888,7 @@ table_with_metadata hybrid_scan_reader_impl::finalize_output( } } -void hybrid_scan_reader_impl::set_pass_page_mask( - cudf::host_span const> data_page_mask) +void hybrid_scan_reader_impl::set_pass_page_mask(std::vector const& data_page_mask) { auto const& pass = _pass_itm_data; auto const& chunks = pass->chunks; @@ -903,13 +902,11 @@ void hybrid_scan_reader_impl::set_pass_page_mask( return; } + size_t num_inserted_data_pages = 0; std::for_each( thrust::counting_iterator(0), thrust::counting_iterator(_input_columns.size()), [&](auto col_idx) { - auto const& col_page_mask = data_page_mask[col_idx]; - size_t num_inserted_data_pages = 0; - for (size_t chunk_idx = col_idx; chunk_idx < chunks.size(); chunk_idx += num_columns) { // Insert a true value for each dictionary page if (chunks[chunk_idx].num_dict_pages > 0) { _pass_page_mask.push_back(true); } @@ -919,23 +916,23 @@ void hybrid_scan_reader_impl::set_pass_page_mask( // Make sure we have enough page mask for this column chunk CUDF_EXPECTS( - col_page_mask.size() >= num_inserted_data_pages + num_data_pages_this_col_chunk, + data_page_mask.size() >= num_inserted_data_pages + num_data_pages_this_col_chunk, "Encountered invalid data page mask size"); // Insert page mask for this column chunk _pass_page_mask.insert( _pass_page_mask.end(), - col_page_mask.begin() + num_inserted_data_pages, - col_page_mask.begin() + num_inserted_data_pages + num_data_pages_this_col_chunk); - + data_page_mask.begin() + num_inserted_data_pages, + data_page_mask.begin() + num_inserted_data_pages + num_data_pages_this_col_chunk); // Update the number of inserted data pages num_inserted_data_pages += num_data_pages_this_col_chunk; } - // Make sure we inserted exactly the number of data pages for this column - CUDF_EXPECTS(num_inserted_data_pages == col_page_mask.size(), - "Encountered mismatch in number of data pages and page mask size"); }); + // Make sure we inserted exactly the number of data pages for this column + CUDF_EXPECTS(num_inserted_data_pages == data_page_mask.size(), + "Encountered mismatch in number of data pages and page mask size"); + // Make sure we inserted exactly the number of pages for this pass CUDF_EXPECTS(_pass_page_mask.size() == pass->pages.size(), "Encountered mismatch in number of pass pages and page mask size"); diff --git a/cpp/src/io/parquet/experimental/hybrid_scan_impl.hpp b/cpp/src/io/parquet/experimental/hybrid_scan_impl.hpp index e862959a44f..9fbeea9a58d 100644 --- a/cpp/src/io/parquet/experimental/hybrid_scan_impl.hpp +++ b/cpp/src/io/parquet/experimental/hybrid_scan_impl.hpp @@ -266,7 +266,7 @@ class hybrid_scan_reader_impl : public parquet::detail::reader_impl { * * @param data_page_mask Input data page mask from page-pruning step */ - void set_pass_page_mask(cudf::host_span const> data_page_mask); + void set_pass_page_mask(std::vector const& data_page_mask); /** * @brief Select the columns to be read based on the read mode @@ -296,11 +296,12 @@ class hybrid_scan_reader_impl : public parquet::detail::reader_impl { * @param mode Value indicating if the data sources are read all at once or chunk by chunk * @param row_group_indices Row group indices to read * @param column_chunk_buffers Device buffers containing column chunk data + * @param data_page_mask Input data page mask from page-pruning step */ void prepare_data(read_mode mode, cudf::host_span const> row_group_indices, std::vector&& column_chunk_buffers, - cudf::host_span const> data_page_mask); + std::vector const& data_page_mask); /** * @brief Create descriptors for filter column chunks and decode dictionary page headers @@ -342,7 +343,7 @@ class hybrid_scan_reader_impl : public parquet::detail::reader_impl { */ void handle_chunking(read_mode mode, std::vector column_chunk_buffers, - cudf::host_span const> data_page_mask); + std::vector const& data_page_mask); /** * @brief Setup step for the next input read pass. diff --git a/cpp/src/io/parquet/experimental/page_index_filter.cu b/cpp/src/io/parquet/experimental/page_index_filter.cu index d6a6e2e9f3e..b5948197537 100644 --- a/cpp/src/io/parquet/experimental/page_index_filter.cu +++ b/cpp/src/io/parquet/experimental/page_index_filter.cu @@ -15,8 +15,8 @@ */ #include "hybrid_scan_helpers.hpp" -#include "io/parquet/reader_impl_helpers.hpp" #include "io/parquet/stats_filter_helpers.hpp" +#include "page_index_filter_utils.hpp" #include #include @@ -27,12 +27,10 @@ #include #include #include -#include #include #include #include #include -#include #include #include #include @@ -43,14 +41,11 @@ #include #include -#include #include #include -#include #include #include -#include namespace cudf::io::parquet::experimental::detail { @@ -60,187 +55,6 @@ using string_index_pair = parquet::detail::string_index_pair; namespace { -/** - * @brief Make a device vector where each row contains the index of the page it belongs to - */ -[[nodiscard]] rmm::device_uvector make_page_indices_async( - cudf::host_span page_row_counts, - cudf::host_span page_row_offsets, - cudf::size_type total_rows, - rmm::cuda_stream_view stream) -{ - auto mr = cudf::get_current_device_resource_ref(); - - // Copy page-level row counts and offsets to device - auto row_counts = cudf::detail::make_device_uvector_async(page_row_counts, stream, mr); - auto row_offsets = cudf::detail::make_device_uvector_async(page_row_offsets, stream, mr); - - // Make a zeroed device vector to store page indices of each row - auto page_indices = - cudf::detail::make_zeroed_device_uvector_async(total_rows, stream, mr); - - // Scatter page indices across the their first row's index - thrust::scatter_if(rmm::exec_policy_nosync(stream), - thrust::counting_iterator(0), - thrust::counting_iterator(row_counts.size()), - row_offsets.begin(), - row_counts.begin(), - page_indices.begin()); - - // Inclusive scan with maximum to replace zeros with the (increasing) page index it belongs to. - // Page indices are scattered at their first row's index. - thrust::inclusive_scan(rmm::exec_policy_nosync(stream), - page_indices.begin(), - page_indices.end(), - page_indices.begin(), - cuda::maximum()); - return page_indices; -} - -/** - * @brief Compute page row counts and page row offsets and column chunk page (count) offsets for a - * given column schema index - */ -[[nodiscard]] auto make_page_row_counts_and_offsets( - cudf::host_span per_file_metadata, - cudf::host_span const> row_group_indices, - size_type schema_idx, - rmm::cuda_stream_view stream) -{ - // Compute total number of row groups - auto const total_row_groups = - std::accumulate(row_group_indices.begin(), - row_group_indices.end(), - size_t{0}, - [](auto sum, auto const& rg_indices) { return sum + rg_indices.size(); }); - - // Vector to store how many rows are present in each page - set initial capacity to two data pages - // per row group - auto page_row_counts = - cudf::detail::make_empty_host_vector(2 * total_row_groups, stream); - // Vector to store the cumulative number of rows in each page - - set initial capacity to two data - // pages per row group - auto page_row_offsets = - cudf::detail::make_empty_host_vector((2 * total_row_groups) + 1, stream); - // Vector to store the cumulative number of pages in each column chunk - auto col_chunk_page_offsets = - cudf::detail::make_empty_host_vector(total_row_groups + 1, stream); - - page_row_offsets.push_back(0); - col_chunk_page_offsets.push_back(0); - - // For all data sources - std::for_each( - thrust::counting_iterator(0), - thrust::counting_iterator(row_group_indices.size()), - [&](auto src_idx) { - auto const& rg_indices = row_group_indices[src_idx]; - // For all column chunks in this data source - std::for_each(rg_indices.cbegin(), rg_indices.cend(), [&](auto rg_idx) { - auto const& row_group = per_file_metadata[src_idx].row_groups[rg_idx]; - // Find the column chunk with the given schema index - auto colchunk_iter = std::find_if( - row_group.columns.begin(), row_group.columns.end(), [schema_idx](ColumnChunk const& col) { - return col.schema_idx == schema_idx; - }); - - CUDF_EXPECTS(colchunk_iter != row_group.columns.end(), - "Column chunk with schema index " + std::to_string(schema_idx) + - " not found in row group", - std::invalid_argument); - - // Compute page row counts and offsets if this column chunk has column and offset indexes - if (colchunk_iter->offset_index.has_value()) { - CUDF_EXPECTS(colchunk_iter->column_index.has_value(), - "Both offset and column indexes must be present"); - // Get the offset and column indexes of the column chunk - auto const& offset_index = colchunk_iter->offset_index.value(); - auto const& column_index = colchunk_iter->column_index.value(); - - // Number of pages in this column chunk - auto const row_group_num_pages = offset_index.page_locations.size(); - - CUDF_EXPECTS(column_index.min_values.size() == column_index.max_values.size(), - "page min and max values should be of same size"); - CUDF_EXPECTS(column_index.min_values.size() == row_group_num_pages, - "mismatch between size of min/max page values and the size of page " - "locations"); - // Update the cumulative number of pages in this column chunk - col_chunk_page_offsets.push_back(col_chunk_page_offsets.back() + row_group_num_pages); - - // For all pages in this column chunk, update page row counts and offsets. - std::for_each( - thrust::counting_iterator(0), - thrust::counting_iterator(row_group_num_pages), - [&](auto const page_idx) { - int64_t const first_row_idx = offset_index.page_locations[page_idx].first_row_index; - // For the last page, this is simply the total number of rows in the column chunk - int64_t const last_row_idx = - (page_idx < row_group_num_pages - 1) - ? offset_index.page_locations[page_idx + 1].first_row_index - : row_group.num_rows; - - // Update the page row counts and offsets - page_row_counts.push_back(last_row_idx - first_row_idx); - page_row_offsets.push_back(page_row_offsets.back() + page_row_counts.back()); - }); - } - }); - }); - - return std::tuple{ - std::move(page_row_counts), std::move(page_row_offsets), std::move(col_chunk_page_offsets)}; -} - -/** - * @brief Compute if the page index is present in all parquet data sources for all columns - */ -[[nodiscard]] bool compute_has_page_index( - cudf::host_span file_metadatas, - cudf::host_span const> row_group_indices, - cudf::host_span column_schema_indices) -{ - // For all output columns, check all parquet data sources - return std::all_of( - column_schema_indices.begin(), column_schema_indices.end(), [&](auto const schema_idx) { - // For all parquet data sources - return std::all_of( - thrust::counting_iterator(0), - thrust::counting_iterator(row_group_indices.size()), - [&](auto const src_index) { - // For all row groups in this parquet data source - auto const& rg_indices = row_group_indices[src_index]; - return std::all_of(rg_indices.begin(), rg_indices.end(), [&](auto const& rg_index) { - auto const& row_group = file_metadatas[src_index].row_groups[rg_index]; - auto col = std::find_if( - row_group.columns.begin(), - row_group.columns.end(), - [schema_idx](ColumnChunk const& col) { return col.schema_idx == schema_idx; }); - // Check if the offset_index and column_index are present - return col != file_metadatas[src_index].row_groups[rg_index].columns.end() and - col->offset_index.has_value() and col->column_index.has_value(); - }); - }); - }); -} - -/** - * @brief Construct a vector of all required data pages from the page row counts - */ -[[nodiscard]] auto all_required_data_pages( - cudf::host_span const> page_row_counts) -{ - std::vector> all_required_data_pages; - all_required_data_pages.reserve(page_row_counts.size()); - std::transform( - page_row_counts.begin(), - page_row_counts.end(), - std::back_inserter(all_required_data_pages), - [&](auto const& col_page_counts) { return std::vector(col_page_counts.size(), true); }); - - return all_required_data_pages; -}; - /** * @brief Converts page-level statistics of a column to 2 device columns - min, max values. Each * column has number of rows equal to the total rows in all row groups. @@ -455,7 +269,8 @@ struct page_stats_caster : public stats_caster_base { } else { // Compute column chunk level page count offsets, and page level row counts and row offsets. auto const [page_row_counts, page_row_offsets, col_chunk_page_offsets] = - make_page_row_counts_and_offsets(per_file_metadata, row_group_indices, schema_idx, stream); + compute_page_row_counts_and_offsets( + per_file_metadata, row_group_indices, schema_idx, stream); CUDF_EXPECTS( page_row_offsets.back() == total_rows, @@ -479,7 +294,7 @@ struct page_stats_caster : public stats_caster_base { std::for_each(rg_indices.cbegin(), rg_indices.cend(), [&](auto rg_idx) { auto const& row_group = per_file_metadata[src_idx].row_groups[rg_idx]; // Find colchunk_iter in row_group.columns. Guaranteed to be found as already verified - // in make_page_row_counts_and_offsets() + // in compute_page_row_counts_and_offsets() auto colchunk_iter = std::find_if( row_group.columns.begin(), row_group.columns.end(), @@ -672,7 +487,7 @@ std::unique_ptr aggregate_reader_metadata::build_row_mask_with_pag return cudf::detail::compute_column(stats_table, stats_expr.get_stats_expr().get(), stream, mr); } -std::vector> aggregate_reader_metadata::compute_data_page_mask( +std::vector aggregate_reader_metadata::compute_data_page_mask( cudf::column_view row_mask, cudf::host_span const> row_group_indices, cudf::host_span input_columns, @@ -701,53 +516,50 @@ std::vector> aggregate_reader_metadata::compute_data_page_mask return {}; // An empty data page mask indicates all pages are required } - // Compute page row counts, offsets, and column chunk page offsets for each column - std::vector> page_row_counts; - std::vector> page_row_offsets; - std::vector> col_chunk_page_offsets; - page_row_counts.reserve(num_columns); - page_row_offsets.reserve(num_columns); - col_chunk_page_offsets.reserve(num_columns); + // Compute page row offsets and column chunk page offsets for each column + std::vector page_row_offsets; + std::vector col_page_offsets; + col_page_offsets.reserve(num_columns + 1); + col_page_offsets.push_back(0); + + size_type max_page_size = 0; if (num_columns == 1) { - auto const schema_idx = column_schema_indices.front(); - auto [counts, offsets, chunk_offsets] = - make_page_row_counts_and_offsets(per_file_metadata, row_group_indices, schema_idx, stream); - page_row_counts.emplace_back(std::move(counts)); - page_row_offsets.emplace_back(std::move(offsets)); + auto const schema_idx = column_schema_indices.front(); + size_type col_num_pages = 0; + std::tie(page_row_offsets, col_num_pages, max_page_size) = + compute_page_row_offsets(per_file_metadata, row_group_indices, schema_idx); + // Add 1 to include the the 0th page's offset for each column + col_page_offsets.emplace_back(col_num_pages + 1); } else { - std::vector, - cudf::detail::host_vector, - cudf::detail::host_vector>>> - page_row_counts_and_offsets_tasks; - page_row_counts_and_offsets_tasks.reserve(num_columns); - - auto streams = cudf::detail::fork_streams(stream, num_columns); + std::vector, size_type, size_type>>> + page_row_offsets_tasks; + page_row_offsets_tasks.reserve(num_columns); std::for_each(thrust::counting_iterator(0), thrust::counting_iterator(num_columns), [&](auto const col_idx) { - page_row_counts_and_offsets_tasks.emplace_back( + page_row_offsets_tasks.emplace_back( cudf::detail::host_worker_pool().submit_task([&, col_idx = col_idx] { - return make_page_row_counts_and_offsets(per_file_metadata, - row_group_indices, - column_schema_indices[col_idx], - streams[col_idx]); + return compute_page_row_offsets( + per_file_metadata, row_group_indices, column_schema_indices[col_idx]); })); }); // Collect results from all tasks - std::for_each(page_row_counts_and_offsets_tasks.begin(), - page_row_counts_and_offsets_tasks.end(), - [&](auto& task) { - auto [counts, offsets, chunk_offsets] = std::move(task).get(); - page_row_counts.emplace_back(std::move(counts)); - page_row_offsets.emplace_back(std::move(offsets)); - col_chunk_page_offsets.emplace_back(std::move(chunk_offsets)); - }); + std::for_each(page_row_offsets_tasks.begin(), page_row_offsets_tasks.end(), [&](auto& task) { + auto [col_page_row_offsets, col_num_pages, col_max_page_size] = std::move(task).get(); + page_row_offsets.insert(page_row_offsets.end(), + std::make_move_iterator(col_page_row_offsets.begin()), + std::make_move_iterator(col_page_row_offsets.end())); + max_page_size = std::max(max_page_size, col_max_page_size); + // Add 1 to include the the 0th page's offset for each column + col_page_offsets.emplace_back(col_page_offsets.back() + col_num_pages + 1); + }); } - auto const total_rows = page_row_offsets.back().back(); + auto const total_rows = page_row_offsets.back(); + auto const total_pages = page_row_offsets.size() - num_columns; CUDF_EXPECTS(row_mask_offset + total_rows <= row_mask.size(), "Mismatch in total rows in input row mask and row groups", @@ -759,17 +571,14 @@ std::vector> aggregate_reader_metadata::compute_data_page_mask row_mask.begin() + row_mask_offset, row_mask.begin() + row_mask_offset + total_rows, cuda::std::identity{})) { - return all_required_data_pages(page_row_counts); + return std::vector(total_pages, true); } - // Vector to hold data page mask for each column - auto data_page_mask = std::vector>(num_columns); - // Total number of surviving pages across all columns std::atomic total_surviving_pages{0}; // Tasks to compute data page mask for each column - std::vector> data_page_mask_tasks; + std::vector>> data_page_mask_tasks; data_page_mask_tasks.reserve(num_columns); // Host row mask validity and first bit offset @@ -807,49 +616,53 @@ std::vector> aggregate_reader_metadata::compute_data_page_mask // For all columns, look up which pages contain at least one required row. i.e. // !validity_it[row_idx] or is_row_required[row_idx] satisfies, and add its byte range to the // output list of byte ranges for the column. - std::for_each( - thrust::counting_iterator(0), - thrust::counting_iterator(num_columns), - [&](auto const col_idx) { - data_page_mask_tasks.emplace_back( - cudf::detail::host_worker_pool().submit_task([&, col_idx = col_idx] { - // Construct a row indices mapping based on page row counts and offsets - auto const total_pages_this_column = page_row_counts[col_idx].size(); - auto valid_pages_this_column = std::vector(total_pages_this_column, false); - // Number of final filtered pages for this column - size_t num_surviving_pages_this_column = 0; - // For all rows - for (auto row_idx = 0; row_idx < total_rows; ++row_idx) { - // If this row is required or invalid, add its page index to the output list. - if (not is_row_valid[row_idx] or is_row_required[row_idx]) { - // binary search to find the page index this row_idx belongs to and set the - // page index to true page_indices - auto const& offsets = page_row_offsets[col_idx]; - auto const page_itr = std::upper_bound(offsets.cbegin(), offsets.cend(), row_idx); - CUDF_EXPECTS(page_itr != offsets.cbegin() and page_itr != offsets.cend(), - "Invalid page index"); - auto const page_idx = std::distance(offsets.cbegin(), page_itr) - 1; - valid_pages_this_column[page_idx] = true; - num_surviving_pages_this_column++; - // Move row_idx to the last row of this page - row_idx = offsets[page_idx + 1] - 1; - } - } - - total_surviving_pages.fetch_add(num_surviving_pages_this_column); - data_page_mask[col_idx] = std::move(valid_pages_this_column); - })); - }); - - std::for_each( - data_page_mask_tasks.begin(), data_page_mask_tasks.end(), [](auto& task) { task.get(); }); + std::for_each(thrust::counting_iterator(0), + thrust::counting_iterator(num_columns), + [&](auto const col_idx) { + data_page_mask_tasks.emplace_back( + cudf::detail::host_worker_pool().submit_task([&, col_idx = col_idx] { + // Construct a row indices mapping based on page row counts and offsets + auto const total_pages_this_column = + col_page_offsets[col_idx + 1] - col_page_offsets[col_idx] - 1; + auto col_data_page_mask = std::vector(total_pages_this_column, false); + // Number of final filtered pages for this column + size_t num_surviving_pages_this_column = 0; + // For all rows + for (auto row_idx = 0; row_idx < total_rows; ++row_idx) { + // If this row is required or invalid, add its page index to the output + // list. + if (not is_row_valid[row_idx] or is_row_required[row_idx]) { + // binary search to find the page index this row_idx belongs to and set + // the page index to true page_indices + auto const offsets = cudf::host_span( + page_row_offsets.data() + col_page_offsets[col_idx], + col_page_offsets[col_idx + 1] - col_page_offsets[col_idx]); + auto const page_itr = + std::upper_bound(offsets.begin(), offsets.end(), row_idx); + CUDF_EXPECTS(page_itr != offsets.begin() and page_itr != offsets.end(), + "Invalid page index"); + auto const page_idx = std::distance(offsets.begin(), page_itr) - 1; + col_data_page_mask[page_idx] = true; + num_surviving_pages_this_column++; + // Move row_idx to the last row of this page + row_idx = offsets[page_idx + 1] - 1; + } + } + total_surviving_pages.fetch_add(num_surviving_pages_this_column); + return col_data_page_mask; + })); + }); - // Total number of input pages across all columns - auto const total_pages = std::accumulate( - page_row_counts.cbegin(), - page_row_counts.cend(), - size_t{0}, - [](auto sum, auto const& page_row_counts) { return sum + page_row_counts.size(); }); + // Vector to hold data page mask for each column + auto data_page_mask = std::vector(); + data_page_mask.reserve(total_pages); + + std::for_each(data_page_mask_tasks.begin(), data_page_mask_tasks.end(), [&](auto& task) { + auto col_data_page_mask = std::move(task).get(); + data_page_mask.insert(data_page_mask.end(), + std::make_move_iterator(col_data_page_mask.begin()), + std::make_move_iterator(col_data_page_mask.end())); + }); CUDF_EXPECTS( total_surviving_pages <= total_pages, diff --git a/cpp/src/io/parquet/experimental/page_index_filter_utils.cu b/cpp/src/io/parquet/experimental/page_index_filter_utils.cu new file mode 100644 index 00000000000..3dfb8e69d51 --- /dev/null +++ b/cpp/src/io/parquet/experimental/page_index_filter_utils.cu @@ -0,0 +1,235 @@ + +/* + * Copyright (c) 2025, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "page_index_filter_utils.hpp" + +#include +#include +#include + +#include +#include +#include + +#include + +#include +#include +#include + +namespace cudf::io::parquet::experimental::detail { + +bool compute_has_page_index(cudf::host_span file_metadatas, + cudf::host_span const> row_group_indices, + cudf::host_span column_schema_indices) +{ + // For all output columns, check all parquet data sources + return std::all_of( + column_schema_indices.begin(), column_schema_indices.end(), [&](auto const schema_idx) { + // For all parquet data sources + return std::all_of( + thrust::counting_iterator(0), + thrust::counting_iterator(row_group_indices.size()), + [&](auto const src_index) { + // For all row groups in this parquet data source + auto const& rg_indices = row_group_indices[src_index]; + return std::all_of(rg_indices.begin(), rg_indices.end(), [&](auto const& rg_index) { + auto const& row_group = file_metadatas[src_index].row_groups[rg_index]; + auto col = std::find_if( + row_group.columns.begin(), + row_group.columns.end(), + [schema_idx](ColumnChunk const& col) { return col.schema_idx == schema_idx; }); + // Check if the offset_index and column_index are present + return col != file_metadatas[src_index].row_groups[rg_index].columns.end() and + col->offset_index.has_value() and col->column_index.has_value(); + }); + }); + }); +} + +std::tuple, + cudf::detail::host_vector, + cudf::detail::host_vector> +compute_page_row_counts_and_offsets(cudf::host_span per_file_metadata, + cudf::host_span const> row_group_indices, + size_type schema_idx, + rmm::cuda_stream_view stream) +{ + // Compute total number of row groups + auto const total_row_groups = + std::accumulate(row_group_indices.begin(), + row_group_indices.end(), + size_t{0}, + [](auto sum, auto const& rg_indices) { return sum + rg_indices.size(); }); + + // Vector to store how many rows are present in each page - set initial capacity to two data pages + // per row group + auto page_row_counts = + cudf::detail::make_empty_host_vector(2 * total_row_groups, stream); + // Vector to store the cumulative number of rows in each page - - set initial capacity to two data + // pages per row group + auto page_row_offsets = + cudf::detail::make_empty_host_vector((2 * total_row_groups) + 1, stream); + // Vector to store the cumulative number of pages in each column chunk + auto col_chunk_page_offsets = + cudf::detail::make_empty_host_vector(total_row_groups + 1, stream); + + page_row_offsets.push_back(0); + col_chunk_page_offsets.push_back(0); + + // For all data sources + std::for_each( + thrust::counting_iterator(0), + thrust::counting_iterator(row_group_indices.size()), + [&](auto src_idx) { + auto const& rg_indices = row_group_indices[src_idx]; + // For all column chunks in this data source + std::for_each(rg_indices.cbegin(), rg_indices.cend(), [&](auto rg_idx) { + auto const& row_group = per_file_metadata[src_idx].row_groups[rg_idx]; + // Find the column chunk with the given schema index + auto colchunk_iter = std::find_if( + row_group.columns.begin(), row_group.columns.end(), [schema_idx](ColumnChunk const& col) { + return col.schema_idx == schema_idx; + }); + + CUDF_EXPECTS(colchunk_iter != row_group.columns.end(), + "Column chunk with schema index " + std::to_string(schema_idx) + + " not found in row group", + std::invalid_argument); + + // Compute page row counts and offsets if this column chunk has column and offset indexes + if (colchunk_iter->offset_index.has_value()) { + // Get the offset index of the column chunk + auto const& offset_index = colchunk_iter->offset_index.value(); + auto const row_group_num_pages = offset_index.page_locations.size(); + + col_chunk_page_offsets.push_back(col_chunk_page_offsets.back() + row_group_num_pages); + + // For all pages in this column chunk, update page row counts and offsets. + std::for_each( + thrust::counting_iterator(0), + thrust::counting_iterator(row_group_num_pages), + [&](auto const page_idx) { + int64_t const first_row_idx = offset_index.page_locations[page_idx].first_row_index; + // For the last page, this is simply the total number of rows in the column chunk + int64_t const last_row_idx = + (page_idx < row_group_num_pages - 1) + ? offset_index.page_locations[page_idx + 1].first_row_index + : row_group.num_rows; + + // Update the page row counts and offsets + page_row_counts.push_back(last_row_idx - first_row_idx); + page_row_offsets.push_back(page_row_offsets.back() + page_row_counts.back()); + }); + } + }); + }); + + return { + std::move(page_row_counts), std::move(page_row_offsets), std::move(col_chunk_page_offsets)}; +} + +std::tuple, size_type, size_type> compute_page_row_offsets( + cudf::host_span per_file_metadata, + cudf::host_span const> row_group_indices, + size_type schema_idx) +{ + // Compute total number of row groups + auto const total_row_groups = + std::accumulate(row_group_indices.begin(), + row_group_indices.end(), + size_t{0}, + [](auto sum, auto const& rg_indices) { return sum + rg_indices.size(); }); + + std::vector page_row_offsets; + page_row_offsets.push_back(0); + size_type max_page_size = 0; + size_type num_pages = 0; + + std::for_each(thrust::counting_iterator(0), + thrust::counting_iterator(row_group_indices.size()), + [&](auto const src_idx) { + auto const& rg_indices = row_group_indices[src_idx]; + // For all row groups in this source + std::for_each(rg_indices.begin(), rg_indices.end(), [&](auto const& rg_idx) { + auto const& row_group = per_file_metadata[src_idx].row_groups[rg_idx]; + // Find the column chunk with the given schema index + auto colchunk_iter = std::find_if( + row_group.columns.begin(), + row_group.columns.end(), + [schema_idx](auto const& col) { return col.schema_idx == schema_idx; }); + CUDF_EXPECTS(colchunk_iter != row_group.columns.end(), + "Column chunk with schema index " + std::to_string(schema_idx) + + " not found in row group", + std::invalid_argument); + auto const& offset_index = colchunk_iter->offset_index.value(); + auto const row_group_num_pages = offset_index.page_locations.size(); + num_pages += static_cast(row_group_num_pages); + std::for_each(thrust::counting_iterator(0), + thrust::counting_iterator(row_group_num_pages), + [&](auto const page_idx) { + int64_t const first_row_idx = + offset_index.page_locations[page_idx].first_row_index; + int64_t const last_row_idx = + (page_idx < row_group_num_pages - 1) + ? offset_index.page_locations[page_idx + 1].first_row_index + : row_group.num_rows; + auto const page_size = last_row_idx - first_row_idx; + max_page_size = std::max(max_page_size, page_size); + page_row_offsets.push_back(page_row_offsets.back() + page_size); + }); + }); + }); + + return {std::move(page_row_offsets), num_pages, max_page_size}; +} + +rmm::device_uvector make_page_indices_async( + cudf::host_span page_row_counts, + cudf::host_span page_row_offsets, + cudf::size_type total_rows, + rmm::cuda_stream_view stream) +{ + auto mr = cudf::get_current_device_resource_ref(); + + // Copy page-level row counts and offsets to device + auto row_counts = cudf::detail::make_device_uvector_async(page_row_counts, stream, mr); + auto row_offsets = cudf::detail::make_device_uvector_async(page_row_offsets, stream, mr); + + // Make a zeroed device vector to store page indices of each row + auto page_indices = + cudf::detail::make_zeroed_device_uvector_async(total_rows, stream, mr); + + // Scatter page indices across the their first row's index + thrust::scatter_if(rmm::exec_policy_nosync(stream), + thrust::counting_iterator(0), + thrust::counting_iterator(row_counts.size()), + row_offsets.begin(), + row_counts.begin(), + page_indices.begin()); + + // Inclusive scan with maximum to replace zeros with the (increasing) page index it belongs to. + // Page indices are scattered at their first row's index. + thrust::inclusive_scan(rmm::exec_policy_nosync(stream), + page_indices.begin(), + page_indices.end(), + page_indices.begin(), + cuda::maximum()); + return page_indices; +} + +} // namespace cudf::io::parquet::experimental::detail \ No newline at end of file diff --git a/cpp/src/io/parquet/experimental/page_index_filter_utils.hpp b/cpp/src/io/parquet/experimental/page_index_filter_utils.hpp new file mode 100644 index 00000000000..f53bede91c7 --- /dev/null +++ b/cpp/src/io/parquet/experimental/page_index_filter_utils.hpp @@ -0,0 +1,95 @@ +/* + * Copyright (c) 2025, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include "io/parquet/reader_impl_helpers.hpp" + +#include +#include +#include +#include + +#include +#include + +namespace cudf::io::parquet::experimental::detail { + +using metadata_base = parquet::detail::metadata; + +/** + * @brief Compute if the page index is present in all parquet data sources for all columns + * + * @param file_metadatas Span of parquet footer metadata + * @param row_group_indices Span of input row group indices + * @param column_schema_indices Span of input column schema indices + * @return Boolean indicating if the page index is present in all parquet data sources for all + * columns + */ +[[nodiscard]] bool compute_has_page_index( + cudf::host_span file_metadatas, + cudf::host_span const> row_group_indices, + cudf::host_span column_schema_indices); + +/** + * @brief Compute page row counts and page row offsets and column chunk page (count) offsets for a + * given column schema index + * + * @param per_file_metadata Span of parquet footer metadata + * @param row_group_indices Span of input row group indices + * @param schema_idx Column's schema index + * @param stream CUDA stream + * @return Tuple of page row counts, page row offsets, and column chunk page (count) offsets + */ +[[nodiscard]] std::tuple, + cudf::detail::host_vector, + cudf::detail::host_vector> +compute_page_row_counts_and_offsets(cudf::host_span per_file_metadata, + cudf::host_span const> row_group_indices, + size_type schema_idx, + rmm::cuda_stream_view stream); + +/** + * @brief Compute page row offsets for a given column schema index + * + * @param per_file_metadata Span of parquet footer metadata + * @param row_group_indices Span of input row group indices + * @param schema_idx Column's schema index + * @param stream CUDA stream + * @return Tuple of page row offsets, number of pages, and the size of the largest page in this + * column + */ +[[nodiscard]] std::tuple, size_type, size_type> compute_page_row_offsets( + cudf::host_span per_file_metadata, + cudf::host_span const> row_group_indices, + size_type schema_idx); + +/** + * @brief Make a device vector where each row contains the index of the page it belongs to + * + * @param page_row_counts Span of page row counts + * @param page_row_offsets Span of page row offsets + * @param total_rows Total number of rows + * @param stream CUDA stream + * @return Device vector where each row contains the index of the page it belongs to + */ +[[nodiscard]] rmm::device_uvector make_page_indices_async( + cudf::host_span page_row_counts, + cudf::host_span page_row_offsets, + cudf::size_type total_rows, + rmm::cuda_stream_view stream); + +} // namespace cudf::io::parquet::experimental::detail \ No newline at end of file From bc6f08b528ff5a77a8eb080d19b9f5edcda90f77 Mon Sep 17 00:00:00 2001 From: Muhammad Haseeb Date: Wed, 15 Oct 2025 02:58:11 +0000 Subject: [PATCH 02/25] Templatize ColumnView type for compute data page mask --- .../experimental/hybrid_scan_helpers.hpp | 6 +- .../parquet/experimental/page_index_filter.cu | 79 ++++++++++--------- 2 files changed, 45 insertions(+), 40 deletions(-) diff --git a/cpp/src/io/parquet/experimental/hybrid_scan_helpers.hpp b/cpp/src/io/parquet/experimental/hybrid_scan_helpers.hpp index 1330353045e..6ad25641407 100644 --- a/cpp/src/io/parquet/experimental/hybrid_scan_helpers.hpp +++ b/cpp/src/io/parquet/experimental/hybrid_scan_helpers.hpp @@ -281,6 +281,9 @@ class aggregate_reader_metadata : public aggregate_reader_metadata_base { * Compute a vector of boolean vectors indicating which data pages need to be decoded to * construct each input column based on the row mask, one vector per column * + * @tparam ColumnView Type of the row mask column view - cudf::mutable_column_view for filter + * columns and cudf::column_view for payload columns + * * @param row_mask Boolean column indicating which rows need to be read after page-pruning * @param row_group_indices Input row groups indices * @param input_columns Input column information @@ -290,8 +293,9 @@ class aggregate_reader_metadata : public aggregate_reader_metadata_base { * @return Boolean vector indicating which data pages need to be decoded to produce * the output table based on the input row mask across all input columns */ + template [[nodiscard]] std::vector compute_data_page_mask( - cudf::column_view row_mask, + ColumnView const& row_mask, cudf::host_span const> row_group_indices, cudf::host_span input_columns, cudf::size_type row_mask_offset, diff --git a/cpp/src/io/parquet/experimental/page_index_filter.cu b/cpp/src/io/parquet/experimental/page_index_filter.cu index b5948197537..937b2dcfa6e 100644 --- a/cpp/src/io/parquet/experimental/page_index_filter.cu +++ b/cpp/src/io/parquet/experimental/page_index_filter.cu @@ -487,8 +487,9 @@ std::unique_ptr aggregate_reader_metadata::build_row_mask_with_pag return cudf::detail::compute_column(stats_table, stats_expr.get_stats_expr().get(), stream, mr); } +template std::vector aggregate_reader_metadata::compute_data_page_mask( - cudf::column_view row_mask, + ColumnView const& row_mask, cudf::host_span const> row_group_indices, cudf::host_span input_columns, cudf::size_type row_mask_offset, @@ -565,13 +566,26 @@ std::vector aggregate_reader_metadata::compute_data_page_mask( "Mismatch in total rows in input row mask and row groups", std::invalid_argument); - // Return if all rows are required or all are invalid. - if (row_mask.null_count(row_mask_offset, row_mask_offset + total_rows) == total_rows or - thrust::all_of(rmm::exec_policy(stream), - row_mask.begin() + row_mask_offset, - row_mask.begin() + row_mask_offset + total_rows, + if constexpr (cuda::std::is_same_v) { + if (row_mask.nullable()) { + thrust::for_each(rmm::exec_policy_nosync(stream), + thrust::counting_iterator(row_mask_offset), + thrust::counting_iterator(row_mask_offset + total_rows), + [row_mask = row_mask.template begin(), + null_mask = row_mask.null_mask()] __device__(auto const row_idx) { + if (not bit_is_set(null_mask, row_idx)) { row_mask[row_idx] = true; } + }); + } + } else { + CUDF_EXPECTS(not row_mask.nullable(), "Row mask must not be nullable for payload columns"); + } + + // Return an empty vector if all rows are required or all are invalid. + if (thrust::all_of(rmm::exec_policy(stream), + row_mask.template begin() + row_mask_offset, + row_mask.template begin() + row_mask_offset + total_rows, cuda::std::identity{})) { - return std::vector(total_pages, true); + return {}; } // Total number of surviving pages across all columns @@ -581,37 +595,10 @@ std::vector aggregate_reader_metadata::compute_data_page_mask( std::vector>> data_page_mask_tasks; data_page_mask_tasks.reserve(num_columns); - // Host row mask validity and first bit offset - auto const [host_row_mask_validity, first_bit_offset] = [&] { - if (row_mask.nullable()) { - auto const first_word_idx = word_index(row_mask_offset); - auto const last_word_idx = word_index(row_mask_offset + total_rows); - auto const num_words = last_word_idx - first_word_idx + 1; - auto const max_words = num_bitmask_words(row_mask.size()) - first_word_idx - 1; - CUDF_EXPECTS(num_words <= max_words, - "Encountered unexpected number of bitmask words to copy from the row mask"); - return std::pair{ - cudf::detail::make_host_vector( - device_span(row_mask.null_mask() + first_word_idx, num_words), - stream), - intra_word_index(row_mask_offset)}; - } else { - // Empty vector if row mask is not nullable - return std::pair{cudf::detail::make_host_vector(0, stream), 0}; - } - }(); - - // Iterator for row mask validity - auto is_row_valid = cudf::detail::make_counting_transform_iterator( - first_bit_offset, - [is_nullable = row_mask.nullable(), nullmask = host_row_mask_validity.data()](auto bit_index) { - // Always valid if row mask is not nullable or check if the corresponding bit is set - return not is_nullable or bit_is_set(nullmask, bit_index); - }); - // Host row mask data auto const is_row_required = cudf::detail::make_host_vector( - device_span(row_mask.data() + row_mask_offset, total_rows), stream); + device_span(row_mask.template data() + row_mask_offset, total_rows), + stream); // For all columns, look up which pages contain at least one required row. i.e. // !validity_it[row_idx] or is_row_required[row_idx] satisfies, and add its byte range to the @@ -629,9 +616,8 @@ std::vector aggregate_reader_metadata::compute_data_page_mask( size_t num_surviving_pages_this_column = 0; // For all rows for (auto row_idx = 0; row_idx < total_rows; ++row_idx) { - // If this row is required or invalid, add its page index to the output - // list. - if (not is_row_valid[row_idx] or is_row_required[row_idx]) { + // If this row is required, add its page index to the output list. + if (is_row_required[row_idx]) { // binary search to find the page index this row_idx belongs to and set // the page index to true page_indices auto const offsets = cudf::host_span( @@ -671,4 +657,19 @@ std::vector aggregate_reader_metadata::compute_data_page_mask( return data_page_mask; } +// Instantiate the templates with ColumnView as cudf::column_view and cudf::mutable_column_view +template std::vector aggregate_reader_metadata::compute_data_page_mask( + cudf::column_view const& row_mask, + cudf::host_span const> row_group_indices, + cudf::host_span input_columns, + cudf::size_type row_mask_offset, + rmm::cuda_stream_view stream) const; + +template std::vector aggregate_reader_metadata::compute_data_page_mask< + cudf::mutable_column_view>(cudf::mutable_column_view const& row_mask, + cudf::host_span const> row_group_indices, + cudf::host_span input_columns, + cudf::size_type row_mask_offset, + rmm::cuda_stream_view stream) const; + } // namespace cudf::io::parquet::experimental::detail From 30dbbee03aa703b2a6391c44e38e2e99b49f1180 Mon Sep 17 00:00:00 2001 From: Muhammad Haseeb Date: Wed, 15 Oct 2025 03:24:54 +0000 Subject: [PATCH 03/25] Minor refactor --- .../parquet/experimental/page_index_filter.cu | 20 ++++++++++--------- 1 file changed, 11 insertions(+), 9 deletions(-) diff --git a/cpp/src/io/parquet/experimental/page_index_filter.cu b/cpp/src/io/parquet/experimental/page_index_filter.cu index 937b2dcfa6e..abf352d1266 100644 --- a/cpp/src/io/parquet/experimental/page_index_filter.cu +++ b/cpp/src/io/parquet/experimental/page_index_filter.cu @@ -566,6 +566,15 @@ std::vector aggregate_reader_metadata::compute_data_page_mask( "Mismatch in total rows in input row mask and row groups", std::invalid_argument); + // Return an empty vector if all rows are invalid or all rows are required + if (row_mask.null_count(row_mask_offset, row_mask_offset + total_rows, stream) == total_rows or + thrust::all_of(rmm::exec_policy(stream), + row_mask.template begin() + row_mask_offset, + row_mask.template begin() + row_mask_offset + total_rows, + cuda::std::identity{})) { + return {}; + } + if constexpr (cuda::std::is_same_v) { if (row_mask.nullable()) { thrust::for_each(rmm::exec_policy_nosync(stream), @@ -577,15 +586,8 @@ std::vector aggregate_reader_metadata::compute_data_page_mask( }); } } else { - CUDF_EXPECTS(not row_mask.nullable(), "Row mask must not be nullable for payload columns"); - } - - // Return an empty vector if all rows are required or all are invalid. - if (thrust::all_of(rmm::exec_policy(stream), - row_mask.template begin() + row_mask_offset, - row_mask.template begin() + row_mask_offset + total_rows, - cuda::std::identity{})) { - return {}; + CUDF_EXPECTS(not row_mask.nullable() or row_mask.null_count() == 0, + "Row mask must not contain nulls for payload columns"); } // Total number of surviving pages across all columns From 73f64f218468c47145c4815fc71dc09a22dfcd5b Mon Sep 17 00:00:00 2001 From: Muhammad Haseeb Date: Wed, 15 Oct 2025 22:47:44 +0000 Subject: [PATCH 04/25] Initial impl of GPU data page mask algorithm --- .../parquet/experimental/page_index_filter.cu | 292 +++++++++++++----- .../experimental/page_index_filter_utils.cu | 28 +- .../experimental/page_index_filter_utils.hpp | 15 +- 3 files changed, 255 insertions(+), 80 deletions(-) diff --git a/cpp/src/io/parquet/experimental/page_index_filter.cu b/cpp/src/io/parquet/experimental/page_index_filter.cu index abf352d1266..450dd2a3de7 100644 --- a/cpp/src/io/parquet/experimental/page_index_filter.cu +++ b/cpp/src/io/parquet/experimental/page_index_filter.cu @@ -24,6 +24,7 @@ #include #include #include +#include #include #include #include @@ -487,6 +488,147 @@ std::unique_ptr aggregate_reader_metadata::build_row_mask_with_pag return cudf::detail::compute_column(stats_table, stats_expr.get_stats_expr().get(), stream, mr); } +/** + * @brief Custom CUDA kernel using Cooperative Groups to perform the paired logical OR reduction. + * * NOTE: This operation is a map/stride-2-read, not a true block-to-global reduction. + * CUB's BlockReduce is unsuitable here as it reduces a block to a single element. + * Cooperative Groups is used here for robust global thread ID calculation. + */ +__global__ void reduce_or_kernel(cudf::device_span level_ptrs, + size_type current_level, + size_t current_level_size) // Size of the source level +{ + // Use Cooperative Groups to get the global thread index (tid) + auto tid = cudf::detail::grid_1d::global_thread_id(); + + auto const current_level_ptr = level_ptrs[current_level]; + auto next_level_ptr = level_ptrs[current_level + 1]; + + // The reduction only needs to run for half the size of the current level + size_t next_level_size = (current_level_size + 1) / 2; + + if (tid < next_level_size) { + size_t idx1 = tid * 2; + size_t idx2 = tid * 2 + 1; + + // Perform the logical OR reduction and write to the next level's location + next_level_ptr[tid] = current_level_ptr[idx1] || current_level_ptr[idx2]; + } + + // Handle the odd-sized remaining element if current_level_size is odd + if (current_level_size % 2 != 0 && tid == next_level_size) { + // The last element is carried forward (ORed with false) + next_level_ptr[tid] = current_level_ptr[current_level_size - 1]; + } +} + +/** + * @brief CUDA kernel to probe multiple ranges against the pre-calculated mask hierarchy. + * One thread handles the binary decomposition and query for one range [M, N). + * * @param d_level_ptrs Device array of pointers, where d_level_ptrs[k] points to the start of + * Level k mask. + * @param d_range_offsets Device array where range i is [d_range_offsets[i], d_range_offsets[i+1]). + * @param num_ranges The number of ranges to process. + * @param d_results Pointer to device memory to store the boolean result (true if a '1' is found in + * the range). + */ +__global__ void probe_hierarchical_masks_kernel( + cudf::device_span level_ptrs, // Pointers to start of each mask level + cudf::device_span page_offsets, // Range boundary array + bool* results) +{ + auto const num_ranges = page_offsets.size() - 1; + size_t range_idx = cudf::detail::grid_1d::global_thread_id(); + + if (range_idx > num_ranges) { return; } + + // Initialize result for this range to false (assuming no set bit) + results[range_idx] = false; + + // Retrieve M and N for the current range [M, N) + size_type M = page_offsets[range_idx]; + size_type N = page_offsets[range_idx + 1]; + + // If the range is empty or invalid, terminate + if (M >= N) { return; } + + // Binary Decomposition Loop + while (M < N) { + // 1. Calculate the largest power of 2 that can align M up to the boundary. + // This is determined by the Least Significant Bit (LSB) of M. + // If M=0, LSB is usually defined as the full size, but here M is typically > 0 + // or we handle M=0 implicitly by the full range check. + // The expression (M & -M) gives the value of the LSB, which is the block size (2^k). + size_t m_lsb_block_size = (M == 0) ? N : (M & -M); + size_t m_next_aligned = M + m_lsb_block_size; + + // 2. Calculate the largest power of 2 block that can align N down to the boundary. + // This is determined by the LSB of (N - M), but simpler to use N's alignment for the end. + // The expression (N & -N) gives the block size corresponding to N's alignment. + // We ensure N_lsb_block_size does not exceed the remaining range size (N-M). + size_t n_lsb_block_size = N & -N; + + // --- Decision Logic: Which side to consume? --- + + // Block 1: M-aligned block (from M up to m_next_aligned) + size_t block1_size = m_next_aligned - M; + + // Block 2: N-aligned block (from N - n_lsb_block_size up to N) + size_t block2_size = n_lsb_block_size; + + // Block 3: The remaining central range block + + if (block1_size > 0 && M < m_next_aligned && m_next_aligned <= N) { + // If the M-aligned block is fully contained in the range [M, N) + + // Check if block1_size is 2^k. k = log2(block1_size). + // Since block1_size is based on LSB, it is always a power of 2. + size_t k1 = __ffs(block1_size) - 1; + + // Calculate mask index: The starting point M is divided by the block size. + size_t mask_idx = M / block1_size; + + // Look up the mask value + if (level_ptrs[k1][mask_idx]) { + results[range_idx] = true; + return; // Found a set bit, terminate for this range + } + + // Advance M + M = m_next_aligned; + } else if (block2_size > 0 && N - block2_size >= M) { + // If the N-aligned block is fully contained and does not overlap M's new position + + // Check if block2_size is 2^k. k = log2(block2_size). + size_t k2 = __ffs(block2_size) - 1; + + // Calculate mask index + size_t mask_idx = (N - block2_size) / block2_size; + + // Look up the mask value + if (level_ptrs[k2][mask_idx]) { + results[range_idx] = true; + return; // Found a set bit, terminate for this range + } + + // Backtrack N + N = N - block2_size; + } else { + // The remaining range is unaligned and small (or just 1 element). + // This happens when M and N are close and unaligned (e.g., [11, 13]). + + // Prioritize M (1-row check) or N (1-row check) until they meet. + + // Check single row at M (Level 0) + if (level_ptrs[0][M]) { + results[range_idx] = true; + return; + } + M++; + } + } +} + template std::vector aggregate_reader_metadata::compute_data_page_mask( ColumnView const& row_mask, @@ -500,6 +642,21 @@ std::vector aggregate_reader_metadata::compute_data_page_mask( CUDF_EXPECTS(row_mask.type().id() == cudf::type_id::BOOL8, "Input row bitmask should be of type BOOL8"); + auto const total_rows = total_rows_in_row_groups(row_group_indices); + + // Return an empty vector if all rows are invalid or all rows are required + if (row_mask.null_count(row_mask_offset, row_mask_offset + total_rows, stream) == total_rows or + thrust::all_of(rmm::exec_policy(stream), + row_mask.template begin() + row_mask_offset, + row_mask.template begin() + row_mask_offset + total_rows, + cuda::std::identity{})) { + return {}; + } + + CUDF_EXPECTS(row_mask_offset + total_rows <= row_mask.size(), + "Mismatch in total rows in input row mask and row groups", + std::invalid_argument); + auto const num_columns = input_columns.size(); // Collect column schema indices from the input columns. @@ -529,7 +686,7 @@ std::vector aggregate_reader_metadata::compute_data_page_mask( auto const schema_idx = column_schema_indices.front(); size_type col_num_pages = 0; std::tie(page_row_offsets, col_num_pages, max_page_size) = - compute_page_row_offsets(per_file_metadata, row_group_indices, schema_idx); + compute_page_row_offsets(per_file_metadata, row_group_indices, schema_idx, row_mask_offset); // Add 1 to include the the 0th page's offset for each column col_page_offsets.emplace_back(col_num_pages + 1); } else { @@ -542,8 +699,10 @@ std::vector aggregate_reader_metadata::compute_data_page_mask( [&](auto const col_idx) { page_row_offsets_tasks.emplace_back( cudf::detail::host_worker_pool().submit_task([&, col_idx = col_idx] { - return compute_page_row_offsets( - per_file_metadata, row_group_indices, column_schema_indices[col_idx]); + return compute_page_row_offsets(per_file_metadata, + row_group_indices, + column_schema_indices[col_idx], + row_mask_offset); })); }); @@ -559,22 +718,9 @@ std::vector aggregate_reader_metadata::compute_data_page_mask( }); } - auto const total_rows = page_row_offsets.back(); auto const total_pages = page_row_offsets.size() - num_columns; - CUDF_EXPECTS(row_mask_offset + total_rows <= row_mask.size(), - "Mismatch in total rows in input row mask and row groups", - std::invalid_argument); - - // Return an empty vector if all rows are invalid or all rows are required - if (row_mask.null_count(row_mask_offset, row_mask_offset + total_rows, stream) == total_rows or - thrust::all_of(rmm::exec_policy(stream), - row_mask.template begin() + row_mask_offset, - row_mask.template begin() + row_mask_offset + total_rows, - cuda::std::identity{})) { - return {}; - } - + // Make sure all row_mask elements contain valid values even if they are nulls if constexpr (cuda::std::is_same_v) { if (row_mask.nullable()) { thrust::for_each(rmm::exec_policy_nosync(stream), @@ -590,71 +736,65 @@ std::vector aggregate_reader_metadata::compute_data_page_mask( "Row mask must not contain nulls for payload columns"); } - // Total number of surviving pages across all columns - std::atomic total_surviving_pages{0}; + auto const mr = cudf::get_current_device_resource_ref(); + auto const [level_offsets, total_levels_size] = + compute_row_mask_levels(row_mask.size(), max_page_size); + auto const total_levels = level_offsets.size(); - // Tasks to compute data page mask for each column - std::vector>> data_page_mask_tasks; - data_page_mask_tasks.reserve(num_columns); + auto levels_data = rmm::device_uvector(total_levels_size, stream, mr); - // Host row mask data - auto const is_row_required = cudf::detail::make_host_vector( - device_span(row_mask.template data() + row_mask_offset, total_rows), - stream); - - // For all columns, look up which pages contain at least one required row. i.e. - // !validity_it[row_idx] or is_row_required[row_idx] satisfies, and add its byte range to the - // output list of byte ranges for the column. - std::for_each(thrust::counting_iterator(0), - thrust::counting_iterator(num_columns), - [&](auto const col_idx) { - data_page_mask_tasks.emplace_back( - cudf::detail::host_worker_pool().submit_task([&, col_idx = col_idx] { - // Construct a row indices mapping based on page row counts and offsets - auto const total_pages_this_column = - col_page_offsets[col_idx + 1] - col_page_offsets[col_idx] - 1; - auto col_data_page_mask = std::vector(total_pages_this_column, false); - // Number of final filtered pages for this column - size_t num_surviving_pages_this_column = 0; - // For all rows - for (auto row_idx = 0; row_idx < total_rows; ++row_idx) { - // If this row is required, add its page index to the output list. - if (is_row_required[row_idx]) { - // binary search to find the page index this row_idx belongs to and set - // the page index to true page_indices - auto const offsets = cudf::host_span( - page_row_offsets.data() + col_page_offsets[col_idx], - col_page_offsets[col_idx + 1] - col_page_offsets[col_idx]); - auto const page_itr = - std::upper_bound(offsets.begin(), offsets.end(), row_idx); - CUDF_EXPECTS(page_itr != offsets.begin() and page_itr != offsets.end(), - "Invalid page index"); - auto const page_idx = std::distance(offsets.begin(), page_itr) - 1; - col_data_page_mask[page_idx] = true; - num_surviving_pages_this_column++; - // Move row_idx to the last row of this page - row_idx = offsets[page_idx + 1] - 1; - } - } - total_surviving_pages.fetch_add(num_surviving_pages_this_column); - return col_data_page_mask; - })); + auto host_level_ptrs = cudf::detail::make_host_vector(total_levels, stream); + host_level_ptrs[0] = const_cast(row_mask.template begin()) + row_mask_offset; + std::for_each(thrust::counting_iterator(1), + thrust::counting_iterator(total_levels), + [&](auto const level_idx) { + host_level_ptrs[level_idx] = levels_data.data() + level_offsets[level_idx - 1]; }); + auto device_level_ptrs = cudf::detail::make_device_uvector_async(host_level_ptrs, stream, mr); - // Vector to hold data page mask for each column - auto data_page_mask = std::vector(); - data_page_mask.reserve(total_pages); + { + auto const next_level_size = level_offsets[1]; + auto const current_level_size = row_mask.size(); + cudf::detail::grid_1d config(next_level_size, 256, 1); + reduce_or_kernel<<>>( + device_level_ptrs, 0, current_level_size); + stream.synchronize(); + } + + for (size_t current_level_idx = 1; current_level_idx < total_levels - 1; current_level_idx++) { + auto const next_level_size = + level_offsets[current_level_idx + 1] - level_offsets[current_level_idx]; + auto const current_level_size = + level_offsets[current_level_idx] - level_offsets[current_level_idx - 1]; + cudf::detail::grid_1d config(next_level_size, 256, 1); + reduce_or_kernel<<>>( + device_level_ptrs, current_level_idx, current_level_size); + stream.synchronize(); + } - std::for_each(data_page_mask_tasks.begin(), data_page_mask_tasks.end(), [&](auto& task) { - auto col_data_page_mask = std::move(task).get(); - data_page_mask.insert(data_page_mask.end(), - std::make_move_iterator(col_data_page_mask.begin()), - std::make_move_iterator(col_data_page_mask.end())); - }); + auto const num_ranges = page_row_offsets.size() - 1; + rmm::device_uvector device_data_page_mask(num_ranges, stream, mr); + auto page_offsets = cudf::detail::make_device_uvector_async(page_row_offsets, stream, mr); + { + cudf::detail::grid_1d config(num_ranges, 256, 1); + probe_hierarchical_masks_kernel<<>>( + device_level_ptrs, page_offsets, device_data_page_mask.data()); + } - CUDF_EXPECTS( - total_surviving_pages <= total_pages, - "Number of surviving pages must be less than or equal to the total number of input pages"); + auto host_results = cudf::detail::make_host_vector(device_data_page_mask, stream); + std::vector data_page_mask{}; + data_page_mask.reserve(total_pages); + thrust::host_vector gather_mask(num_ranges, true); + std::for_each(thrust::counting_iterator(0), + thrust::counting_iterator(num_ranges), + [&](auto const range_idx) { + if (page_row_offsets[range_idx] < page_row_offsets[range_idx + 1]) { + data_page_mask.push_back(host_results[range_idx]); + } + }); return data_page_mask; } diff --git a/cpp/src/io/parquet/experimental/page_index_filter_utils.cu b/cpp/src/io/parquet/experimental/page_index_filter_utils.cu index 3dfb8e69d51..0aa3d58a75e 100644 --- a/cpp/src/io/parquet/experimental/page_index_filter_utils.cu +++ b/cpp/src/io/parquet/experimental/page_index_filter_utils.cu @@ -146,7 +146,8 @@ compute_page_row_counts_and_offsets(cudf::host_span per_fil std::tuple, size_type, size_type> compute_page_row_offsets( cudf::host_span per_file_metadata, cudf::host_span const> row_group_indices, - size_type schema_idx) + cudf::size_type schema_idx, + cudf::size_type row_mask_offset) { // Compute total number of row groups auto const total_row_groups = @@ -156,7 +157,7 @@ std::tuple, size_type, size_type> compute_page_row_offset [](auto sum, auto const& rg_indices) { return sum + rg_indices.size(); }); std::vector page_row_offsets; - page_row_offsets.push_back(0); + page_row_offsets.push_back(row_mask_offset); size_type max_page_size = 0; size_type num_pages = 0; @@ -232,4 +233,27 @@ rmm::device_uvector make_page_indices_async( return page_indices; } +std::pair, size_type> compute_row_mask_levels(cudf::size_type num_rows, + cudf::size_type max_page_size) +{ + std::vector level_offsets; + level_offsets.push_back(0); + + size_t current_size = (num_rows + 1) / 2; + size_t current_level = 1; + + while (current_size > 0) { + size_t block_size = 1ULL << current_level; + + level_offsets.push_back(current_size); + current_size += num_rows; + + if (std::cmp_greater_equal(block_size, max_page_size)) { break; } + + current_size = (current_size + 1) / 2; + current_level++; + } + return {std::move(level_offsets), current_size}; +} + } // namespace cudf::io::parquet::experimental::detail \ No newline at end of file diff --git a/cpp/src/io/parquet/experimental/page_index_filter_utils.hpp b/cpp/src/io/parquet/experimental/page_index_filter_utils.hpp index f53bede91c7..ef29b9be9a1 100644 --- a/cpp/src/io/parquet/experimental/page_index_filter_utils.hpp +++ b/cpp/src/io/parquet/experimental/page_index_filter_utils.hpp @@ -68,14 +68,15 @@ compute_page_row_counts_and_offsets(cudf::host_span per_fil * @param per_file_metadata Span of parquet footer metadata * @param row_group_indices Span of input row group indices * @param schema_idx Column's schema index - * @param stream CUDA stream + * @param row_mask_offset Offset of the row mask * @return Tuple of page row offsets, number of pages, and the size of the largest page in this * column */ [[nodiscard]] std::tuple, size_type, size_type> compute_page_row_offsets( cudf::host_span per_file_metadata, cudf::host_span const> row_group_indices, - size_type schema_idx); + size_type schema_idx, + cudf::size_type row_mask_offset); /** * @brief Make a device vector where each row contains the index of the page it belongs to @@ -92,4 +93,14 @@ compute_page_row_counts_and_offsets(cudf::host_span per_fil cudf::size_type total_rows, rmm::cuda_stream_view stream); +/** + * @brief Compute the levels of the row mask + * + * @param num_rows Number of rows in the row mask + * @param max_page_size Maximum page size + * @return Pair of level offsets and total levels size + */ +[[nodiscard]] std::pair, size_type> compute_row_mask_levels( + cudf::size_type num_rows, cudf::size_type max_page_size); + } // namespace cudf::io::parquet::experimental::detail \ No newline at end of file From de651839bf3c7c064b6cf93c119e35c8b1a79f08 Mon Sep 17 00:00:00 2001 From: Muhammad Haseeb Date: Thu, 16 Oct 2025 00:28:26 +0000 Subject: [PATCH 05/25] Minor improvements --- .../parquet/experimental/page_index_filter.cu | 323 +++++++++--------- 1 file changed, 154 insertions(+), 169 deletions(-) diff --git a/cpp/src/io/parquet/experimental/page_index_filter.cu b/cpp/src/io/parquet/experimental/page_index_filter.cu index 450dd2a3de7..2534a0f05e4 100644 --- a/cpp/src/io/parquet/experimental/page_index_filter.cu +++ b/cpp/src/io/parquet/experimental/page_index_filter.cu @@ -395,6 +395,139 @@ struct page_stats_caster : public stats_caster_base { } }; +/** + * @brief Custom CUDA kernel using Cooperative Groups to perform the paired logical OR reduction. + * * NOTE: This operation is a map/stride-2-read, not a true block-to-global reduction. + * CUB's BlockReduce is unsuitable here as it reduces a block to a single element. + * Cooperative Groups is used here for robust global thread ID calculation. + */ +CUDF_KERNEL void reduce_or_kernel(bool** const level_ptrs, + size_type current_level, + size_t current_level_size) // Size of the source level +{ + // Use Cooperative Groups to get the global thread index (tid) + auto tid = cudf::detail::grid_1d::global_thread_id(); + + auto const current_level_ptr = level_ptrs[current_level]; + auto next_level_ptr = level_ptrs[current_level + 1]; + + // The reduction only needs to run for half the size of the current level + size_t next_level_size = (current_level_size + 1) / 2; + + if (tid < next_level_size) { + size_t idx1 = tid * 2; + size_t idx2 = tid * 2 + 1; + + // Perform the logical OR reduction and write to the next level's location + next_level_ptr[tid] = current_level_ptr[idx1] || current_level_ptr[idx2]; + } + + // Handle the odd-sized remaining element if current_level_size is odd + if (current_level_size % 2 != 0 && tid == next_level_size) { + // The last element is carried forward (ORed with false) + next_level_ptr[tid] = current_level_ptr[current_level_size - 1]; + } +} + +/** + * @brief CUDA kernel to probe multiple ranges against the pre-calculated mask hierarchy. + * One thread handles the binary decomposition and query for one range [M, N). + * * @param d_level_ptrs Device array of pointers, where d_level_ptrs[k] points to the start of + * Level k mask. + * @param d_range_offsets Device array where range i is [d_range_offsets[i], d_range_offsets[i+1]). + * @param num_ranges The number of ranges to process. + * @param d_results Pointer to device memory to store the boolean result (true if a '1' is found in + * the range). + */ +CUDF_KERNEL void probe_hierarchical_masks_kernel( + cudf::device_span level_ptrs, // Pointers to start of each mask level + cudf::device_span page_offsets, // Range boundary array + bool* results) +{ + auto const num_ranges = page_offsets.size() - 1; + size_t range_idx = cudf::detail::grid_1d::global_thread_id(); + + if (range_idx >= num_ranges) { return; } + + // Initialize result for this range to false (assuming no set bit) + results[range_idx] = false; + + // Retrieve M and N for the current range [M, N) + size_type M = page_offsets[range_idx]; + size_type N = page_offsets[range_idx + 1]; + + // If the range is empty or invalid, terminate + if (M >= N) { return; } + + // Binary Decomposition Loop + while (M < N) { + // 1. M Alignment: Find the largest power-of-two block that starts at M and aligns M up. + // Block size is determined by the Least Significant Bit (LSB) of M. + // If M=0, the LSB is the full range N, but we handle the LSB only for M>0. + // The __ffs intrinsic (Find First Set, 1-based) is the fastest way to get the LSB position (k). + size_t m_lsb_position = __ffs(M); // Position is 1-based (k+1) + size_t m_block_size = 1ULL << (m_lsb_position - 1); // Size is 2^k + + // 2. N Alignment: Find the largest power-of-two block that aligns N down. + // N & -N gives the LSB block size *if* N were the start, but we use it as the largest + // possible size that evenly divides N. + size_t n_block_size = N & -N; + + // The largest block size we can consume from the current range [M, N) + size_t max_block_size = 0; + size_t mask_level = 0; // k (k=0 is 1 row, k=1 is 2 rows, etc.) + size_t mask_index = 0; + + // --- Core Decomposition Logic --- + + // Check the M side alignment block: [M, M + m_block_size) + // This is only valid if M + m_block_size <= N (the block fits). + if (M > 0 && M + m_block_size <= N) { + max_block_size = m_block_size; + mask_level = m_lsb_position - 1; + mask_index = M >> mask_level; // M / 2^k + } + + // Check the N side alignment block: [N - n_block_size, N) + // This is only valid if N - n_block_size >= M and the N block is larger or equal to the M + // block. + if (n_block_size > 0 && N - n_block_size >= M && n_block_size >= max_block_size) { + // If the N block is larger or we are at the end, prioritize the N block + max_block_size = n_block_size; + mask_level = __ffs(n_block_size) - 1; + mask_index = (N - n_block_size) >> mask_level; // (N - 2^k) / 2^k + } + + // Fallback for small, unaligned ranges (e.g., [11, 13) where M and N are close) + // If max_block_size is 0 or too large, reduce by 1 row (Level 0) + if (max_block_size == 0 || max_block_size > N - M) { + max_block_size = 1; + mask_level = 0; + mask_index = M; // Level 0 index is just M + } + + // --- Query Mask and Advance --- + + // Look up the mask value at the determined level and index + if (level_ptrs[mask_level][mask_index]) { + results[range_idx] = true; + return; // Found a set bit, terminate for this range + } + + // Advance M or N based on which block was consumed (whichever has the smaller index) + if (mask_level == 0) { + // Consumed a single row + M += max_block_size; + } else if (M == mask_index * max_block_size) { + // Consumed an M-aligned block (moving M up) + M += max_block_size; + } else { + // Consumed an N-aligned block (moving N down) + N -= max_block_size; + } + } +} + } // namespace std::unique_ptr aggregate_reader_metadata::build_row_mask_with_page_index_stats( @@ -488,147 +621,6 @@ std::unique_ptr aggregate_reader_metadata::build_row_mask_with_pag return cudf::detail::compute_column(stats_table, stats_expr.get_stats_expr().get(), stream, mr); } -/** - * @brief Custom CUDA kernel using Cooperative Groups to perform the paired logical OR reduction. - * * NOTE: This operation is a map/stride-2-read, not a true block-to-global reduction. - * CUB's BlockReduce is unsuitable here as it reduces a block to a single element. - * Cooperative Groups is used here for robust global thread ID calculation. - */ -__global__ void reduce_or_kernel(cudf::device_span level_ptrs, - size_type current_level, - size_t current_level_size) // Size of the source level -{ - // Use Cooperative Groups to get the global thread index (tid) - auto tid = cudf::detail::grid_1d::global_thread_id(); - - auto const current_level_ptr = level_ptrs[current_level]; - auto next_level_ptr = level_ptrs[current_level + 1]; - - // The reduction only needs to run for half the size of the current level - size_t next_level_size = (current_level_size + 1) / 2; - - if (tid < next_level_size) { - size_t idx1 = tid * 2; - size_t idx2 = tid * 2 + 1; - - // Perform the logical OR reduction and write to the next level's location - next_level_ptr[tid] = current_level_ptr[idx1] || current_level_ptr[idx2]; - } - - // Handle the odd-sized remaining element if current_level_size is odd - if (current_level_size % 2 != 0 && tid == next_level_size) { - // The last element is carried forward (ORed with false) - next_level_ptr[tid] = current_level_ptr[current_level_size - 1]; - } -} - -/** - * @brief CUDA kernel to probe multiple ranges against the pre-calculated mask hierarchy. - * One thread handles the binary decomposition and query for one range [M, N). - * * @param d_level_ptrs Device array of pointers, where d_level_ptrs[k] points to the start of - * Level k mask. - * @param d_range_offsets Device array where range i is [d_range_offsets[i], d_range_offsets[i+1]). - * @param num_ranges The number of ranges to process. - * @param d_results Pointer to device memory to store the boolean result (true if a '1' is found in - * the range). - */ -__global__ void probe_hierarchical_masks_kernel( - cudf::device_span level_ptrs, // Pointers to start of each mask level - cudf::device_span page_offsets, // Range boundary array - bool* results) -{ - auto const num_ranges = page_offsets.size() - 1; - size_t range_idx = cudf::detail::grid_1d::global_thread_id(); - - if (range_idx > num_ranges) { return; } - - // Initialize result for this range to false (assuming no set bit) - results[range_idx] = false; - - // Retrieve M and N for the current range [M, N) - size_type M = page_offsets[range_idx]; - size_type N = page_offsets[range_idx + 1]; - - // If the range is empty or invalid, terminate - if (M >= N) { return; } - - // Binary Decomposition Loop - while (M < N) { - // 1. Calculate the largest power of 2 that can align M up to the boundary. - // This is determined by the Least Significant Bit (LSB) of M. - // If M=0, LSB is usually defined as the full size, but here M is typically > 0 - // or we handle M=0 implicitly by the full range check. - // The expression (M & -M) gives the value of the LSB, which is the block size (2^k). - size_t m_lsb_block_size = (M == 0) ? N : (M & -M); - size_t m_next_aligned = M + m_lsb_block_size; - - // 2. Calculate the largest power of 2 block that can align N down to the boundary. - // This is determined by the LSB of (N - M), but simpler to use N's alignment for the end. - // The expression (N & -N) gives the block size corresponding to N's alignment. - // We ensure N_lsb_block_size does not exceed the remaining range size (N-M). - size_t n_lsb_block_size = N & -N; - - // --- Decision Logic: Which side to consume? --- - - // Block 1: M-aligned block (from M up to m_next_aligned) - size_t block1_size = m_next_aligned - M; - - // Block 2: N-aligned block (from N - n_lsb_block_size up to N) - size_t block2_size = n_lsb_block_size; - - // Block 3: The remaining central range block - - if (block1_size > 0 && M < m_next_aligned && m_next_aligned <= N) { - // If the M-aligned block is fully contained in the range [M, N) - - // Check if block1_size is 2^k. k = log2(block1_size). - // Since block1_size is based on LSB, it is always a power of 2. - size_t k1 = __ffs(block1_size) - 1; - - // Calculate mask index: The starting point M is divided by the block size. - size_t mask_idx = M / block1_size; - - // Look up the mask value - if (level_ptrs[k1][mask_idx]) { - results[range_idx] = true; - return; // Found a set bit, terminate for this range - } - - // Advance M - M = m_next_aligned; - } else if (block2_size > 0 && N - block2_size >= M) { - // If the N-aligned block is fully contained and does not overlap M's new position - - // Check if block2_size is 2^k. k = log2(block2_size). - size_t k2 = __ffs(block2_size) - 1; - - // Calculate mask index - size_t mask_idx = (N - block2_size) / block2_size; - - // Look up the mask value - if (level_ptrs[k2][mask_idx]) { - results[range_idx] = true; - return; // Found a set bit, terminate for this range - } - - // Backtrack N - N = N - block2_size; - } else { - // The remaining range is unaligned and small (or just 1 element). - // This happens when M and N are close and unaligned (e.g., [11, 13]). - - // Prioritize M (1-row check) or N (1-row check) until they meet. - - // Check single row at M (Level 0) - if (level_ptrs[0][M]) { - results[range_idx] = true; - return; - } - M++; - } - } -} - template std::vector aggregate_reader_metadata::compute_data_page_mask( ColumnView const& row_mask, @@ -750,27 +742,19 @@ std::vector aggregate_reader_metadata::compute_data_page_mask( [&](auto const level_idx) { host_level_ptrs[level_idx] = levels_data.data() + level_offsets[level_idx - 1]; }); - auto device_level_ptrs = cudf::detail::make_device_uvector_async(host_level_ptrs, stream, mr); - - { - auto const next_level_size = level_offsets[1]; - auto const current_level_size = row_mask.size(); - cudf::detail::grid_1d config(next_level_size, 256, 1); - reduce_or_kernel<<>>( - device_level_ptrs, 0, current_level_size); - stream.synchronize(); - } - for (size_t current_level_idx = 1; current_level_idx < total_levels - 1; current_level_idx++) { - auto const next_level_size = - level_offsets[current_level_idx + 1] - level_offsets[current_level_idx]; - auto const current_level_size = - level_offsets[current_level_idx] - level_offsets[current_level_idx - 1]; - cudf::detail::grid_1d config(next_level_size, 256, 1); - reduce_or_kernel<<>>( - device_level_ptrs, current_level_idx, current_level_size); - stream.synchronize(); - } + auto device_level_ptrs = cudf::detail::make_device_uvector_async(host_level_ptrs, stream, mr); + auto current_level_size = row_mask.size(); + std::for_each( + thrust::counting_iterator(0), + thrust::counting_iterator(total_levels - 1), + [&](auto const level) { + auto const next_level_size = level_offsets[level + 1] - level_offsets[level]; + cudf::detail::grid_1d config(next_level_size, 256, 1); + reduce_or_kernel<<>>( + device_level_ptrs.data(), level, current_level_size); + current_level_size = next_level_size; + }); auto const num_ranges = page_row_offsets.size() - 1; rmm::device_uvector device_data_page_mask(num_ranges, stream, mr); @@ -784,18 +768,19 @@ std::vector aggregate_reader_metadata::compute_data_page_mask( device_level_ptrs, page_offsets, device_data_page_mask.data()); } - auto host_results = cudf::detail::make_host_vector(device_data_page_mask, stream); + auto host_results = cudf::detail::make_host_vector(device_data_page_mask, stream); + auto host_results_iter = host_results.begin(); std::vector data_page_mask{}; data_page_mask.reserve(total_pages); - thrust::host_vector gather_mask(num_ranges, true); std::for_each(thrust::counting_iterator(0), - thrust::counting_iterator(num_ranges), - [&](auto const range_idx) { - if (page_row_offsets[range_idx] < page_row_offsets[range_idx + 1]) { - data_page_mask.push_back(host_results[range_idx]); - } + thrust::counting_iterator(num_columns), + [&](auto col_idx) { + auto const col_num_pages = + col_page_offsets[col_idx + 1] - col_page_offsets[col_idx] - 1; + data_page_mask.insert( + data_page_mask.end(), host_results_iter, host_results_iter + col_num_pages); + std::advance(host_results_iter, col_num_pages + 1); }); - return data_page_mask; } From f6b79c797371d94432fe0d90dcd8423dccedcaa1 Mon Sep 17 00:00:00 2001 From: Muhammad Haseeb Date: Thu, 16 Oct 2025 00:31:53 +0000 Subject: [PATCH 06/25] Add cmake changes --- cpp/CMakeLists.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index f3eb82a113f..c189575c4ed 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -546,6 +546,7 @@ add_library( src/io/parquet/experimental/hybrid_scan_impl.cpp src/io/parquet/experimental/hybrid_scan_preprocess.cu src/io/parquet/experimental/page_index_filter.cu + src/io/parquet/experimental/page_index_filter_utils.cu src/io/parquet/page_data.cu src/io/parquet/chunk_dict.cu src/io/parquet/page_enc.cu From aef9d646558d97ab3739831dbc26c1e2e48bc3b4 Mon Sep 17 00:00:00 2001 From: Muhammad Haseeb Date: Thu, 16 Oct 2025 01:35:07 +0000 Subject: [PATCH 07/25] Revert to old probe impl --- .../parquet/experimental/page_index_filter.cu | 129 ++++++++++-------- 1 file changed, 69 insertions(+), 60 deletions(-) diff --git a/cpp/src/io/parquet/experimental/page_index_filter.cu b/cpp/src/io/parquet/experimental/page_index_filter.cu index 2534a0f05e4..36983d706a0 100644 --- a/cpp/src/io/parquet/experimental/page_index_filter.cu +++ b/cpp/src/io/parquet/experimental/page_index_filter.cu @@ -461,69 +461,77 @@ CUDF_KERNEL void probe_hierarchical_masks_kernel( // Binary Decomposition Loop while (M < N) { - // 1. M Alignment: Find the largest power-of-two block that starts at M and aligns M up. - // Block size is determined by the Least Significant Bit (LSB) of M. - // If M=0, the LSB is the full range N, but we handle the LSB only for M>0. - // The __ffs intrinsic (Find First Set, 1-based) is the fastest way to get the LSB position (k). - size_t m_lsb_position = __ffs(M); // Position is 1-based (k+1) - size_t m_block_size = 1ULL << (m_lsb_position - 1); // Size is 2^k - - // 2. N Alignment: Find the largest power-of-two block that aligns N down. - // N & -N gives the LSB block size *if* N were the start, but we use it as the largest - // possible size that evenly divides N. - size_t n_block_size = N & -N; - - // The largest block size we can consume from the current range [M, N) - size_t max_block_size = 0; - size_t mask_level = 0; // k (k=0 is 1 row, k=1 is 2 rows, etc.) - size_t mask_index = 0; - - // --- Core Decomposition Logic --- - - // Check the M side alignment block: [M, M + m_block_size) - // This is only valid if M + m_block_size <= N (the block fits). - if (M > 0 && M + m_block_size <= N) { - max_block_size = m_block_size; - mask_level = m_lsb_position - 1; - mask_index = M >> mask_level; // M / 2^k - } + // 1. Calculate the largest power of 2 that can align M up to the boundary. + // This is determined by the Least Significant Bit (LSB) of M. + // If M=0, LSB is usually defined as the full size, but here M is typically > 0 + // or we handle M=0 implicitly by the full range check. + // The expression (M & -M) gives the value of the LSB, which is the block size (2^k). + size_t m_lsb_block_size = (M == 0) ? N : (M & -M); + size_t m_next_aligned = M + m_lsb_block_size; - // Check the N side alignment block: [N - n_block_size, N) - // This is only valid if N - n_block_size >= M and the N block is larger or equal to the M - // block. - if (n_block_size > 0 && N - n_block_size >= M && n_block_size >= max_block_size) { - // If the N block is larger or we are at the end, prioritize the N block - max_block_size = n_block_size; - mask_level = __ffs(n_block_size) - 1; - mask_index = (N - n_block_size) >> mask_level; // (N - 2^k) / 2^k - } + // 2. Calculate the largest power of 2 block that can align N down to the boundary. + // This is determined by the LSB of (N - M), but simpler to use N's alignment for the end. + // The expression (N & -N) gives the block size corresponding to N's alignment. + // We ensure N_lsb_block_size does not exceed the remaining range size (N-M). + size_t n_lsb_block_size = N & -N; - // Fallback for small, unaligned ranges (e.g., [11, 13) where M and N are close) - // If max_block_size is 0 or too large, reduce by 1 row (Level 0) - if (max_block_size == 0 || max_block_size > N - M) { - max_block_size = 1; - mask_level = 0; - mask_index = M; // Level 0 index is just M - } + // --- Decision Logic: Which side to consume? --- - // --- Query Mask and Advance --- + // Block 1: M-aligned block (from M up to m_next_aligned) + size_t block1_size = m_next_aligned - M; - // Look up the mask value at the determined level and index - if (level_ptrs[mask_level][mask_index]) { - results[range_idx] = true; - return; // Found a set bit, terminate for this range - } + // Block 2: N-aligned block (from N - n_lsb_block_size up to N) + size_t block2_size = n_lsb_block_size; + + // Block 3: The remaining central range block + + if (block1_size > 0 && M < m_next_aligned && m_next_aligned <= N) { + // If the M-aligned block is fully contained in the range [M, N) + + // Check if block1_size is 2^k. k = log2(block1_size). + // Since block1_size is based on LSB, it is always a power of 2. + size_t k1 = __ffs(block1_size) - 1; + + // Calculate mask index: The starting point M is divided by the block size. + size_t mask_idx = M / block1_size; + + // Look up the mask value + if (level_ptrs[k1][mask_idx]) { + results[range_idx] = true; + return; // Found a set bit, terminate for this range + } + + // Advance M + M = m_next_aligned; + } else if (block2_size > 0 && N - block2_size >= M) { + // If the N-aligned block is fully contained and does not overlap M's new position - // Advance M or N based on which block was consumed (whichever has the smaller index) - if (mask_level == 0) { - // Consumed a single row - M += max_block_size; - } else if (M == mask_index * max_block_size) { - // Consumed an M-aligned block (moving M up) - M += max_block_size; + // Check if block2_size is 2^k. k = log2(block2_size). + size_t k2 = __ffs(block2_size) - 1; + + // Calculate mask index + size_t mask_idx = (N - block2_size) / block2_size; + + // Look up the mask value + if (level_ptrs[k2][mask_idx]) { + results[range_idx] = true; + return; // Found a set bit, terminate for this range + } + + // Backtrack N + N = N - block2_size; } else { - // Consumed an N-aligned block (moving N down) - N -= max_block_size; + // The remaining range is unaligned and small (or just 1 element). + // This happens when M and N are close and unaligned (e.g., [11, 13]). + + // Prioritize M (1-row check) or N (1-row check) until they meet. + + // Check single row at M (Level 0) + if (level_ptrs[0][M]) { + results[range_idx] = true; + return; + } + M++; } } } @@ -768,10 +776,11 @@ std::vector aggregate_reader_metadata::compute_data_page_mask( device_level_ptrs, page_offsets, device_data_page_mask.data()); } - auto host_results = cudf::detail::make_host_vector(device_data_page_mask, stream); - auto host_results_iter = host_results.begin(); + auto host_results = cudf::detail::make_host_vector_async(device_data_page_mask, stream); std::vector data_page_mask{}; data_page_mask.reserve(total_pages); + stream.synchronize(); + auto host_results_iter = host_results.begin(); std::for_each(thrust::counting_iterator(0), thrust::counting_iterator(num_columns), [&](auto col_idx) { @@ -779,7 +788,7 @@ std::vector aggregate_reader_metadata::compute_data_page_mask( col_page_offsets[col_idx + 1] - col_page_offsets[col_idx] - 1; data_page_mask.insert( data_page_mask.end(), host_results_iter, host_results_iter + col_num_pages); - std::advance(host_results_iter, col_num_pages + 1); + host_results_iter += col_num_pages + 1; }); return data_page_mask; } From ebf460a620c463f805c4ce2c6d14406a4d777ff5 Mon Sep 17 00:00:00 2001 From: Muhammad Haseeb Date: Thu, 16 Oct 2025 01:43:26 +0000 Subject: [PATCH 08/25] style fix --- cpp/src/io/parquet/experimental/page_index_filter_utils.cu | 2 +- cpp/src/io/parquet/experimental/page_index_filter_utils.hpp | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/cpp/src/io/parquet/experimental/page_index_filter_utils.cu b/cpp/src/io/parquet/experimental/page_index_filter_utils.cu index 0aa3d58a75e..ee12761dcd2 100644 --- a/cpp/src/io/parquet/experimental/page_index_filter_utils.cu +++ b/cpp/src/io/parquet/experimental/page_index_filter_utils.cu @@ -256,4 +256,4 @@ std::pair, size_type> compute_row_mask_levels(cudf::size_ return {std::move(level_offsets), current_size}; } -} // namespace cudf::io::parquet::experimental::detail \ No newline at end of file +} // namespace cudf::io::parquet::experimental::detail diff --git a/cpp/src/io/parquet/experimental/page_index_filter_utils.hpp b/cpp/src/io/parquet/experimental/page_index_filter_utils.hpp index ef29b9be9a1..03c857c2533 100644 --- a/cpp/src/io/parquet/experimental/page_index_filter_utils.hpp +++ b/cpp/src/io/parquet/experimental/page_index_filter_utils.hpp @@ -103,4 +103,4 @@ compute_page_row_counts_and_offsets(cudf::host_span per_fil [[nodiscard]] std::pair, size_type> compute_row_mask_levels( cudf::size_type num_rows, cudf::size_type max_page_size); -} // namespace cudf::io::parquet::experimental::detail \ No newline at end of file +} // namespace cudf::io::parquet::experimental::detail From c62f6b794b8af95cbe338a1364b5f53967cfaedf Mon Sep 17 00:00:00 2001 From: Muhammad Haseeb <14217455+mhaseeb123@users.noreply.github.com> Date: Thu, 16 Oct 2025 04:10:14 +0000 Subject: [PATCH 09/25] Use functors --- .../parquet/experimental/page_index_filter.cu | 224 ++++++++---------- .../experimental/page_index_filter_utils.cu | 7 +- 2 files changed, 108 insertions(+), 123 deletions(-) diff --git a/cpp/src/io/parquet/experimental/page_index_filter.cu b/cpp/src/io/parquet/experimental/page_index_filter.cu index 36983d706a0..cba29876774 100644 --- a/cpp/src/io/parquet/experimental/page_index_filter.cu +++ b/cpp/src/io/parquet/experimental/page_index_filter.cu @@ -26,6 +26,7 @@ #include #include #include +#include #include #include #include @@ -401,33 +402,28 @@ struct page_stats_caster : public stats_caster_base { * CUB's BlockReduce is unsuitable here as it reduces a block to a single element. * Cooperative Groups is used here for robust global thread ID calculation. */ -CUDF_KERNEL void reduce_or_kernel(bool** const level_ptrs, - size_type current_level, - size_t current_level_size) // Size of the source level -{ - // Use Cooperative Groups to get the global thread index (tid) - auto tid = cudf::detail::grid_1d::global_thread_id(); - - auto const current_level_ptr = level_ptrs[current_level]; - auto next_level_ptr = level_ptrs[current_level + 1]; - - // The reduction only needs to run for half the size of the current level - size_t next_level_size = (current_level_size + 1) / 2; - - if (tid < next_level_size) { - size_t idx1 = tid * 2; - size_t idx2 = tid * 2 + 1; +struct compute_next_level_functor { + cudf::device_span const level_ptrs; + cudf::size_type const current_level; + cudf::size_type const current_level_size; + cudf::size_type const next_level_size; - // Perform the logical OR reduction and write to the next level's location - next_level_ptr[tid] = current_level_ptr[idx1] || current_level_ptr[idx2]; - } + __device__ void operator()(cudf::size_type next_level_index) const noexcept + { + auto const current_level_ptr = level_ptrs.begin()[current_level]; + auto next_level_ptr = level_ptrs.begin()[current_level + 1]; - // Handle the odd-sized remaining element if current_level_size is odd - if (current_level_size % 2 != 0 && tid == next_level_size) { - // The last element is carried forward (ORed with false) - next_level_ptr[tid] = current_level_ptr[current_level_size - 1]; + // Handle the odd-sized remaining element if current_level_size is odd + if (current_level_size % 2 and next_level_index == (next_level_size - 1)) { + // The last element is carried forward (ORed with false) + next_level_ptr[next_level_index] = current_level_ptr[current_level_size - 1]; + } else { + // Perform the logical OR reduction and write to the next level's location + next_level_ptr[next_level_index] = + current_level_ptr[(next_level_index * 2)] or current_level_ptr[(next_level_index * 2) + 1]; + } } -} +}; /** * @brief CUDA kernel to probe multiple ranges against the pre-calculated mask hierarchy. @@ -439,102 +435,94 @@ CUDF_KERNEL void reduce_or_kernel(bool** const level_ptrs, * @param d_results Pointer to device memory to store the boolean result (true if a '1' is found in * the range). */ -CUDF_KERNEL void probe_hierarchical_masks_kernel( - cudf::device_span level_ptrs, // Pointers to start of each mask level - cudf::device_span page_offsets, // Range boundary array - bool* results) -{ - auto const num_ranges = page_offsets.size() - 1; - size_t range_idx = cudf::detail::grid_1d::global_thread_id(); +struct probe_masks_functor { + cudf::device_span level_ptrs; + cudf::device_span page_offsets; - if (range_idx >= num_ranges) { return; } + __device__ bool operator()(cudf::size_type range_idx) const noexcept + { + auto const num_ranges = page_offsets.size() - 1; - // Initialize result for this range to false (assuming no set bit) - results[range_idx] = false; + // Retrieve M and N for the current range [M, N) + size_type M = page_offsets[range_idx]; + size_type N = page_offsets[range_idx + 1]; - // Retrieve M and N for the current range [M, N) - size_type M = page_offsets[range_idx]; - size_type N = page_offsets[range_idx + 1]; + // If the range is empty or invalid, terminate + if (M >= N) { return false; } - // If the range is empty or invalid, terminate - if (M >= N) { return; } + // Binary Decomposition Loop + while (M < N) { + // 1. Calculate the largest power of 2 that can align M up to the boundary. + // This is determined by the Least Significant Bit (LSB) of M. + // If M=0, LSB is usually defined as the full size, but here M is typically > 0 + // or we handle M=0 implicitly by the full range check. + // The expression (M & -M) gives the value of the LSB, which is the block size (2^k). + size_t m_lsb_block_size = (M == 0) ? N : (M & -M); + size_t m_next_aligned = M + m_lsb_block_size; - // Binary Decomposition Loop - while (M < N) { - // 1. Calculate the largest power of 2 that can align M up to the boundary. - // This is determined by the Least Significant Bit (LSB) of M. - // If M=0, LSB is usually defined as the full size, but here M is typically > 0 - // or we handle M=0 implicitly by the full range check. - // The expression (M & -M) gives the value of the LSB, which is the block size (2^k). - size_t m_lsb_block_size = (M == 0) ? N : (M & -M); - size_t m_next_aligned = M + m_lsb_block_size; + // 2. Calculate the largest power of 2 block that can align N down to the boundary. + // This is determined by the LSB of (N - M), but simpler to use N's alignment for the end. + // The expression (N & -N) gives the block size corresponding to N's alignment. + // We ensure N_lsb_block_size does not exceed the remaining range size (N-M). + size_t n_lsb_block_size = N & -N; - // 2. Calculate the largest power of 2 block that can align N down to the boundary. - // This is determined by the LSB of (N - M), but simpler to use N's alignment for the end. - // The expression (N & -N) gives the block size corresponding to N's alignment. - // We ensure N_lsb_block_size does not exceed the remaining range size (N-M). - size_t n_lsb_block_size = N & -N; + // --- Decision Logic: Which side to consume? --- - // --- Decision Logic: Which side to consume? --- + // Block 1: M-aligned block (from M up to m_next_aligned) + size_t block1_size = m_next_aligned - M; - // Block 1: M-aligned block (from M up to m_next_aligned) - size_t block1_size = m_next_aligned - M; + // Block 2: N-aligned block (from N - n_lsb_block_size up to N) + size_t block2_size = n_lsb_block_size; - // Block 2: N-aligned block (from N - n_lsb_block_size up to N) - size_t block2_size = n_lsb_block_size; + // Block 3: The remaining central range block - // Block 3: The remaining central range block + if (block1_size > 0 && M < m_next_aligned && m_next_aligned <= N) { + // If the M-aligned block is fully contained in the range [M, N) - if (block1_size > 0 && M < m_next_aligned && m_next_aligned <= N) { - // If the M-aligned block is fully contained in the range [M, N) + // Check if block1_size is 2^k. k = log2(block1_size). + // Since block1_size is based on LSB, it is always a power of 2. + size_t k1 = __ffs(block1_size) - 1; - // Check if block1_size is 2^k. k = log2(block1_size). - // Since block1_size is based on LSB, it is always a power of 2. - size_t k1 = __ffs(block1_size) - 1; + // Calculate mask index: The starting point M is divided by the block size. + size_t mask_idx = M / block1_size; - // Calculate mask index: The starting point M is divided by the block size. - size_t mask_idx = M / block1_size; + // Look up the mask value + if (level_ptrs[k1][mask_idx]) { + return true; // Found a set bit, terminate for this range + } - // Look up the mask value - if (level_ptrs[k1][mask_idx]) { - results[range_idx] = true; - return; // Found a set bit, terminate for this range - } + // Advance M + M = m_next_aligned; + } else if (block2_size > 0 && N - block2_size >= M) { + // If the N-aligned block is fully contained and does not overlap M's new position - // Advance M - M = m_next_aligned; - } else if (block2_size > 0 && N - block2_size >= M) { - // If the N-aligned block is fully contained and does not overlap M's new position + // Check if block2_size is 2^k. k = log2(block2_size). + size_t k2 = __ffs(block2_size) - 1; - // Check if block2_size is 2^k. k = log2(block2_size). - size_t k2 = __ffs(block2_size) - 1; + // Calculate mask index + size_t mask_idx = (N - block2_size) / block2_size; - // Calculate mask index - size_t mask_idx = (N - block2_size) / block2_size; + // Look up the mask value + if (level_ptrs[k2][mask_idx]) { + return true; // Found a set bit, terminate for this range + } - // Look up the mask value - if (level_ptrs[k2][mask_idx]) { - results[range_idx] = true; - return; // Found a set bit, terminate for this range - } - - // Backtrack N - N = N - block2_size; - } else { - // The remaining range is unaligned and small (or just 1 element). - // This happens when M and N are close and unaligned (e.g., [11, 13]). + // Backtrack N + N = N - block2_size; + } else { + // The remaining range is unaligned and small (or just 1 element). + // This happens when M and N are close and unaligned (e.g., [11, 13]). - // Prioritize M (1-row check) or N (1-row check) until they meet. + // Prioritize M (1-row check) or N (1-row check) until they meet. - // Check single row at M (Level 0) - if (level_ptrs[0][M]) { - results[range_idx] = true; - return; + // Check single row at M (Level 0) + if (level_ptrs[0][M]) { return true; } + M++; } - M++; } + return false; } -} +}; } // namespace @@ -739,42 +727,38 @@ std::vector aggregate_reader_metadata::compute_data_page_mask( auto const mr = cudf::get_current_device_resource_ref(); auto const [level_offsets, total_levels_size] = compute_row_mask_levels(row_mask.size(), max_page_size); - auto const total_levels = level_offsets.size(); + auto const num_levels = static_cast(level_offsets.size()); auto levels_data = rmm::device_uvector(total_levels_size, stream, mr); - auto host_level_ptrs = cudf::detail::make_host_vector(total_levels, stream); + auto host_level_ptrs = cudf::detail::make_host_vector(num_levels, stream); host_level_ptrs[0] = const_cast(row_mask.template begin()) + row_mask_offset; - std::for_each(thrust::counting_iterator(1), - thrust::counting_iterator(total_levels), - [&](auto const level_idx) { - host_level_ptrs[level_idx] = levels_data.data() + level_offsets[level_idx - 1]; - }); + std::for_each( + thrust::counting_iterator(1), thrust::counting_iterator(num_levels), [&](auto const level_idx) { + host_level_ptrs[level_idx] = levels_data.data() + level_offsets[level_idx - 1]; + }); auto device_level_ptrs = cudf::detail::make_device_uvector_async(host_level_ptrs, stream, mr); auto current_level_size = row_mask.size(); std::for_each( - thrust::counting_iterator(0), - thrust::counting_iterator(total_levels - 1), - [&](auto const level) { - auto const next_level_size = level_offsets[level + 1] - level_offsets[level]; - cudf::detail::grid_1d config(next_level_size, 256, 1); - reduce_or_kernel<<>>( - device_level_ptrs.data(), level, current_level_size); + thrust::counting_iterator(0), thrust::counting_iterator(num_levels - 1), [&](auto const level) { + auto const next_level_size = cudf::util::div_rounding_up_unsafe(current_level_size, 2); + thrust::for_each( + rmm::exec_policy_nosync(stream), + thrust::counting_iterator(0), + thrust::counting_iterator(next_level_size), + compute_next_level_functor{device_level_ptrs, level, current_level_size, next_level_size}); current_level_size = next_level_size; }); - auto const num_ranges = page_row_offsets.size() - 1; + auto const num_ranges = static_cast(page_row_offsets.size() - 1); rmm::device_uvector device_data_page_mask(num_ranges, stream, mr); auto page_offsets = cudf::detail::make_device_uvector_async(page_row_offsets, stream, mr); - { - cudf::detail::grid_1d config(num_ranges, 256, 1); - probe_hierarchical_masks_kernel<<>>( - device_level_ptrs, page_offsets, device_data_page_mask.data()); - } + thrust::transform(rmm::exec_policy_nosync(stream), + thrust::counting_iterator(0), + thrust::counting_iterator(num_ranges), + device_data_page_mask.begin(), + probe_masks_functor{device_level_ptrs, page_offsets}); auto host_results = cudf::detail::make_host_vector_async(device_data_page_mask, stream); std::vector data_page_mask{}; diff --git a/cpp/src/io/parquet/experimental/page_index_filter_utils.cu b/cpp/src/io/parquet/experimental/page_index_filter_utils.cu index ee12761dcd2..d64d3eaf313 100644 --- a/cpp/src/io/parquet/experimental/page_index_filter_utils.cu +++ b/cpp/src/io/parquet/experimental/page_index_filter_utils.cu @@ -18,6 +18,7 @@ #include "page_index_filter_utils.hpp" #include +#include #include #include @@ -239,18 +240,18 @@ std::pair, size_type> compute_row_mask_levels(cudf::size_ std::vector level_offsets; level_offsets.push_back(0); - size_t current_size = (num_rows + 1) / 2; + size_t current_size = cudf::util::div_rounding_up_unsafe(num_rows, 2); size_t current_level = 1; while (current_size > 0) { - size_t block_size = 1ULL << current_level; + size_t block_size = size_t{1} << current_level; level_offsets.push_back(current_size); current_size += num_rows; if (std::cmp_greater_equal(block_size, max_page_size)) { break; } - current_size = (current_size + 1) / 2; + current_size = cudf::util::div_rounding_up_unsafe(current_size, 2); current_level++; } return {std::move(level_offsets), current_size}; From f9e8f85d4100b7dff87a2021d431bafd8f5541e4 Mon Sep 17 00:00:00 2001 From: Muhammad Haseeb <14217455+mhaseeb123@users.noreply.github.com> Date: Thu, 16 Oct 2025 04:12:48 +0000 Subject: [PATCH 10/25] Optimize for the chunked reader --- cpp/src/io/parquet/experimental/page_index_filter.cu | 12 +++++------- .../parquet/experimental/page_index_filter_utils.hpp | 2 +- 2 files changed, 6 insertions(+), 8 deletions(-) diff --git a/cpp/src/io/parquet/experimental/page_index_filter.cu b/cpp/src/io/parquet/experimental/page_index_filter.cu index cba29876774..35a47c3f3c4 100644 --- a/cpp/src/io/parquet/experimental/page_index_filter.cu +++ b/cpp/src/io/parquet/experimental/page_index_filter.cu @@ -674,7 +674,7 @@ std::vector aggregate_reader_metadata::compute_data_page_mask( auto const schema_idx = column_schema_indices.front(); size_type col_num_pages = 0; std::tie(page_row_offsets, col_num_pages, max_page_size) = - compute_page_row_offsets(per_file_metadata, row_group_indices, schema_idx, row_mask_offset); + compute_page_row_offsets(per_file_metadata, row_group_indices, schema_idx); // Add 1 to include the the 0th page's offset for each column col_page_offsets.emplace_back(col_num_pages + 1); } else { @@ -687,10 +687,8 @@ std::vector aggregate_reader_metadata::compute_data_page_mask( [&](auto const col_idx) { page_row_offsets_tasks.emplace_back( cudf::detail::host_worker_pool().submit_task([&, col_idx = col_idx] { - return compute_page_row_offsets(per_file_metadata, - row_group_indices, - column_schema_indices[col_idx], - row_mask_offset); + return compute_page_row_offsets( + per_file_metadata, row_group_indices, column_schema_indices[col_idx]); })); }); @@ -726,7 +724,7 @@ std::vector aggregate_reader_metadata::compute_data_page_mask( auto const mr = cudf::get_current_device_resource_ref(); auto const [level_offsets, total_levels_size] = - compute_row_mask_levels(row_mask.size(), max_page_size); + compute_row_mask_levels(total_rows, max_page_size); auto const num_levels = static_cast(level_offsets.size()); auto levels_data = rmm::device_uvector(total_levels_size, stream, mr); @@ -739,7 +737,7 @@ std::vector aggregate_reader_metadata::compute_data_page_mask( }); auto device_level_ptrs = cudf::detail::make_device_uvector_async(host_level_ptrs, stream, mr); - auto current_level_size = row_mask.size(); + auto current_level_size = total_rows; std::for_each( thrust::counting_iterator(0), thrust::counting_iterator(num_levels - 1), [&](auto const level) { auto const next_level_size = cudf::util::div_rounding_up_unsafe(current_level_size, 2); diff --git a/cpp/src/io/parquet/experimental/page_index_filter_utils.hpp b/cpp/src/io/parquet/experimental/page_index_filter_utils.hpp index 03c857c2533..c84746e07ac 100644 --- a/cpp/src/io/parquet/experimental/page_index_filter_utils.hpp +++ b/cpp/src/io/parquet/experimental/page_index_filter_utils.hpp @@ -76,7 +76,7 @@ compute_page_row_counts_and_offsets(cudf::host_span per_fil cudf::host_span per_file_metadata, cudf::host_span const> row_group_indices, size_type schema_idx, - cudf::size_type row_mask_offset); + cudf::size_type row_mask_offset = 0); /** * @brief Make a device vector where each row contains the index of the page it belongs to From 9ea83f5fc4ff22b98e2c038f95ab95abf792c8e4 Mon Sep 17 00:00:00 2001 From: Muhammad Haseeb <14217455+mhaseeb123@users.noreply.github.com> Date: Thu, 16 Oct 2025 04:56:10 +0000 Subject: [PATCH 11/25] Minor optimizations --- .../parquet/experimental/page_index_filter.cu | 25 +++++++++---------- 1 file changed, 12 insertions(+), 13 deletions(-) diff --git a/cpp/src/io/parquet/experimental/page_index_filter.cu b/cpp/src/io/parquet/experimental/page_index_filter.cu index 35a47c3f3c4..6b948e8f3ea 100644 --- a/cpp/src/io/parquet/experimental/page_index_filter.cu +++ b/cpp/src/io/parquet/experimental/page_index_filter.cu @@ -403,15 +403,15 @@ struct page_stats_caster : public stats_caster_base { * Cooperative Groups is used here for robust global thread ID calculation. */ struct compute_next_level_functor { - cudf::device_span const level_ptrs; + bool** const level_ptrs; cudf::size_type const current_level; cudf::size_type const current_level_size; cudf::size_type const next_level_size; __device__ void operator()(cudf::size_type next_level_index) const noexcept { - auto const current_level_ptr = level_ptrs.begin()[current_level]; - auto next_level_ptr = level_ptrs.begin()[current_level + 1]; + auto const current_level_ptr = level_ptrs[current_level]; + auto next_level_ptr = level_ptrs[current_level + 1]; // Handle the odd-sized remaining element if current_level_size is odd if (current_level_size % 2 and next_level_index == (next_level_size - 1)) { @@ -436,13 +436,12 @@ struct compute_next_level_functor { * the range). */ struct probe_masks_functor { - cudf::device_span level_ptrs; - cudf::device_span page_offsets; + bool** const level_ptrs; + cudf::size_type const* const page_offsets; + cudf::size_type const num_ranges; __device__ bool operator()(cudf::size_type range_idx) const noexcept { - auto const num_ranges = page_offsets.size() - 1; - // Retrieve M and N for the current range [M, N) size_type M = page_offsets[range_idx]; size_type N = page_offsets[range_idx + 1]; @@ -741,11 +740,11 @@ std::vector aggregate_reader_metadata::compute_data_page_mask( std::for_each( thrust::counting_iterator(0), thrust::counting_iterator(num_levels - 1), [&](auto const level) { auto const next_level_size = cudf::util::div_rounding_up_unsafe(current_level_size, 2); - thrust::for_each( - rmm::exec_policy_nosync(stream), - thrust::counting_iterator(0), - thrust::counting_iterator(next_level_size), - compute_next_level_functor{device_level_ptrs, level, current_level_size, next_level_size}); + thrust::for_each(rmm::exec_policy_nosync(stream), + thrust::counting_iterator(0), + thrust::counting_iterator(next_level_size), + compute_next_level_functor{ + device_level_ptrs.data(), level, current_level_size, next_level_size}); current_level_size = next_level_size; }); @@ -756,7 +755,7 @@ std::vector aggregate_reader_metadata::compute_data_page_mask( thrust::counting_iterator(0), thrust::counting_iterator(num_ranges), device_data_page_mask.begin(), - probe_masks_functor{device_level_ptrs, page_offsets}); + probe_masks_functor{device_level_ptrs.data(), page_offsets.data(), num_ranges}); auto host_results = cudf::detail::make_host_vector_async(device_data_page_mask, stream); std::vector data_page_mask{}; From 38a640bc70b1ea44deefc2778e1bf54a7d8e5183 Mon Sep 17 00:00:00 2001 From: Muhammad Haseeb <14217455+mhaseeb123@users.noreply.github.com> Date: Thu, 16 Oct 2025 05:24:40 +0000 Subject: [PATCH 12/25] Small optimizations --- .../experimental/hybrid_scan_chunking.cu | 10 +++--- .../experimental/hybrid_scan_helpers.hpp | 2 +- .../parquet/experimental/hybrid_scan_impl.cpp | 16 ++++----- .../parquet/experimental/hybrid_scan_impl.hpp | 12 +++---- .../experimental/hybrid_scan_preprocess.cu | 2 +- .../parquet/experimental/page_index_filter.cu | 36 +++++++++---------- .../experimental/page_index_filter_utils.cu | 34 +++++++----------- .../experimental/page_index_filter_utils.hpp | 4 +-- 8 files changed, 51 insertions(+), 65 deletions(-) diff --git a/cpp/src/io/parquet/experimental/hybrid_scan_chunking.cu b/cpp/src/io/parquet/experimental/hybrid_scan_chunking.cu index 7e824a89c60..41129c6b5b8 100644 --- a/cpp/src/io/parquet/experimental/hybrid_scan_chunking.cu +++ b/cpp/src/io/parquet/experimental/hybrid_scan_chunking.cu @@ -41,9 +41,10 @@ namespace cudf::io::parquet::experimental::detail { using parquet::detail::ColumnChunkDesc; using parquet::detail::pass_intermediate_data; -void hybrid_scan_reader_impl::handle_chunking(read_mode mode, - std::vector column_chunk_buffers, - std::vector const& data_page_mask) +void hybrid_scan_reader_impl::handle_chunking( + read_mode mode, + std::vector&& column_chunk_buffers, + cudf::host_span data_page_mask) { // if this is our first time in here, setup the first pass. if (!_pass_itm_data) { @@ -87,7 +88,8 @@ void hybrid_scan_reader_impl::handle_chunking(read_mode mode, setup_next_subpass(mode); } -void hybrid_scan_reader_impl::setup_next_pass(std::vector column_chunk_buffers) +void hybrid_scan_reader_impl::setup_next_pass( + std::vector&& column_chunk_buffers) { auto const num_passes = _file_itm_data.num_passes(); CUDF_EXPECTS(num_passes == 1, diff --git a/cpp/src/io/parquet/experimental/hybrid_scan_helpers.hpp b/cpp/src/io/parquet/experimental/hybrid_scan_helpers.hpp index 6ad25641407..eaa8860ec98 100644 --- a/cpp/src/io/parquet/experimental/hybrid_scan_helpers.hpp +++ b/cpp/src/io/parquet/experimental/hybrid_scan_helpers.hpp @@ -294,7 +294,7 @@ class aggregate_reader_metadata : public aggregate_reader_metadata_base { * the output table based on the input row mask across all input columns */ template - [[nodiscard]] std::vector compute_data_page_mask( + [[nodiscard]] cudf::detail::host_vector compute_data_page_mask( ColumnView const& row_mask, cudf::host_span const> row_group_indices, cudf::host_span input_columns, diff --git a/cpp/src/io/parquet/experimental/hybrid_scan_impl.cpp b/cpp/src/io/parquet/experimental/hybrid_scan_impl.cpp index 1915cce1073..266f6288424 100644 --- a/cpp/src/io/parquet/experimental/hybrid_scan_impl.cpp +++ b/cpp/src/io/parquet/experimental/hybrid_scan_impl.cpp @@ -455,7 +455,7 @@ table_with_metadata hybrid_scan_reader_impl::materialize_filter_columns( (mask_data_pages == use_data_page_mask::YES) ? _extended_metadata->compute_data_page_mask( row_mask, row_group_indices, _input_columns, _rows_processed_so_far, stream) - : std::vector{}; + : cudf::detail::make_empty_host_vector(0, stream); prepare_data( read_mode::READ_ALL, row_group_indices, std::move(column_chunk_buffers), data_page_mask); @@ -485,7 +485,7 @@ table_with_metadata hybrid_scan_reader_impl::materialize_payload_columns( (mask_data_pages == use_data_page_mask::YES) ? _extended_metadata->compute_data_page_mask( row_mask, row_group_indices, _input_columns, _rows_processed_so_far, stream) - : std::vector{}; + : cudf::detail::make_empty_host_vector(0, stream); prepare_data( read_mode::READ_ALL, row_group_indices, std::move(column_chunk_buffers), data_page_mask); @@ -524,7 +524,7 @@ void hybrid_scan_reader_impl::setup_chunking_for_filter_columns( (mask_data_pages == use_data_page_mask::YES) ? _extended_metadata->compute_data_page_mask( row_mask, row_group_indices, _input_columns, _rows_processed_so_far, _stream) - : std::vector{}; + : cudf::detail::make_empty_host_vector(0, _stream); prepare_data( read_mode::CHUNKED_READ, row_group_indices, std::move(column_chunk_buffers), data_page_mask); @@ -575,7 +575,7 @@ void hybrid_scan_reader_impl::setup_chunking_for_payload_columns( (mask_data_pages == use_data_page_mask::YES) ? _extended_metadata->compute_data_page_mask( row_mask, row_group_indices, _input_columns, _rows_processed_so_far, _stream) - : std::vector{}; + : cudf::detail::make_empty_host_vector(0, _stream); prepare_data( read_mode::CHUNKED_READ, row_group_indices, std::move(column_chunk_buffers), data_page_mask); @@ -656,7 +656,7 @@ void hybrid_scan_reader_impl::prepare_data( read_mode mode, cudf::host_span const> row_group_indices, std::vector&& column_chunk_buffers, - std::vector const& data_page_mask) + cudf::host_span data_page_mask) { // if we have not preprocessed at the whole-file level, do that now if (not _file_preprocessed) { @@ -885,7 +885,7 @@ table_with_metadata hybrid_scan_reader_impl::finalize_output( } } -void hybrid_scan_reader_impl::set_pass_page_mask(std::vector const& data_page_mask) +void hybrid_scan_reader_impl::set_pass_page_mask(cudf::host_span data_page_mask) { auto const& pass = _pass_itm_data; auto const& chunks = pass->chunks; @@ -926,10 +926,6 @@ void hybrid_scan_reader_impl::set_pass_page_mask(std::vector const& data_p } }); - // Make sure we inserted exactly the number of data pages for this column - CUDF_EXPECTS(num_inserted_data_pages == data_page_mask.size(), - "Encountered mismatch in number of data pages and page mask size"); - // Make sure we inserted exactly the number of pages for this pass CUDF_EXPECTS(_pass_page_mask.size() == pass->pages.size(), "Encountered mismatch in number of pass pages and page mask size"); diff --git a/cpp/src/io/parquet/experimental/hybrid_scan_impl.hpp b/cpp/src/io/parquet/experimental/hybrid_scan_impl.hpp index 9fbeea9a58d..9c6fd5cca45 100644 --- a/cpp/src/io/parquet/experimental/hybrid_scan_impl.hpp +++ b/cpp/src/io/parquet/experimental/hybrid_scan_impl.hpp @@ -266,7 +266,7 @@ class hybrid_scan_reader_impl : public parquet::detail::reader_impl { * * @param data_page_mask Input data page mask from page-pruning step */ - void set_pass_page_mask(std::vector const& data_page_mask); + void set_pass_page_mask(cudf::host_span data_page_mask); /** * @brief Select the columns to be read based on the read mode @@ -301,7 +301,7 @@ class hybrid_scan_reader_impl : public parquet::detail::reader_impl { void prepare_data(read_mode mode, cudf::host_span const> row_group_indices, std::vector&& column_chunk_buffers, - std::vector const& data_page_mask); + cudf::host_span data_page_mask); /** * @brief Create descriptors for filter column chunks and decode dictionary page headers @@ -342,8 +342,8 @@ class hybrid_scan_reader_impl : public parquet::detail::reader_impl { * @param data_page_mask Input data page mask from page-pruning step for the current pass */ void handle_chunking(read_mode mode, - std::vector column_chunk_buffers, - std::vector const& data_page_mask); + std::vector&& column_chunk_buffers, + cudf::host_span data_page_mask); /** * @brief Setup step for the next input read pass. @@ -353,7 +353,7 @@ class hybrid_scan_reader_impl : public parquet::detail::reader_impl { * * @param column_chunk_buffers Device buffers containing column chunk data */ - void setup_next_pass(std::vector column_chunk_buffers); + void setup_next_pass(std::vector&& column_chunk_buffers); /** * @brief Setup pointers to columns chunks to be processed for this pass. @@ -369,7 +369,7 @@ class hybrid_scan_reader_impl : public parquet::detail::reader_impl { * * @param column_chunk_buffers Device buffers containing column chunk data */ - void setup_compressed_data(std::vector column_chunk_buffers); + void setup_compressed_data(std::vector&& column_chunk_buffers); /** * @brief Reset the internal state of the reader. diff --git a/cpp/src/io/parquet/experimental/hybrid_scan_preprocess.cu b/cpp/src/io/parquet/experimental/hybrid_scan_preprocess.cu index 5892cfb0fa9..2798827a234 100644 --- a/cpp/src/io/parquet/experimental/hybrid_scan_preprocess.cu +++ b/cpp/src/io/parquet/experimental/hybrid_scan_preprocess.cu @@ -183,7 +183,7 @@ bool hybrid_scan_reader_impl::setup_column_chunks() } void hybrid_scan_reader_impl::setup_compressed_data( - std::vector column_chunk_buffers) + std::vector&& column_chunk_buffers) { auto& pass = *_pass_itm_data; diff --git a/cpp/src/io/parquet/experimental/page_index_filter.cu b/cpp/src/io/parquet/experimental/page_index_filter.cu index 6b948e8f3ea..c28731dc689 100644 --- a/cpp/src/io/parquet/experimental/page_index_filter.cu +++ b/cpp/src/io/parquet/experimental/page_index_filter.cu @@ -539,8 +539,7 @@ std::unique_ptr aggregate_reader_metadata::build_row_mask_with_pag if (row_group_indices.empty()) { return cudf::make_empty_column(cudf::type_id::BOOL8); } // Check if we have page index for all columns in all row groups - auto const has_page_index = - compute_has_page_index(per_file_metadata, row_group_indices, output_column_schemas); + auto const has_page_index = compute_has_page_index(per_file_metadata, row_group_indices); // Return if page index is not present CUDF_EXPECTS(has_page_index, @@ -617,7 +616,7 @@ std::unique_ptr aggregate_reader_metadata::build_row_mask_with_pag } template -std::vector aggregate_reader_metadata::compute_data_page_mask( +cudf::detail::host_vector aggregate_reader_metadata::compute_data_page_mask( ColumnView const& row_mask, cudf::host_span const> row_group_indices, cudf::host_span input_columns, @@ -637,7 +636,7 @@ std::vector aggregate_reader_metadata::compute_data_page_mask( row_mask.template begin() + row_mask_offset, row_mask.template begin() + row_mask_offset + total_rows, cuda::std::identity{})) { - return {}; + return cudf::detail::make_empty_host_vector(0, stream); } CUDF_EXPECTS(row_mask_offset + total_rows <= row_mask.size(), @@ -652,13 +651,13 @@ std::vector aggregate_reader_metadata::compute_data_page_mask( input_columns.begin(), input_columns.end(), column_schema_indices.begin(), [](auto const& col) { return col.schema_idx; }); - auto const has_page_index = - compute_has_page_index(per_file_metadata, row_group_indices, column_schema_indices); + auto const has_page_index = compute_has_page_index(per_file_metadata, row_group_indices); // Return early if page index is not present if (not has_page_index) { CUDF_LOG_WARN("Encountered missing Parquet page index for one or more output columns"); - return {}; // An empty data page mask indicates all pages are required + return cudf::detail::make_empty_host_vector( + 0, stream); // An empty data page mask indicates all pages are required } // Compute page row offsets and column chunk page offsets for each column @@ -757,11 +756,10 @@ std::vector aggregate_reader_metadata::compute_data_page_mask( device_data_page_mask.begin(), probe_masks_functor{device_level_ptrs.data(), page_offsets.data(), num_ranges}); - auto host_results = cudf::detail::make_host_vector_async(device_data_page_mask, stream); - std::vector data_page_mask{}; - data_page_mask.reserve(total_pages); - stream.synchronize(); + auto host_results = cudf::detail::make_host_vector_async(device_data_page_mask, stream); + auto data_page_mask = cudf::detail::make_empty_host_vector(total_pages, stream); auto host_results_iter = host_results.begin(); + stream.synchronize(); std::for_each(thrust::counting_iterator(0), thrust::counting_iterator(num_columns), [&](auto col_idx) { @@ -775,14 +773,14 @@ std::vector aggregate_reader_metadata::compute_data_page_mask( } // Instantiate the templates with ColumnView as cudf::column_view and cudf::mutable_column_view -template std::vector aggregate_reader_metadata::compute_data_page_mask( - cudf::column_view const& row_mask, - cudf::host_span const> row_group_indices, - cudf::host_span input_columns, - cudf::size_type row_mask_offset, - rmm::cuda_stream_view stream) const; - -template std::vector aggregate_reader_metadata::compute_data_page_mask< +template cudf::detail::host_vector aggregate_reader_metadata::compute_data_page_mask< + cudf::column_view>(cudf::column_view const& row_mask, + cudf::host_span const> row_group_indices, + cudf::host_span input_columns, + cudf::size_type row_mask_offset, + rmm::cuda_stream_view stream) const; + +template cudf::detail::host_vector aggregate_reader_metadata::compute_data_page_mask< cudf::mutable_column_view>(cudf::mutable_column_view const& row_mask, cudf::host_span const> row_group_indices, cudf::host_span input_columns, diff --git a/cpp/src/io/parquet/experimental/page_index_filter_utils.cu b/cpp/src/io/parquet/experimental/page_index_filter_utils.cu index d64d3eaf313..59c2e76c523 100644 --- a/cpp/src/io/parquet/experimental/page_index_filter_utils.cu +++ b/cpp/src/io/parquet/experimental/page_index_filter_utils.cu @@ -35,30 +35,22 @@ namespace cudf::io::parquet::experimental::detail { bool compute_has_page_index(cudf::host_span file_metadatas, - cudf::host_span const> row_group_indices, - cudf::host_span column_schema_indices) + cudf::host_span const> row_group_indices) { - // For all output columns, check all parquet data sources + // For all parquet data sources return std::all_of( - column_schema_indices.begin(), column_schema_indices.end(), [&](auto const schema_idx) { - // For all parquet data sources - return std::all_of( - thrust::counting_iterator(0), - thrust::counting_iterator(row_group_indices.size()), - [&](auto const src_index) { - // For all row groups in this parquet data source - auto const& rg_indices = row_group_indices[src_index]; - return std::all_of(rg_indices.begin(), rg_indices.end(), [&](auto const& rg_index) { - auto const& row_group = file_metadatas[src_index].row_groups[rg_index]; - auto col = std::find_if( - row_group.columns.begin(), - row_group.columns.end(), - [schema_idx](ColumnChunk const& col) { return col.schema_idx == schema_idx; }); - // Check if the offset_index and column_index are present - return col != file_metadatas[src_index].row_groups[rg_index].columns.end() and - col->offset_index.has_value() and col->column_index.has_value(); + thrust::counting_iterator(0), + thrust::counting_iterator(row_group_indices.size()), + [&](auto const src_index) { + // For all row groups in this parquet data source + auto const& rg_indices = row_group_indices[src_index]; + return std::all_of(rg_indices.begin(), rg_indices.end(), [&](auto const& rg_index) { + auto const& row_group = file_metadatas[src_index].row_groups[rg_index]; + return std::any_of( + row_group.columns.begin(), row_group.columns.end(), [&](auto const& col) { + return col.offset_index.has_value() and col.column_index.has_value(); }); - }); + }); }); } diff --git a/cpp/src/io/parquet/experimental/page_index_filter_utils.hpp b/cpp/src/io/parquet/experimental/page_index_filter_utils.hpp index c84746e07ac..299639e4172 100644 --- a/cpp/src/io/parquet/experimental/page_index_filter_utils.hpp +++ b/cpp/src/io/parquet/experimental/page_index_filter_utils.hpp @@ -35,14 +35,12 @@ using metadata_base = parquet::detail::metadata; * * @param file_metadatas Span of parquet footer metadata * @param row_group_indices Span of input row group indices - * @param column_schema_indices Span of input column schema indices * @return Boolean indicating if the page index is present in all parquet data sources for all * columns */ [[nodiscard]] bool compute_has_page_index( cudf::host_span file_metadatas, - cudf::host_span const> row_group_indices, - cudf::host_span column_schema_indices); + cudf::host_span const> row_group_indices); /** * @brief Compute page row counts and page row offsets and column chunk page (count) offsets for a From 0071dc738565f13c03d018adcd88240fae364388 Mon Sep 17 00:00:00 2001 From: Muhammad Haseeb <14217455+mhaseeb123@users.noreply.github.com> Date: Thu, 16 Oct 2025 21:09:34 +0000 Subject: [PATCH 13/25] CPU optimizations --- .../parquet/experimental/page_index_filter.cu | 74 +++++++++++-------- .../experimental/page_index_filter_utils.cu | 70 +++++++++++------- .../experimental/page_index_filter_utils.hpp | 8 +- 3 files changed, 88 insertions(+), 64 deletions(-) diff --git a/cpp/src/io/parquet/experimental/page_index_filter.cu b/cpp/src/io/parquet/experimental/page_index_filter.cu index c28731dc689..6a8534f5c41 100644 --- a/cpp/src/io/parquet/experimental/page_index_filter.cu +++ b/cpp/src/io/parquet/experimental/page_index_filter.cu @@ -25,7 +25,6 @@ #include #include #include -#include #include #include #include @@ -48,6 +47,7 @@ #include #include +#include namespace cudf::io::parquet::experimental::detail { @@ -668,38 +668,48 @@ cudf::detail::host_vector aggregate_reader_metadata::compute_data_page_mas size_type max_page_size = 0; - if (num_columns == 1) { - auto const schema_idx = column_schema_indices.front(); - size_type col_num_pages = 0; - std::tie(page_row_offsets, col_num_pages, max_page_size) = - compute_page_row_offsets(per_file_metadata, row_group_indices, schema_idx); - // Add 1 to include the the 0th page's offset for each column - col_page_offsets.emplace_back(col_num_pages + 1); + if (num_columns <= 2) { + std::for_each( + column_schema_indices.begin(), column_schema_indices.end(), [&](auto const schema_idx) { + auto [col_page_row_offsets, col_max_page_size] = + compute_page_row_offsets(per_file_metadata, row_group_indices, schema_idx); + page_row_offsets.insert( + page_row_offsets.end(), col_page_row_offsets.begin(), col_page_row_offsets.end()); + max_page_size = std::max(max_page_size, col_max_page_size); + col_page_offsets.emplace_back(page_row_offsets.size()); + }); } else { - std::vector, size_type, size_type>>> - page_row_offsets_tasks; - page_row_offsets_tasks.reserve(num_columns); - - std::for_each(thrust::counting_iterator(0), - thrust::counting_iterator(num_columns), - [&](auto const col_idx) { - page_row_offsets_tasks.emplace_back( - cudf::detail::host_worker_pool().submit_task([&, col_idx = col_idx] { - return compute_page_row_offsets( - per_file_metadata, row_group_indices, column_schema_indices[col_idx]); - })); - }); - - // Collect results from all tasks - std::for_each(page_row_offsets_tasks.begin(), page_row_offsets_tasks.end(), [&](auto& task) { - auto [col_page_row_offsets, col_num_pages, col_max_page_size] = std::move(task).get(); - page_row_offsets.insert(page_row_offsets.end(), - std::make_move_iterator(col_page_row_offsets.begin()), - std::make_move_iterator(col_page_row_offsets.end())); - max_page_size = std::max(max_page_size, col_max_page_size); - // Add 1 to include the the 0th page's offset for each column - col_page_offsets.emplace_back(col_page_offsets.back() + col_num_pages + 1); - }); + auto constexpr max_data_page_mask_threads = 2; + auto const num_threads = std::min(max_data_page_mask_threads, num_columns); + std::vector threads{}; + threads.reserve(num_threads); + std::vector, size_type>>> thread_results( + num_threads); + auto const cols_per_thread = cudf::util::div_rounding_up_unsafe(num_columns, num_threads); + std::for_each( + thrust::counting_iterator(0), thrust::counting_iterator(num_threads), [&](auto const tid) { + threads.emplace_back([&, tid = tid]() { + auto const start_col = std::min(tid * cols_per_thread, num_columns); + auto const end_col = std::min(start_col + cols_per_thread, num_columns); + std::for_each(thrust::counting_iterator(start_col), + thrust::counting_iterator(end_col), + [&](auto const col_idx) { + thread_results[tid].emplace_back(compute_page_row_offsets( + per_file_metadata, row_group_indices, column_schema_indices[col_idx])); + }); + }); + }); + + std::for_each( + thrust::counting_iterator(0), thrust::counting_iterator(num_threads), [&](auto const tid) { + threads[tid].join(); + for (auto& [col_page_row_offsets, col_max_page_size] : thread_results[tid]) { + page_row_offsets.insert( + page_row_offsets.end(), col_page_row_offsets.begin(), col_page_row_offsets.end()); + max_page_size = std::max(max_page_size, col_max_page_size); + col_page_offsets.emplace_back(page_row_offsets.size()); + } + }); } auto const total_pages = page_row_offsets.size() - num_columns; diff --git a/cpp/src/io/parquet/experimental/page_index_filter_utils.cu b/cpp/src/io/parquet/experimental/page_index_filter_utils.cu index 59c2e76c523..aacbc860dee 100644 --- a/cpp/src/io/parquet/experimental/page_index_filter_utils.cu +++ b/cpp/src/io/parquet/experimental/page_index_filter_utils.cu @@ -34,6 +34,30 @@ namespace cudf::io::parquet::experimental::detail { +namespace { + +/** + * @brief Find the offset of the column chunk with the given schema index in the row group + * + * @param row_group Row group + * @param schema_idx Schema index + * @return Offset of the column chunk iterator + */ +[[nodiscard]] auto find_colchunk_iter_offset(RowGroup const& row_group, size_type schema_idx) +{ + auto const& colchunk_iter = + std::find_if(row_group.columns.begin(), row_group.columns.end(), [schema_idx](auto const& col) { + return col.schema_idx == schema_idx; + }); + CUDF_EXPECTS( + colchunk_iter != row_group.columns.end(), + "Column chunk with schema index " + std::to_string(schema_idx) + " not found in row group", + std::invalid_argument); + return std::distance(row_group.columns.begin(), colchunk_iter); +} + +} // namespace + bool compute_has_page_index(cudf::host_span file_metadatas, cudf::host_span const> row_group_indices) { @@ -89,20 +113,16 @@ compute_page_row_counts_and_offsets(cudf::host_span per_fil thrust::counting_iterator(0), thrust::counting_iterator(row_group_indices.size()), [&](auto src_idx) { - auto const& rg_indices = row_group_indices[src_idx]; // For all column chunks in this data source + auto const& rg_indices = row_group_indices[src_idx]; + std::optional colchunk_iter_offset{}; std::for_each(rg_indices.cbegin(), rg_indices.cend(), [&](auto rg_idx) { auto const& row_group = per_file_metadata[src_idx].row_groups[rg_idx]; - // Find the column chunk with the given schema index - auto colchunk_iter = std::find_if( - row_group.columns.begin(), row_group.columns.end(), [schema_idx](ColumnChunk const& col) { - return col.schema_idx == schema_idx; - }); - - CUDF_EXPECTS(colchunk_iter != row_group.columns.end(), - "Column chunk with schema index " + std::to_string(schema_idx) + - " not found in row group", - std::invalid_argument); + if (not colchunk_iter_offset.has_value() or + row_group.columns[colchunk_iter_offset.value()].schema_idx != schema_idx) { + colchunk_iter_offset = find_colchunk_iter_offset(row_group, schema_idx); + } + auto const& colchunk_iter = row_group.columns.begin() + colchunk_iter_offset.value(); // Compute page row counts and offsets if this column chunk has column and offset indexes if (colchunk_iter->offset_index.has_value()) { @@ -136,11 +156,10 @@ compute_page_row_counts_and_offsets(cudf::host_span per_fil std::move(page_row_counts), std::move(page_row_offsets), std::move(col_chunk_page_offsets)}; } -std::tuple, size_type, size_type> compute_page_row_offsets( +std::pair, size_type> compute_page_row_offsets( cudf::host_span per_file_metadata, cudf::host_span const> row_group_indices, - cudf::size_type schema_idx, - cudf::size_type row_mask_offset) + cudf::size_type schema_idx) { // Compute total number of row groups auto const total_row_groups = @@ -150,29 +169,26 @@ std::tuple, size_type, size_type> compute_page_row_offset [](auto sum, auto const& rg_indices) { return sum + rg_indices.size(); }); std::vector page_row_offsets; - page_row_offsets.push_back(row_mask_offset); + page_row_offsets.push_back(0); size_type max_page_size = 0; - size_type num_pages = 0; std::for_each(thrust::counting_iterator(0), thrust::counting_iterator(row_group_indices.size()), [&](auto const src_idx) { - auto const& rg_indices = row_group_indices[src_idx]; // For all row groups in this source + auto const& rg_indices = row_group_indices[src_idx]; + std::optional colchunk_iter_offset{}; std::for_each(rg_indices.begin(), rg_indices.end(), [&](auto const& rg_idx) { auto const& row_group = per_file_metadata[src_idx].row_groups[rg_idx]; // Find the column chunk with the given schema index - auto colchunk_iter = std::find_if( - row_group.columns.begin(), - row_group.columns.end(), - [schema_idx](auto const& col) { return col.schema_idx == schema_idx; }); - CUDF_EXPECTS(colchunk_iter != row_group.columns.end(), - "Column chunk with schema index " + std::to_string(schema_idx) + - " not found in row group", - std::invalid_argument); + if (not colchunk_iter_offset.has_value() or + row_group.columns[colchunk_iter_offset.value()].schema_idx != schema_idx) { + colchunk_iter_offset = find_colchunk_iter_offset(row_group, schema_idx); + } + auto const& colchunk_iter = + row_group.columns.begin() + colchunk_iter_offset.value(); auto const& offset_index = colchunk_iter->offset_index.value(); auto const row_group_num_pages = offset_index.page_locations.size(); - num_pages += static_cast(row_group_num_pages); std::for_each(thrust::counting_iterator(0), thrust::counting_iterator(row_group_num_pages), [&](auto const page_idx) { @@ -189,7 +205,7 @@ std::tuple, size_type, size_type> compute_page_row_offset }); }); - return {std::move(page_row_offsets), num_pages, max_page_size}; + return {std::move(page_row_offsets), max_page_size}; } rmm::device_uvector make_page_indices_async( diff --git a/cpp/src/io/parquet/experimental/page_index_filter_utils.hpp b/cpp/src/io/parquet/experimental/page_index_filter_utils.hpp index 299639e4172..42fdde40aaa 100644 --- a/cpp/src/io/parquet/experimental/page_index_filter_utils.hpp +++ b/cpp/src/io/parquet/experimental/page_index_filter_utils.hpp @@ -66,15 +66,13 @@ compute_page_row_counts_and_offsets(cudf::host_span per_fil * @param per_file_metadata Span of parquet footer metadata * @param row_group_indices Span of input row group indices * @param schema_idx Column's schema index - * @param row_mask_offset Offset of the row mask - * @return Tuple of page row offsets, number of pages, and the size of the largest page in this + * @return Pair of page row offsets and the size of the largest page in this * column */ -[[nodiscard]] std::tuple, size_type, size_type> compute_page_row_offsets( +[[nodiscard]] std::pair, size_type> compute_page_row_offsets( cudf::host_span per_file_metadata, cudf::host_span const> row_group_indices, - size_type schema_idx, - cudf::size_type row_mask_offset = 0); + size_type schema_idx); /** * @brief Make a device vector where each row contains the index of the page it belongs to From 06dfca49e176c31fb9911e65fc1dc536270b05cb Mon Sep 17 00:00:00 2001 From: Muhammad Haseeb <14217455+mhaseeb123@users.noreply.github.com> Date: Fri, 17 Oct 2025 02:47:36 +0000 Subject: [PATCH 14/25] Bug fixing in fenwick tree search --- .../parquet/experimental/page_index_filter.cu | 192 +++++++++--------- .../experimental/page_index_filter_utils.cu | 22 +- .../experimental/page_index_filter_utils.hpp | 13 +- 3 files changed, 111 insertions(+), 116 deletions(-) diff --git a/cpp/src/io/parquet/experimental/page_index_filter.cu b/cpp/src/io/parquet/experimental/page_index_filter.cu index 6a8534f5c41..433b96d8b94 100644 --- a/cpp/src/io/parquet/experimental/page_index_filter.cu +++ b/cpp/src/io/parquet/experimental/page_index_filter.cu @@ -402,7 +402,7 @@ struct page_stats_caster : public stats_caster_base { * CUB's BlockReduce is unsuitable here as it reduces a block to a single element. * Cooperative Groups is used here for robust global thread ID calculation. */ -struct compute_next_level_functor { +struct build_fenwick_tree_level_functor { bool** const level_ptrs; cudf::size_type const current_level; cudf::size_type const current_level_size; @@ -414,11 +414,9 @@ struct compute_next_level_functor { auto next_level_ptr = level_ptrs[current_level + 1]; // Handle the odd-sized remaining element if current_level_size is odd - if (current_level_size % 2 and next_level_index == (next_level_size - 1)) { - // The last element is carried forward (ORed with false) + if (current_level_size % 2 and next_level_index == next_level_size - 1) { next_level_ptr[next_level_index] = current_level_ptr[current_level_size - 1]; } else { - // Perform the logical OR reduction and write to the next level's location next_level_ptr[next_level_index] = current_level_ptr[(next_level_index * 2)] or current_level_ptr[(next_level_index * 2) + 1]; } @@ -435,89 +433,75 @@ struct compute_next_level_functor { * @param d_results Pointer to device memory to store the boolean result (true if a '1' is found in * the range). */ -struct probe_masks_functor { +struct search_fenwick_tree_functor { bool** const level_ptrs; cudf::size_type const* const page_offsets; cudf::size_type const num_ranges; __device__ bool operator()(cudf::size_type range_idx) const noexcept { - // Retrieve M and N for the current range [M, N) + // Retrieve start and end for the current range [start, end) size_type M = page_offsets[range_idx]; size_type N = page_offsets[range_idx + 1]; - // If the range is empty or invalid, terminate - if (M >= N) { return false; } + // Return early if the range is empty or invalid + if (M >= N or range_idx >= num_ranges) { return false; } // Binary Decomposition Loop while (M < N) { - // 1. Calculate the largest power of 2 that can align M up to the boundary. - // This is determined by the Least Significant Bit (LSB) of M. - // If M=0, LSB is usually defined as the full size, but here M is typically > 0 - // or we handle M=0 implicitly by the full range check. - // The expression (M & -M) gives the value of the LSB, which is the block size (2^k). - size_t m_lsb_block_size = (M == 0) ? N : (M & -M); - size_t m_next_aligned = M + m_lsb_block_size; - - // 2. Calculate the largest power of 2 block that can align N down to the boundary. - // This is determined by the LSB of (N - M), but simpler to use N's alignment for the end. - // The expression (N & -N) gives the block size corresponding to N's alignment. - // We ensure N_lsb_block_size does not exceed the remaining range size (N-M). - size_t n_lsb_block_size = N & -N; - - // --- Decision Logic: Which side to consume? --- - - // Block 1: M-aligned block (from M up to m_next_aligned) - size_t block1_size = m_next_aligned - M; - - // Block 2: N-aligned block (from N - n_lsb_block_size up to N) - size_t block2_size = n_lsb_block_size; - - // Block 3: The remaining central range block - - if (block1_size > 0 && M < m_next_aligned && m_next_aligned <= N) { - // If the M-aligned block is fully contained in the range [M, N) - - // Check if block1_size is 2^k. k = log2(block1_size). - // Since block1_size is based on LSB, it is always a power of 2. - size_t k1 = __ffs(block1_size) - 1; - - // Calculate mask index: The starting point M is divided by the block size. - size_t mask_idx = M / block1_size; - - // Look up the mask value - if (level_ptrs[k1][mask_idx]) { - return true; // Found a set bit, terminate for this range - } - - // Advance M - M = m_next_aligned; - } else if (block2_size > 0 && N - block2_size >= M) { - // If the N-aligned block is fully contained and does not overlap M's new position - - // Check if block2_size is 2^k. k = log2(block2_size). - size_t k2 = __ffs(block2_size) - 1; - - // Calculate mask index - size_t mask_idx = (N - block2_size) / block2_size; - - // Look up the mask value - if (level_ptrs[k2][mask_idx]) { - return true; // Found a set bit, terminate for this range - } - - // Backtrack N - N = N - block2_size; - } else { - // The remaining range is unaligned and small (or just 1 element). - // This happens when M and N are close and unaligned (e.g., [11, 13]). + // 1. M Alignment: Find the largest power-of-two block that starts at M and aligns M up. + // Block size is determined by the Least Significant Bit (LSB) of M. + // If M=0, the LSB is the full range N, but we handle the LSB only for M>0. + // The __ffs intrinsic (Find First Set, 1-based) is the fastest way to get the LSB position + // (k). + size_t m_lsb_position = __ffs(M); // Position is 1-based (k+1) + size_t m_block_size = 1ULL << (m_lsb_position - 1); // Size is 2^k + + // 2. N Alignment: Find the largest power-of-two block that aligns N down. + // N & -N gives the LSB block size *if* N were the start, but we use it as the largest + // possible size that evenly divides N. + size_t n_block_size = N & -N; + + // The largest block size we can consume from the current range [M, N) + size_t max_block_size = 0; + size_t mask_level = 0; // k (k=0 is 1 row, k=1 is 2 rows, etc.) + size_t mask_index = 0; + + // --- Core Decomposition Logic --- + + // Check the M side alignment block: [M, M + m_block_size) + // This is only valid if M + m_block_size <= N (the block fits). + if (M > 0 && M + m_block_size <= N) { + max_block_size = m_block_size; + mask_level = m_lsb_position - 1; + mask_index = M >> mask_level; // M / 2^k + M += max_block_size; + } - // Prioritize M (1-row check) or N (1-row check) until they meet. + // Check the N side alignment block: [N - n_block_size, N) + // This is only valid if N - n_block_size >= M and the N block is larger or equal to the M + // block. + else if (n_block_size > 0 && N - n_block_size >= M && n_block_size >= max_block_size) { + // If the N block is larger or we are at the end, prioritize the N block + max_block_size = n_block_size; + mask_level = __ffs(n_block_size) - 1; + mask_index = (N - n_block_size) >> mask_level; // (N - 2^k) / 2^k + N -= max_block_size; + } - // Check single row at M (Level 0) - if (level_ptrs[0][M]) { return true; } + // Fallback for small, unaligned ranges (e.g., [11, 13) where M and N are close) + // If max_block_size is 0 or too large, reduce by 1 row (Level 0) + else if (max_block_size == 0 || max_block_size > N - M) { + max_block_size = 1; + mask_level = 0; + mask_index = M; // Level 0 index is just M M++; } + + // --- Query Mask and Advance --- + + // Look up the mask value at the determined level and index + if (level_ptrs[mask_level][mask_index]) { return true; } } return false; } @@ -643,14 +627,6 @@ cudf::detail::host_vector aggregate_reader_metadata::compute_data_page_mas "Mismatch in total rows in input row mask and row groups", std::invalid_argument); - auto const num_columns = input_columns.size(); - - // Collect column schema indices from the input columns. - auto column_schema_indices = std::vector(input_columns.size()); - std::transform( - input_columns.begin(), input_columns.end(), column_schema_indices.begin(), [](auto const& col) { - return col.schema_idx; - }); auto const has_page_index = compute_has_page_index(per_file_metadata, row_group_indices); // Return early if page index is not present @@ -660,7 +636,15 @@ cudf::detail::host_vector aggregate_reader_metadata::compute_data_page_mas 0, stream); // An empty data page mask indicates all pages are required } + // Collect column schema indices from the input columns. + auto column_schema_indices = std::vector(input_columns.size()); + std::transform( + input_columns.begin(), input_columns.end(), column_schema_indices.begin(), [](auto const& col) { + return col.schema_idx; + }); + // Compute page row offsets and column chunk page offsets for each column + auto const num_columns = input_columns.size(); std::vector page_row_offsets; std::vector col_page_offsets; col_page_offsets.reserve(num_columns + 1); @@ -731,45 +715,57 @@ cudf::detail::host_vector aggregate_reader_metadata::compute_data_page_mas } auto const mr = cudf::get_current_device_resource_ref(); - auto const [level_offsets, total_levels_size] = - compute_row_mask_levels(total_rows, max_page_size); - auto const num_levels = static_cast(level_offsets.size()); - auto levels_data = rmm::device_uvector(total_levels_size, stream, mr); + // Compute fenwick tree level offsets and total size (level 1 and higher) + auto const tree_level_offsets = compute_fenwick_tree_level_offsets(total_rows, max_page_size); + auto const num_levels = static_cast(tree_level_offsets.size()); + // Buffer to store Fenwick tree levels (level 1 and higher) data + auto tree_levels_data = rmm::device_uvector(tree_level_offsets.back(), stream, mr); - auto host_level_ptrs = cudf::detail::make_host_vector(num_levels, stream); - host_level_ptrs[0] = const_cast(row_mask.template begin()) + row_mask_offset; + // Pointers to each Fenwick tree level data + auto host_tree_level_ptrs = cudf::detail::make_host_vector(num_levels, stream); + // Zeroth level is just the row mask itself + host_tree_level_ptrs[0] = const_cast(row_mask.template begin()) + row_mask_offset; std::for_each( thrust::counting_iterator(1), thrust::counting_iterator(num_levels), [&](auto const level_idx) { - host_level_ptrs[level_idx] = levels_data.data() + level_offsets[level_idx - 1]; + host_tree_level_ptrs[level_idx] = tree_levels_data.data() + tree_level_offsets[level_idx - 1]; }); - auto device_level_ptrs = cudf::detail::make_device_uvector_async(host_level_ptrs, stream, mr); + auto fenwick_tree_level_ptrs = + cudf::detail::make_device_uvector_async(host_tree_level_ptrs, stream, mr); + + // Build Fenwick tree levels auto current_level_size = total_rows; std::for_each( thrust::counting_iterator(0), thrust::counting_iterator(num_levels - 1), [&](auto const level) { auto const next_level_size = cudf::util::div_rounding_up_unsafe(current_level_size, 2); - thrust::for_each(rmm::exec_policy_nosync(stream), - thrust::counting_iterator(0), - thrust::counting_iterator(next_level_size), - compute_next_level_functor{ - device_level_ptrs.data(), level, current_level_size, next_level_size}); + thrust::for_each( + rmm::exec_policy_nosync(stream), + thrust::counting_iterator(0), + thrust::counting_iterator(next_level_size), + build_fenwick_tree_level_functor{ + fenwick_tree_level_ptrs.data(), level, current_level_size, next_level_size}); current_level_size = next_level_size; }); + // Search the Fenwick tree to see if there's a surviving row in each page's row range auto const num_ranges = static_cast(page_row_offsets.size() - 1); rmm::device_uvector device_data_page_mask(num_ranges, stream, mr); auto page_offsets = cudf::detail::make_device_uvector_async(page_row_offsets, stream, mr); - thrust::transform(rmm::exec_policy_nosync(stream), - thrust::counting_iterator(0), - thrust::counting_iterator(num_ranges), - device_data_page_mask.begin(), - probe_masks_functor{device_level_ptrs.data(), page_offsets.data(), num_ranges}); - + thrust::transform( + rmm::exec_policy_nosync(stream), + thrust::counting_iterator(0), + thrust::counting_iterator(num_ranges), + device_data_page_mask.begin(), + search_fenwick_tree_functor{fenwick_tree_level_ptrs.data(), page_offsets.data(), num_ranges}); + + // Copy over search results to host auto host_results = cudf::detail::make_host_vector_async(device_data_page_mask, stream); auto data_page_mask = cudf::detail::make_empty_host_vector(total_pages, stream); auto host_results_iter = host_results.begin(); stream.synchronize(); + // Discard results for invalid ranges. i.e. ranges starting at the last page of a column and + // ending at the first page of the next column std::for_each(thrust::counting_iterator(0), thrust::counting_iterator(num_columns), [&](auto col_idx) { diff --git a/cpp/src/io/parquet/experimental/page_index_filter_utils.cu b/cpp/src/io/parquet/experimental/page_index_filter_utils.cu index aacbc860dee..be16cd46d09 100644 --- a/cpp/src/io/parquet/experimental/page_index_filter_utils.cu +++ b/cpp/src/io/parquet/experimental/page_index_filter_utils.cu @@ -242,27 +242,25 @@ rmm::device_uvector make_page_indices_async( return page_indices; } -std::pair, size_type> compute_row_mask_levels(cudf::size_type num_rows, - cudf::size_type max_page_size) +std::vector compute_fenwick_tree_level_offsets(cudf::size_type level0_size, + cudf::size_type max_page_size) { - std::vector level_offsets; - level_offsets.push_back(0); + std::vector tree_level_offsets; + tree_level_offsets.push_back(0); - size_t current_size = cudf::util::div_rounding_up_unsafe(num_rows, 2); - size_t current_level = 1; + cudf::size_type current_level_size = cudf::util::div_rounding_up_unsafe(level0_size, 2); + cudf::size_type current_level = 1; - while (current_size > 0) { + while (current_level_size > 0) { size_t block_size = size_t{1} << current_level; - - level_offsets.push_back(current_size); - current_size += num_rows; + tree_level_offsets.push_back(tree_level_offsets.back() + current_level_size); if (std::cmp_greater_equal(block_size, max_page_size)) { break; } - current_size = cudf::util::div_rounding_up_unsafe(current_size, 2); + current_level_size = cudf::util::div_rounding_up_unsafe(current_level_size, 2); current_level++; } - return {std::move(level_offsets), current_size}; + return tree_level_offsets; } } // namespace cudf::io::parquet::experimental::detail diff --git a/cpp/src/io/parquet/experimental/page_index_filter_utils.hpp b/cpp/src/io/parquet/experimental/page_index_filter_utils.hpp index 42fdde40aaa..cd0ef0f06b7 100644 --- a/cpp/src/io/parquet/experimental/page_index_filter_utils.hpp +++ b/cpp/src/io/parquet/experimental/page_index_filter_utils.hpp @@ -90,13 +90,14 @@ compute_page_row_counts_and_offsets(cudf::host_span per_fil rmm::cuda_stream_view stream); /** - * @brief Compute the levels of the row mask + * @brief Computes the offsets of the Fenwick tree levels (level 1 and higher) until the tree + * level's block size is greater than the maximum page size (tree search range) * - * @param num_rows Number of rows in the row mask - * @param max_page_size Maximum page size - * @return Pair of level offsets and total levels size + * @param level0_size Size of the zeroth tree level (the row mask) + * @param max_page_size Max page (tree search range) size + * @return Fenwick tree level offsets */ -[[nodiscard]] std::pair, size_type> compute_row_mask_levels( - cudf::size_type num_rows, cudf::size_type max_page_size); +[[nodiscard]] std::vector compute_fenwick_tree_level_offsets( + cudf::size_type level0_size, cudf::size_type max_page_size); } // namespace cudf::io::parquet::experimental::detail From 8ac9c3a20f6ca98ab3ef1503b22b042231476442 Mon Sep 17 00:00:00 2001 From: Muhammad Haseeb <14217455+mhaseeb123@users.noreply.github.com> Date: Fri, 17 Oct 2025 05:50:27 +0000 Subject: [PATCH 15/25] Bug fixing in fenwick tree --- .../parquet/experimental/page_index_filter.cu | 139 +++++++++++------- .../experimental/page_index_filter_utils.cu | 10 +- .../experimental/page_index_filter_utils.hpp | 6 +- 3 files changed, 88 insertions(+), 67 deletions(-) diff --git a/cpp/src/io/parquet/experimental/page_index_filter.cu b/cpp/src/io/parquet/experimental/page_index_filter.cu index 433b96d8b94..00676b1f56f 100644 --- a/cpp/src/io/parquet/experimental/page_index_filter.cu +++ b/cpp/src/io/parquet/experimental/page_index_filter.cu @@ -434,74 +434,101 @@ struct build_fenwick_tree_level_functor { * the range). */ struct search_fenwick_tree_functor { - bool** const level_ptrs; + bool** const tree_level_ptrs; cudf::size_type const* const page_offsets; cudf::size_type const num_ranges; + /** + * @brief Checks if a value is a power of two + * + * @param value Value to check + * @return Boolean indicating if the value is a power of two + */ + __device__ bool constexpr is_power_of_two(size_type value) const noexcept + { + return (value & (value - 1)) == 0; + } + + /** + * @brief Finds the largest power of two in the range (start, end]; 0 otherwise + * + * @param start Range start + * @param end Range end + * @return Largest power of two in the range (start, end]; 0 otherwise + */ + __device__ size_type constexpr largest_power_of_two_in_range(size_type start, + size_type end) const noexcept + { + auto constexpr nbits = cudf::detail::size_in_bits() - 1; + auto const result = size_type{1} << (nbits - cuda::std::countl_zero(end)); + return result > start ? result : 0; + } + + /** + * @brief Searches the Fenwick tree to find a `true` value in range [start, end) + * + * @param range_idx Index of the range to search + * @return Boolean indicating if a `true` value is found in the range + */ __device__ bool operator()(cudf::size_type range_idx) const noexcept { // Retrieve start and end for the current range [start, end) - size_type M = page_offsets[range_idx]; - size_type N = page_offsets[range_idx + 1]; + size_type start = page_offsets[range_idx]; + size_type end = page_offsets[range_idx + 1]; // Return early if the range is empty or invalid - if (M >= N or range_idx >= num_ranges) { return false; } - - // Binary Decomposition Loop - while (M < N) { - // 1. M Alignment: Find the largest power-of-two block that starts at M and aligns M up. - // Block size is determined by the Least Significant Bit (LSB) of M. - // If M=0, the LSB is the full range N, but we handle the LSB only for M>0. - // The __ffs intrinsic (Find First Set, 1-based) is the fastest way to get the LSB position - // (k). - size_t m_lsb_position = __ffs(M); // Position is 1-based (k+1) - size_t m_block_size = 1ULL << (m_lsb_position - 1); // Size is 2^k - - // 2. N Alignment: Find the largest power-of-two block that aligns N down. - // N & -N gives the LSB block size *if* N were the start, but we use it as the largest - // possible size that evenly divides N. - size_t n_block_size = N & -N; - - // The largest block size we can consume from the current range [M, N) - size_t max_block_size = 0; - size_t mask_level = 0; // k (k=0 is 1 row, k=1 is 2 rows, etc.) - size_t mask_index = 0; - - // --- Core Decomposition Logic --- - - // Check the M side alignment block: [M, M + m_block_size) - // This is only valid if M + m_block_size <= N (the block fits). - if (M > 0 && M + m_block_size <= N) { - max_block_size = m_block_size; - mask_level = m_lsb_position - 1; - mask_index = M >> mask_level; // M / 2^k - M += max_block_size; + if (start >= end or range_idx >= num_ranges) { return false; } + + // Binary search decomposition loop + while (start < end) { + // Base case: start is zero or start and/or end are power(s) of two + if (start == 0 or is_power_of_two(start)) { + auto const block_size = largest_power_of_two_in_range(start, end); + if (block_size) { + auto const mask_level = cuda::std::countr_zero(block_size); + auto const mask_index = start >> mask_level; + if (tree_level_ptrs[mask_level][mask_index]) { return true; } + start += block_size; + } + } else if (is_power_of_two(end)) { + auto const block_size = largest_power_of_two_in_range(start, end); + if (block_size) { + auto const mask_level = cuda::std::countr_zero(block_size); + auto const mask_index = (end - block_size) >> mask_level; + if (tree_level_ptrs[mask_level][mask_index]) { return true; } + end -= block_size; + } } - // Check the N side alignment block: [N - n_block_size, N) - // This is only valid if N - n_block_size >= M and the N block is larger or equal to the M - // block. - else if (n_block_size > 0 && N - n_block_size >= M && n_block_size >= max_block_size) { - // If the N block is larger or we are at the end, prioritize the N block - max_block_size = n_block_size; - mask_level = __ffs(n_block_size) - 1; - mask_index = (N - n_block_size) >> mask_level; // (N - 2^k) / 2^k - N -= max_block_size; - } + // Return early if start >= end already + if (start >= end) { return false; } - // Fallback for small, unaligned ranges (e.g., [11, 13) where M and N are close) - // If max_block_size is 0 or too large, reduce by 1 row (Level 0) - else if (max_block_size == 0 || max_block_size > N - M) { - max_block_size = 1; - mask_level = 0; - mask_index = M; // Level 0 index is just M - M++; - } + // Find the largest power-of-two block that begins and `start` and aligns it up + size_type const start_mask_level = cuda::std::countr_zero(start); + size_type const start_block_size = size_t{1} << (start_mask_level); - // --- Query Mask and Advance --- + // Find the largest power-of-two block that aligns `end` down. + size_type const end_block_size = end & -end; - // Look up the mask value at the determined level and index - if (level_ptrs[mask_level][mask_index]) { return true; } + // Check the `start` side alignment block: [M, M + m_block_size) and if it's the larger block + if (start + start_block_size <= end and start_block_size >= end_block_size) { + auto const mask_level = start_mask_level; + auto const mask_index = start >> mask_level; + if (tree_level_ptrs[mask_level][mask_index]) { return true; } + start += start_block_size; + } + // Otherwise, check the `end` side alignment block: [end - end_block_size, end) + else if (end - end_block_size >= start) { + auto const mask_level = cuda::std::countr_zero(end_block_size); + auto const mask_index = (end - end_block_size) >> mask_level; + if (tree_level_ptrs[mask_level][mask_index]) { return true; } + end -= end_block_size; + } + // Fallback for small, unaligned ranges. e.g., [11, 13) + else { + if (tree_level_ptrs[0][start]) { return true; } + start++; + } } return false; } @@ -717,7 +744,7 @@ cudf::detail::host_vector aggregate_reader_metadata::compute_data_page_mas auto const mr = cudf::get_current_device_resource_ref(); // Compute fenwick tree level offsets and total size (level 1 and higher) - auto const tree_level_offsets = compute_fenwick_tree_level_offsets(total_rows, max_page_size); + auto const tree_level_offsets = compute_fenwick_tree_level_offsets(total_rows); auto const num_levels = static_cast(tree_level_offsets.size()); // Buffer to store Fenwick tree levels (level 1 and higher) data auto tree_levels_data = rmm::device_uvector(tree_level_offsets.back(), stream, mr); diff --git a/cpp/src/io/parquet/experimental/page_index_filter_utils.cu b/cpp/src/io/parquet/experimental/page_index_filter_utils.cu index be16cd46d09..75be66d7865 100644 --- a/cpp/src/io/parquet/experimental/page_index_filter_utils.cu +++ b/cpp/src/io/parquet/experimental/page_index_filter_utils.cu @@ -242,8 +242,7 @@ rmm::device_uvector make_page_indices_async( return page_indices; } -std::vector compute_fenwick_tree_level_offsets(cudf::size_type level0_size, - cudf::size_type max_page_size) +std::vector compute_fenwick_tree_level_offsets(cudf::size_type level0_size) { std::vector tree_level_offsets; tree_level_offsets.push_back(0); @@ -252,12 +251,9 @@ std::vector compute_fenwick_tree_level_offsets(cudf::size_type level0 cudf::size_type current_level = 1; while (current_level_size > 0) { - size_t block_size = size_t{1} << current_level; tree_level_offsets.push_back(tree_level_offsets.back() + current_level_size); - - if (std::cmp_greater_equal(block_size, max_page_size)) { break; } - - current_level_size = cudf::util::div_rounding_up_unsafe(current_level_size, 2); + current_level_size = + current_level_size == 1 ? 0 : cudf::util::div_rounding_up_unsafe(current_level_size, 2); current_level++; } return tree_level_offsets; diff --git a/cpp/src/io/parquet/experimental/page_index_filter_utils.hpp b/cpp/src/io/parquet/experimental/page_index_filter_utils.hpp index cd0ef0f06b7..f3f51e3768f 100644 --- a/cpp/src/io/parquet/experimental/page_index_filter_utils.hpp +++ b/cpp/src/io/parquet/experimental/page_index_filter_utils.hpp @@ -90,14 +90,12 @@ compute_page_row_counts_and_offsets(cudf::host_span per_fil rmm::cuda_stream_view stream); /** - * @brief Computes the offsets of the Fenwick tree levels (level 1 and higher) until the tree - * level's block size is greater than the maximum page size (tree search range) + * @brief Computes the offsets of the Fenwick tree levels (level 1 and higher) * * @param level0_size Size of the zeroth tree level (the row mask) - * @param max_page_size Max page (tree search range) size * @return Fenwick tree level offsets */ [[nodiscard]] std::vector compute_fenwick_tree_level_offsets( - cudf::size_type level0_size, cudf::size_type max_page_size); + cudf::size_type level0_size); } // namespace cudf::io::parquet::experimental::detail From e5e12ecb4cb87c31a84967ad367a2f7f30bbc6a0 Mon Sep 17 00:00:00 2001 From: Muhammad Haseeb <14217455+mhaseeb123@users.noreply.github.com> Date: Fri, 17 Oct 2025 06:12:18 +0000 Subject: [PATCH 16/25] Minor optimizations --- .../parquet/experimental/page_index_filter.cu | 52 +++++-------------- 1 file changed, 14 insertions(+), 38 deletions(-) diff --git a/cpp/src/io/parquet/experimental/page_index_filter.cu b/cpp/src/io/parquet/experimental/page_index_filter.cu index 00676b1f56f..98e1781da47 100644 --- a/cpp/src/io/parquet/experimental/page_index_filter.cu +++ b/cpp/src/io/parquet/experimental/page_index_filter.cu @@ -438,17 +438,6 @@ struct search_fenwick_tree_functor { cudf::size_type const* const page_offsets; cudf::size_type const num_ranges; - /** - * @brief Checks if a value is a power of two - * - * @param value Value to check - * @return Boolean indicating if the value is a power of two - */ - __device__ bool constexpr is_power_of_two(size_type value) const noexcept - { - return (value & (value - 1)) == 0; - } - /** * @brief Finds the largest power of two in the range (start, end]; 0 otherwise * @@ -456,8 +445,8 @@ struct search_fenwick_tree_functor { * @param end Range end * @return Largest power of two in the range (start, end]; 0 otherwise */ - __device__ size_type constexpr largest_power_of_two_in_range(size_type start, - size_type end) const noexcept + __device__ size_type inline constexpr largest_power_of_two_in_range(size_type start, + size_type end) const noexcept { auto constexpr nbits = cudf::detail::size_in_bits() - 1; auto const result = size_type{1} << (nbits - cuda::std::countl_zero(end)); @@ -481,36 +470,23 @@ struct search_fenwick_tree_functor { // Binary search decomposition loop while (start < end) { - // Base case: start is zero or start and/or end are power(s) of two - if (start == 0 or is_power_of_two(start)) { - auto const block_size = largest_power_of_two_in_range(start, end); - if (block_size) { - auto const mask_level = cuda::std::countr_zero(block_size); - auto const mask_index = start >> mask_level; - if (tree_level_ptrs[mask_level][mask_index]) { return true; } - start += block_size; - } - } else if (is_power_of_two(end)) { - auto const block_size = largest_power_of_two_in_range(start, end); - if (block_size) { + // Find the largest power-of-two block that begins and `start` and aligns it up + auto const [start_mask_level, start_block_size] = [&]() { + if (start == 0) { + auto const block_size = largest_power_of_two_in_range(start, end); auto const mask_level = cuda::std::countr_zero(block_size); - auto const mask_index = (end - block_size) >> mask_level; - if (tree_level_ptrs[mask_level][mask_index]) { return true; } - end -= block_size; + return cuda::std::pair{mask_level, block_size}; + } else { + auto const mask_level = cuda::std::countr_zero(start); + return cuda::std::pair{mask_level, size_type{1} << mask_level}; } - } - - // Return early if start >= end already - if (start >= end) { return false; } - - // Find the largest power-of-two block that begins and `start` and aligns it up - size_type const start_mask_level = cuda::std::countr_zero(start); - size_type const start_block_size = size_t{1} << (start_mask_level); + }(); // Find the largest power-of-two block that aligns `end` down. - size_type const end_block_size = end & -end; + auto const end_block_size = end & -end; - // Check the `start` side alignment block: [M, M + m_block_size) and if it's the larger block + // Check the `start` side alignment block: [M, M + m_block_size) and if it's the larger + // block if (start + start_block_size <= end and start_block_size >= end_block_size) { auto const mask_level = start_mask_level; auto const mask_index = start >> mask_level; From 3d056ed4bed561cba34cff1723438efa93f7ddfa Mon Sep 17 00:00:00 2001 From: Muhammad Haseeb <14217455+mhaseeb123@users.noreply.github.com> Date: Fri, 17 Oct 2025 06:20:34 +0000 Subject: [PATCH 17/25] Documentation and style --- .../parquet/experimental/page_index_filter.cu | 52 +++++++++++-------- 1 file changed, 29 insertions(+), 23 deletions(-) diff --git a/cpp/src/io/parquet/experimental/page_index_filter.cu b/cpp/src/io/parquet/experimental/page_index_filter.cu index 98e1781da47..ed77605f9ab 100644 --- a/cpp/src/io/parquet/experimental/page_index_filter.cu +++ b/cpp/src/io/parquet/experimental/page_index_filter.cu @@ -397,41 +397,47 @@ struct page_stats_caster : public stats_caster_base { }; /** - * @brief Custom CUDA kernel using Cooperative Groups to perform the paired logical OR reduction. - * * NOTE: This operation is a map/stride-2-read, not a true block-to-global reduction. - * CUB's BlockReduce is unsuitable here as it reduces a block to a single element. - * Cooperative Groups is used here for robust global thread ID calculation. + * @brief Functor to build the NEXT Fenwick tree level from the current level data + * + * @param tree_level_ptrs Pointers to the start of Fenwick tree level data + * @param current_level Current tree level + * @param current_level_size Size of the current tree level + * @param next_level_size Size of the next tree level */ -struct build_fenwick_tree_level_functor { - bool** const level_ptrs; +struct build_next_fenwick_tree_level_functor { + bool** const tree_level_ptrs; cudf::size_type const current_level; cudf::size_type const current_level_size; cudf::size_type const next_level_size; - __device__ void operator()(cudf::size_type next_level_index) const noexcept + /** + * @brief Builds the next Fenwick tree level from the current level data + * by ORing the two children of the current level. + * + * @param next_level_idx Next tree level element index + */ + __device__ void operator()(cudf::size_type next_level_idx) const noexcept { - auto const current_level_ptr = level_ptrs[current_level]; - auto next_level_ptr = level_ptrs[current_level + 1]; + auto const current_level_ptr = tree_level_ptrs[current_level]; + auto next_level_ptr = tree_level_ptrs[current_level + 1]; // Handle the odd-sized remaining element if current_level_size is odd - if (current_level_size % 2 and next_level_index == next_level_size - 1) { - next_level_ptr[next_level_index] = current_level_ptr[current_level_size - 1]; + if (current_level_size % 2 and next_level_idx == next_level_size - 1) { + next_level_ptr[next_level_idx] = current_level_ptr[current_level_size - 1]; } else { - next_level_ptr[next_level_index] = - current_level_ptr[(next_level_index * 2)] or current_level_ptr[(next_level_index * 2) + 1]; + next_level_ptr[next_level_idx] = + current_level_ptr[(next_level_idx * 2)] or current_level_ptr[(next_level_idx * 2) + 1]; } } }; /** - * @brief CUDA kernel to probe multiple ranges against the pre-calculated mask hierarchy. - * One thread handles the binary decomposition and query for one range [M, N). - * * @param d_level_ptrs Device array of pointers, where d_level_ptrs[k] points to the start of - * Level k mask. - * @param d_range_offsets Device array where range i is [d_range_offsets[i], d_range_offsets[i+1]). - * @param num_ranges The number of ranges to process. - * @param d_results Pointer to device memory to store the boolean result (true if a '1' is found in - * the range). + * @brief Functor to binary search a `true` value in the Fenwick tree in range [start, end) + * + * @param tree_level_ptrs Pointers to the start of Fenwick tree level data + * @param page_offsets Pointer to page offsets describing each search range i as [page_offsets[i], + * page_offsets[i+1)) + * @param num_ranges Number of search ranges */ struct search_fenwick_tree_functor { bool** const tree_level_ptrs; @@ -737,7 +743,7 @@ cudf::detail::host_vector aggregate_reader_metadata::compute_data_page_mas auto fenwick_tree_level_ptrs = cudf::detail::make_device_uvector_async(host_tree_level_ptrs, stream, mr); - // Build Fenwick tree levels + // Build Fenwick tree levels (zeroth level is just the row mask itself) auto current_level_size = total_rows; std::for_each( thrust::counting_iterator(0), thrust::counting_iterator(num_levels - 1), [&](auto const level) { @@ -746,7 +752,7 @@ cudf::detail::host_vector aggregate_reader_metadata::compute_data_page_mas rmm::exec_policy_nosync(stream), thrust::counting_iterator(0), thrust::counting_iterator(next_level_size), - build_fenwick_tree_level_functor{ + build_next_fenwick_tree_level_functor{ fenwick_tree_level_ptrs.data(), level, current_level_size, next_level_size}); current_level_size = next_level_size; }); From 55948f09253a60651e6f441f0fd3abb88d0d2b55 Mon Sep 17 00:00:00 2001 From: Muhammad Haseeb <14217455+mhaseeb123@users.noreply.github.com> Date: Fri, 17 Oct 2025 07:14:22 +0000 Subject: [PATCH 18/25] Small optimization --- cpp/src/io/parquet/experimental/page_index_filter.cu | 4 ++-- cpp/src/io/parquet/experimental/page_index_filter_utils.cu | 6 ++++-- cpp/src/io/parquet/experimental/page_index_filter_utils.hpp | 6 ++++-- 3 files changed, 10 insertions(+), 6 deletions(-) diff --git a/cpp/src/io/parquet/experimental/page_index_filter.cu b/cpp/src/io/parquet/experimental/page_index_filter.cu index ed77605f9ab..6c5a9425754 100644 --- a/cpp/src/io/parquet/experimental/page_index_filter.cu +++ b/cpp/src/io/parquet/experimental/page_index_filter.cu @@ -709,7 +709,7 @@ cudf::detail::host_vector aggregate_reader_metadata::compute_data_page_mas // Make sure all row_mask elements contain valid values even if they are nulls if constexpr (cuda::std::is_same_v) { - if (row_mask.nullable()) { + if (row_mask.nullable() and row_mask.null_count() > 0) { thrust::for_each(rmm::exec_policy_nosync(stream), thrust::counting_iterator(row_mask_offset), thrust::counting_iterator(row_mask_offset + total_rows), @@ -726,7 +726,7 @@ cudf::detail::host_vector aggregate_reader_metadata::compute_data_page_mas auto const mr = cudf::get_current_device_resource_ref(); // Compute fenwick tree level offsets and total size (level 1 and higher) - auto const tree_level_offsets = compute_fenwick_tree_level_offsets(total_rows); + auto const tree_level_offsets = compute_fenwick_tree_level_offsets(total_rows, max_page_size); auto const num_levels = static_cast(tree_level_offsets.size()); // Buffer to store Fenwick tree levels (level 1 and higher) data auto tree_levels_data = rmm::device_uvector(tree_level_offsets.back(), stream, mr); diff --git a/cpp/src/io/parquet/experimental/page_index_filter_utils.cu b/cpp/src/io/parquet/experimental/page_index_filter_utils.cu index 75be66d7865..8750449fc24 100644 --- a/cpp/src/io/parquet/experimental/page_index_filter_utils.cu +++ b/cpp/src/io/parquet/experimental/page_index_filter_utils.cu @@ -1,4 +1,3 @@ - /* * Copyright (c) 2025, NVIDIA CORPORATION. * @@ -242,7 +241,8 @@ rmm::device_uvector make_page_indices_async( return page_indices; } -std::vector compute_fenwick_tree_level_offsets(cudf::size_type level0_size) +std::vector compute_fenwick_tree_level_offsets(cudf::size_type level0_size, + cudf::size_type max_page_size) { std::vector tree_level_offsets; tree_level_offsets.push_back(0); @@ -251,6 +251,8 @@ std::vector compute_fenwick_tree_level_offsets(cudf::size_type level0 cudf::size_type current_level = 1; while (current_level_size > 0) { + auto const block_size = 1 << current_level; + if (std::cmp_greater(block_size, max_page_size)) { break; } tree_level_offsets.push_back(tree_level_offsets.back() + current_level_size); current_level_size = current_level_size == 1 ? 0 : cudf::util::div_rounding_up_unsafe(current_level_size, 2); diff --git a/cpp/src/io/parquet/experimental/page_index_filter_utils.hpp b/cpp/src/io/parquet/experimental/page_index_filter_utils.hpp index f3f51e3768f..adb2b3b1a0f 100644 --- a/cpp/src/io/parquet/experimental/page_index_filter_utils.hpp +++ b/cpp/src/io/parquet/experimental/page_index_filter_utils.hpp @@ -90,12 +90,14 @@ compute_page_row_counts_and_offsets(cudf::host_span per_fil rmm::cuda_stream_view stream); /** - * @brief Computes the offsets of the Fenwick tree levels (level 1 and higher) + * @brief Computes the offsets of the Fenwick tree levels (level 1 and higher) until the tree level + * block size becomes larger than the maximum page (search range) size * * @param level0_size Size of the zeroth tree level (the row mask) + * @param max_page_size Maximum page (search range) size * @return Fenwick tree level offsets */ [[nodiscard]] std::vector compute_fenwick_tree_level_offsets( - cudf::size_type level0_size); + cudf::size_type level0_size, cudf::size_type max_page_size); } // namespace cudf::io::parquet::experimental::detail From a2221a55ffd75b983e21404f804aada0fc8af845 Mon Sep 17 00:00:00 2001 From: Muhammad Haseeb <14217455+mhaseeb123@users.noreply.github.com> Date: Fri, 17 Oct 2025 09:46:35 +0000 Subject: [PATCH 19/25] Minor optimizations --- .../parquet/experimental/page_index_filter.cu | 97 +++++++++++++++---- 1 file changed, 80 insertions(+), 17 deletions(-) diff --git a/cpp/src/io/parquet/experimental/page_index_filter.cu b/cpp/src/io/parquet/experimental/page_index_filter.cu index 6c5a9425754..d762c9bed3a 100644 --- a/cpp/src/io/parquet/experimental/page_index_filter.cu +++ b/cpp/src/io/parquet/experimental/page_index_filter.cu @@ -444,6 +444,45 @@ struct search_fenwick_tree_functor { cudf::size_type const* const page_offsets; cudf::size_type const num_ranges; + /** + * @brief Enum class to represent which range boundary to align + */ + enum class alignment : uint8_t { + START = 0, + END = 1, + }; + + /** + * @brief Checks if a value is a power of two + * + * @param value Value to check + * @return Boolean indicating if the value is a power of two + */ + __device__ bool inline constexpr is_power_of_two(cudf::size_type value) const noexcept + { + return (value & (value - 1)) == 0; + } + + /** + * @brief Finds the smallest power of two in the range [start, end); 0 otherwise + * + * @param start Range start + * @param end Range end + * @return Largest power of two in the range [start, end); 0 otherwise + */ + __device__ cudf::size_type inline constexpr smallest_power_of_two_in_range( + cudf::size_type start, cudf::size_type end) const noexcept + { + start--; + start |= start >> 1; + start |= start >> 2; + start |= start >> 4; + start |= start >> 8; + start |= start >> 16; + auto const result = start + 1; + return result < end ? result : 0; + } + /** * @brief Finds the largest power of two in the range (start, end]; 0 otherwise * @@ -459,6 +498,37 @@ struct search_fenwick_tree_functor { return result > start ? result : 0; } + /** + * @brief Aligns a range boundary to the next power-of-two block + * + * @tparam Alignment The bounday to align + * @param start Range start + * @param end Range end + * @return A pair of the tree level and block size + */ + template + __device__ + cuda::std::pair inline constexpr align_range_boundary( + cudf::size_type start, cudf::size_type end) const noexcept + { + if constexpr (Alignment == alignment::START) { + if (start == 0 or is_power_of_two(start)) { + auto const block_size = largest_power_of_two_in_range(start, end); + auto const tree_level = cuda::std::countr_zero(block_size); + return cuda::std::pair{tree_level, block_size}; + } else { + auto const tree_level = cuda::std::countr_zero(start); + return cuda::std::pair{tree_level, size_type{1} << tree_level}; + } + } else { + auto block_size = end & -end; + if (start > 0 and is_power_of_two(end)) { + block_size = end - smallest_power_of_two_in_range(start, end); + } + return cuda::std::pair{cuda::std::countr_zero(block_size), block_size}; + } + } + /** * @brief Searches the Fenwick tree to find a `true` value in range [start, end) * @@ -477,33 +547,26 @@ struct search_fenwick_tree_functor { // Binary search decomposition loop while (start < end) { // Find the largest power-of-two block that begins and `start` and aligns it up - auto const [start_mask_level, start_block_size] = [&]() { - if (start == 0) { - auto const block_size = largest_power_of_two_in_range(start, end); - auto const mask_level = cuda::std::countr_zero(block_size); - return cuda::std::pair{mask_level, block_size}; - } else { - auto const mask_level = cuda::std::countr_zero(start); - return cuda::std::pair{mask_level, size_type{1} << mask_level}; - } - }(); + auto const [start_tree_level, start_block_size] = + align_range_boundary(start, end); // Find the largest power-of-two block that aligns `end` down. - auto const end_block_size = end & -end; + auto const [end_tree_level, end_block_size] = + align_range_boundary(start, end); // Check the `start` side alignment block: [M, M + m_block_size) and if it's the larger // block if (start + start_block_size <= end and start_block_size >= end_block_size) { - auto const mask_level = start_mask_level; - auto const mask_index = start >> mask_level; - if (tree_level_ptrs[mask_level][mask_index]) { return true; } + auto const tree_level = start_tree_level; + auto const mask_index = start >> tree_level; + if (tree_level_ptrs[tree_level][mask_index]) { return true; } start += start_block_size; } // Otherwise, check the `end` side alignment block: [end - end_block_size, end) else if (end - end_block_size >= start) { - auto const mask_level = cuda::std::countr_zero(end_block_size); - auto const mask_index = (end - end_block_size) >> mask_level; - if (tree_level_ptrs[mask_level][mask_index]) { return true; } + auto const tree_level = end_tree_level; + auto const mask_index = (end - end_block_size) >> tree_level; + if (tree_level_ptrs[tree_level][mask_index]) { return true; } end -= end_block_size; } // Fallback for small, unaligned ranges. e.g., [11, 13) From fc6c5347912da9530f661a37054e90648489ecf8 Mon Sep 17 00:00:00 2001 From: Muhammad Haseeb <14217455+mhaseeb123@users.noreply.github.com> Date: Fri, 17 Oct 2025 17:13:45 +0000 Subject: [PATCH 20/25] Style check --- cpp/examples/hybrid_scan_io/common_utils.cpp | 21 ++++++++++++++----- .../parquet/experimental/page_index_filter.cu | 2 +- 2 files changed, 17 insertions(+), 6 deletions(-) diff --git a/cpp/examples/hybrid_scan_io/common_utils.cpp b/cpp/examples/hybrid_scan_io/common_utils.cpp index 0eac78d18da..96611dd961e 100644 --- a/cpp/examples/hybrid_scan_io/common_utils.cpp +++ b/cpp/examples/hybrid_scan_io/common_utils.cpp @@ -81,13 +81,24 @@ void check_tables_equal(cudf::table_view const& lhs_table, cudf::filtered_join join_obj( lhs_table, cudf::null_equality::EQUAL, cudf::set_as_build_table::RIGHT, stream); auto const indices = join_obj.anti_join(rhs_table, stream); - // No exception thrown, check indices - auto const valid = indices->size() == 0; - std::cout << "Tables identical: " << std::boolalpha << valid << "\n\n"; + auto const tables_equal = indices->size() == 0; + if (tables_equal) { + std::cout << "Tables identical: " << std::boolalpha << tables_equal << "\n\n"; + } else { + // Helper to write parquet data for inspection + auto const write_parquet = + [](cudf::table_view table, std::string filepath, rmm::cuda_stream_view stream) { + auto sink_info = cudf::io::sink_info(filepath); + auto opts = cudf::io::parquet_writer_options::builder(sink_info, table).build(); + cudf::io::write_parquet(opts, stream); + }; + write_parquet(lhs_table, "lhs_table.parquet", stream); + write_parquet(rhs_table, "rhs_table.parquet", stream); + throw std::logic_error("Tables identical: false\n\n"); + } } catch (std::exception& e) { - std::cerr << e.what() << std::endl << std::endl; - throw std::runtime_error("Tables identical: false\n\n"); + std::cout << e.what() << std::endl; } } diff --git a/cpp/src/io/parquet/experimental/page_index_filter.cu b/cpp/src/io/parquet/experimental/page_index_filter.cu index d762c9bed3a..81a2d5a3c69 100644 --- a/cpp/src/io/parquet/experimental/page_index_filter.cu +++ b/cpp/src/io/parquet/experimental/page_index_filter.cu @@ -501,7 +501,7 @@ struct search_fenwick_tree_functor { /** * @brief Aligns a range boundary to the next power-of-two block * - * @tparam Alignment The bounday to align + * @tparam Alignment The boundary (start or end) to align * @param start Range start * @param end Range end * @return A pair of the tree level and block size From 488b1de6c61335237d5cff69a6be8a99ce6605d2 Mon Sep 17 00:00:00 2001 From: Muhammad Haseeb <14217455+mhaseeb123@users.noreply.github.com> Date: Fri, 17 Oct 2025 17:35:10 +0000 Subject: [PATCH 21/25] Use host worker pool instead of c++ threads --- .../parquet/experimental/page_index_filter.cu | 75 ++++++++++--------- 1 file changed, 40 insertions(+), 35 deletions(-) diff --git a/cpp/src/io/parquet/experimental/page_index_filter.cu b/cpp/src/io/parquet/experimental/page_index_filter.cu index 81a2d5a3c69..07deb58d834 100644 --- a/cpp/src/io/parquet/experimental/page_index_filter.cu +++ b/cpp/src/io/parquet/experimental/page_index_filter.cu @@ -25,6 +25,7 @@ #include #include #include +#include #include #include #include @@ -47,7 +48,6 @@ #include #include -#include namespace cudf::io::parquet::experimental::detail { @@ -405,10 +405,10 @@ struct page_stats_caster : public stats_caster_base { * @param next_level_size Size of the next tree level */ struct build_next_fenwick_tree_level_functor { - bool** const tree_level_ptrs; - cudf::size_type const current_level; - cudf::size_type const current_level_size; - cudf::size_type const next_level_size; + bool** tree_level_ptrs; + cudf::size_type current_level; + cudf::size_type current_level_size; + cudf::size_type next_level_size; /** * @brief Builds the next Fenwick tree level from the current level data @@ -440,9 +440,9 @@ struct build_next_fenwick_tree_level_functor { * @param num_ranges Number of search ranges */ struct search_fenwick_tree_functor { - bool** const tree_level_ptrs; - cudf::size_type const* const page_offsets; - cudf::size_type const num_ranges; + bool** tree_level_ptrs; + cudf::size_type const* page_offsets; + cudf::size_type num_ranges; /** * @brief Enum class to represent which range boundary to align @@ -735,41 +735,45 @@ cudf::detail::host_vector aggregate_reader_metadata::compute_data_page_mas col_page_offsets.emplace_back(page_row_offsets.size()); }); } else { - auto constexpr max_data_page_mask_threads = 2; - auto const num_threads = std::min(max_data_page_mask_threads, num_columns); - std::vector threads{}; - threads.reserve(num_threads); - std::vector, size_type>>> thread_results( - num_threads); + auto constexpr num_threads = 2; + std::vector, size_type>>>> + page_row_offset_tasks{}; + page_row_offset_tasks.reserve(num_threads); auto const cols_per_thread = cudf::util::div_rounding_up_unsafe(num_columns, num_threads); - std::for_each( - thrust::counting_iterator(0), thrust::counting_iterator(num_threads), [&](auto const tid) { - threads.emplace_back([&, tid = tid]() { + + // Submit page row offset compute tasks + std::transform( + thrust::counting_iterator(0), + thrust::counting_iterator(num_threads), + std::back_inserter(page_row_offset_tasks), + [&](auto const tid) { + return cudf::detail::host_worker_pool().submit_task([&, tid = tid]() { auto const start_col = std::min(tid * cols_per_thread, num_columns); auto const end_col = std::min(start_col + cols_per_thread, num_columns); - std::for_each(thrust::counting_iterator(start_col), - thrust::counting_iterator(end_col), - [&](auto const col_idx) { - thread_results[tid].emplace_back(compute_page_row_offsets( - per_file_metadata, row_group_indices, column_schema_indices[col_idx])); - }); + std::vector, size_type>> thread_page_row_offsets{}; + thread_page_row_offsets.reserve(end_col - start_col); + std::transform(thrust::counting_iterator(start_col), + thrust::counting_iterator(end_col), + std::back_inserter(thread_page_row_offsets), + [&](auto const col_idx) { + return compute_page_row_offsets( + per_file_metadata, row_group_indices, column_schema_indices[col_idx]); + }); + return thread_page_row_offsets; }); }); - std::for_each( - thrust::counting_iterator(0), thrust::counting_iterator(num_threads), [&](auto const tid) { - threads[tid].join(); - for (auto& [col_page_row_offsets, col_max_page_size] : thread_results[tid]) { - page_row_offsets.insert( - page_row_offsets.end(), col_page_row_offsets.begin(), col_page_row_offsets.end()); - max_page_size = std::max(max_page_size, col_max_page_size); - col_page_offsets.emplace_back(page_row_offsets.size()); - } - }); + std::for_each(page_row_offset_tasks.begin(), page_row_offset_tasks.end(), [&](auto& task) { + auto const& thread_page_row_offsets = task.get(); + for (auto& [col_page_row_offsets, col_max_page_size] : thread_page_row_offsets) { + page_row_offsets.insert( + page_row_offsets.end(), col_page_row_offsets.begin(), col_page_row_offsets.end()); + max_page_size = std::max(max_page_size, col_max_page_size); + col_page_offsets.emplace_back(page_row_offsets.size()); + } + }); } - auto const total_pages = page_row_offsets.size() - num_columns; - // Make sure all row_mask elements contain valid values even if they are nulls if constexpr (cuda::std::is_same_v) { if (row_mask.nullable() and row_mask.null_count() > 0) { @@ -833,6 +837,7 @@ cudf::detail::host_vector aggregate_reader_metadata::compute_data_page_mas // Copy over search results to host auto host_results = cudf::detail::make_host_vector_async(device_data_page_mask, stream); + auto const total_pages = page_row_offsets.size() - num_columns; auto data_page_mask = cudf::detail::make_empty_host_vector(total_pages, stream); auto host_results_iter = host_results.begin(); stream.synchronize(); From 33489df7e0932a12b41b31cfbbc7e351f1140f0f Mon Sep 17 00:00:00 2001 From: Muhammad Haseeb <14217455+mhaseeb123@users.noreply.github.com> Date: Fri, 17 Oct 2025 10:41:10 -0700 Subject: [PATCH 22/25] Apply suggestion from @mhaseeb123 --- cpp/src/io/parquet/experimental/page_index_filter.cu | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/src/io/parquet/experimental/page_index_filter.cu b/cpp/src/io/parquet/experimental/page_index_filter.cu index 07deb58d834..3676dedc0d9 100644 --- a/cpp/src/io/parquet/experimental/page_index_filter.cu +++ b/cpp/src/io/parquet/experimental/page_index_filter.cu @@ -554,7 +554,7 @@ struct search_fenwick_tree_functor { auto const [end_tree_level, end_block_size] = align_range_boundary(start, end); - // Check the `start` side alignment block: [M, M + m_block_size) and if it's the larger + // Check the `start` side alignment block: [start, start + start_block_size) and if it's the larger // block if (start + start_block_size <= end and start_block_size >= end_block_size) { auto const tree_level = start_tree_level; From ffacb9355ddb41f6c8cecb51b11116b544c27d5e Mon Sep 17 00:00:00 2001 From: Muhammad Haseeb <14217455+mhaseeb123@users.noreply.github.com> Date: Fri, 17 Oct 2025 17:41:32 +0000 Subject: [PATCH 23/25] style pls --- cpp/src/io/parquet/experimental/page_index_filter.cu | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/cpp/src/io/parquet/experimental/page_index_filter.cu b/cpp/src/io/parquet/experimental/page_index_filter.cu index 3676dedc0d9..9eccbcbc1fe 100644 --- a/cpp/src/io/parquet/experimental/page_index_filter.cu +++ b/cpp/src/io/parquet/experimental/page_index_filter.cu @@ -554,8 +554,8 @@ struct search_fenwick_tree_functor { auto const [end_tree_level, end_block_size] = align_range_boundary(start, end); - // Check the `start` side alignment block: [start, start + start_block_size) and if it's the larger - // block + // Check the `start` side alignment block: [start, start + start_block_size) and if it's the + // larger block if (start + start_block_size <= end and start_block_size >= end_block_size) { auto const tree_level = start_tree_level; auto const mask_index = start >> tree_level; From f9e6848f953616f64cd84dae82ca902ab2baa4f1 Mon Sep 17 00:00:00 2001 From: Muhammad Haseeb <14217455+mhaseeb123@users.noreply.github.com> Date: Wed, 29 Oct 2025 01:26:58 +0000 Subject: [PATCH 24/25] Update headers to SPDX --- .../experimental/page_index_filter_utils.cu | 15 ++------------- .../experimental/page_index_filter_utils.hpp | 15 ++------------- 2 files changed, 4 insertions(+), 26 deletions(-) diff --git a/cpp/src/io/parquet/experimental/page_index_filter_utils.cu b/cpp/src/io/parquet/experimental/page_index_filter_utils.cu index 8750449fc24..ff4c4d2ae58 100644 --- a/cpp/src/io/parquet/experimental/page_index_filter_utils.cu +++ b/cpp/src/io/parquet/experimental/page_index_filter_utils.cu @@ -1,17 +1,6 @@ /* - * Copyright (c) 2025, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION. + * SPDX-License-Identifier: Apache-2.0 */ #include "page_index_filter_utils.hpp" diff --git a/cpp/src/io/parquet/experimental/page_index_filter_utils.hpp b/cpp/src/io/parquet/experimental/page_index_filter_utils.hpp index adb2b3b1a0f..873cdfd8ec7 100644 --- a/cpp/src/io/parquet/experimental/page_index_filter_utils.hpp +++ b/cpp/src/io/parquet/experimental/page_index_filter_utils.hpp @@ -1,17 +1,6 @@ /* - * Copyright (c) 2025, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION. + * SPDX-License-Identifier: Apache-2.0 */ #pragma once From bc14621c1d5314d15e270e5f51bbfd33edb48285 Mon Sep 17 00:00:00 2001 From: Muhammad Haseeb <14217455+mhaseeb123@users.noreply.github.com> Date: Tue, 4 Nov 2025 03:06:00 +0000 Subject: [PATCH 25/25] Address reviews --- .../experimental/hybrid_scan_helpers.hpp | 2 +- .../parquet/experimental/hybrid_scan_impl.cpp | 8 +-- .../parquet/experimental/page_index_filter.cu | 68 ++++++++++--------- .../experimental/page_index_filter_utils.cu | 2 +- .../experimental/page_index_filter_utils.hpp | 9 +-- cpp/src/io/parquet/reader_impl.cpp | 2 - cpp/src/io/parquet/reader_impl.hpp | 6 +- 7 files changed, 50 insertions(+), 47 deletions(-) diff --git a/cpp/src/io/parquet/experimental/hybrid_scan_helpers.hpp b/cpp/src/io/parquet/experimental/hybrid_scan_helpers.hpp index 4eefa293449..d25bd70964b 100644 --- a/cpp/src/io/parquet/experimental/hybrid_scan_helpers.hpp +++ b/cpp/src/io/parquet/experimental/hybrid_scan_helpers.hpp @@ -283,7 +283,7 @@ class aggregate_reader_metadata : public aggregate_reader_metadata_base { * the output table based on the input row mask across all input columns */ template - [[nodiscard]] cudf::detail::host_vector compute_data_page_mask( + [[nodiscard]] thrust::host_vector compute_data_page_mask( ColumnView const& row_mask, cudf::host_span const> row_group_indices, cudf::host_span input_columns, diff --git a/cpp/src/io/parquet/experimental/hybrid_scan_impl.cpp b/cpp/src/io/parquet/experimental/hybrid_scan_impl.cpp index 8e06535b9f0..5a919ae2a0f 100644 --- a/cpp/src/io/parquet/experimental/hybrid_scan_impl.cpp +++ b/cpp/src/io/parquet/experimental/hybrid_scan_impl.cpp @@ -444,7 +444,7 @@ table_with_metadata hybrid_scan_reader_impl::materialize_filter_columns( (mask_data_pages == use_data_page_mask::YES) ? _extended_metadata->compute_data_page_mask( row_mask, row_group_indices, _input_columns, _rows_processed_so_far, stream) - : cudf::detail::make_empty_host_vector(0, stream); + : thrust::host_vector(0); prepare_data( read_mode::READ_ALL, row_group_indices, std::move(column_chunk_buffers), data_page_mask); @@ -474,7 +474,7 @@ table_with_metadata hybrid_scan_reader_impl::materialize_payload_columns( (mask_data_pages == use_data_page_mask::YES) ? _extended_metadata->compute_data_page_mask( row_mask, row_group_indices, _input_columns, _rows_processed_so_far, stream) - : cudf::detail::make_empty_host_vector(0, stream); + : thrust::host_vector(0); prepare_data( read_mode::READ_ALL, row_group_indices, std::move(column_chunk_buffers), data_page_mask); @@ -513,7 +513,7 @@ void hybrid_scan_reader_impl::setup_chunking_for_filter_columns( (mask_data_pages == use_data_page_mask::YES) ? _extended_metadata->compute_data_page_mask( row_mask, row_group_indices, _input_columns, _rows_processed_so_far, _stream) - : cudf::detail::make_empty_host_vector(0, _stream); + : thrust::host_vector(0); prepare_data( read_mode::CHUNKED_READ, row_group_indices, std::move(column_chunk_buffers), data_page_mask); @@ -564,7 +564,7 @@ void hybrid_scan_reader_impl::setup_chunking_for_payload_columns( (mask_data_pages == use_data_page_mask::YES) ? _extended_metadata->compute_data_page_mask( row_mask, row_group_indices, _input_columns, _rows_processed_so_far, _stream) - : cudf::detail::make_empty_host_vector(0, _stream); + : thrust::host_vector(0); prepare_data( read_mode::CHUNKED_READ, row_group_indices, std::move(column_chunk_buffers), data_page_mask); diff --git a/cpp/src/io/parquet/experimental/page_index_filter.cu b/cpp/src/io/parquet/experimental/page_index_filter.cu index 51b15cf69a4..3d142475eeb 100644 --- a/cpp/src/io/parquet/experimental/page_index_filter.cu +++ b/cpp/src/io/parquet/experimental/page_index_filter.cu @@ -381,7 +381,7 @@ struct page_stats_caster : public stats_caster_base { // Construct a row indices mapping based on page row counts and offsets auto const page_indices = - make_page_indices_async(page_row_counts, page_row_offsets, total_rows, stream); + compute_page_indices_async(page_row_counts, page_row_offsets, total_rows, stream); // For non-strings columns, directly gather the page-level column data and bitmask to the // row-level. @@ -786,7 +786,7 @@ std::unique_ptr aggregate_reader_metadata::build_row_mask_with_pag } template -cudf::detail::host_vector aggregate_reader_metadata::compute_data_page_mask( +thrust::host_vector aggregate_reader_metadata::compute_data_page_mask( ColumnView const& row_mask, cudf::host_span const> row_group_indices, cudf::host_span input_columns, @@ -806,7 +806,7 @@ cudf::detail::host_vector aggregate_reader_metadata::compute_data_page_mas row_mask.template begin() + row_mask_offset, row_mask.template begin() + row_mask_offset + total_rows, cuda::std::identity{})) { - return cudf::detail::make_empty_host_vector(0, stream); + return thrust::host_vector(0, stream); } CUDF_EXPECTS(row_mask_offset + total_rows <= row_mask.size(), @@ -818,8 +818,8 @@ cudf::detail::host_vector aggregate_reader_metadata::compute_data_page_mas // Return early if page index is not present if (not has_page_index) { CUDF_LOG_WARN("Encountered missing Parquet page index for one or more output columns"); - return cudf::detail::make_empty_host_vector( - 0, stream); // An empty data page mask indicates all pages are required + return thrust::host_vector( + 0); // An empty data page mask indicates all pages are required } // Collect column schema indices from the input columns. @@ -849,37 +849,37 @@ cudf::detail::host_vector aggregate_reader_metadata::compute_data_page_mas col_page_offsets.emplace_back(page_row_offsets.size()); }); } else { - auto constexpr num_threads = 2; - std::vector, size_type>>>> - page_row_offset_tasks{}; - page_row_offset_tasks.reserve(num_threads); - auto const cols_per_thread = cudf::util::div_rounding_up_unsafe(num_columns, num_threads); + auto constexpr max_tasks = 2; + using task_page_row_offsets_type = std::vector, size_type>>; + std::vector> page_row_offset_tasks{}; + page_row_offset_tasks.reserve(max_tasks); + auto const cols_per_thread = cudf::util::div_rounding_up_unsafe(num_columns, max_tasks); // Submit page row offset compute tasks - std::transform( - thrust::counting_iterator(0), - thrust::counting_iterator(num_threads), - std::back_inserter(page_row_offset_tasks), - [&](auto const tid) { - return cudf::detail::host_worker_pool().submit_task([&, tid = tid]() { - auto const start_col = std::min(tid * cols_per_thread, num_columns); - auto const end_col = std::min(start_col + cols_per_thread, num_columns); - std::vector, size_type>> thread_page_row_offsets{}; - thread_page_row_offsets.reserve(end_col - start_col); - std::transform(thrust::counting_iterator(start_col), + std::transform(thrust::counting_iterator(0), + thrust::counting_iterator(max_tasks), + std::back_inserter(page_row_offset_tasks), + [&](auto const tid) { + return cudf::detail::host_worker_pool().submit_task([&, tid = tid]() { + auto const start_col = std::min(tid * cols_per_thread, num_columns); + auto const end_col = std::min(start_col + cols_per_thread, num_columns); + task_page_row_offsets_type task_page_row_offsets{}; + task_page_row_offsets.reserve(end_col - start_col); + std::transform( + thrust::counting_iterator(start_col), thrust::counting_iterator(end_col), - std::back_inserter(thread_page_row_offsets), + std::back_inserter(task_page_row_offsets), [&](auto const col_idx) { return compute_page_row_offsets( per_file_metadata, row_group_indices, column_schema_indices[col_idx]); }); - return thread_page_row_offsets; - }); - }); + return task_page_row_offsets; + }); + }); std::for_each(page_row_offset_tasks.begin(), page_row_offset_tasks.end(), [&](auto& task) { - auto const& thread_page_row_offsets = task.get(); - for (auto& [col_page_row_offsets, col_max_page_size] : thread_page_row_offsets) { + auto const& task_page_row_offsets = task.get(); + for (auto& [col_page_row_offsets, col_max_page_size] : task_page_row_offsets) { page_row_offsets.insert( page_row_offsets.end(), col_page_row_offsets.begin(), col_page_row_offsets.end()); max_page_size = std::max(max_page_size, col_max_page_size); @@ -952,32 +952,36 @@ cudf::detail::host_vector aggregate_reader_metadata::compute_data_page_mas // Copy over search results to host auto host_results = cudf::detail::make_host_vector_async(device_data_page_mask, stream); auto const total_pages = page_row_offsets.size() - num_columns; - auto data_page_mask = cudf::detail::make_empty_host_vector(total_pages, stream); + auto data_page_mask = thrust::host_vector(total_pages, stream); auto host_results_iter = host_results.begin(); stream.synchronize(); + // Discard results for invalid ranges. i.e. ranges starting at the last page of a column and // ending at the first page of the next column + auto num_pages_inserted = 0; std::for_each(thrust::counting_iterator(0), thrust::counting_iterator(num_columns), [&](auto col_idx) { auto const col_num_pages = col_page_offsets[col_idx + 1] - col_page_offsets[col_idx] - 1; - data_page_mask.insert( - data_page_mask.end(), host_results_iter, host_results_iter + col_num_pages); + data_page_mask.insert(data_page_mask.begin() + num_pages_inserted, + host_results_iter, + host_results_iter + col_num_pages); host_results_iter += col_num_pages + 1; + num_pages_inserted += col_num_pages; }); return data_page_mask; } // Instantiate the templates with ColumnView as cudf::column_view and cudf::mutable_column_view -template cudf::detail::host_vector aggregate_reader_metadata::compute_data_page_mask< +template thrust::host_vector aggregate_reader_metadata::compute_data_page_mask< cudf::column_view>(cudf::column_view const& row_mask, cudf::host_span const> row_group_indices, cudf::host_span input_columns, cudf::size_type row_mask_offset, rmm::cuda_stream_view stream) const; -template cudf::detail::host_vector aggregate_reader_metadata::compute_data_page_mask< +template thrust::host_vector aggregate_reader_metadata::compute_data_page_mask< cudf::mutable_column_view>(cudf::mutable_column_view const& row_mask, cudf::host_span const> row_group_indices, cudf::host_span input_columns, diff --git a/cpp/src/io/parquet/experimental/page_index_filter_utils.cu b/cpp/src/io/parquet/experimental/page_index_filter_utils.cu index ff4c4d2ae58..5a4ccc8dea8 100644 --- a/cpp/src/io/parquet/experimental/page_index_filter_utils.cu +++ b/cpp/src/io/parquet/experimental/page_index_filter_utils.cu @@ -196,7 +196,7 @@ std::pair, size_type> compute_page_row_offsets( return {std::move(page_row_offsets), max_page_size}; } -rmm::device_uvector make_page_indices_async( +rmm::device_uvector compute_page_indices_async( cudf::host_span page_row_counts, cudf::host_span page_row_offsets, cudf::size_type total_rows, diff --git a/cpp/src/io/parquet/experimental/page_index_filter_utils.hpp b/cpp/src/io/parquet/experimental/page_index_filter_utils.hpp index 873cdfd8ec7..f10ad3f2654 100644 --- a/cpp/src/io/parquet/experimental/page_index_filter_utils.hpp +++ b/cpp/src/io/parquet/experimental/page_index_filter_utils.hpp @@ -50,12 +50,13 @@ compute_page_row_counts_and_offsets(cudf::host_span per_fil rmm::cuda_stream_view stream); /** - * @brief Compute page row offsets for a given column schema index + * @brief Computes page row offsets and the size (number of rows) of the largest page for a given + * column schema index * * @param per_file_metadata Span of parquet footer metadata * @param row_group_indices Span of input row group indices * @param schema_idx Column's schema index - * @return Pair of page row offsets and the size of the largest page in this + * @return A pair of page row offsets and the size of the largest page in this * column */ [[nodiscard]] std::pair, size_type> compute_page_row_offsets( @@ -64,7 +65,7 @@ compute_page_row_counts_and_offsets(cudf::host_span per_fil size_type schema_idx); /** - * @brief Make a device vector where each row contains the index of the page it belongs to + * @brief Computes a device vector where each row contains the index of the page it belongs to * * @param page_row_counts Span of page row counts * @param page_row_offsets Span of page row offsets @@ -72,7 +73,7 @@ compute_page_row_counts_and_offsets(cudf::host_span per_fil * @param stream CUDA stream * @return Device vector where each row contains the index of the page it belongs to */ -[[nodiscard]] rmm::device_uvector make_page_indices_async( +[[nodiscard]] rmm::device_uvector compute_page_indices_async( cudf::host_span page_row_counts, cudf::host_span page_row_offsets, cudf::size_type total_rows, diff --git a/cpp/src/io/parquet/reader_impl.cpp b/cpp/src/io/parquet/reader_impl.cpp index e193b84fb78..470420a6775 100644 --- a/cpp/src/io/parquet/reader_impl.cpp +++ b/cpp/src/io/parquet/reader_impl.cpp @@ -473,7 +473,6 @@ void reader_impl::decode_page_data(read_mode mode, size_t skip_rows, size_t num_ reader_impl::reader_impl() : _options{}, - _pass_page_mask{cudf::detail::make_host_vector(0, cudf::get_default_stream())}, _subpass_page_mask{cudf::detail::hostdevice_vector(0, cudf::get_default_stream())} { } @@ -507,7 +506,6 @@ reader_impl::reader_impl(std::size_t chunk_read_limit, options.get_row_groups(), options.is_enabled_use_jit_filter()}, _sources{std::move(sources)}, - _pass_page_mask{cudf::detail::make_host_vector(0, _stream)}, _subpass_page_mask{cudf::detail::hostdevice_vector(0, _stream)}, _output_chunk_read_limit{chunk_read_limit}, _input_pass_read_limit{pass_read_limit} diff --git a/cpp/src/io/parquet/reader_impl.hpp b/cpp/src/io/parquet/reader_impl.hpp index 324df2f46b7..38204bdb37e 100644 --- a/cpp/src/io/parquet/reader_impl.hpp +++ b/cpp/src/io/parquet/reader_impl.hpp @@ -435,10 +435,10 @@ class reader_impl { // _output_buffers associated schema indices std::vector _output_column_schemas; - // Page mask for filtering out pass data pages - cudf::detail::host_vector _pass_page_mask; + // Page mask for filtering out pass data pages (Not copied to the device) + thrust::host_vector _pass_page_mask; - // Page mask for filtering out subpass data pages + // Page mask for filtering out subpass data pages (Copied to the device) cudf::detail::hostdevice_vector _subpass_page_mask; // _output_buffers associated metadata