Skip to content
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
25 changes: 22 additions & 3 deletions cpp/src/parquet/column_reader.cc
Original file line number Diff line number Diff line change
Expand Up @@ -93,6 +93,27 @@ inline void CheckNumberDecoded(int64_t number_decoded, int64_t expected) {
constexpr std::string_view kErrorRepDefLevelNotMatchesNumValues =
"Number of decoded rep / def levels do not match num_values in page header";

inline int64_t CountMaxDefLevels(const int16_t* levels, int64_t num_levels,
int16_t max_def_level) {
if (num_levels <= 0) {
return 0;
}
const int16_t rhs = static_cast<int16_t>(max_def_level - 1);
int64_t count = 0;
int64_t offset = 0;
for (; offset + 64 <= num_levels; offset += 64) {
const uint64_t bitmap =
internal::GreaterThanBitmap(levels + offset, 64, rhs);
count += static_cast<int64_t>(bit_util::PopCount(bitmap));
}
if (offset < num_levels) {
const uint64_t bitmap =
internal::GreaterThanBitmap(levels + offset, num_levels - offset, rhs);
count += static_cast<int64_t>(bit_util::PopCount(bitmap));
}
Comment on lines +103 to +113
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
int64_t offset = 0;
for (; offset + 64 <= num_levels; offset += 64) {
const uint64_t bitmap =
internal::GreaterThanBitmap(levels + offset, 64, rhs);
count += static_cast<int64_t>(bit_util::PopCount(bitmap));
}
if (offset < num_levels) {
const uint64_t bitmap =
internal::GreaterThanBitmap(levels + offset, num_levels - offset, rhs);
count += static_cast<int64_t>(bit_util::PopCount(bitmap));
}
for (int64_t offset = 0; offset < num_levels; offset += 64) {
const int64_t chunk_size = std::min<int64_t>(64, num_levels - offset);
const uint64_t bitmap =
internal::GreaterThanBitmap(levels + offset, static_cast<int>(chunk_size), rhs);
count += static_cast<int64_t>(bit_util::PopCount(bitmap));
}

I suggest simplifying the two loops into one.

return count;
}

} // namespace

LevelDecoder::LevelDecoder() : num_values_remaining_(0) {}
Expand Down Expand Up @@ -1010,10 +1031,8 @@ class TypedColumnReaderImpl : public TypedColumnReader<DType>,
if (ARROW_PREDICT_FALSE(*num_def_levels != batch_size)) {
throw ParquetException(kErrorRepDefLevelNotMatchesNumValues);
}
// TODO(wesm): this tallying of values-to-decode can be performed with better
// cache-efficiency if fused with the level decoding.
*non_null_values_to_read +=
std::count(def_levels, def_levels + *num_def_levels, this->max_def_level_);
CountMaxDefLevels(def_levels, *num_def_levels, this->max_def_level_);
} else {
// Required field, read all values
if (num_def_levels != nullptr) {
Expand Down