-
Notifications
You must be signed in to change notification settings - Fork 1.4k
[ntuple] Account for aliased pages in on-disk size calculation #19460
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: master
Are you sure you want to change the base?
Changes from 1 commit
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -93,6 +93,7 @@ void ROOT::RNTupleDescriptor::PrintInfo(std::ostream &output) const | |
std::uint64_t nBytesOnStorage = 0; | ||
std::uint64_t nBytesInMemory = 0; | ||
std::uint64_t nPages = 0; | ||
std::unordered_set<std::uint64_t> seenPages{}; | ||
int compression = -1; | ||
for (const auto &column : fColumnDescriptors) { | ||
// Alias columns (columns of projected fields) don't contribute to the storage consumption. Count them | ||
|
@@ -124,15 +125,22 @@ void ROOT::RNTupleDescriptor::PrintInfo(std::ostream &output) const | |
} | ||
const auto &pageRange = cluster.second.GetPageRange(column.second.GetPhysicalId()); | ||
auto idx = cluster2Idx[cluster.first]; | ||
std::uint64_t locatorOffset; | ||
for (const auto &page : pageRange.GetPageInfos()) { | ||
nBytesOnStorage += page.GetLocator().GetNBytesOnStorage(); | ||
nBytesInMemory += page.GetNElements() * elementSize; | ||
clusters[idx].fNBytesOnStorage += page.GetLocator().GetNBytesOnStorage(); | ||
clusters[idx].fNBytesInMemory += page.GetNElements() * elementSize; | ||
++clusters[idx].fNPages; | ||
info.fNBytesOnStorage += page.GetLocator().GetNBytesOnStorage(); | ||
++info.fNPages; | ||
++nPages; | ||
locatorOffset = page.GetLocator().GetType() == ROOT::RNTupleLocator::ELocatorType::kTypeDAOS | ||
? page.GetLocator().GetPosition<RNTupleLocatorObject64>().GetLocation() | ||
: page.GetLocator().GetPosition<std::uint64_t>(); | ||
auto [_, pageAdded] = seenPages.emplace(locatorOffset); | ||
if (pageAdded) { | ||
nBytesOnStorage += page.GetLocator().GetNBytesOnStorage(); | ||
nBytesInMemory += page.GetNElements() * elementSize; | ||
clusters[idx].fNBytesOnStorage += page.GetLocator().GetNBytesOnStorage(); | ||
clusters[idx].fNBytesInMemory += page.GetNElements() * elementSize; | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Same here, I think I'd always sum up the in-memory size |
||
++clusters[idx].fNPages; | ||
info.fNBytesOnStorage += page.GetLocator().GetNBytesOnStorage(); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Here it gets tricky. As is, deduplicated pages count for none of the columns. I'd rather error the other way and account for all of the pages (aliased or not) for the per column statistics. If we want to do it correctly, I think we need to keep a |
||
++info.fNPages; | ||
++nPages; | ||
} | ||
} | ||
} | ||
columns.emplace_back(info); | ||
|
@@ -168,8 +176,11 @@ void ROOT::RNTupleDescriptor::PrintInfo(std::ostream &output) const | |
<< clusters[i].fFirstEntry + clusters[i].fNEntries - 1 << "] -- " << clusters[i].fNEntries << "\n"; | ||
output << " " << " # Pages: " << clusters[i].fNPages << "\n"; | ||
output << " " << " Size on storage: " << clusters[i].fNBytesOnStorage << " B\n"; | ||
output << " " << " Compression: " << std::fixed << std::setprecision(2) | ||
<< float(clusters[i].fNBytesInMemory) / float(float(clusters[i].fNBytesOnStorage)) << std::endl; | ||
output << " " << " Compression: " << std::fixed << std::setprecision(2); | ||
if (clusters[i].fNPages > 0) | ||
output << float(clusters[i].fNBytesInMemory) / float(float(clusters[i].fNBytesOnStorage)) << std::endl; | ||
else | ||
output << "N/A" << std::endl; | ||
} | ||
|
||
output << "------------------------------------------------------------\n"; | ||
|
@@ -198,8 +209,11 @@ void ROOT::RNTupleDescriptor::PrintInfo(std::ostream &output) const | |
output << " Avg elements / page: " << avgElementsPerPage << "\n"; | ||
output << " Avg page size: " << avgPageSize << " B\n"; | ||
output << " Size on storage: " << col.fNBytesOnStorage << " B\n"; | ||
output << " Compression: " << std::fixed << std::setprecision(2) | ||
<< float(col.fElementSize * col.fNElements) / float(col.fNBytesOnStorage) << "\n"; | ||
output << " Compression: " << std::fixed << std::setprecision(2); | ||
if (col.fNPages > 0) | ||
output << float(col.fElementSize * col.fNElements) / float(col.fNBytesOnStorage) << std::endl; | ||
else | ||
output << "N/A" << std::endl; | ||
output << "............................................................" << std::endl; | ||
} | ||
} |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I think for the in-memory summary, we should not take into account the page deduplication (because we don't deduplicate in memory).