diff --git a/tree/ntuple/src/RNTupleDescriptorFmt.cxx b/tree/ntuple/src/RNTupleDescriptorFmt.cxx index a603dba909efb..6a4d86e352598 100644 --- a/tree/ntuple/src/RNTupleDescriptorFmt.cxx +++ b/tree/ntuple/src/RNTupleDescriptorFmt.cxx @@ -25,7 +25,8 @@ namespace { struct ClusterInfo { std::uint64_t fFirstEntry = 0; - std::uint32_t fNPages = 0; + std::uint64_t fNPhysicalPages = 0; + std::uint64_t fNAliasedPages = 0; std::uint32_t fNEntries = 0; std::uint32_t fNBytesOnStorage = 0; std::uint32_t fNBytesInMemory = 0; @@ -40,7 +41,8 @@ struct ColumnInfo { ROOT::DescriptorId_t fLogicalColumnId = 0; ROOT::DescriptorId_t fFieldId = 0; std::uint64_t fNElements = 0; - std::uint64_t fNPages = 0; + std::uint64_t fNPhysicalPages = 0; + std::uint64_t fNAliasedPages = 0; std::uint64_t fNBytesOnStorage = 0; std::uint32_t fElementSize = 0; std::uint32_t fColumnIndex = 0; @@ -92,7 +94,9 @@ void ROOT::RNTupleDescriptor::PrintInfo(std::ostream &output) const std::uint64_t nBytesOnStorage = 0; std::uint64_t nBytesInMemory = 0; - std::uint64_t nPages = 0; + std::uint64_t nPhysicalPages = 0; + std::uint64_t nAliasedPages = 0; + std::unordered_set seenPages{}; int compression = -1; for (const auto &column : fColumnDescriptors) { // Alias columns (columns of projected fields) don't contribute to the storage consumption. Count them @@ -124,15 +128,26 @@ void ROOT::RNTupleDescriptor::PrintInfo(std::ostream &output) const } const auto &pageRange = cluster.second.GetPageRange(column.second.GetPhysicalId()); auto idx = cluster2Idx[cluster.first]; + std::uint64_t locatorOffset; for (const auto &page : pageRange.GetPageInfos()) { - nBytesOnStorage += page.GetLocator().GetNBytesOnStorage(); - nBytesInMemory += page.GetNElements() * elementSize; - clusters[idx].fNBytesOnStorage += page.GetLocator().GetNBytesOnStorage(); - clusters[idx].fNBytesInMemory += page.GetNElements() * elementSize; - ++clusters[idx].fNPages; - info.fNBytesOnStorage += page.GetLocator().GetNBytesOnStorage(); - ++info.fNPages; - ++nPages; + locatorOffset = page.GetLocator().GetType() == ROOT::RNTupleLocator::ELocatorType::kTypeDAOS + ? page.GetLocator().GetPosition().GetLocation() + : page.GetLocator().GetPosition(); + auto [_, pageAdded] = seenPages.emplace(locatorOffset); + if (pageAdded) { + nBytesOnStorage += page.GetLocator().GetNBytesOnStorage(); + nBytesInMemory += page.GetNElements() * elementSize; + clusters[idx].fNBytesOnStorage += page.GetLocator().GetNBytesOnStorage(); + clusters[idx].fNBytesInMemory += page.GetNElements() * elementSize; + ++clusters[idx].fNPhysicalPages; + info.fNBytesOnStorage += page.GetLocator().GetNBytesOnStorage(); + ++info.fNPhysicalPages; + ++nPhysicalPages; + } else { + ++clusters[idx].fNAliasedPages; + ++info.fNAliasedPages; + ++nAliasedPages; + } } } columns.emplace_back(info); @@ -147,7 +162,8 @@ void ROOT::RNTupleDescriptor::PrintInfo(std::ostream &output) const output << " # Fields: " << GetNFields() << "\n"; output << " # Columns: " << GetNPhysicalColumns() << "\n"; output << " # Alias Columns: " << GetNLogicalColumns() - GetNPhysicalColumns() << "\n"; - output << " # Pages: " << nPages << "\n"; + output << " # Physical Pages: " << nPhysicalPages << "\n"; + output << " # Aliased Pages: " << nAliasedPages << "\n"; output << " # Clusters: " << GetNClusters() << "\n"; output << " Size on storage: " << nBytesOnStorage << " B" << "\n"; output << " Compression rate: " << std::fixed << std::setprecision(2) @@ -164,12 +180,16 @@ void ROOT::RNTupleDescriptor::PrintInfo(std::ostream &output) const std::sort(clusters.begin(), clusters.end()); for (unsigned int i = 0; i < clusters.size(); ++i) { - output << " # " << std::setw(5) << i << " Entry range: [" << clusters[i].fFirstEntry << ".." + output << " # " << std::setw(5) << i << " Entry range: [" << clusters[i].fFirstEntry << ".." << clusters[i].fFirstEntry + clusters[i].fNEntries - 1 << "] -- " << clusters[i].fNEntries << "\n"; - output << " " << " # Pages: " << clusters[i].fNPages << "\n"; - output << " " << " Size on storage: " << clusters[i].fNBytesOnStorage << " B\n"; - output << " " << " Compression: " << std::fixed << std::setprecision(2) - << float(clusters[i].fNBytesInMemory) / float(float(clusters[i].fNBytesOnStorage)) << std::endl; + output << " " << " # Physical Pages: " << clusters[i].fNPhysicalPages << "\n"; + output << " " << " # Aliased Pages: " << clusters[i].fNAliasedPages << "\n"; + output << " " << " Size on storage: " << clusters[i].fNBytesOnStorage << " B\n"; + output << " " << " Compression: " << std::fixed << std::setprecision(2); + if (clusters[i].fNPhysicalPages > 0) + output << float(clusters[i].fNBytesInMemory) / float(float(clusters[i].fNBytesOnStorage)) << std::endl; + else + output << "N/A" << std::endl; } output << "------------------------------------------------------------\n"; @@ -181,8 +201,8 @@ void ROOT::RNTupleDescriptor::PrintInfo(std::ostream &output) const } std::sort(columns.begin(), columns.end()); for (const auto &col : columns) { - auto avgPageSize = (col.fNPages == 0) ? 0 : (col.fNBytesOnStorage / col.fNPages); - auto avgElementsPerPage = (col.fNPages == 0) ? 0 : (col.fNElements / col.fNPages); + auto avgPageSize = (col.fNPhysicalPages == 0) ? 0 : (col.fNBytesOnStorage / col.fNPhysicalPages); + auto avgElementsPerPage = (col.fNPhysicalPages == 0) ? 0 : (col.fNElements / col.fNPhysicalPages); std::string nameAndType = std::string(" ") + col.fFieldName + " [#" + std::to_string(col.fColumnIndex); if (col.fRepresentationIndex > 0) nameAndType += " / R." + std::to_string(col.fRepresentationIndex); @@ -194,12 +214,16 @@ void ROOT::RNTupleDescriptor::PrintInfo(std::ostream &output) const if (!col.fFieldDescription.empty()) output << " Description: " << col.fFieldDescription << "\n"; output << " # Elements: " << col.fNElements << "\n"; - output << " # Pages: " << col.fNPages << "\n"; + output << " # Physical Pages: " << col.fNPhysicalPages << "\n"; + output << " # Aliased Pages: " << col.fNAliasedPages << "\n"; output << " Avg elements / page: " << avgElementsPerPage << "\n"; output << " Avg page size: " << avgPageSize << " B\n"; output << " Size on storage: " << col.fNBytesOnStorage << " B\n"; - output << " Compression: " << std::fixed << std::setprecision(2) - << float(col.fElementSize * col.fNElements) / float(col.fNBytesOnStorage) << "\n"; + output << " Compression: " << std::fixed << std::setprecision(2); + if (col.fNPhysicalPages > 0) + output << float(col.fElementSize * col.fNElements) / float(col.fNBytesOnStorage) << std::endl; + else + output << "N/A" << std::endl; output << "............................................................" << std::endl; } } diff --git a/tree/ntuple/test/ntuple_multi_column.cxx b/tree/ntuple/test/ntuple_multi_column.cxx index b62a16e27e958..0874bd3263289 100644 --- a/tree/ntuple/test/ntuple_multi_column.cxx +++ b/tree/ntuple/test/ntuple_multi_column.cxx @@ -95,7 +95,8 @@ TEST(RNTuple, MultiColumnRepresentationSimple) " # Fields: 2\n" " # Columns: 2\n" " # Alias Columns: 0\n" - " # Pages: 3\n" + " # Physical Pages: 3\n" + " # Aliased Pages: 0\n" " # Clusters: 3\n" " Size on storage: .* B\n" " Compression rate: .*\n" @@ -105,24 +106,28 @@ TEST(RNTuple, MultiColumnRepresentationSimple) "------------------------------------------------------------\n" "CLUSTER DETAILS\n" "------------------------------------------------------------\n" - " # 0 Entry range: .0..0. -- 1\n" - " # Pages: 1\n" - " Size on storage: 4 B\n" - " Compression: 1.00\n" - " # 1 Entry range: .1..1. -- 1\n" - " # Pages: 1\n" - " Size on storage: 2 B\n" - " Compression: 2.00\n" - " # 2 Entry range: .2..2. -- 1\n" - " # Pages: 1\n" - " Size on storage: 4 B\n" - " Compression: 1.00\n" + " # 0 Entry range: .0..0. -- 1\n" + " # Physical Pages: 1\n" + " # Aliased Pages: 0\n" + " Size on storage: 4 B\n" + " Compression: 1.00\n" + " # 1 Entry range: .1..1. -- 1\n" + " # Physical Pages: 1\n" + " # Aliased Pages: 0\n" + " Size on storage: 2 B\n" + " Compression: 2.00\n" + " # 2 Entry range: .2..2. -- 1\n" + " # Physical Pages: 1\n" + " # Aliased Pages: 0\n" + " Size on storage: 4 B\n" + " Compression: 1.00\n" "------------------------------------------------------------\n" "COLUMN DETAILS\n" "------------------------------------------------------------\n" " px .#0. -- Real32 .id:0.\n" " # Elements: 2\n" - " # Pages: 2\n" + " # Physical Pages: 2\n" + " # Aliased Pages: 0\n" " Avg elements / page: 1\n" " Avg page size: 4 B\n" " Size on storage: 8 B\n" @@ -130,7 +135,8 @@ TEST(RNTuple, MultiColumnRepresentationSimple) "............................................................\n" " px .#0 / R.1. -- Real16 .id:1.\n" " # Elements: 1\n" - " # Pages: 1\n" + " # Physical Pages: 1\n" + " # Aliased Pages: 0\n" " Avg elements / page: 1\n" " Avg page size: 2 B\n" " Size on storage: 2 B\n" diff --git a/tree/ntuple/test/ntuple_print.cxx b/tree/ntuple/test/ntuple_print.cxx index 1ac73392ff0d5..f499eb2c8202d 100644 --- a/tree/ntuple/test/ntuple_print.cxx +++ b/tree/ntuple/test/ntuple_print.cxx @@ -40,9 +40,10 @@ TEST(RNtuplePrint, FullString) " # Fields: 4\n" " # Columns: 2\n" " # Alias Columns: 1\n" - " # Pages: 2\n" + " # Physical Pages: 1\n" + " # Aliased Pages: 1\n" " # Clusters: 1\n" - " Size on storage: 8 B\n" + " Size on storage: 4 B\n" " Compression rate: 1.00\n" " Header size: .* B\n" " Footer size: .* B\n" @@ -50,24 +51,27 @@ TEST(RNtuplePrint, FullString) "------------------------------------------------------------\n" "CLUSTER DETAILS\n" "------------------------------------------------------------\n" - " # 0 Entry range: .0..0. -- 1\n" - " # Pages: 2\n" - " Size on storage: 8 B\n" - " Compression: 1.00\n" + " # 0 Entry range: .0..0. -- 1\n" + " # Physical Pages: 1\n" + " # Aliased Pages: 1\n" + " Size on storage: 4 B\n" + " Compression: 1.00\n" "------------------------------------------------------------\n" "COLUMN DETAILS\n" "------------------------------------------------------------\n" " px .#0. -- Real32 .id:0.\n" " # Elements: 1\n" - " # Pages: 1\n" - " Avg elements / page: 1\n" - " Avg page size: 4 B\n" - " Size on storage: 4 B\n" - " Compression: 1.00\n" + " # Physical Pages: 0\n" + " # Aliased Pages: 1\n" + " Avg elements / page: 0\n" + " Avg page size: 0 B\n" + " Size on storage: 0 B\n" + " Compression: N/A\n" "............................................................\n" " py .#0. -- Real32 .id:1.\n" " # Elements: 1\n" - " # Pages: 1\n" + " # Physical Pages: 1\n" + " # Aliased Pages: 0\n" " Avg elements / page: 1\n" " Avg page size: 4 B\n" " Size on storage: 4 B\n" diff --git a/tree/ntupleutil/v7/src/RNTupleInspector.cxx b/tree/ntupleutil/v7/src/RNTupleInspector.cxx index 3809f5af3eee1..34250d09727cc 100644 --- a/tree/ntupleutil/v7/src/RNTupleInspector.cxx +++ b/tree/ntupleutil/v7/src/RNTupleInspector.cxx @@ -60,6 +60,7 @@ void ROOT::Experimental::RNTupleInspector::CollectColumnInfo() // to report the size _in memory_ of column elements. std::uint32_t elemSize = RColumnElementBase::Generate(colDesc.GetType())->GetSize(); std::uint64_t nElems = 0; + std::unordered_set seenPages{}; std::vector compressedPageSizes{}; for (const auto &clusterDescriptor : fDescriptor.GetClusterIterable()) { @@ -88,8 +89,16 @@ void ROOT::Experimental::RNTupleInspector::CollectColumnInfo() const auto &pageRange = clusterDescriptor.GetPageRange(colId); + std::uint64_t locatorOffset; for (const auto &page : pageRange.GetPageInfos()) { - compressedPageSizes.emplace_back(page.GetLocator().GetNBytesOnStorage()); + locatorOffset = page.GetLocator().GetType() == ROOT::RNTupleLocator::ELocatorType::kTypeDAOS + ? page.GetLocator().GetPosition().GetLocation() + : page.GetLocator().GetPosition(); + auto [_, pageAdded] = seenPages.emplace(locatorOffset); + if (pageAdded) { + compressedPageSizes.emplace_back(page.GetLocator().GetNBytesOnStorage()); + } + // For the moment, we actually load and decompress aliased pages multiple times fUncompressedSize += page.GetNElements() * elemSize; } } diff --git a/tree/ntupleutil/v7/test/ntuple_inspector.cxx b/tree/ntupleutil/v7/test/ntuple_inspector.cxx index 0ce07842c45c4..fca286391da41 100644 --- a/tree/ntupleutil/v7/test/ntuple_inspector.cxx +++ b/tree/ntupleutil/v7/test/ntuple_inspector.cxx @@ -263,6 +263,30 @@ TEST(RNTupleInspector, SizeProjectedFields) EXPECT_EQ(inspector->GetFieldTreeInspector("muonPt").GetCompressedSize(), inspector->GetCompressedSize()); } +TEST(RNTupleInspector, SizeSamePageMerging) +{ + FileRaii fileGuard("test_ntuple_inspector_size_same_page_merging.root"); + { + auto model = RNTupleModel::Create(); + auto nFldInt = model->MakeField("int"); + + auto writeOptions = RNTupleWriteOptions(); + writeOptions.SetCompression(0); + writeOptions.SetInitialUnzippedPageSize(16); + writeOptions.SetMaxUnzippedPageSize(16); + auto ntuple = RNTupleWriter::Recreate(std::move(model), "ntuple", fileGuard.GetPath(), writeOptions); + + for (int32_t i = 0; i < 64; ++i) { + *nFldInt = 0; + ntuple->Fill(); + } + } + + auto inspector = RNTupleInspector::Create("ntuple", fileGuard.GetPath()); + EXPECT_EQ(inspector->GetUncompressedSize(), 256); + EXPECT_EQ(inspector->GetCompressedSize(), 16); +} + TEST(RNTupleInspector, ColumnInfoCompressed) { FileRaii fileGuard("test_ntuple_inspector_column_info_compressed.root");