From 826971284c5d15ec40fe130e98011821b2e30339 Mon Sep 17 00:00:00 2001 From: "Dr. Patrick Urbanke" Date: Fri, 18 Apr 2025 07:58:41 +0200 Subject: [PATCH 01/36] Include arrow as a dependency --- CMakeLists.txt | 8 +++++++- vcpkg.json | 10 ++++++++++ 2 files changed, 17 insertions(+), 1 deletion(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 9fa3d277..da595254 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -9,6 +9,7 @@ option(REFLECTCPP_CAPNPROTO "Enable Cap’n Proto support" OFF) option(REFLECTCPP_CBOR "Enable CBOR support" OFF) option(REFLECTCPP_FLEXBUFFERS "Enable flexbuffers support" OFF) option(REFLECTCPP_MSGPACK "Enable msgpack support" OFF) +option(REFLECTCPP_PARQUET "Enable parquet support" OFF) option(REFLECTCPP_XML "Enable XML support" OFF) option(REFLECTCPP_TOML "Enable TOML support" OFF) option(REFLECTCPP_UBJSON "Enable UBJSON support" OFF) @@ -53,7 +54,8 @@ endif() if (REFLECTCPP_BUILD_TESTS OR REFLECTCPP_BUILD_BENCHMARKS OR (REFLECTCPP_JSON AND NOT REFLECTCPP_USE_BUNDLED_DEPENDENCIES) OR REFLECTCPP_AVRO OR REFLECTCPP_BSON OR REFLECTCPP_CAPNPROTO OR REFLECTCPP_CBOR OR REFLECTCPP_FLEXBUFFERS OR - REFLECTCPP_MSGPACK OR REFLECTCPP_XML OR REFLECTCPP_TOML OR REFLECTCPP_UBJSON OR REFLECTCPP_YAML) + REFLECTCPP_MSGPACK OR REFLECTCPP_PARQUET OR REFLECTCPP_XML OR + REFLECTCPP_TOML OR REFLECTCPP_UBJSON OR REFLECTCPP_YAML) # enable vcpkg per default if features other than JSON are required set(REFLECTCPP_USE_VCPKG_DEFAULT ON) endif() @@ -101,6 +103,10 @@ if (REFLECTCPP_USE_VCPKG) list(APPEND VCPKG_MANIFEST_FEATURES "msgpack") endif() + if (REFLECTCPP_PARQUET) + list(APPEND VCPKG_MANIFEST_FEATURES "parquet") + endif() + if (REFLECTCPP_TOML) list(APPEND VCPKG_MANIFEST_FEATURES "toml") endif() diff --git a/vcpkg.json b/vcpkg.json index f42e3521..466ae79d 100644 --- a/vcpkg.json +++ b/vcpkg.json @@ -110,6 +110,16 @@ } ] }, + "parquet": { + "description": "Enable parquet support", + "dependencies": [ + { + "name": "arrow", + "version>=": "19.0.1", + "features": ["parquet"] + } + ] + }, "tests": { "description": "Compile the tests", "dependencies": [ From e83e8893930ea6564ea2d48556723934e6c24459 Mon Sep 17 00:00:00 2001 From: "Dr. Patrick Urbanke" Date: Fri, 18 Apr 2025 08:57:45 +0200 Subject: [PATCH 02/36] Began writing the parquet interface --- include/rfl/parquet/Parser.hpp | 15 ++++ include/rfl/parquet/Reader.hpp | 146 +++++++++++++++++++++++++++++++++ include/rfl/parquet/Writer.hpp | 60 ++++++++++++++ include/rfl/parquet/load.hpp | 20 +++++ include/rfl/parquet/read.hpp | 55 +++++++++++++ include/rfl/parquet/save.hpp | 24 ++++++ include/rfl/parquet/write.hpp | 44 ++++++++++ 7 files changed, 364 insertions(+) create mode 100644 include/rfl/parquet/Parser.hpp create mode 100644 include/rfl/parquet/Reader.hpp create mode 100644 include/rfl/parquet/Writer.hpp create mode 100644 include/rfl/parquet/load.hpp create mode 100644 include/rfl/parquet/read.hpp create mode 100644 include/rfl/parquet/save.hpp create mode 100644 include/rfl/parquet/write.hpp diff --git a/include/rfl/parquet/Parser.hpp b/include/rfl/parquet/Parser.hpp new file mode 100644 index 00000000..c4149b97 --- /dev/null +++ b/include/rfl/parquet/Parser.hpp @@ -0,0 +1,15 @@ +#ifndef RFL_PARQUET_PARSER_HPP_ +#define RFL_PARQUET_PARSER_HPP_ + +#include "../parsing/Parser.hpp" +#include "Reader.hpp" +#include "Writer.hpp" + +namespace rfl::parquet { + +template +using Parser = parsing::Parser; + +} // namespace rfl::parquet + +#endif diff --git a/include/rfl/parquet/Reader.hpp b/include/rfl/parquet/Reader.hpp new file mode 100644 index 00000000..dfce8595 --- /dev/null +++ b/include/rfl/parquet/Reader.hpp @@ -0,0 +1,146 @@ +#ifndef RFL_PARQUET_READER_HPP_ +#define RFL_PARQUET_READER_HPP_ + +#include + +#include +#include +#include +#include +#include + +#include "../Bytestring.hpp" +#include "../Result.hpp" +#include "../always_false.hpp" +#include "../internal/ptr_cast.hpp" + +namespace rfl::parquet { + +struct Reader { + using InputArrayType = parquet_object_array; + using InputObjectType = parquet_object_map; + using InputVarType = parquet_object; + + template + static constexpr bool has_custom_constructor = + (requires(InputVarType var) { T::from_parquet_obj(var); }); + + rfl::Result get_field_from_array( + const size_t _idx, const InputArrayType _arr) const noexcept { + if (_idx >= _arr.size) { + return error("Index " + std::to_string(_idx) + " of of bounds."); + } + return _arr.ptr[_idx]; + } + + rfl::Result get_field_from_object( + const std::string& _name, const InputObjectType& _obj) const noexcept { + for (uint32_t i = 0; i < _obj.size; ++i) { + const auto& key = _obj.ptr[i].key; + if (key.type != PARQUET_OBJECT_STR) { + return error("Key in element " + std::to_string(i) + + " was not a string."); + } + const auto current_name = + std::string_view(key.via.str.ptr, key.via.str.size); + if (_name == current_name) { + return _obj.ptr[i].val; + } + } + return error("No field named '" + _name + "' was found."); + } + + bool is_empty(const InputVarType& _var) const noexcept { + return _var.type == PARQUET_OBJECT_NIL; + } + + template + rfl::Result to_basic_type(const InputVarType& _var) const noexcept { + const auto type = _var.type; + if constexpr (std::is_same, std::string>()) { + if (type != PARQUET_OBJECT_STR) { + return error("Could not cast to string."); + } + const auto str = _var.via.str; + return std::string(str.ptr, str.size); + + } else if constexpr (std::is_same, + rfl::Bytestring>()) { + if (type != PARQUET_OBJECT_BIN) { + return error("Could not cast to a bytestring."); + } + const auto bin = _var.via.bin; + const auto data = internal::ptr_cast(bin.ptr); + return rfl::Bytestring(data, data + bin.size); + + } else if constexpr (std::is_same, bool>()) { + if (type != PARQUET_OBJECT_BOOLEAN) { + return error("Could not cast to boolean."); + } + return _var.via.boolean; + + } else if constexpr (std::is_floating_point>()) { + if (type == PARQUET_OBJECT_FLOAT32 || type == PARQUET_OBJECT_FLOAT64 || + type == PARQUET_OBJECT_FLOAT) { + return static_cast(_var.via.f64); + } + return error( + "Could not cast to numeric value. The type must be float " + "or double."); + + } else if constexpr (std::is_integral>()) { + if (type == PARQUET_OBJECT_POSITIVE_INTEGER) { + return static_cast(_var.via.u64); + } else if (type == PARQUET_OBJECT_NEGATIVE_INTEGER) { + return static_cast(_var.via.i64); + } + return error( + "Could not cast to numeric value. The type must be integral, float " + "or double."); + } else { + static_assert(rfl::always_false_v, "Unsupported type."); + } + } + + template + std::optional read_array(const ArrayReader& _array_reader, + const InputArrayType& _arr) const noexcept { + for (uint32_t i = 0; i < _arr.size; ++i) { + const auto err = _array_reader.read(_arr.ptr[i]); + if (err) { + return err; + } + } + return std::nullopt; + } + + template + std::optional read_object(const ObjectReader& _object_reader, + const InputObjectType& _obj) const noexcept { + for (uint32_t i = 0; i < _obj.size; ++i) { + const auto& key = _obj.ptr[i].key; + const auto& val = _obj.ptr[i].val; + if (key.type != PARQUET_OBJECT_STR) { + return rfl::Error("Key in element " + std::to_string(i) + + " was not a string."); + } + const auto name = std::string_view(key.via.str.ptr, key.via.str.size); + _object_reader.read(name, val); + } + return std::nullopt; + } + + template + rfl::Result use_custom_constructor( + const InputVarType& _var) const noexcept { + try { + return T::from_parquet_obj(_var); + } catch (std::exception& e) { + return error(e.what()); + } + } +}; + +} // namespace rfl::parquet + +#endif diff --git a/include/rfl/parquet/Writer.hpp b/include/rfl/parquet/Writer.hpp new file mode 100644 index 00000000..d39e1b67 --- /dev/null +++ b/include/rfl/parquet/Writer.hpp @@ -0,0 +1,60 @@ +#ifndef RFL_PARQUET_WRITER_HPP_ +#define RFL_PARQUET_WRITER_HPP_ + +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "../Box.hpp" +#include "../Bytestring.hpp" +#include "../Ref.hpp" +#include "../Result.hpp" +#include "../always_false.hpp" + +namespace rfl::parquet { + +class Writer { + public: + using OutputArrayType = MsgpackOutputArray; + using OutputObjectType = MsgpackOutputObject; + using OutputVarType = MsgpackOutputVar; + + Writer(); + + ~Writer(); + + OutputArrayType array_as_root(const size_t _size) const noexcept; + + OutputObjectType add_object_to_array(const size_t _size, + OutputArrayType* _parent) const noexcept; + + OutputObjectType add_object_to_object( + const std::string_view& _name, const size_t _size, + OutputObjectType* _parent) const noexcept; + + template + OutputVarType add_value_to_object(const std::string_view& _name, + const T& _var, + OutputObjectType* _parent) const noexcept; + + OutputVarType add_null_to_object(const std::string_view& _name, + OutputObjectType* _parent) const noexcept; + + void end_array(OutputArrayType* _arr) const noexcept; + + void end_object(OutputObjectType* _obj) const noexcept; + + private: +}; + +} // namespace rfl::parquet + +#endif diff --git a/include/rfl/parquet/load.hpp b/include/rfl/parquet/load.hpp new file mode 100644 index 00000000..4d4490d8 --- /dev/null +++ b/include/rfl/parquet/load.hpp @@ -0,0 +1,20 @@ +#ifndef RFL_PARQUET_LOAD_HPP_ +#define RFL_PARQUET_LOAD_HPP_ + +#include "../Result.hpp" +#include "../io/load_bytes.hpp" +#include "read.hpp" + +namespace rfl ::parquet { + +template +Result load(const std::string& _fname) { + const auto read_bytes = [](const auto& _bytes) { + return read(_bytes); + }; + return rfl::io::load_bytes(_fname).and_then(read_bytes); +} + +} // namespace rfl::parquet + +#endif diff --git a/include/rfl/parquet/read.hpp b/include/rfl/parquet/read.hpp new file mode 100644 index 00000000..7a241081 --- /dev/null +++ b/include/rfl/parquet/read.hpp @@ -0,0 +1,55 @@ +#ifndef RFL_PARQUET_READ_HPP_ +#define RFL_PARQUET_READ_HPP_ + +#include + +#include +#include + +#include "../Processors.hpp" +#include "../internal/wrap_in_rfl_array_t.hpp" +#include "Parser.hpp" +#include "Reader.hpp" + +namespace rfl::parquet { + +using InputObjectType = typename Reader::InputObjectType; +using InputVarType = typename Reader::InputVarType; + +/// Parses an object from a PARQUET var. +template +auto read(const InputVarType& _obj) { + const auto r = Reader(); + return Parser>::read(r, _obj); +} + +/// Parses an object from PARQUET using reflection. +template +Result> read(const char* _bytes, + const size_t _size) { + parquet_zone mempool; + parquet_zone_init(&mempool, 2048); + parquet_object deserialized; + parquet_unpack(_bytes, _size, NULL, &mempool, &deserialized); + auto r = read(deserialized); + parquet_zone_destroy(&mempool); + return r; +} + +/// Parses an object from PARQUET using reflection. +template +auto read(const std::vector& _bytes) { + return read(_bytes.data(), _bytes.size()); +} + +/// Parses an object from a stream. +template +auto read(std::istream& _stream) { + std::istreambuf_iterator begin(_stream), end; + auto bytes = std::vector(begin, end); + return read(bytes.data(), bytes.size()); +} + +} // namespace rfl::parquet + +#endif diff --git a/include/rfl/parquet/save.hpp b/include/rfl/parquet/save.hpp new file mode 100644 index 00000000..092abe64 --- /dev/null +++ b/include/rfl/parquet/save.hpp @@ -0,0 +1,24 @@ +#ifndef RFL_PARQUET_SAVE_HPP_ +#define RFL_PARQUET_SAVE_HPP_ + +#include +#include +#include + +#include "../Result.hpp" +#include "../io/save_bytes.hpp" +#include "write.hpp" + +namespace rfl::parquet { + +template +Result save(const std::string& _fname, const auto& _obj) { + const auto write_func = [](const auto& _obj, auto& _stream) -> auto& { + return write(_obj, _stream); + }; + return rfl::io::save_bytes(_fname, _obj, write_func); +} + +} // namespace rfl::parquet + +#endif diff --git a/include/rfl/parquet/write.hpp b/include/rfl/parquet/write.hpp new file mode 100644 index 00000000..c5220fab --- /dev/null +++ b/include/rfl/parquet/write.hpp @@ -0,0 +1,44 @@ +#ifndef RFL_PARQUET_WRITE_HPP_ +#define RFL_PARQUET_WRITE_HPP_ + +#include + +#include +#include +#include +#include +#include + +#include "../Processors.hpp" +#include "../parsing/Parent.hpp" +#include "Parser.hpp" + +namespace rfl::parquet { + +/// Returns parquet bytes. +template +std::vector write(const auto& _obj) noexcept { + using T = std::remove_cvref_t; + using ParentType = parsing::Parent; + parquet_sbuffer sbuf; + parquet_sbuffer_init(&sbuf); + parquet_packer pk; + parquet_packer_init(&pk, &sbuf, parquet_sbuffer_write); + auto w = Writer(&pk); + Parser>::write(w, _obj, typename ParentType::Root{}); + auto bytes = std::vector(sbuf.data, sbuf.data + sbuf.size); + parquet_sbuffer_destroy(&sbuf); + return bytes; +} + +/// Writes a PARQUET into an ostream. +template +std::ostream& write(const auto& _obj, std::ostream& _stream) noexcept { + auto buffer = write(_obj); + _stream.write(buffer.data(), buffer.size()); + return _stream; +} + +} // namespace rfl::parquet + +#endif From 617ec90b5c61169bc2cb7ff2a296c1d6dc01e79b Mon Sep 17 00:00:00 2001 From: "Dr. Patrick Urbanke" Date: Sat, 16 Aug 2025 16:53:52 +0200 Subject: [PATCH 03/36] Used more idiomatic solutions for has_reflection_method_v and has_reflection_type_v --- .../rfl/internal/has_reflection_method_v.hpp | 28 ++++------------ .../rfl/internal/has_reflection_type_v.hpp | 33 ++++++------------- 2 files changed, 17 insertions(+), 44 deletions(-) diff --git a/include/rfl/internal/has_reflection_method_v.hpp b/include/rfl/internal/has_reflection_method_v.hpp index 228239a5..963253b1 100644 --- a/include/rfl/internal/has_reflection_method_v.hpp +++ b/include/rfl/internal/has_reflection_method_v.hpp @@ -1,29 +1,15 @@ #ifndef RFL_INTERNAL_HASREFLECTIONMETHODV_HPP_ #define RFL_INTERNAL_HASREFLECTIONMETHODV_HPP_ -#include +#include -namespace rfl { -namespace internal { +namespace rfl::internal { -template -using reflection_method_t = - decltype(std::declval().reflection()); +template +constexpr bool has_reflection_method_v = requires(T t) { + { t.reflection() } -> std::convertible_to; +}; -template > -struct has_refl_m : std::false_type {}; - -template -struct has_refl_m>> - : std::true_type {}; - -/// Utility parameter for named tuple parsing, can be used by the -/// parsers to determine whether a class or struct has a method -/// called "reflection". -template -constexpr bool has_reflection_method_v = has_refl_m::value; - -} // namespace internal -} // namespace rfl +} // namespace rfl::internal #endif diff --git a/include/rfl/internal/has_reflection_type_v.hpp b/include/rfl/internal/has_reflection_type_v.hpp index f6cd70a9..c6020138 100644 --- a/include/rfl/internal/has_reflection_type_v.hpp +++ b/include/rfl/internal/has_reflection_type_v.hpp @@ -1,33 +1,20 @@ #ifndef RFL_HASREFLECTIONTYPEV_HPP_ #define RFL_HASREFLECTIONTYPEV_HPP_ -#include -#include +#include -namespace rfl { -namespace internal { +namespace rfl::internal { -template -class HasReflectionType { - private: - template - static std::int64_t foo(...); +template +struct ReflectionTypeWrapper {}; - template - static std::int32_t foo(typename U::ReflectionType*); - - public: - static constexpr bool value = - sizeof(foo(nullptr)) == sizeof(std::int32_t); +template +constexpr bool has_reflection_type_v = requires() { + { + ReflectionTypeWrapper{} + } -> std::same_as>; }; -/// Utility parameter for named tuple parsing, can be used by the -/// parsers to determine whether a class or struct defines a type -/// called "ReflectionType". -template -constexpr bool has_reflection_type_v = HasReflectionType::value; - -} // namespace internal -} // namespace rfl +} // namespace rfl::internal #endif // RFL_HASNAMEDTUPLETYPEV_HPP_ From 9e83f61465665bcac0770e052a34a30dda04410d Mon Sep 17 00:00:00 2001 From: "Dr. Patrick Urbanke" Date: Sun, 17 Aug 2025 15:39:55 +0200 Subject: [PATCH 04/36] Basic parquet write appears to work --- CMakeLists.txt | 14 + include/rfl/parquet.hpp | 13 + include/rfl/parquet/Writer.hpp | 73 +++- include/rfl/parquet/write.hpp | 55 ++- include/rfl/parsing/tabular/ArrowWriter.hpp | 101 ++++++ .../rfl/parsing/tabular/add_to_builder.hpp | 19 + .../rfl/parsing/tabular/arrow_builders_t.hpp | 333 ++++++++++++++++++ .../parsing/tabular/make_arrow_data_types.hpp | 18 + .../rfl/parsing/tabular/make_arrow_schema.hpp | 18 + src/reflectcpp_parquet.cpp | 31 ++ src/rfl/parquet/Writer.cpp | 78 ++++ tests/CMakeLists.txt | 4 + tests/parquet/CMakeLists.txt | 21 ++ tests/parquet/test_readme_example.cpp | 41 +++ tests/parquet/write_and_read.hpp | 21 ++ 15 files changed, 817 insertions(+), 23 deletions(-) create mode 100644 include/rfl/parquet.hpp create mode 100644 include/rfl/parsing/tabular/ArrowWriter.hpp create mode 100644 include/rfl/parsing/tabular/add_to_builder.hpp create mode 100644 include/rfl/parsing/tabular/arrow_builders_t.hpp create mode 100644 include/rfl/parsing/tabular/make_arrow_data_types.hpp create mode 100644 include/rfl/parsing/tabular/make_arrow_schema.hpp create mode 100644 src/reflectcpp_parquet.cpp create mode 100644 src/rfl/parquet/Writer.cpp create mode 100644 tests/parquet/CMakeLists.txt create mode 100644 tests/parquet/test_readme_example.cpp create mode 100644 tests/parquet/write_and_read.hpp diff --git a/CMakeLists.txt b/CMakeLists.txt index 622f5c1c..ff3a99b5 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -266,6 +266,20 @@ if (REFLECTCPP_MSGPACK) target_link_libraries(reflectcpp PUBLIC msgpack-c) endif () +if (REFLECTCPP_PARQUET) + list(APPEND REFLECT_CPP_SOURCES + src/reflectcpp_parquet.cpp + ) + if (NOT TARGET Arrow) + find_package(Arrow CONFIG REQUIRED) + endif() + if (NOT TARGET Parquet) + find_package(Parquet CONFIG REQUIRED) + endif() + target_link_libraries(reflectcpp PUBLIC "$,Arrow::arrow_static,Arrow::arrow_shared>") + target_link_libraries(reflectcpp PUBLIC "$,Parquet::parquet_static,Parquet::parquet_shared>") +endif () + if (REFLECTCPP_TOML) list(APPEND REFLECT_CPP_SOURCES src/reflectcpp_toml.cpp diff --git a/include/rfl/parquet.hpp b/include/rfl/parquet.hpp new file mode 100644 index 00000000..8570378a --- /dev/null +++ b/include/rfl/parquet.hpp @@ -0,0 +1,13 @@ +#ifndef RFL_PARQUET_HPP_ +#define RFL_PARQUET_HPP_ + +#include "../rfl.hpp" +// #include "parquet/Parser.hpp" +// #include "parquet/Reader.hpp" +// #include "parquet/Writer.hpp" +// #include "parquet/load.hpp" +// #include "parquet/read.hpp" +// #include "parquet/save.hpp" +#include "parquet/write.hpp" + +#endif diff --git a/include/rfl/parquet/Writer.hpp b/include/rfl/parquet/Writer.hpp index d39e1b67..cb5bc118 100644 --- a/include/rfl/parquet/Writer.hpp +++ b/include/rfl/parquet/Writer.hpp @@ -1,5 +1,5 @@ -#ifndef RFL_PARQUET_WRITER_HPP_ -#define RFL_PARQUET_WRITER_HPP_ +#ifndef RFL_MSGPACK_WRITER_HPP_ +#define RFL_MSGPACK_WRITER_HPP_ #include @@ -17,21 +17,33 @@ #include "../Bytestring.hpp" #include "../Ref.hpp" #include "../Result.hpp" +#include "../Vectorstring.hpp" #include "../always_false.hpp" namespace rfl::parquet { class Writer { public: - using OutputArrayType = MsgpackOutputArray; - using OutputObjectType = MsgpackOutputObject; - using OutputVarType = MsgpackOutputVar; + struct ParquetOutputArray {}; - Writer(); + struct ParquetOutputObject {}; + + struct ParquetOutputVar {}; + + using OutputArrayType = ParquetOutputArray; + using OutputObjectType = ParquetOutputObject; + using OutputVarType = ParquetOutputVar; + + Writer(parquet_packer* _pk); ~Writer(); - OutputArrayType array_as_root(const size_t _size) const noexcept; + OutputArrayType add_array_to_array(const size_t _size, + OutputArrayType* _parent) const noexcept; + + OutputArrayType add_array_to_object(const std::string_view& _name, + const size_t _size, + OutputObjectType* _parent) const noexcept; OutputObjectType add_object_to_array(const size_t _size, OutputArrayType* _parent) const noexcept; @@ -40,10 +52,22 @@ class Writer { const std::string_view& _name, const size_t _size, OutputObjectType* _parent) const noexcept; + template + OutputVarType add_value_to_array(const T& _var, + OutputArrayType* _parent) const noexcept { + return new_value(_var); + } + template OutputVarType add_value_to_object(const std::string_view& _name, const T& _var, - OutputObjectType* _parent) const noexcept; + OutputObjectType* _parent) const noexcept { + parquet_pack_str(pk_, _name.size()); + parquet_pack_str_body(pk_, _name.data(), _name.size()); + return new_value(_var); + } + + OutputVarType add_null_to_array(OutputArrayType* _parent) const noexcept; OutputVarType add_null_to_object(const std::string_view& _name, OutputObjectType* _parent) const noexcept; @@ -53,6 +77,39 @@ class Writer { void end_object(OutputObjectType* _obj) const noexcept; private: + OutputArrayType new_array(const size_t _size) const noexcept; + + OutputObjectType new_object(const size_t _size) const noexcept; + + template + OutputVarType new_value(const T& _var) const noexcept { + using Type = std::remove_cvref_t; + if constexpr (std::is_same()) { + parquet_pack_str(pk_, _var.size()); + parquet_pack_str_body(pk_, _var.c_str(), _var.size()); + } else if constexpr (std::is_same() || + std::is_same()) { + parquet_pack_bin(pk_, _var.size()); + parquet_pack_bin_body(pk_, _var.data(), _var.size()); + } else if constexpr (std::is_same()) { + if (_var) { + parquet_pack_true(pk_); + } else { + parquet_pack_false(pk_); + } + } else if constexpr (std::is_floating_point()) { + parquet_pack_double(pk_, static_cast(_var)); + } else if constexpr (std::is_integral()) { + parquet_pack_int64(pk_, static_cast(_var)); + } else { + static_assert(rfl::always_false_v, "Unsupported type."); + } + return OutputVarType{}; + } + + private: + /// The underlying packer. + parquet_packer* pk_; }; } // namespace rfl::parquet diff --git a/include/rfl/parquet/write.hpp b/include/rfl/parquet/write.hpp index c5220fab..7f522b4e 100644 --- a/include/rfl/parquet/write.hpp +++ b/include/rfl/parquet/write.hpp @@ -1,34 +1,59 @@ #ifndef RFL_PARQUET_WRITE_HPP_ #define RFL_PARQUET_WRITE_HPP_ -#include +#include +#include #include #include #include +#include #include #include #include "../Processors.hpp" -#include "../parsing/Parent.hpp" -#include "Parser.hpp" +#include "../parsing/tabular/ArrowWriter.hpp" namespace rfl::parquet { /// Returns parquet bytes. template -std::vector write(const auto& _obj) noexcept { - using T = std::remove_cvref_t; - using ParentType = parsing::Parent; - parquet_sbuffer sbuf; - parquet_sbuffer_init(&sbuf); - parquet_packer pk; - parquet_packer_init(&pk, &sbuf, parquet_sbuffer_write); - auto w = Writer(&pk); - Parser>::write(w, _obj, typename ParentType::Root{}); - auto bytes = std::vector(sbuf.data, sbuf.data + sbuf.size); - parquet_sbuffer_destroy(&sbuf); - return bytes; +std::vector write(const auto& _arr) { + /// TODO: Support processors + using T = std::remove_cvref_t; + const auto table = + parsing::tabular::ArrowWriter(/*chunksize=*/2000).to_table(_arr); + + const auto props = ::parquet::WriterProperties::Builder() + .compression(arrow::Compression::SNAPPY) + ->build(); + + const auto arrow_props = + ::parquet::ArrowWriterProperties::Builder().store_schema()->build(); + + const auto output_buffer = arrow::io::BufferOutputStream::Create(); + + if (!output_buffer.ok()) { + throw std::runtime_error(output_buffer.status().message()); + } + + const auto status = ::parquet::arrow::WriteTable( + *table.get(), arrow::default_memory_pool(), output_buffer.ValueOrDie(), + /*chunk_size=*/2000, props, arrow_props); + + if (!status.ok()) { + throw std::runtime_error(status.message()); + } + + const auto buffer = output_buffer.ValueOrDie()->Finish(); + + if (!buffer.ok()) { + throw std::runtime_error(output_buffer.status().message()); + } + + const auto view = std::string_view(*buffer.ValueOrDie()); + + return std::vector(view.begin(), view.end()); } /// Writes a PARQUET into an ostream. diff --git a/include/rfl/parsing/tabular/ArrowWriter.hpp b/include/rfl/parsing/tabular/ArrowWriter.hpp new file mode 100644 index 00000000..bfa3656e --- /dev/null +++ b/include/rfl/parsing/tabular/ArrowWriter.hpp @@ -0,0 +1,101 @@ +#ifndef RFL_PARSING_TABULAR_ARROWWRITER_HPP_ +#define RFL_PARSING_TABULAR_ARROWWRITER_HPP_ + +#include + +#include +#include +#include +#include + +#include "../../Tuple.hpp" +#include "../../get.hpp" +#include "../../named_tuple_t.hpp" +#include "../../to_view.hpp" +#include "add_to_builder.hpp" +#include "arrow_builders_t.hpp" +#include "make_arrow_data_types.hpp" +#include "make_arrow_schema.hpp" + +namespace rfl::parsing::tabular { + +template +class ArrowWriter { + public: + using ValueType = typename std::remove_cvref_t; + + ArrowWriter(const size_t _chunksize) : chunksize_(_chunksize) {} + + ~ArrowWriter() = default; + + std::shared_ptr to_table(const VecType& _data) const { + return arrow::Table::Make(make_arrow_schema(), + to_chunked_arrays(_data)); + } + + private: + std::vector> to_chunked_arrays( + const VecType& _data) const; + + private: + size_t chunksize_; +}; + +template +std::vector> +ArrowWriter::to_chunked_arrays(const VecType& _data) const { + using BuildersType = arrow_builders_t; + BuildersType builders; + + constexpr size_t size = tuple_size_v; + + std::vector>> array_chunks(size); + + auto it = _data.begin(); + + while (it != _data.end()) { + size_t i = 0; + + for (; it != _data.end() && (i < chunksize_ || chunksize_ == 0); + ++i, ++it) { + const auto view = to_view(*it); + + [&](const auto& _v, auto* _b, + std::integer_sequence) { + (add_to_builder(*get<_is>(_v), &(_b->template get<_is>())), ...); + }(view, &builders, std::make_integer_sequence()); + } + + if (i != 0) { + std::vector> chunks(size); + + const auto finish_builder = [](auto* _b, auto* _c) { + const auto status = _b->Finish(_c); + if (!status.ok()) { + throw std::runtime_error(status.message()); + } + }; + + [&](auto* _b, auto* _c, + std::integer_sequence) { + (finish_builder(&_b->template get<_is>(), &_c->at(_is)), ...); + }(&builders, &chunks, std::make_integer_sequence()); + + for (size_t j = 0; j < size; ++j) { + array_chunks.at(j).emplace_back(std::move(chunks.at(j))); + } + } + } + + const auto data_types = make_arrow_data_types(); + + return [&](std::integer_sequence) { + return std::vector>( + {std::make_shared(array_chunks.at(_is), + std::get<_is>(data_types))...}); + }(std::make_integer_sequence()); +} + +} // namespace rfl::parsing::tabular + +#endif diff --git a/include/rfl/parsing/tabular/add_to_builder.hpp b/include/rfl/parsing/tabular/add_to_builder.hpp new file mode 100644 index 00000000..1279a2b5 --- /dev/null +++ b/include/rfl/parsing/tabular/add_to_builder.hpp @@ -0,0 +1,19 @@ +#ifndef RFL_PARSING_TABULAR_ADD_TO_BUILDER_HPP_ +#define RFL_PARSING_TABULAR_ADD_TO_BUILDER_HPP_ + +#include + +#include "../../named_tuple_t.hpp" +#include "arrow_builders_t.hpp" + +namespace rfl::parsing::tabular { + +template +inline void add_to_builder(const ValueType& _val, BuilderType* _builder) { + ArrowBuilderType>::add_to_builder(_val, + _builder); +} + +} // namespace rfl::parsing::tabular + +#endif diff --git a/include/rfl/parsing/tabular/arrow_builders_t.hpp b/include/rfl/parsing/tabular/arrow_builders_t.hpp new file mode 100644 index 00000000..b5874fab --- /dev/null +++ b/include/rfl/parsing/tabular/arrow_builders_t.hpp @@ -0,0 +1,333 @@ +#ifndef RFL_PARSING_TABULAR_ARROWBUILDERST_HPP_ +#define RFL_PARSING_TABULAR_ARROWBUILDERST_HPP_ + +#include + +#include +#include +#include +#include +#include +#include +#include + +#include "../../Box.hpp" +#include "../../NamedTuple.hpp" +#include "../../Ref.hpp" +#include "../../Rename.hpp" +#include "../../Timestamp.hpp" +#include "../../Tuple.hpp" +#include "../../internal/StringLiteral.hpp" +#include "../../internal/has_reflection_type_v.hpp" +#include "../../named_tuple_t.hpp" + +namespace rfl::parsing::tabular { + +template +struct ArrowBuilderType; + +template <> +struct ArrowBuilderType { + using Type = arrow::UInt8Builder; + + static auto data_type() { return arrow::uint8(); } + + static void add_to_builder(const auto _val, Type* _builder) { + const auto status = _builder->Append(_val); + if (!status.ok()) { + throw std::runtime_error(status.message()); + } + } +}; + +template <> +struct ArrowBuilderType { + using Type = arrow::UInt16Builder; + + static auto data_type() { return arrow::uint16(); } + + static void add_to_builder(const auto _val, Type* _builder) { + const auto status = _builder->Append(_val); + if (!status.ok()) { + throw std::runtime_error(status.message()); + } + } +}; + +template <> +struct ArrowBuilderType { + using Type = arrow::UInt32Builder; + + static auto data_type() { return arrow::uint32(); } + + static void add_to_builder(const auto _val, Type* _builder) { + const auto status = _builder->Append(_val); + if (!status.ok()) { + throw std::runtime_error(status.message()); + } + } +}; + +template <> +struct ArrowBuilderType { + using Type = arrow::UInt64Builder; + + static auto data_type() { return arrow::uint64(); } + + static void add_to_builder(const auto _val, Type* _builder) { + const auto status = _builder->Append(_val); + if (!status.ok()) { + throw std::runtime_error(status.message()); + } + } +}; + +template <> +struct ArrowBuilderType { + using Type = arrow::Int8Builder; + + static auto data_type() { return arrow::int8(); } + + static void add_to_builder(const auto _val, Type* _builder) { + const auto status = _builder->Append(_val); + if (!status.ok()) { + throw std::runtime_error(status.message()); + } + } +}; + +template <> +struct ArrowBuilderType { + using Type = arrow::Int16Builder; + + static auto data_type() { return arrow::int16(); } + + static void add_to_builder(const auto _val, Type* _builder) { + const auto status = _builder->Append(_val); + if (!status.ok()) { + throw std::runtime_error(status.message()); + } + } +}; + +template <> +struct ArrowBuilderType { + using Type = arrow::Int32Builder; + + static auto data_type() { return arrow::int32(); } + + static void add_to_builder(const auto _val, Type* _builder) { + const auto status = _builder->Append(_val); + if (!status.ok()) { + throw std::runtime_error(status.message()); + } + } +}; + +template <> +struct ArrowBuilderType { + using Type = arrow::Int64Builder; + + static auto data_type() { return arrow::int64(); } + + static void add_to_builder(const auto _val, Type* _builder) { + const auto status = _builder->Append(_val); + if (!status.ok()) { + throw std::runtime_error(status.message()); + } + } +}; + +template <> +struct ArrowBuilderType { + using Type = arrow::FloatBuilder; + + static auto data_type() { return arrow::float32(); } + + static void add_to_builder(const auto _val, Type* _builder) { + const auto status = _builder->Append(_val); + if (!status.ok()) { + throw std::runtime_error(status.message()); + } + } +}; + +template <> +struct ArrowBuilderType { + using Type = arrow::DoubleBuilder; + + static auto data_type() { return arrow::float64(); } + + static void add_to_builder(const auto _val, Type* _builder) { + const auto status = _builder->Append(_val); + if (!status.ok()) { + throw std::runtime_error(status.message()); + } + } +}; + +template <> +struct ArrowBuilderType { + using Type = arrow::StringBuilder; + + static auto data_type() { return arrow::utf8(); } + + static void add_to_builder(const auto& _val, Type* _builder) { + const auto status = _builder->Append(_val); + if (!status.ok()) { + throw std::runtime_error(status.message()); + } + } +}; + +template +struct ArrowBuilderType> { + using Type = arrow::TimestampBuilder; + + static auto data_type() { return arrow::timestamp(arrow::TimeUnit::SECOND); } + + static void add_to_builder(const auto _val, Type* _builder) { + const auto status = _builder->Append(_val); + if (!status.ok()) { + throw std::runtime_error(status.message()); + } + } +}; + +template + requires internal::has_reflection_type_v +struct ArrowBuilderType { + using Type = typename ArrowBuilderType::Type; + + static auto data_type() { + return ArrowBuilderType::data_type(); + } + + static void add_to_builder(const auto _val, Type* _builder) { + ArrowBuilderType::add_to_builder( + _val.reflection(), _builder); + } +}; + +template +struct ArrowBuilderType> { + using Type = typename ArrowBuilderType>::Type; + + static auto data_type() { return ArrowBuilderType::data_type(); } + + static void add_to_builder(const auto _val, Type* _builder) { + if (_val) { + ArrowBuilderType::add_to_builder(*_val, _builder); + } else { + const auto status = _builder->AppendNull(); + if (!status.ok()) { + throw std::runtime_error(status.message()); + } + } + } +}; + +template +struct ArrowBuilderType> { + using Type = typename ArrowBuilderType>::Type; + + static auto data_type() { return ArrowBuilderType::data_type(); } + + static void add_to_builder(const auto _val, Type* _builder) { + if (_val) { + ArrowBuilderType::add_to_builder(*_val, _builder); + } else { + const auto status = _builder->AppendNull(); + if (!status.ok()) { + throw std::runtime_error(status.message()); + } + } + } +}; + +template +struct ArrowBuilderType> { + using Type = typename ArrowBuilderType>::Type; + + static auto data_type() { return ArrowBuilderType::data_type(); } + + static void add_to_builder(const auto _val, Type* _builder) { + if (_val) { + ArrowBuilderType::add_to_builder(*_val, _builder); + } else { + const auto status = _builder->AppendNull(); + if (!status.ok()) { + throw std::runtime_error(status.message()); + } + } + } +}; + +template +struct ArrowBuilderType> { + using Type = typename ArrowBuilderType>::Type; + + static auto data_type() { return ArrowBuilderType::data_type(); } + + static void add_to_builder(const auto _val, Type* _builder) { + ArrowBuilderType::add_to_builder(*_val, _builder); + } +}; + +template +struct ArrowBuilderType> { + using Type = typename ArrowBuilderType>::Type; + + static auto data_type() { return ArrowBuilderType::data_type(); } + + static void add_to_builder(const auto _val, Type* _builder) { + ArrowBuilderType::add_to_builder(*_val, _builder); + } +}; + +template +struct ArrowBuilderType> { + using Type = typename ArrowBuilderType>::Type; + + static auto data_type() { return ArrowBuilderType::data_type(); } + + static void add_to_builder(const auto _val, Type* _builder) { + ArrowBuilderType::add_to_builder(_val.value(), _builder); + } +}; + +template +using arrow_builder_t = typename ArrowBuilderType< + std::remove_cvref_t>>::Type; + +template +struct ArrowBuildersType; + +template +struct ArrowBuildersType> { + using Type = Tuple...>; + + static auto data_types() { + return [&](std::integer_sequence) { + return std::array, + sizeof...(FieldTypes)>( + {ArrowBuilderType::data_type()...}); + }(std::make_integer_sequence()); + } + + static auto schema() { + const auto fields = + std::vector>({arrow::field( + typename FieldTypes::Name().str(), + ArrowBuilderType::data_type())...}); + return arrow::schema(fields); + } +}; + +template +using arrow_builders_t = + typename ArrowBuildersType>>::Type; + +} // namespace rfl::parsing::tabular + +#endif diff --git a/include/rfl/parsing/tabular/make_arrow_data_types.hpp b/include/rfl/parsing/tabular/make_arrow_data_types.hpp new file mode 100644 index 00000000..a85ed26f --- /dev/null +++ b/include/rfl/parsing/tabular/make_arrow_data_types.hpp @@ -0,0 +1,18 @@ +#ifndef RFL_PARSING_TABULAR_MAKE_ARROW_DATA_TYPES_HPP_ +#define RFL_PARSING_TABULAR_MAKE_ARROW_DATA_TYPES_HPP_ + +#include + +#include "../../named_tuple_t.hpp" +#include "arrow_builders_t.hpp" + +namespace rfl::parsing::tabular { + +template +inline auto make_arrow_data_types() { + return ArrowBuildersType>>::data_types(); +} + +} // namespace rfl::parsing::tabular + +#endif diff --git a/include/rfl/parsing/tabular/make_arrow_schema.hpp b/include/rfl/parsing/tabular/make_arrow_schema.hpp new file mode 100644 index 00000000..ba2a3ac4 --- /dev/null +++ b/include/rfl/parsing/tabular/make_arrow_schema.hpp @@ -0,0 +1,18 @@ +#ifndef RFL_PARSING_TABULAR_MAKE_ARROW_SCHEMA_HPP_ +#define RFL_PARSING_TABULAR_MAKE_ARROW_SCHEMA_HPP_ + +#include + +#include "../../named_tuple_t.hpp" +#include "arrow_builders_t.hpp" + +namespace rfl::parsing::tabular { + +template +inline auto make_arrow_schema() { + return ArrowBuildersType>>::schema(); +} + +} // namespace rfl::parsing::tabular + +#endif diff --git a/src/reflectcpp_parquet.cpp b/src/reflectcpp_parquet.cpp new file mode 100644 index 00000000..975c55b8 --- /dev/null +++ b/src/reflectcpp_parquet.cpp @@ -0,0 +1,31 @@ +/* + +MIT License + +Copyright (c) 2023-2024 Code17 GmbH + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. + +*/ + +// This file include all other source files, so that the user of the library +// don't need to add multiple source files into their build. +// Also, this speeds up compile time, compared to multiple separate .cpp files +// compilation. + diff --git a/src/rfl/parquet/Writer.cpp b/src/rfl/parquet/Writer.cpp new file mode 100644 index 00000000..f711a0f4 --- /dev/null +++ b/src/rfl/parquet/Writer.cpp @@ -0,0 +1,78 @@ +#include "rfl/msgpack/Writer.hpp" + +namespace rfl::msgpack { + +Writer::Writer(msgpack_packer* _pk) : pk_(_pk) {} + +Writer::~Writer() = default; + +Writer::OutputArrayType Writer::array_as_root( + const size_t _size) const noexcept { + return new_array(_size); +} + +Writer::OutputObjectType Writer::object_as_root( + const size_t _size) const noexcept { + return new_object(_size); +} + +Writer::OutputVarType Writer::null_as_root() const noexcept { + msgpack_pack_nil(pk_); + return OutputVarType{}; +} + +Writer::OutputArrayType Writer::add_array_to_array( + const size_t _size, OutputArrayType* _parent) const noexcept { + return new_array(_size); +} + +Writer::OutputArrayType Writer::add_array_to_object( + const std::string_view& _name, const size_t _size, + OutputObjectType* _parent) const noexcept { + msgpack_pack_str(pk_, _name.size()); + msgpack_pack_str_body(pk_, _name.data(), _name.size()); + return new_array(_size); +} + +Writer::OutputObjectType Writer::add_object_to_array( + const size_t _size, OutputArrayType* _parent) const noexcept { + return new_object(_size); +} + +Writer::OutputObjectType Writer::add_object_to_object( + const std::string_view& _name, const size_t _size, + OutputObjectType* _parent) const noexcept { + msgpack_pack_str(pk_, _name.size()); + msgpack_pack_str_body(pk_, _name.data(), _name.size()); + return new_object(_size); +} + +Writer::OutputVarType Writer::add_null_to_array( + OutputArrayType* _parent) const noexcept { + msgpack_pack_nil(pk_); + return OutputVarType{}; +} + +Writer::OutputVarType Writer::add_null_to_object( + const std::string_view& _name, OutputObjectType* _parent) const noexcept { + msgpack_pack_str(pk_, _name.size()); + msgpack_pack_str_body(pk_, _name.data(), _name.size()); + msgpack_pack_nil(pk_); + return OutputVarType{}; +} + +void Writer::end_array(OutputArrayType* _arr) const noexcept {} + +void Writer::end_object(OutputObjectType* _obj) const noexcept {} + +Writer::OutputArrayType Writer::new_array(const size_t _size) const noexcept { + msgpack_pack_array(pk_, _size); + return OutputArrayType{}; +} + +Writer::OutputObjectType Writer::new_object(const size_t _size) const noexcept { + msgpack_pack_map(pk_, _size); + return OutputObjectType{}; +} + +} // namespace rfl::msgpack diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt index ae98d010..9e559b73 100644 --- a/tests/CMakeLists.txt +++ b/tests/CMakeLists.txt @@ -36,6 +36,10 @@ if (REFLECTCPP_MSGPACK) add_subdirectory(msgpack) endif() +if (REFLECTCPP_PARQUET) + add_subdirectory(parquet) +endif() + if (REFLECTCPP_TOML) add_subdirectory(toml) endif() diff --git a/tests/parquet/CMakeLists.txt b/tests/parquet/CMakeLists.txt new file mode 100644 index 00000000..c42b7a5b --- /dev/null +++ b/tests/parquet/CMakeLists.txt @@ -0,0 +1,21 @@ +project(reflect-cpp-parquet-tests) + +file(GLOB_RECURSE SOURCES CONFIGURE_DEPENDS "*.cpp") + +add_executable( + reflect-cpp-parquet-tests + ${SOURCES} +) +target_precompile_headers(reflect-cpp-parquet-tests PRIVATE [["rfl.hpp"]] ) + +target_include_directories(reflect-cpp-parquet-tests SYSTEM PRIVATE "${VCPKG_INSTALLED_DIR}/${VCPKG_TARGET_TRIPLET}/include") + +target_link_libraries( + reflect-cpp-parquet-tests + PRIVATE + "${REFLECT_CPP_GTEST_LIB}" +) + +find_package(GTest) +gtest_discover_tests(reflect-cpp-parquet-tests) + diff --git a/tests/parquet/test_readme_example.cpp b/tests/parquet/test_readme_example.cpp new file mode 100644 index 00000000..10d35cc6 --- /dev/null +++ b/tests/parquet/test_readme_example.cpp @@ -0,0 +1,41 @@ +#include +#include +#include +#include + +namespace test_readme_example { + +using Age = rfl::Validator, rfl::Maximum<130>>; + +struct Person { + rfl::Rename<"firstName", std::string> first_name; + rfl::Rename<"lastName", std::string> last_name = "Simpson"; + std::string town = "Springfield"; + // rfl::Timestamp<"%Y-%m-%d"> birthday; + std::string birthday; + Age age; + rfl::Email email; +}; + +TEST(parquet, test_readme_example) { + const auto people = + std::vector({Person{.first_name = "Bart", + .birthday = "1987-04-19", + .age = 10, + .email = "bart@simpson.com"}, + Person{.first_name = "Lisa", + .birthday = "1987-04-19", + .age = 8, + .email = "lisa@simpson.com"}, + Person{.first_name = "Maggie", + .birthday = "1987-04-19", + .age = 0, + .email = "maggie@simpson.com"}, + Person{.first_name = "Homer", + .birthday = "1987-04-19", + .age = 45, + .email = "homer@simpson.com"}}); + + rfl::parquet::write(people); +} +} // namespace test_readme_example diff --git a/tests/parquet/write_and_read.hpp b/tests/parquet/write_and_read.hpp new file mode 100644 index 00000000..254f88a2 --- /dev/null +++ b/tests/parquet/write_and_read.hpp @@ -0,0 +1,21 @@ +#ifndef WRITE_AND_READ_ +#define WRITE_AND_READ_ + +#include + +#include +#include +#include + +template +void write_and_read(const auto& _struct) { + using T = std::remove_cvref_t; + const auto serialized1 = rfl::parquet::write(_struct); + const auto res = rfl::parquet::read(serialized1); + EXPECT_TRUE(res && true) << "Test failed on read. Error: " + << res.error().what(); + const auto serialized2 = rfl::parquet::write(res.value()); + EXPECT_EQ(serialized1, serialized2); +} + +#endif From d4a09d94057b50e82eb2a141a3c8834ac22d75e7 Mon Sep 17 00:00:00 2001 From: "Dr. Patrick Urbanke" Date: Sun, 17 Aug 2025 16:15:20 +0200 Subject: [PATCH 05/36] Made sure we can save files --- .gitignore | 1 + include/rfl/parquet.hpp | 2 +- include/rfl/parquet/write.hpp | 17 +++++++--- tests/parquet/test_save_load.cpp | 53 ++++++++++++++++++++++++++++++++ 4 files changed, 68 insertions(+), 5 deletions(-) create mode 100644 tests/parquet/test_save_load.cpp diff --git a/.gitignore b/.gitignore index 13abfb6a..1b782f9d 100644 --- a/.gitignore +++ b/.gitignore @@ -49,6 +49,7 @@ *.fb *.flexbuf *.msgpack +*.parquet *.toml *.ubjson *.xml diff --git a/include/rfl/parquet.hpp b/include/rfl/parquet.hpp index 8570378a..b1bec877 100644 --- a/include/rfl/parquet.hpp +++ b/include/rfl/parquet.hpp @@ -7,7 +7,7 @@ // #include "parquet/Writer.hpp" // #include "parquet/load.hpp" // #include "parquet/read.hpp" -// #include "parquet/save.hpp" +#include "parquet/save.hpp" #include "parquet/write.hpp" #endif diff --git a/include/rfl/parquet/write.hpp b/include/rfl/parquet/write.hpp index 7f522b4e..ff21a9eb 100644 --- a/include/rfl/parquet/write.hpp +++ b/include/rfl/parquet/write.hpp @@ -9,18 +9,21 @@ #include #include #include +#include #include #include "../Processors.hpp" +#include "../Ref.hpp" #include "../parsing/tabular/ArrowWriter.hpp" namespace rfl::parquet { /// Returns parquet bytes. template -std::vector write(const auto& _arr) { +Ref to_buffer(const auto& _arr) { /// TODO: Support processors using T = std::remove_cvref_t; + const auto table = parsing::tabular::ArrowWriter(/*chunksize=*/2000).to_table(_arr); @@ -51,16 +54,22 @@ std::vector write(const auto& _arr) { throw std::runtime_error(output_buffer.status().message()); } - const auto view = std::string_view(*buffer.ValueOrDie()); + return Ref::make(buffer.ValueOrDie()).value(); +} +/// Returns parquet bytes. +template +std::vector write(const auto& _arr) { + const auto buffer = to_buffer(_arr); + const auto view = std::string_view(*buffer); return std::vector(view.begin(), view.end()); } /// Writes a PARQUET into an ostream. template std::ostream& write(const auto& _obj, std::ostream& _stream) noexcept { - auto buffer = write(_obj); - _stream.write(buffer.data(), buffer.size()); + auto buffer = to_buffer(_obj); + _stream << std::string_view(*buffer); return _stream; } diff --git a/tests/parquet/test_save_load.cpp b/tests/parquet/test_save_load.cpp new file mode 100644 index 00000000..573fd297 --- /dev/null +++ b/tests/parquet/test_save_load.cpp @@ -0,0 +1,53 @@ +#include + +#include +#include +#include +#include +#include +#include + +namespace test_save_load { + +using Age = rfl::Validator, rfl::Maximum<130>>>; + +struct Person { + rfl::Rename<"firstName", std::string> first_name; + rfl::Rename<"lastName", std::string> last_name = "Simpson"; + std::string town = "Springfield"; + // rfl::Timestamp<"%Y-%m-%d"> birthday; + std::string birthday; + Age age; + rfl::Email email; +}; + +TEST(parquet, test_save_load) { + const auto people = + std::vector({Person{.first_name = "Bart", + .birthday = "1987-04-19", + .age = 10, + .email = "bart@simpson.com"}, + Person{.first_name = "Lisa", + .birthday = "1987-04-19", + .age = 8, + .email = "lisa@simpson.com"}, + Person{.first_name = "Maggie", + .birthday = "1987-04-19", + .age = 0, + .email = "maggie@simpson.com"}, + Person{.first_name = "Homer", + .birthday = "1987-04-19", + .age = 45, + .email = "homer@simpson.com"}}); + + rfl::parquet::save("people.parquet", people); + + /*const auto homer2 = rfl::toml::load("homer.toml").value(); + + const auto string1 = rfl::toml::write(homer1); + const auto string2 = rfl::toml::write(homer2); + + EXPECT_EQ(string1, string2);*/ +} +} // namespace test_save_load From 5a9c4d1ec6b04f33e84577a0205e146e5455f59a Mon Sep 17 00:00:00 2001 From: "Dr. Patrick Urbanke" Date: Wed, 20 Aug 2025 22:38:14 +0200 Subject: [PATCH 06/36] Started writing the ArrowReader and ChunkedArrayIterator --- include/rfl/parsing/tabular/ArrowReader.hpp | 38 +++ include/rfl/parsing/tabular/ArrowTypes.hpp | 321 ++++++++++++++++++ .../parsing/tabular/ChunkedArrayIterator.hpp | 98 ++++++ .../rfl/parsing/tabular/add_to_builder.hpp | 5 +- include/rfl/parsing/tabular/array_t.hpp | 13 + .../rfl/parsing/tabular/arrow_builders_t.hpp | 295 +--------------- 6 files changed, 479 insertions(+), 291 deletions(-) create mode 100644 include/rfl/parsing/tabular/ArrowReader.hpp create mode 100644 include/rfl/parsing/tabular/ArrowTypes.hpp create mode 100644 include/rfl/parsing/tabular/ChunkedArrayIterator.hpp create mode 100644 include/rfl/parsing/tabular/array_t.hpp diff --git a/include/rfl/parsing/tabular/ArrowReader.hpp b/include/rfl/parsing/tabular/ArrowReader.hpp new file mode 100644 index 00000000..78618d51 --- /dev/null +++ b/include/rfl/parsing/tabular/ArrowReader.hpp @@ -0,0 +1,38 @@ +#ifndef RFL_PARSING_TABULAR_ARROWREADER_HPP_ +#define RFL_PARSING_TABULAR_ARROWREADER_HPP_ + +#include + +#include +#include +#include +#include + +#include "../../Result.hpp" +#include "../../Tuple.hpp" +#include "../../get.hpp" +#include "../../named_tuple_t.hpp" +#include "../../to_view.hpp" +#include "add_to_builder.hpp" +#include "arrow_builders_t.hpp" +#include "make_arrow_data_types.hpp" +#include "make_arrow_schema.hpp" + +namespace rfl::parsing::tabular { + +template +class ArrowReader { + public: + using ValueType = typename std::remove_cvref_t; + + ArrowReader(const size_t _chunksize) : chunksize_(_chunksize) {} + + ~ArrowReader() = default; + + Result from_table( + const std::shared_ptr& _table) const {} +}; + +} // namespace rfl::parsing::tabular + +#endif diff --git a/include/rfl/parsing/tabular/ArrowTypes.hpp b/include/rfl/parsing/tabular/ArrowTypes.hpp new file mode 100644 index 00000000..e399ef0a --- /dev/null +++ b/include/rfl/parsing/tabular/ArrowTypes.hpp @@ -0,0 +1,321 @@ +#ifndef RFL_PARSING_TABULAR_ARROWTYPES_HPP_ +#define RFL_PARSING_TABULAR_ARROWTYPES_HPP_ + +#include + +#include +#include +#include +#include +#include +#include +#include + +#include "../../Box.hpp" +#include "../../NamedTuple.hpp" +#include "../../Ref.hpp" +#include "../../Rename.hpp" +#include "../../Timestamp.hpp" +#include "../../Tuple.hpp" +#include "../../internal/StringLiteral.hpp" +#include "../../internal/has_reflection_type_v.hpp" +#include "../../named_tuple_t.hpp" + +namespace rfl::parsing::tabular { + +template +struct ArrowTypes; + +template <> +struct ArrowTypes { + using ArrayType = arrow::UInt8Array; + using BuilderType = arrow::UInt8Builder; + + static auto data_type() { return arrow::uint8(); } + + static void add_to_builder(const auto _val, BuilderType* _builder) { + const auto status = _builder->Append(_val); + if (!status.ok()) { + throw std::runtime_error(status.message()); + } + } +}; + +template <> +struct ArrowTypes { + using ArrayType = arrow::UInt16Array; + using BuilderType = arrow::UInt16Builder; + + static auto data_type() { return arrow::uint16(); } + + static void add_to_builder(const auto _val, BuilderType* _builder) { + const auto status = _builder->Append(_val); + if (!status.ok()) { + throw std::runtime_error(status.message()); + } + } +}; + +template <> +struct ArrowTypes { + using ArrayType = arrow::UInt32Array; + using BuilderType = arrow::UInt32Builder; + + static auto data_type() { return arrow::uint32(); } + + static void add_to_builder(const auto _val, BuilderType* _builder) { + const auto status = _builder->Append(_val); + if (!status.ok()) { + throw std::runtime_error(status.message()); + } + } +}; + +template <> +struct ArrowTypes { + using ArrayType = arrow::UInt64Array; + using BuilderType = arrow::UInt64Builder; + + static auto data_type() { return arrow::uint64(); } + + static void add_to_builder(const auto _val, BuilderType* _builder) { + const auto status = _builder->Append(_val); + if (!status.ok()) { + throw std::runtime_error(status.message()); + } + } +}; + +template <> +struct ArrowTypes { + using ArrayType = arrow::Int8Array; + using BuilderType = arrow::Int8Builder; + + static auto data_type() { return arrow::int8(); } + + static void add_to_builder(const auto _val, BuilderType* _builder) { + const auto status = _builder->Append(_val); + if (!status.ok()) { + throw std::runtime_error(status.message()); + } + } +}; + +template <> +struct ArrowTypes { + using ArrayType = arrow::Int16Array; + using BuilderType = arrow::Int16Builder; + + static auto data_type() { return arrow::int16(); } + + static void add_to_builder(const auto _val, BuilderType* _builder) { + const auto status = _builder->Append(_val); + if (!status.ok()) { + throw std::runtime_error(status.message()); + } + } +}; + +template <> +struct ArrowTypes { + using ArrayType = arrow::Int32Array; + using BuilderType = arrow::Int32Builder; + + static auto data_type() { return arrow::int32(); } + + static void add_to_builder(const auto _val, BuilderType* _builder) { + const auto status = _builder->Append(_val); + if (!status.ok()) { + throw std::runtime_error(status.message()); + } + } +}; + +template <> +struct ArrowTypes { + using ArrayType = arrow::Int64Array; + using BuilderType = arrow::Int64Builder; + + static auto data_type() { return arrow::int64(); } + + static void add_to_builder(const auto _val, BuilderType* _builder) { + const auto status = _builder->Append(_val); + if (!status.ok()) { + throw std::runtime_error(status.message()); + } + } +}; + +template <> +struct ArrowTypes { + using ArrayType = arrow::FloatArray; + using BuilderType = arrow::FloatBuilder; + + static auto data_type() { return arrow::float32(); } + + static void add_to_builder(const auto _val, BuilderType* _builder) { + const auto status = _builder->Append(_val); + if (!status.ok()) { + throw std::runtime_error(status.message()); + } + } +}; + +template <> +struct ArrowTypes { + using ArrayType = arrow::DoubleArray; + using BuilderType = arrow::DoubleBuilder; + + static auto data_type() { return arrow::float64(); } + + static void add_to_builder(const auto _val, BuilderType* _builder) { + const auto status = _builder->Append(_val); + if (!status.ok()) { + throw std::runtime_error(status.message()); + } + } +}; + +template <> +struct ArrowTypes { + using ArrayType = arrow::StringArray; + using BuilderType = arrow::StringBuilder; + + static auto data_type() { return arrow::utf8(); } + + static void add_to_builder(const auto& _val, BuilderType* _builder) { + const auto status = _builder->Append(_val); + if (!status.ok()) { + throw std::runtime_error(status.message()); + } + } +}; + +template +struct ArrowTypes> { + using ArrayType = arrow::TimestampArray; + using BuilderType = arrow::TimestampBuilder; + + static auto data_type() { return arrow::timestamp(arrow::TimeUnit::SECOND); } + + static void add_to_builder(const auto _val, BuilderType* _builder) { + const auto status = _builder->Append(_val); + if (!status.ok()) { + throw std::runtime_error(status.message()); + } + } +}; + +template + requires internal::has_reflection_type_v +struct ArrowTypes { + using ArrayType = typename ArrowTypes::ArrayType; + using BuilderType = + typename ArrowTypes::BuilderType; + + static auto data_type() { + return ArrowTypes::data_type(); + } + + static void add_to_builder(const auto _val, BuilderType* _builder) { + ArrowTypes::add_to_builder(_val.reflection(), + _builder); + } +}; + +template +struct ArrowTypes> { + using ArrayType = typename ArrowTypes>::ArrayType; + using BuilderType = typename ArrowTypes>::BuilderType; + + static auto data_type() { return ArrowTypes::data_type(); } + + static void add_to_builder(const auto _val, BuilderType* _builder) { + if (_val) { + ArrowTypes::add_to_builder(*_val, _builder); + } else { + const auto status = _builder->AppendNull(); + if (!status.ok()) { + throw std::runtime_error(status.message()); + } + } + } +}; + +template +struct ArrowTypes> { + using ArrayType = typename ArrowTypes>::ArrayType; + using BuilderType = typename ArrowTypes>::BuilderType; + + static auto data_type() { return ArrowTypes::data_type(); } + + static void add_to_builder(const auto _val, BuilderType* _builder) { + if (_val) { + ArrowTypes::add_to_builder(*_val, _builder); + } else { + const auto status = _builder->AppendNull(); + if (!status.ok()) { + throw std::runtime_error(status.message()); + } + } + } +}; + +template +struct ArrowTypes> { + using ArrayType = typename ArrowTypes>::ArrayType; + using BuilderType = typename ArrowTypes>::BuilderType; + + static auto data_type() { return ArrowTypes::data_type(); } + + static void add_to_builder(const auto _val, BuilderType* _builder) { + if (_val) { + ArrowTypes::add_to_builder(*_val, _builder); + } else { + const auto status = _builder->AppendNull(); + if (!status.ok()) { + throw std::runtime_error(status.message()); + } + } + } +}; + +template +struct ArrowTypes> { + using ArrayType = typename ArrowTypes>::ArrayType; + using BuilderType = typename ArrowTypes>::BuilderType; + + static auto data_type() { return ArrowTypes::data_type(); } + + static void add_to_builder(const auto _val, BuilderType* _builder) { + ArrowTypes::add_to_builder(*_val, _builder); + } +}; + +template +struct ArrowTypes> { + using ArrayType = typename ArrowTypes>::ArrayType; + using BuilderType = typename ArrowTypes>::BuilderType; + + static auto data_type() { return ArrowTypes::data_type(); } + + static void add_to_builder(const auto _val, BuilderType* _builder) { + ArrowTypes::add_to_builder(*_val, _builder); + } +}; + +template +struct ArrowTypes> { + using ArrayType = typename ArrowTypes>::ArrayType; + using BuilderType = typename ArrowTypes>::BuilderType; + + static auto data_type() { return ArrowTypes::data_type(); } + + static void add_to_builder(const auto _val, BuilderType* _builder) { + ArrowTypes::add_to_builder(_val.value(), _builder); + } +}; + +} // namespace rfl::parsing::tabular + +#endif diff --git a/include/rfl/parsing/tabular/ChunkedArrayIterator.hpp b/include/rfl/parsing/tabular/ChunkedArrayIterator.hpp new file mode 100644 index 00000000..f8a1ab46 --- /dev/null +++ b/include/rfl/parsing/tabular/ChunkedArrayIterator.hpp @@ -0,0 +1,98 @@ +#ifndef RFL_PARSING_TABULAR_CHUNKEDARRAYITERATOR_HPP_ +#define RFL_PARSING_TABULAR_CHUNKEDARRAYITERATOR_HPP_ + +#include + +#include +#include +#include +#include + +#include "../../Ref.hpp" +#include "../../Result.hpp" +#include "array_t.hpp" + +namespace rfl::parsing::tabular { + +template +class ChunkedArrayIterator { + public: + using difference_type = std::ptrdiff_t; + using value_type = Result; + + using ArrayType = array_t; + + struct End { + bool operator==(const ChunkedArrayIterator& _it) const noexcept { + return _it == *this; + } + + bool operator!=(const ChunkedArrayIterator& _it) const noexcept { + return _it != *this; + } + }; + + static Result make( + const std::shared_ptr& _arr) { + try { + return ChunkedArrayIterator(_arr); + } catch (const std::exception& e) { + return error(e.what()); + } + } + + ChunkedArrayIterator(const std::shared_ptr& _arr) + : arr_(Ref::make(_arr).value()), + chunk_ix_(0), + current_chunk_(get_chunk(arr_, 0)), + ix_(0) {} + + ~ChunkedArrayIterator() = default; + + Result operator*() const noexcept { + return current_chunk_.transform( + [&](const auto& _c) { return _c->Value(ix_); }); + } + + bool operator==(const End&) const noexcept { + return chunk_ix_ >= arr_->num_chunks(); + } + + bool operator!=(const End& _end) const noexcept { return !(*this == _end); } + + Iterator& operator++() noexcept { + if (!current_chunk_) { + return *this; + } + ++ix_; + if (ix_ >= (*current_chunk_)->length()) { + ++chunk_ix_; + current_chunk_ = get_chunk(arr_, chunk_ix_); + ix_ = 0; + } + return *this; + } + + void operator++(int) noexcept { ++*this; } + + private: + static Result> get_chunk(const Ref& _arr, + const int _chunk_ix) noexcept { + return Ref::make( + std::dynamic_pointer_cast>( + arr_->chunk(chunk_ix_))); + } + + private: + Ref arr_; + + int chunk_ix_; + + Result> current_chunk_; + + int64_t ix_; +}; + +} // namespace rfl::parsing::tabular + +#endif diff --git a/include/rfl/parsing/tabular/add_to_builder.hpp b/include/rfl/parsing/tabular/add_to_builder.hpp index 1279a2b5..1d42a22f 100644 --- a/include/rfl/parsing/tabular/add_to_builder.hpp +++ b/include/rfl/parsing/tabular/add_to_builder.hpp @@ -4,14 +4,13 @@ #include #include "../../named_tuple_t.hpp" -#include "arrow_builders_t.hpp" +#include "ArrowTypes.hpp" namespace rfl::parsing::tabular { template inline void add_to_builder(const ValueType& _val, BuilderType* _builder) { - ArrowBuilderType>::add_to_builder(_val, - _builder); + ArrowTypes>::add_to_builder(_val, _builder); } } // namespace rfl::parsing::tabular diff --git a/include/rfl/parsing/tabular/array_t.hpp b/include/rfl/parsing/tabular/array_t.hpp new file mode 100644 index 00000000..6d083af8 --- /dev/null +++ b/include/rfl/parsing/tabular/array_t.hpp @@ -0,0 +1,13 @@ +#ifndef RFL_PARSING_TABULAR_ARRAYT_HPP_ +#define RFL_PARSING_TABULAR_ARRAYT_HPP_ + +#include "ArrowTypes.hpp" + +namespace rfl::parsing::tabular { + +template +using array_t = typename ArrowTypes>::ArrayType; + +} // namespace rfl::parsing::tabular + +#endif diff --git a/include/rfl/parsing/tabular/arrow_builders_t.hpp b/include/rfl/parsing/tabular/arrow_builders_t.hpp index b5874fab..c87726ac 100644 --- a/include/rfl/parsing/tabular/arrow_builders_t.hpp +++ b/include/rfl/parsing/tabular/arrow_builders_t.hpp @@ -11,294 +11,14 @@ #include #include -#include "../../Box.hpp" -#include "../../NamedTuple.hpp" -#include "../../Ref.hpp" -#include "../../Rename.hpp" -#include "../../Timestamp.hpp" -#include "../../Tuple.hpp" -#include "../../internal/StringLiteral.hpp" -#include "../../internal/has_reflection_type_v.hpp" #include "../../named_tuple_t.hpp" +#include "ArrowTypes.hpp" namespace rfl::parsing::tabular { template -struct ArrowBuilderType; - -template <> -struct ArrowBuilderType { - using Type = arrow::UInt8Builder; - - static auto data_type() { return arrow::uint8(); } - - static void add_to_builder(const auto _val, Type* _builder) { - const auto status = _builder->Append(_val); - if (!status.ok()) { - throw std::runtime_error(status.message()); - } - } -}; - -template <> -struct ArrowBuilderType { - using Type = arrow::UInt16Builder; - - static auto data_type() { return arrow::uint16(); } - - static void add_to_builder(const auto _val, Type* _builder) { - const auto status = _builder->Append(_val); - if (!status.ok()) { - throw std::runtime_error(status.message()); - } - } -}; - -template <> -struct ArrowBuilderType { - using Type = arrow::UInt32Builder; - - static auto data_type() { return arrow::uint32(); } - - static void add_to_builder(const auto _val, Type* _builder) { - const auto status = _builder->Append(_val); - if (!status.ok()) { - throw std::runtime_error(status.message()); - } - } -}; - -template <> -struct ArrowBuilderType { - using Type = arrow::UInt64Builder; - - static auto data_type() { return arrow::uint64(); } - - static void add_to_builder(const auto _val, Type* _builder) { - const auto status = _builder->Append(_val); - if (!status.ok()) { - throw std::runtime_error(status.message()); - } - } -}; - -template <> -struct ArrowBuilderType { - using Type = arrow::Int8Builder; - - static auto data_type() { return arrow::int8(); } - - static void add_to_builder(const auto _val, Type* _builder) { - const auto status = _builder->Append(_val); - if (!status.ok()) { - throw std::runtime_error(status.message()); - } - } -}; - -template <> -struct ArrowBuilderType { - using Type = arrow::Int16Builder; - - static auto data_type() { return arrow::int16(); } - - static void add_to_builder(const auto _val, Type* _builder) { - const auto status = _builder->Append(_val); - if (!status.ok()) { - throw std::runtime_error(status.message()); - } - } -}; - -template <> -struct ArrowBuilderType { - using Type = arrow::Int32Builder; - - static auto data_type() { return arrow::int32(); } - - static void add_to_builder(const auto _val, Type* _builder) { - const auto status = _builder->Append(_val); - if (!status.ok()) { - throw std::runtime_error(status.message()); - } - } -}; - -template <> -struct ArrowBuilderType { - using Type = arrow::Int64Builder; - - static auto data_type() { return arrow::int64(); } - - static void add_to_builder(const auto _val, Type* _builder) { - const auto status = _builder->Append(_val); - if (!status.ok()) { - throw std::runtime_error(status.message()); - } - } -}; - -template <> -struct ArrowBuilderType { - using Type = arrow::FloatBuilder; - - static auto data_type() { return arrow::float32(); } - - static void add_to_builder(const auto _val, Type* _builder) { - const auto status = _builder->Append(_val); - if (!status.ok()) { - throw std::runtime_error(status.message()); - } - } -}; - -template <> -struct ArrowBuilderType { - using Type = arrow::DoubleBuilder; - - static auto data_type() { return arrow::float64(); } - - static void add_to_builder(const auto _val, Type* _builder) { - const auto status = _builder->Append(_val); - if (!status.ok()) { - throw std::runtime_error(status.message()); - } - } -}; - -template <> -struct ArrowBuilderType { - using Type = arrow::StringBuilder; - - static auto data_type() { return arrow::utf8(); } - - static void add_to_builder(const auto& _val, Type* _builder) { - const auto status = _builder->Append(_val); - if (!status.ok()) { - throw std::runtime_error(status.message()); - } - } -}; - -template -struct ArrowBuilderType> { - using Type = arrow::TimestampBuilder; - - static auto data_type() { return arrow::timestamp(arrow::TimeUnit::SECOND); } - - static void add_to_builder(const auto _val, Type* _builder) { - const auto status = _builder->Append(_val); - if (!status.ok()) { - throw std::runtime_error(status.message()); - } - } -}; - -template - requires internal::has_reflection_type_v -struct ArrowBuilderType { - using Type = typename ArrowBuilderType::Type; - - static auto data_type() { - return ArrowBuilderType::data_type(); - } - - static void add_to_builder(const auto _val, Type* _builder) { - ArrowBuilderType::add_to_builder( - _val.reflection(), _builder); - } -}; - -template -struct ArrowBuilderType> { - using Type = typename ArrowBuilderType>::Type; - - static auto data_type() { return ArrowBuilderType::data_type(); } - - static void add_to_builder(const auto _val, Type* _builder) { - if (_val) { - ArrowBuilderType::add_to_builder(*_val, _builder); - } else { - const auto status = _builder->AppendNull(); - if (!status.ok()) { - throw std::runtime_error(status.message()); - } - } - } -}; - -template -struct ArrowBuilderType> { - using Type = typename ArrowBuilderType>::Type; - - static auto data_type() { return ArrowBuilderType::data_type(); } - - static void add_to_builder(const auto _val, Type* _builder) { - if (_val) { - ArrowBuilderType::add_to_builder(*_val, _builder); - } else { - const auto status = _builder->AppendNull(); - if (!status.ok()) { - throw std::runtime_error(status.message()); - } - } - } -}; - -template -struct ArrowBuilderType> { - using Type = typename ArrowBuilderType>::Type; - - static auto data_type() { return ArrowBuilderType::data_type(); } - - static void add_to_builder(const auto _val, Type* _builder) { - if (_val) { - ArrowBuilderType::add_to_builder(*_val, _builder); - } else { - const auto status = _builder->AppendNull(); - if (!status.ok()) { - throw std::runtime_error(status.message()); - } - } - } -}; - -template -struct ArrowBuilderType> { - using Type = typename ArrowBuilderType>::Type; - - static auto data_type() { return ArrowBuilderType::data_type(); } - - static void add_to_builder(const auto _val, Type* _builder) { - ArrowBuilderType::add_to_builder(*_val, _builder); - } -}; - -template -struct ArrowBuilderType> { - using Type = typename ArrowBuilderType>::Type; - - static auto data_type() { return ArrowBuilderType::data_type(); } - - static void add_to_builder(const auto _val, Type* _builder) { - ArrowBuilderType::add_to_builder(*_val, _builder); - } -}; - -template -struct ArrowBuilderType> { - using Type = typename ArrowBuilderType>::Type; - - static auto data_type() { return ArrowBuilderType::data_type(); } - - static void add_to_builder(const auto _val, Type* _builder) { - ArrowBuilderType::add_to_builder(_val.value(), _builder); - } -}; - -template -using arrow_builder_t = typename ArrowBuilderType< - std::remove_cvref_t>>::Type; +using arrow_builder_t = typename ArrowTypes< + std::remove_cvref_t>>::BuilderType; template struct ArrowBuildersType; @@ -311,15 +31,14 @@ struct ArrowBuildersType> { return [&](std::integer_sequence) { return std::array, sizeof...(FieldTypes)>( - {ArrowBuilderType::data_type()...}); + {ArrowTypes::data_type()...}); }(std::make_integer_sequence()); } static auto schema() { - const auto fields = - std::vector>({arrow::field( - typename FieldTypes::Name().str(), - ArrowBuilderType::data_type())...}); + const auto fields = std::vector>( + {arrow::field(typename FieldTypes::Name().str(), + ArrowTypes::data_type())...}); return arrow::schema(fields); } }; From 9200247ef62e89dccdf6944d7d5109b9c6091beb Mon Sep 17 00:00:00 2001 From: "Dr. Patrick Urbanke" Date: Sat, 23 Aug 2025 00:41:04 +0200 Subject: [PATCH 07/36] Continued developing the arrow reader --- include/rfl/parsing/tabular/ArrowReader.hpp | 38 ++++++++++--- .../parsing/tabular/ChunkedArrayIterator.hpp | 16 +----- .../rfl/parsing/tabular/ChunkedArrayRange.hpp | 38 +++++++++++++ .../tabular/make_chunked_array_ranges.hpp | 56 +++++++++++++++++++ 4 files changed, 127 insertions(+), 21 deletions(-) create mode 100644 include/rfl/parsing/tabular/ChunkedArrayRange.hpp create mode 100644 include/rfl/parsing/tabular/make_chunked_array_ranges.hpp diff --git a/include/rfl/parsing/tabular/ArrowReader.hpp b/include/rfl/parsing/tabular/ArrowReader.hpp index 78618d51..c8d6049b 100644 --- a/include/rfl/parsing/tabular/ArrowReader.hpp +++ b/include/rfl/parsing/tabular/ArrowReader.hpp @@ -5,18 +5,17 @@ #include #include +#include #include #include #include "../../Result.hpp" #include "../../Tuple.hpp" +#include "../../apply.hpp" #include "../../get.hpp" #include "../../named_tuple_t.hpp" #include "../../to_view.hpp" -#include "add_to_builder.hpp" -#include "arrow_builders_t.hpp" -#include "make_arrow_data_types.hpp" -#include "make_arrow_schema.hpp" +#include "make_chunked_array_ranges.hpp" namespace rfl::parsing::tabular { @@ -25,12 +24,37 @@ class ArrowReader { public: using ValueType = typename std::remove_cvref_t; - ArrowReader(const size_t _chunksize) : chunksize_(_chunksize) {} + static Result make(const std::shared_ptr& _table) { + try { + return ArrowReader(_table); + } catch (const std::exception& e) { + return error("Could not create ArrowReader: " + e.what()); + } + } ~ArrowReader() = default; - Result from_table( - const std::shared_ptr& _table) const {} + static Result read(const ArrowReader& _r) { return _r.read(); } + + private: + ArrowReader(const std::shared_ptr& _table) + : table_(Ref::make(_table).value()) {} + + Result read() const { + const auto chunked_array_ranges = make_chunked_array_ranges(table_); + if (!chunked_array_ranges) { + return error(chunked_array_ranges.error().what()); + } + auto chunked_array_iterators = + apply([](const auto&... _rs) { return make_tuple(_rs.begin()...); }, + chunked_array_ranges.value()); + VecType result; + // TODO + return result; + } + + private: + Ref table_; }; } // namespace rfl::parsing::tabular diff --git a/include/rfl/parsing/tabular/ChunkedArrayIterator.hpp b/include/rfl/parsing/tabular/ChunkedArrayIterator.hpp index f8a1ab46..e491c247 100644 --- a/include/rfl/parsing/tabular/ChunkedArrayIterator.hpp +++ b/include/rfl/parsing/tabular/ChunkedArrayIterator.hpp @@ -32,20 +32,8 @@ class ChunkedArrayIterator { } }; - static Result make( - const std::shared_ptr& _arr) { - try { - return ChunkedArrayIterator(_arr); - } catch (const std::exception& e) { - return error(e.what()); - } - } - - ChunkedArrayIterator(const std::shared_ptr& _arr) - : arr_(Ref::make(_arr).value()), - chunk_ix_(0), - current_chunk_(get_chunk(arr_, 0)), - ix_(0) {} + ChunkedArrayIterator(const Ref& _arr) + : arr_(_arr), chunk_ix_(0), current_chunk_(get_chunk(arr_, 0)), ix_(0) {} ~ChunkedArrayIterator() = default; diff --git a/include/rfl/parsing/tabular/ChunkedArrayRange.hpp b/include/rfl/parsing/tabular/ChunkedArrayRange.hpp new file mode 100644 index 00000000..ad63aa89 --- /dev/null +++ b/include/rfl/parsing/tabular/ChunkedArrayRange.hpp @@ -0,0 +1,38 @@ +#ifndef RFL_PARSING_TABULAR_CHUNKEDARRAYRANGE_HPP_ +#define RFL_PARSING_TABULAR_CHUNKEDARRAYRANGE_HPP_ + +#include + +#include +#include +#include + +#include "../../Ref.hpp" +#include "../../Result.hpp" +#include "ChunkedArrayIterator.hpp" +#include "array_t.hpp" + +namespace rfl::parsing::tabular { + +template +class ChunkedArrayRange { + public: + static ChunkedArrayRange make(const Ref& _arr) { + return ChunkedArrayRange(_arr); + } + + ChunkedArrayRange(const Ref& _arr) : arr_(_arr) {} + + ~ChunkedArrayRange() = default; + + auto begin() const { return ChunkedArrayIterator(arr_); } + + auto end() const { return ChunkedArrayIterator::End{}; } + + private: + Ref arr_; +}; + +} // namespace rfl::parsing::tabular + +#endif diff --git a/include/rfl/parsing/tabular/make_chunked_array_ranges.hpp b/include/rfl/parsing/tabular/make_chunked_array_ranges.hpp new file mode 100644 index 00000000..81294507 --- /dev/null +++ b/include/rfl/parsing/tabular/make_chunked_array_ranges.hpp @@ -0,0 +1,56 @@ +#ifndef RFL_PARSING_TABULAR_MAKECHUNKEDARRAYRANGES_HPP_ +#define RFL_PARSING_TABULAR_MAKECHUNKEDARRAYRANGES_HPP_ + +#include + +#include +#include +#include +#include +#include +#include +#include + +#include "../../NamedTuple.hpp" +#include "../../Ref.hpp" +#include "../../Result.hpp" +#include "../../Tuple.hpp" +#include "ArrowTypes.hpp" +#include "ChunkedArrayRange.hpp" + +namespace rfl::parsing::tabular { + +template +struct MakeChunkedArrayRanges; + +template +struct MakeChunkedArrayRanges> { + using TupleType = Tuple>; + + Result operator(const Ref& _table) const { + const auto get_column = + [&](const std::string& _colname) -> Result> { + const auto col = _table.GetColumnByName(_colname); + if (!col) { + return error("Column named '" + _colname + "' not found."); + } + return Ref::make(col); + }; + + try { + return TupleType( + get_column(typename FieldTypes::Name().str()) + .transform(ChunkedArrayRange::make) + .value()...); + } catch (const std::exception& e) { + return error(e.what()); + } + } +}; + +template +const auto make_chunked_array_ranges = MakeChunkedArrayRanges{}; + +} // namespace rfl::parsing::tabular + +#endif From 69db63d97111cf2e701f4e808e03589a9fb3d62c Mon Sep 17 00:00:00 2001 From: "Dr. Patrick Urbanke" Date: Sun, 24 Aug 2025 12:46:00 +0200 Subject: [PATCH 08/36] Added support for reading parquet files --- include/rfl/parquet.hpp | 2 +- include/rfl/parquet/read.hpp | 58 +++++++++------- include/rfl/parsing/tabular/ArrowReader.hpp | 69 +++++++++++++++---- .../parsing/tabular/ChunkedArrayIterator.hpp | 38 +++++----- .../rfl/parsing/tabular/ChunkedArrayRange.hpp | 38 ---------- ...s.hpp => make_chunked_array_iterators.hpp} | 21 +++--- tests/parquet/test_readme_example.cpp | 4 +- tests/parquet/write_and_read.hpp | 6 +- 8 files changed, 128 insertions(+), 108 deletions(-) delete mode 100644 include/rfl/parsing/tabular/ChunkedArrayRange.hpp rename include/rfl/parsing/tabular/{make_chunked_array_ranges.hpp => make_chunked_array_iterators.hpp} (59%) diff --git a/include/rfl/parquet.hpp b/include/rfl/parquet.hpp index b1bec877..c747d945 100644 --- a/include/rfl/parquet.hpp +++ b/include/rfl/parquet.hpp @@ -6,7 +6,7 @@ // #include "parquet/Reader.hpp" // #include "parquet/Writer.hpp" // #include "parquet/load.hpp" -// #include "parquet/read.hpp" +#include "parquet/read.hpp" #include "parquet/save.hpp" #include "parquet/write.hpp" diff --git a/include/rfl/parquet/read.hpp b/include/rfl/parquet/read.hpp index 7a241081..ea037f9a 100644 --- a/include/rfl/parquet/read.hpp +++ b/include/rfl/parquet/read.hpp @@ -1,44 +1,56 @@ #ifndef RFL_PARQUET_READ_HPP_ #define RFL_PARQUET_READ_HPP_ -#include +#include +#include #include +#include #include #include "../Processors.hpp" +#include "../Result.hpp" +#include "../concepts.hpp" #include "../internal/wrap_in_rfl_array_t.hpp" -#include "Parser.hpp" -#include "Reader.hpp" +#include "../parsing/tabular/ArrowReader.hpp" namespace rfl::parquet { -using InputObjectType = typename Reader::InputObjectType; -using InputVarType = typename Reader::InputVarType; - -/// Parses an object from a PARQUET var. -template -auto read(const InputVarType& _obj) { - const auto r = Reader(); - return Parser>::read(r, _obj); -} - /// Parses an object from PARQUET using reflection. template -Result> read(const char* _bytes, - const size_t _size) { - parquet_zone mempool; - parquet_zone_init(&mempool, 2048); - parquet_object deserialized; - parquet_unpack(_bytes, _size, NULL, &mempool, &deserialized); - auto r = read(deserialized); - parquet_zone_destroy(&mempool); - return r; +Result> read( + const concepts::ByteLike auto* _bytes, const size_t _size) { + arrow::MemoryPool* pool = arrow::default_memory_pool(); + + const auto buffer = std::make_shared( + internal::ptr_cast(_bytes), _size); + + const auto input = std::make_shared(buffer); + + auto arrow_reader = ::parquet::arrow::OpenFile(input, pool); + + if (!arrow_reader.ok()) { + return error(std::string("Could not generate the arrow reader: ") + + arrow_reader.status().message()); + } + + std::shared_ptr table; + + const auto status = arrow_reader.ValueOrDie()->ReadTable(&table); + + if (!status.ok()) { + return error("Could not read table: " + status.message()); + } + + using ArrowReader = parsing::tabular::ArrowReader; + + return ArrowReader::make(table).and_then( + [](const auto& _r) { return _r.read(); }); } /// Parses an object from PARQUET using reflection. template -auto read(const std::vector& _bytes) { +auto read(const concepts::ContiguousByteContainer auto& _bytes) { return read(_bytes.data(), _bytes.size()); } diff --git a/include/rfl/parsing/tabular/ArrowReader.hpp b/include/rfl/parsing/tabular/ArrowReader.hpp index c8d6049b..66694edb 100644 --- a/include/rfl/parsing/tabular/ArrowReader.hpp +++ b/include/rfl/parsing/tabular/ArrowReader.hpp @@ -3,6 +3,7 @@ #include +#include #include #include #include @@ -15,7 +16,8 @@ #include "../../get.hpp" #include "../../named_tuple_t.hpp" #include "../../to_view.hpp" -#include "make_chunked_array_ranges.hpp" +#include "../call_destructors_where_necessary.hpp" +#include "make_chunked_array_iterators.hpp" namespace rfl::parsing::tabular { @@ -28,29 +30,70 @@ class ArrowReader { try { return ArrowReader(_table); } catch (const std::exception& e) { - return error("Could not create ArrowReader: " + e.what()); + return error(std::string("Could not create ArrowReader: ") + e.what()); } } ~ArrowReader() = default; - static Result read(const ArrowReader& _r) { return _r.read(); } + Result read() const noexcept { + try { + auto chunked_array_iterators = + make_chunked_array_iterators>(table_) + .value(); + VecType result; + while (!end(chunked_array_iterators)) { + auto value = new_value(&chunked_array_iterators); + if (!value) { + return error(value.error().what()); + } + result.emplace_back(std::move(*value)); + } + return result; + } catch (const std::exception& e) { + return error(e.what()); + } + } private: ArrowReader(const std::shared_ptr& _table) : table_(Ref::make(_table).value()) {} - Result read() const { - const auto chunked_array_ranges = make_chunked_array_ranges(table_); - if (!chunked_array_ranges) { - return error(chunked_array_ranges.error().what()); - } - auto chunked_array_iterators = - apply([](const auto&... _rs) { return make_tuple(_rs.begin()...); }, - chunked_array_ranges.value()); - VecType result; + bool end(const auto& _chunked_array_iterators) const { + return apply( + [](const auto&... _its) { return (false || ... || _its.end()); }, + _chunked_array_iterators); + } + + Result new_value(auto* _chunked_array_iterators) const noexcept { + alignas(ValueType) unsigned char buf[sizeof(ValueType)]{}; + auto ptr = internal::ptr_cast(&buf); // TODO - return result; + // auto view = ProcessorsType::template process(to_view(*ptr)); + auto view = to_view(*ptr); + using ViewType = std::remove_cvref_t; + auto set = std::array(); + set.fill(false); + try { + const auto set_one = [&](std::integral_constant) { + using FieldType = tuple_element_t<_i, typename ViewType::Fields>; + using T = std::remove_cvref_t< + std::remove_pointer_t>; + ::new (view.template get<_i>()) T( + std::move((*_chunked_array_iterators->template get<_i>()).value())); + std::get<_i>(set) = true; + ++_chunked_array_iterators->template get<_i>(); + }; + + [&](std::integer_sequence) { + (set_one(std::integral_constant{}), ...); + }(std::make_integer_sequence()); + + return std::move(*ptr); + } catch (const std::exception& e) { + call_destructors_where_necessary(set, &view); + return error(e.what()); + } } private: diff --git a/include/rfl/parsing/tabular/ChunkedArrayIterator.hpp b/include/rfl/parsing/tabular/ChunkedArrayIterator.hpp index e491c247..acbda921 100644 --- a/include/rfl/parsing/tabular/ChunkedArrayIterator.hpp +++ b/include/rfl/parsing/tabular/ChunkedArrayIterator.hpp @@ -22,15 +22,9 @@ class ChunkedArrayIterator { using ArrayType = array_t; - struct End { - bool operator==(const ChunkedArrayIterator& _it) const noexcept { - return _it == *this; - } - - bool operator!=(const ChunkedArrayIterator& _it) const noexcept { - return _it != *this; - } - }; + static ChunkedArrayIterator make(const Ref& _arr) { + return ChunkedArrayIterator(_arr); + } ChunkedArrayIterator(const Ref& _arr) : arr_(_arr), chunk_ix_(0), current_chunk_(get_chunk(arr_, 0)), ix_(0) {} @@ -38,17 +32,20 @@ class ChunkedArrayIterator { ~ChunkedArrayIterator() = default; Result operator*() const noexcept { - return current_chunk_.transform( - [&](const auto& _c) { return _c->Value(ix_); }); + if constexpr (std::is_same_v) { + return current_chunk_.transform( + [&](const auto& _c) { return T(std::string(_c->Value(ix_))); }); + } else { + return current_chunk_.transform( + [&](const auto& _c) { return T(_c->Value(ix_)); }); + } } - bool operator==(const End&) const noexcept { - return chunk_ix_ >= arr_->num_chunks(); + bool end() const noexcept { + return !current_chunk_ || (chunk_ix_ >= arr_->num_chunks()); } - bool operator!=(const End& _end) const noexcept { return !(*this == _end); } - - Iterator& operator++() noexcept { + ChunkedArrayIterator& operator++() noexcept { if (!current_chunk_) { return *this; } @@ -66,9 +63,12 @@ class ChunkedArrayIterator { private: static Result> get_chunk(const Ref& _arr, const int _chunk_ix) noexcept { - return Ref::make( - std::dynamic_pointer_cast>( - arr_->chunk(chunk_ix_))); + if (_chunk_ix < _arr->num_chunks()) { + return Ref::make( + std::static_pointer_cast(_arr->chunk(_chunk_ix))); + } else { + return error("chunk_ix out of bounds."); + } } private: diff --git a/include/rfl/parsing/tabular/ChunkedArrayRange.hpp b/include/rfl/parsing/tabular/ChunkedArrayRange.hpp deleted file mode 100644 index ad63aa89..00000000 --- a/include/rfl/parsing/tabular/ChunkedArrayRange.hpp +++ /dev/null @@ -1,38 +0,0 @@ -#ifndef RFL_PARSING_TABULAR_CHUNKEDARRAYRANGE_HPP_ -#define RFL_PARSING_TABULAR_CHUNKEDARRAYRANGE_HPP_ - -#include - -#include -#include -#include - -#include "../../Ref.hpp" -#include "../../Result.hpp" -#include "ChunkedArrayIterator.hpp" -#include "array_t.hpp" - -namespace rfl::parsing::tabular { - -template -class ChunkedArrayRange { - public: - static ChunkedArrayRange make(const Ref& _arr) { - return ChunkedArrayRange(_arr); - } - - ChunkedArrayRange(const Ref& _arr) : arr_(_arr) {} - - ~ChunkedArrayRange() = default; - - auto begin() const { return ChunkedArrayIterator(arr_); } - - auto end() const { return ChunkedArrayIterator::End{}; } - - private: - Ref arr_; -}; - -} // namespace rfl::parsing::tabular - -#endif diff --git a/include/rfl/parsing/tabular/make_chunked_array_ranges.hpp b/include/rfl/parsing/tabular/make_chunked_array_iterators.hpp similarity index 59% rename from include/rfl/parsing/tabular/make_chunked_array_ranges.hpp rename to include/rfl/parsing/tabular/make_chunked_array_iterators.hpp index 81294507..697b8b87 100644 --- a/include/rfl/parsing/tabular/make_chunked_array_ranges.hpp +++ b/include/rfl/parsing/tabular/make_chunked_array_iterators.hpp @@ -1,5 +1,5 @@ -#ifndef RFL_PARSING_TABULAR_MAKECHUNKEDARRAYRANGES_HPP_ -#define RFL_PARSING_TABULAR_MAKECHUNKEDARRAYRANGES_HPP_ +#ifndef RFL_PARSING_TABULAR_MAKECHUNKEDARRAYITERATORS_HPP_ +#define RFL_PARSING_TABULAR_MAKECHUNKEDARRAYITERATORS_HPP_ #include @@ -16,21 +16,21 @@ #include "../../Result.hpp" #include "../../Tuple.hpp" #include "ArrowTypes.hpp" -#include "ChunkedArrayRange.hpp" +#include "ChunkedArrayIterator.hpp" namespace rfl::parsing::tabular { template -struct MakeChunkedArrayRanges; +struct MakeChunkedArrayIterators; template -struct MakeChunkedArrayRanges> { - using TupleType = Tuple>; +struct MakeChunkedArrayIterators> { + using TupleType = Tuple...>; - Result operator(const Ref& _table) const { + Result operator()(const Ref& _table) const { const auto get_column = [&](const std::string& _colname) -> Result> { - const auto col = _table.GetColumnByName(_colname); + const auto col = _table->GetColumnByName(_colname); if (!col) { return error("Column named '" + _colname + "' not found."); } @@ -40,7 +40,7 @@ struct MakeChunkedArrayRanges> { try { return TupleType( get_column(typename FieldTypes::Name().str()) - .transform(ChunkedArrayRange::make) + .transform(ChunkedArrayIterator::make) .value()...); } catch (const std::exception& e) { return error(e.what()); @@ -49,7 +49,8 @@ struct MakeChunkedArrayRanges> { }; template -const auto make_chunked_array_ranges = MakeChunkedArrayRanges{}; +const auto make_chunked_array_iterators = + MakeChunkedArrayIterators{}; } // namespace rfl::parsing::tabular diff --git a/tests/parquet/test_readme_example.cpp b/tests/parquet/test_readme_example.cpp index 10d35cc6..7e58c0d1 100644 --- a/tests/parquet/test_readme_example.cpp +++ b/tests/parquet/test_readme_example.cpp @@ -3,6 +3,8 @@ #include #include +#include "write_and_read.hpp" + namespace test_readme_example { using Age = rfl::Validator, rfl::Maximum<130>>; @@ -36,6 +38,6 @@ TEST(parquet, test_readme_example) { .age = 45, .email = "homer@simpson.com"}}); - rfl::parquet::write(people); + write_and_read(people); } } // namespace test_readme_example diff --git a/tests/parquet/write_and_read.hpp b/tests/parquet/write_and_read.hpp index 254f88a2..4bec476f 100644 --- a/tests/parquet/write_and_read.hpp +++ b/tests/parquet/write_and_read.hpp @@ -8,9 +8,9 @@ #include template -void write_and_read(const auto& _struct) { - using T = std::remove_cvref_t; - const auto serialized1 = rfl::parquet::write(_struct); +void write_and_read(const auto& _vec) { + using T = std::remove_cvref_t; + const auto serialized1 = rfl::parquet::write(_vec); const auto res = rfl::parquet::read(serialized1); EXPECT_TRUE(res && true) << "Test failed on read. Error: " << res.error().what(); From ffd4ed530ed33b5fdc466a22785b4602d4122a53 Mon Sep 17 00:00:00 2001 From: "Dr. Patrick Urbanke" Date: Sun, 24 Aug 2025 13:06:24 +0200 Subject: [PATCH 09/36] Improved the ArrowReader --- include/rfl/parsing/tabular/ArrowReader.hpp | 55 ++++++++++++--------- 1 file changed, 33 insertions(+), 22 deletions(-) diff --git a/include/rfl/parsing/tabular/ArrowReader.hpp b/include/rfl/parsing/tabular/ArrowReader.hpp index 66694edb..628212e1 100644 --- a/include/rfl/parsing/tabular/ArrowReader.hpp +++ b/include/rfl/parsing/tabular/ArrowReader.hpp @@ -37,22 +37,18 @@ class ArrowReader { ~ArrowReader() = default; Result read() const noexcept { - try { - auto chunked_array_iterators = - make_chunked_array_iterators>(table_) - .value(); - VecType result; - while (!end(chunked_array_iterators)) { - auto value = new_value(&chunked_array_iterators); - if (!value) { - return error(value.error().what()); - } - result.emplace_back(std::move(*value)); - } - return result; - } catch (const std::exception& e) { - return error(e.what()); - } + return make_chunked_array_iterators>(table_) + .and_then([&](auto chunked_array_iterators) -> Result { + VecType result; + while (!end(chunked_array_iterators)) { + auto value = new_value(&chunked_array_iterators); + if (!value) { + return error(value.error().what()); + } + result.emplace_back(std::move(*value)); + } + return result; + }); } private: @@ -72,16 +68,19 @@ class ArrowReader { // auto view = ProcessorsType::template process(to_view(*ptr)); auto view = to_view(*ptr); using ViewType = std::remove_cvref_t; - auto set = std::array(); - set.fill(false); try { const auto set_one = [&](std::integral_constant) { using FieldType = tuple_element_t<_i, typename ViewType::Fields>; using T = std::remove_cvref_t< std::remove_pointer_t>; - ::new (view.template get<_i>()) T( - std::move((*_chunked_array_iterators->template get<_i>()).value())); - std::get<_i>(set) = true; + auto res = *_chunked_array_iterators->template get<_i>(); + if (!res) { + destroy_value<_i>(&view); + throw std::runtime_error( + std::string("Field '") + typename FieldType::Name().str() + + std::string("' could not be set: ") + res.error().what()); + } + ::new (view.template get<_i>()) T(std::move(*res)); ++_chunked_array_iterators->template get<_i>(); }; @@ -91,11 +90,23 @@ class ArrowReader { return std::move(*ptr); } catch (const std::exception& e) { - call_destructors_where_necessary(set, &view); return error(e.what()); } } + template + void destroy_value(ViewType* _view) const { + static_assert(_i < ViewType::size(), "_i out of bounds."); + auto set = std::array(); + for (size_t i = 0; i < _i; ++i) { + set[i] = true; + } + for (size_t i = _i; i < ViewType::size(); ++i) { + set[i] = false; + } + call_destructors_where_necessary(set, _view); + } + private: Ref table_; }; From bedac406425346aff54a7bf460bd2df770e7d3bd Mon Sep 17 00:00:00 2001 From: "Dr. Patrick Urbanke" Date: Sun, 24 Aug 2025 13:31:01 +0200 Subject: [PATCH 10/36] Added the load function --- include/rfl/parquet.hpp | 2 +- include/rfl/parquet/load.hpp | 2 +- tests/parquet/test_save_load.cpp | 13 +++++++------ 3 files changed, 9 insertions(+), 8 deletions(-) diff --git a/include/rfl/parquet.hpp b/include/rfl/parquet.hpp index c747d945..cc07bad7 100644 --- a/include/rfl/parquet.hpp +++ b/include/rfl/parquet.hpp @@ -5,7 +5,7 @@ // #include "parquet/Parser.hpp" // #include "parquet/Reader.hpp" // #include "parquet/Writer.hpp" -// #include "parquet/load.hpp" +#include "parquet/load.hpp" #include "parquet/read.hpp" #include "parquet/save.hpp" #include "parquet/write.hpp" diff --git a/include/rfl/parquet/load.hpp b/include/rfl/parquet/load.hpp index 4d4490d8..e8802ab5 100644 --- a/include/rfl/parquet/load.hpp +++ b/include/rfl/parquet/load.hpp @@ -5,7 +5,7 @@ #include "../io/load_bytes.hpp" #include "read.hpp" -namespace rfl ::parquet { +namespace rfl::parquet { template Result load(const std::string& _fname) { diff --git a/tests/parquet/test_save_load.cpp b/tests/parquet/test_save_load.cpp index 573fd297..f0856b88 100644 --- a/tests/parquet/test_save_load.cpp +++ b/tests/parquet/test_save_load.cpp @@ -23,7 +23,7 @@ struct Person { }; TEST(parquet, test_save_load) { - const auto people = + const auto people1 = std::vector({Person{.first_name = "Bart", .birthday = "1987-04-19", .age = 10, @@ -41,13 +41,14 @@ TEST(parquet, test_save_load) { .age = 45, .email = "homer@simpson.com"}}); - rfl::parquet::save("people.parquet", people); + rfl::parquet::save("people.parquet", people1); - /*const auto homer2 = rfl::toml::load("homer.toml").value(); + const auto people2 = + rfl::parquet::load>("people.parquet").value(); - const auto string1 = rfl::toml::write(homer1); - const auto string2 = rfl::toml::write(homer2); + const auto bytes1 = rfl::parquet::write(people1); + const auto bytes2 = rfl::parquet::write(people2); - EXPECT_EQ(string1, string2);*/ + EXPECT_EQ(bytes1, bytes2); } } // namespace test_save_load From 255d42129dc23dfd7413ea38e5a6fbdc9d73d397 Mon Sep 17 00:00:00 2001 From: "Dr. Patrick Urbanke" Date: Mon, 25 Aug 2025 23:20:24 +0200 Subject: [PATCH 11/36] Added support for timestamps --- include/rfl/Timestamp.hpp | 11 +++++ include/rfl/parsing/tabular/ArrowTypes.hpp | 44 ++++++++++++++++++- include/rfl/parsing/tabular/ArrowWriter.hpp | 8 ++-- .../parsing/tabular/ChunkedArrayIterator.hpp | 4 ++ ...builders_t.hpp => make_arrow_builders.hpp} | 13 ++++-- .../parsing/tabular/make_arrow_data_types.hpp | 2 +- .../rfl/parsing/tabular/make_arrow_schema.hpp | 2 +- tests/parquet/test_readme_example.cpp | 3 +- tests/parquet/test_save_load.cpp | 3 +- 9 files changed, 75 insertions(+), 15 deletions(-) rename include/rfl/parsing/tabular/{arrow_builders_t.hpp => make_arrow_builders.hpp} (80%) diff --git a/include/rfl/Timestamp.hpp b/include/rfl/Timestamp.hpp index d5302683..4ba33958 100644 --- a/include/rfl/Timestamp.hpp +++ b/include/rfl/Timestamp.hpp @@ -37,6 +37,11 @@ class Timestamp { Timestamp(const std::tm& _tm) : tm_(_tm) {} + Timestamp(const time_t _t) : tm_(std::tm{}) { + auto t = _t; + tm_ = *std::gmtime(&t); + } + ~Timestamp() = default; /// Returns a result containing the timestamp when successful or an Error @@ -71,6 +76,12 @@ class Timestamp { /// Trivial (const) accessor to the underlying time stamp. const std::tm& tm() const { return tm_; } + /// Returns time_t by calling std::mktime under-the-hood. + time_t to_time_t() const { + auto tm = tm_; + return std::mktime(&tm) - timezone; + } + private: #if defined(_MSC_VER) || defined(__MINGW32__) // This workaround is necessary, because strptime is not available on Windows. diff --git a/include/rfl/parsing/tabular/ArrowTypes.hpp b/include/rfl/parsing/tabular/ArrowTypes.hpp index e399ef0a..d0827dd5 100644 --- a/include/rfl/parsing/tabular/ArrowTypes.hpp +++ b/include/rfl/parsing/tabular/ArrowTypes.hpp @@ -39,6 +39,8 @@ struct ArrowTypes { throw std::runtime_error(status.message()); } } + + static auto make_builder() { return BuilderType(); } }; template <> @@ -54,6 +56,8 @@ struct ArrowTypes { throw std::runtime_error(status.message()); } } + + static auto make_builder() { return BuilderType(); } }; template <> @@ -69,6 +73,8 @@ struct ArrowTypes { throw std::runtime_error(status.message()); } } + + static auto make_builder() { return BuilderType(); } }; template <> @@ -84,6 +90,8 @@ struct ArrowTypes { throw std::runtime_error(status.message()); } } + + static auto make_builder() { return BuilderType(); } }; template <> @@ -99,6 +107,8 @@ struct ArrowTypes { throw std::runtime_error(status.message()); } } + + static auto make_builder() { return BuilderType(); } }; template <> @@ -114,6 +124,8 @@ struct ArrowTypes { throw std::runtime_error(status.message()); } } + + static auto make_builder() { return BuilderType(); } }; template <> @@ -129,6 +141,8 @@ struct ArrowTypes { throw std::runtime_error(status.message()); } } + + static auto make_builder() { return BuilderType(); } }; template <> @@ -144,6 +158,8 @@ struct ArrowTypes { throw std::runtime_error(status.message()); } } + + static auto make_builder() { return BuilderType(); } }; template <> @@ -159,6 +175,8 @@ struct ArrowTypes { throw std::runtime_error(status.message()); } } + + static auto make_builder() { return BuilderType(); } }; template <> @@ -174,6 +192,8 @@ struct ArrowTypes { throw std::runtime_error(status.message()); } } + + static auto make_builder() { return BuilderType(); } }; template <> @@ -189,6 +209,8 @@ struct ArrowTypes { throw std::runtime_error(status.message()); } } + + static auto make_builder() { return BuilderType(); } }; template @@ -199,11 +221,15 @@ struct ArrowTypes> { static auto data_type() { return arrow::timestamp(arrow::TimeUnit::SECOND); } static void add_to_builder(const auto _val, BuilderType* _builder) { - const auto status = _builder->Append(_val); + const auto status = _builder->Append(_val.to_time_t()); if (!status.ok()) { throw std::runtime_error(status.message()); } } + + static auto make_builder() { + return BuilderType(data_type(), arrow::default_memory_pool()); + } }; template @@ -221,6 +247,10 @@ struct ArrowTypes { ArrowTypes::add_to_builder(_val.reflection(), _builder); } + + static auto make_builder() { + return ArrowTypes::make_builder(); + } }; template @@ -240,6 +270,8 @@ struct ArrowTypes> { } } } + + static auto make_builder() { return ArrowTypes::make_builder(); } }; template @@ -259,6 +291,8 @@ struct ArrowTypes> { } } } + + static auto make_builder() { return ArrowTypes::make_builder(); } }; template @@ -278,6 +312,8 @@ struct ArrowTypes> { } } } + + static auto make_builder() { return ArrowTypes::make_builder(); } }; template @@ -290,6 +326,8 @@ struct ArrowTypes> { static void add_to_builder(const auto _val, BuilderType* _builder) { ArrowTypes::add_to_builder(*_val, _builder); } + + static auto make_builder() { return ArrowTypes::make_builder(); } }; template @@ -302,6 +340,8 @@ struct ArrowTypes> { static void add_to_builder(const auto _val, BuilderType* _builder) { ArrowTypes::add_to_builder(*_val, _builder); } + + static auto make_builder() { return ArrowTypes::make_builder(); } }; template @@ -314,6 +354,8 @@ struct ArrowTypes> { static void add_to_builder(const auto _val, BuilderType* _builder) { ArrowTypes::add_to_builder(_val.value(), _builder); } + + static auto make_builder() { return ArrowTypes::make_builder(); } }; } // namespace rfl::parsing::tabular diff --git a/include/rfl/parsing/tabular/ArrowWriter.hpp b/include/rfl/parsing/tabular/ArrowWriter.hpp index bfa3656e..c0a082e5 100644 --- a/include/rfl/parsing/tabular/ArrowWriter.hpp +++ b/include/rfl/parsing/tabular/ArrowWriter.hpp @@ -13,7 +13,7 @@ #include "../../named_tuple_t.hpp" #include "../../to_view.hpp" #include "add_to_builder.hpp" -#include "arrow_builders_t.hpp" +#include "make_arrow_builders.hpp" #include "make_arrow_data_types.hpp" #include "make_arrow_schema.hpp" @@ -44,10 +44,10 @@ class ArrowWriter { template std::vector> ArrowWriter::to_chunked_arrays(const VecType& _data) const { - using BuildersType = arrow_builders_t; - BuildersType builders; + auto builders = + make_arrow_builders>(); - constexpr size_t size = tuple_size_v; + constexpr size_t size = tuple_size_v; std::vector>> array_chunks(size); diff --git a/include/rfl/parsing/tabular/ChunkedArrayIterator.hpp b/include/rfl/parsing/tabular/ChunkedArrayIterator.hpp index acbda921..ac7f9db1 100644 --- a/include/rfl/parsing/tabular/ChunkedArrayIterator.hpp +++ b/include/rfl/parsing/tabular/ChunkedArrayIterator.hpp @@ -35,6 +35,10 @@ class ChunkedArrayIterator { if constexpr (std::is_same_v) { return current_chunk_.transform( [&](const auto& _c) { return T(std::string(_c->Value(ix_))); }); + + } else if constexpr (std::is_same_v) { + return current_chunk_.transform( + [&](const auto& _c) { return T(_c->Value(ix_) / 1000); }); } else { return current_chunk_.transform( [&](const auto& _c) { return T(_c->Value(ix_)); }); diff --git a/include/rfl/parsing/tabular/arrow_builders_t.hpp b/include/rfl/parsing/tabular/make_arrow_builders.hpp similarity index 80% rename from include/rfl/parsing/tabular/arrow_builders_t.hpp rename to include/rfl/parsing/tabular/make_arrow_builders.hpp index c87726ac..501694ee 100644 --- a/include/rfl/parsing/tabular/arrow_builders_t.hpp +++ b/include/rfl/parsing/tabular/make_arrow_builders.hpp @@ -1,5 +1,5 @@ -#ifndef RFL_PARSING_TABULAR_ARROWBUILDERST_HPP_ -#define RFL_PARSING_TABULAR_ARROWBUILDERST_HPP_ +#ifndef RFL_PARSING_TABULAR_MAKEARROWBUILDERS_HPP_ +#define RFL_PARSING_TABULAR_MAKEARROWBUILDERS_HPP_ #include @@ -35,6 +35,10 @@ struct ArrowBuildersType> { }(std::make_integer_sequence()); } + static Type make_builders() { + return Type(ArrowTypes::make_builder()...); + } + static auto schema() { const auto fields = std::vector>( {arrow::field(typename FieldTypes::Name().str(), @@ -44,8 +48,9 @@ struct ArrowBuildersType> { }; template -using arrow_builders_t = - typename ArrowBuildersType>>::Type; +auto make_arrow_builders() { + return ArrowBuildersType>::make_builders(); +} } // namespace rfl::parsing::tabular diff --git a/include/rfl/parsing/tabular/make_arrow_data_types.hpp b/include/rfl/parsing/tabular/make_arrow_data_types.hpp index a85ed26f..0fb237bd 100644 --- a/include/rfl/parsing/tabular/make_arrow_data_types.hpp +++ b/include/rfl/parsing/tabular/make_arrow_data_types.hpp @@ -4,7 +4,7 @@ #include #include "../../named_tuple_t.hpp" -#include "arrow_builders_t.hpp" +#include "make_arrow_builders.hpp" namespace rfl::parsing::tabular { diff --git a/include/rfl/parsing/tabular/make_arrow_schema.hpp b/include/rfl/parsing/tabular/make_arrow_schema.hpp index ba2a3ac4..b9c6268a 100644 --- a/include/rfl/parsing/tabular/make_arrow_schema.hpp +++ b/include/rfl/parsing/tabular/make_arrow_schema.hpp @@ -4,7 +4,7 @@ #include #include "../../named_tuple_t.hpp" -#include "arrow_builders_t.hpp" +#include "make_arrow_builders.hpp" namespace rfl::parsing::tabular { diff --git a/tests/parquet/test_readme_example.cpp b/tests/parquet/test_readme_example.cpp index 7e58c0d1..48fc082c 100644 --- a/tests/parquet/test_readme_example.cpp +++ b/tests/parquet/test_readme_example.cpp @@ -13,8 +13,7 @@ struct Person { rfl::Rename<"firstName", std::string> first_name; rfl::Rename<"lastName", std::string> last_name = "Simpson"; std::string town = "Springfield"; - // rfl::Timestamp<"%Y-%m-%d"> birthday; - std::string birthday; + rfl::Timestamp<"%Y-%m-%d"> birthday; Age age; rfl::Email email; }; diff --git a/tests/parquet/test_save_load.cpp b/tests/parquet/test_save_load.cpp index f0856b88..b0e19619 100644 --- a/tests/parquet/test_save_load.cpp +++ b/tests/parquet/test_save_load.cpp @@ -16,8 +16,7 @@ struct Person { rfl::Rename<"firstName", std::string> first_name; rfl::Rename<"lastName", std::string> last_name = "Simpson"; std::string town = "Springfield"; - // rfl::Timestamp<"%Y-%m-%d"> birthday; - std::string birthday; + rfl::Timestamp<"%Y-%m-%d"> birthday; Age age; rfl::Email email; }; From f3928a127d12f4bb1560bf7b81e0c4a0bb9a448b Mon Sep 17 00:00:00 2001 From: "Dr. Patrick Urbanke" Date: Tue, 26 Aug 2025 21:20:59 +0200 Subject: [PATCH 12/36] Added support for processors --- include/rfl/parquet/read.hpp | 2 +- include/rfl/parquet/write.hpp | 10 ++--- include/rfl/parsing/tabular/ArrowReader.hpp | 6 +-- include/rfl/parsing/tabular/ArrowWriter.hpp | 12 ++++-- tests/parquet/test_camel_case.cpp | 42 +++++++++++++++++++++ 5 files changed, 58 insertions(+), 14 deletions(-) create mode 100644 tests/parquet/test_camel_case.cpp diff --git a/include/rfl/parquet/read.hpp b/include/rfl/parquet/read.hpp index ea037f9a..8091ee4e 100644 --- a/include/rfl/parquet/read.hpp +++ b/include/rfl/parquet/read.hpp @@ -42,7 +42,7 @@ Result> read( return error("Could not read table: " + status.message()); } - using ArrowReader = parsing::tabular::ArrowReader; + using ArrowReader = parsing::tabular::ArrowReader>; return ArrowReader::make(table).and_then( [](const auto& _r) { return _r.read(); }); diff --git a/include/rfl/parquet/write.hpp b/include/rfl/parquet/write.hpp index ff21a9eb..ca0e3ddf 100644 --- a/include/rfl/parquet/write.hpp +++ b/include/rfl/parquet/write.hpp @@ -21,11 +21,11 @@ namespace rfl::parquet { /// Returns parquet bytes. template Ref to_buffer(const auto& _arr) { - /// TODO: Support processors using T = std::remove_cvref_t; const auto table = - parsing::tabular::ArrowWriter(/*chunksize=*/2000).to_table(_arr); + parsing::tabular::ArrowWriter>(/*chunksize=*/2000) + .to_table(_arr); const auto props = ::parquet::WriterProperties::Builder() .compression(arrow::Compression::SNAPPY) @@ -60,15 +60,15 @@ Ref to_buffer(const auto& _arr) { /// Returns parquet bytes. template std::vector write(const auto& _arr) { - const auto buffer = to_buffer(_arr); + const auto buffer = to_buffer(_arr); const auto view = std::string_view(*buffer); return std::vector(view.begin(), view.end()); } /// Writes a PARQUET into an ostream. template -std::ostream& write(const auto& _obj, std::ostream& _stream) noexcept { - auto buffer = to_buffer(_obj); +std::ostream& write(const auto& _arr, std::ostream& _stream) noexcept { + auto buffer = to_buffer(_arr); _stream << std::string_view(*buffer); return _stream; } diff --git a/include/rfl/parsing/tabular/ArrowReader.hpp b/include/rfl/parsing/tabular/ArrowReader.hpp index 628212e1..fe6962f7 100644 --- a/include/rfl/parsing/tabular/ArrowReader.hpp +++ b/include/rfl/parsing/tabular/ArrowReader.hpp @@ -21,7 +21,7 @@ namespace rfl::parsing::tabular { -template +template class ArrowReader { public: using ValueType = typename std::remove_cvref_t; @@ -64,9 +64,7 @@ class ArrowReader { Result new_value(auto* _chunked_array_iterators) const noexcept { alignas(ValueType) unsigned char buf[sizeof(ValueType)]{}; auto ptr = internal::ptr_cast(&buf); - // TODO - // auto view = ProcessorsType::template process(to_view(*ptr)); - auto view = to_view(*ptr); + auto view = ProcessorsType::template process(to_view(*ptr)); using ViewType = std::remove_cvref_t; try { const auto set_one = [&](std::integral_constant) { diff --git a/include/rfl/parsing/tabular/ArrowWriter.hpp b/include/rfl/parsing/tabular/ArrowWriter.hpp index c0a082e5..162df7bb 100644 --- a/include/rfl/parsing/tabular/ArrowWriter.hpp +++ b/include/rfl/parsing/tabular/ArrowWriter.hpp @@ -19,7 +19,7 @@ namespace rfl::parsing::tabular { -template +template class ArrowWriter { public: using ValueType = typename std::remove_cvref_t; @@ -41,9 +41,12 @@ class ArrowWriter { size_t chunksize_; }; -template +template std::vector> -ArrowWriter::to_chunked_arrays(const VecType& _data) const { +ArrowWriter::to_chunked_arrays( + const VecType& _data) const { + using ValueType = typename VecType::value_type; + auto builders = make_arrow_builders>(); @@ -58,7 +61,8 @@ ArrowWriter::to_chunked_arrays(const VecType& _data) const { for (; it != _data.end() && (i < chunksize_ || chunksize_ == 0); ++i, ++it) { - const auto view = to_view(*it); + const auto view = + ProcessorsType::template process(to_view(*it)); [&](const auto& _v, auto* _b, std::integer_sequence) { diff --git a/tests/parquet/test_camel_case.cpp b/tests/parquet/test_camel_case.cpp new file mode 100644 index 00000000..8b84b520 --- /dev/null +++ b/tests/parquet/test_camel_case.cpp @@ -0,0 +1,42 @@ +#include +#include +#include +#include + +#include "write_and_read.hpp" + +namespace test_camel_case { + +using Age = rfl::Validator, rfl::Maximum<130>>; + +struct Person { + rfl::Rename<"firstName", std::string> first_name; + rfl::Rename<"lastName", std::string> last_name = "Simpson"; + std::string town = "Springfield"; + rfl::Timestamp<"%Y-%m-%d"> birthday; + Age age; + rfl::Email email; +}; + +TEST(parquet, test_camel_case) { + const auto people = + std::vector({Person{.first_name = "Bart", + .birthday = "1987-04-19", + .age = 10, + .email = "bart@simpson.com"}, + Person{.first_name = "Lisa", + .birthday = "1987-04-19", + .age = 8, + .email = "lisa@simpson.com"}, + Person{.first_name = "Maggie", + .birthday = "1987-04-19", + .age = 0, + .email = "maggie@simpson.com"}, + Person{.first_name = "Homer", + .birthday = "1987-04-19", + .age = 45, + .email = "homer@simpson.com"}}); + + write_and_read(people); +} +} // namespace test_camel_case From d49933174c9bd2298f0993fe1449d41e1a5a3b2f Mon Sep 17 00:00:00 2001 From: "Dr. Patrick Urbanke" Date: Tue, 26 Aug 2025 21:21:08 +0200 Subject: [PATCH 13/36] Added support for optionals --- .../parsing/tabular/ChunkedArrayIterator.hpp | 15 +++++++ tests/parquet/test_optionals.cpp | 40 +++++++++++++++++++ 2 files changed, 55 insertions(+) create mode 100644 tests/parquet/test_optionals.cpp diff --git a/include/rfl/parsing/tabular/ChunkedArrayIterator.hpp b/include/rfl/parsing/tabular/ChunkedArrayIterator.hpp index ac7f9db1..20343ed3 100644 --- a/include/rfl/parsing/tabular/ChunkedArrayIterator.hpp +++ b/include/rfl/parsing/tabular/ChunkedArrayIterator.hpp @@ -10,6 +10,7 @@ #include "../../Ref.hpp" #include "../../Result.hpp" +#include "../is_required.hpp" #include "array_t.hpp" namespace rfl::parsing::tabular { @@ -32,6 +33,19 @@ class ChunkedArrayIterator { ~ChunkedArrayIterator() = default; Result operator*() const noexcept { + const bool is_null = + current_chunk_ + .transform([&](const auto& _c) { return _c->IsNull(ix_); }) + .value_or(false); + + if (is_null) { + if constexpr (is_required()) { + return error("Value cannot be null."); + } else { + return T(); + } + } + if constexpr (std::is_same_v) { return current_chunk_.transform( [&](const auto& _c) { return T(std::string(_c->Value(ix_))); }); @@ -39,6 +53,7 @@ class ChunkedArrayIterator { } else if constexpr (std::is_same_v) { return current_chunk_.transform( [&](const auto& _c) { return T(_c->Value(ix_) / 1000); }); + } else { return current_chunk_.transform( [&](const auto& _c) { return T(_c->Value(ix_)); }); diff --git a/tests/parquet/test_optionals.cpp b/tests/parquet/test_optionals.cpp new file mode 100644 index 00000000..b5d3df80 --- /dev/null +++ b/tests/parquet/test_optionals.cpp @@ -0,0 +1,40 @@ +#include +#include +#include +#include +#include + +#include "write_and_read.hpp" + +namespace test_optionals { + +using Age = rfl::Validator, rfl::Maximum<130>>; + +struct Person { + rfl::Rename<"firstName", std::string> first_name; + rfl::Rename<"lastName", std::string> last_name = "Simpson"; + std::string town = "Springfield"; + rfl::Timestamp<"%Y-%m-%d"> birthday; + Age age; + std::optional email; +}; + +TEST(parquet, test_optionals) { + const auto people = std::vector( + {Person{.first_name = "Bart", + .birthday = "1987-04-19", + .age = 10, + .email = "bart@simpson.com"}, + Person{.first_name = "Lisa", + .birthday = "1987-04-19", + .age = 8, + .email = "lisa@simpson.com"}, + Person{.first_name = "Maggie", .birthday = "1987-04-19", .age = 0}, + Person{.first_name = "Homer", + .birthday = "1987-04-19", + .age = 45, + .email = "homer@simpson.com"}}); + + write_and_read(people); +} +} // namespace test_optionals From 2b832c07a7e043318bd093d234ae6124fed62a42 Mon Sep 17 00:00:00 2001 From: "Dr. Patrick Urbanke" Date: Tue, 26 Aug 2025 21:45:54 +0200 Subject: [PATCH 14/36] Added a test for deques --- tests/parquet/test_deque.cpp | 42 ++++++++++++++++++++++++++++++++++++ 1 file changed, 42 insertions(+) create mode 100644 tests/parquet/test_deque.cpp diff --git a/tests/parquet/test_deque.cpp b/tests/parquet/test_deque.cpp new file mode 100644 index 00000000..41b40ab8 --- /dev/null +++ b/tests/parquet/test_deque.cpp @@ -0,0 +1,42 @@ +#include +#include +#include +#include + +#include "write_and_read.hpp" + +namespace test_deque { + +using Age = rfl::Validator, rfl::Maximum<130>>; + +struct Person { + rfl::Rename<"firstName", std::string> first_name; + rfl::Rename<"lastName", std::string> last_name = "Simpson"; + std::string town = "Springfield"; + rfl::Timestamp<"%Y-%m-%d"> birthday; + Age age; + rfl::Email email; +}; + +TEST(parquet, test_deque) { + const auto people = + std::deque({Person{.first_name = "Bart", + .birthday = "1987-04-19", + .age = 10, + .email = "bart@simpson.com"}, + Person{.first_name = "Lisa", + .birthday = "1987-04-19", + .age = 8, + .email = "lisa@simpson.com"}, + Person{.first_name = "Maggie", + .birthday = "1987-04-19", + .age = 0, + .email = "maggie@simpson.com"}, + Person{.first_name = "Homer", + .birthday = "1987-04-19", + .age = 45, + .email = "homer@simpson.com"}}); + + write_and_read(people); +} +} // namespace test_deque From c040b3a36bc17b6b8aa19113f2f5c2e0be0fd6c3 Mon Sep 17 00:00:00 2001 From: "Dr. Patrick Urbanke" Date: Tue, 26 Aug 2025 22:50:15 +0200 Subject: [PATCH 15/36] Made sure that the processors actually do what they are supposed to --- include/rfl.hpp | 1 + include/rfl/internal/ptr_named_tuple_t.hpp | 2 +- include/rfl/named_tuple_t.hpp | 9 +++-- include/rfl/parquet/read.hpp | 2 +- include/rfl/parquet/write.hpp | 5 +-- include/rfl/parsing/tabular/ArrowReader.hpp | 7 ++-- include/rfl/parsing/tabular/ArrowWriter.hpp | 15 ++++---- include/rfl/view_t.hpp | 20 ++++++++++ tests/parquet/test_camel_case.cpp | 4 +- tests/parquet/test_literal.cpp | 42 +++++++++++++++++++++ tests/parquet/test_save_load.cpp | 4 +- 11 files changed, 87 insertions(+), 24 deletions(-) create mode 100644 include/rfl/view_t.hpp create mode 100644 tests/parquet/test_literal.cpp diff --git a/include/rfl.hpp b/include/rfl.hpp index ac5e7eac..4753749c 100644 --- a/include/rfl.hpp +++ b/include/rfl.hpp @@ -76,6 +76,7 @@ #include "rfl/to_view.hpp" #include "rfl/tuple_cat.hpp" #include "rfl/type_name_t.hpp" +#include "rfl/view_t.hpp" #include "rfl/visit.hpp" #ifdef _MSC_VER diff --git a/include/rfl/internal/ptr_named_tuple_t.hpp b/include/rfl/internal/ptr_named_tuple_t.hpp index c354e9be..7b7544b5 100644 --- a/include/rfl/internal/ptr_named_tuple_t.hpp +++ b/include/rfl/internal/ptr_named_tuple_t.hpp @@ -13,7 +13,7 @@ namespace internal { template using ptr_named_tuple_t = - typename std::invoke_result), T>::type; + std::invoke_result_t), T>; } // namespace internal } // namespace rfl diff --git a/include/rfl/named_tuple_t.hpp b/include/rfl/named_tuple_t.hpp index 4cb7786a..8e31100b 100644 --- a/include/rfl/named_tuple_t.hpp +++ b/include/rfl/named_tuple_t.hpp @@ -6,18 +6,19 @@ #include #include "NamedTuple.hpp" -#include "internal/ptr_named_tuple_t.hpp" +#include "Processors.hpp" #include "internal/remove_ptrs_nt.hpp" #include "to_named_tuple.hpp" +#include "view_t.hpp" namespace rfl { /// Generates the named tuple that is equivalent to the struct T. /// This is the result you would expect from calling to_named_tuple(my_struct). /// All fields of the struct must be an rfl::Field. -template -using named_tuple_t = typename internal::remove_ptrs_nt< - internal::ptr_named_tuple_t>::NamedTupleType; +template +using named_tuple_t = + typename internal::remove_ptrs_nt>::NamedTupleType; } // namespace rfl diff --git a/include/rfl/parquet/read.hpp b/include/rfl/parquet/read.hpp index 8091ee4e..68ff2e76 100644 --- a/include/rfl/parquet/read.hpp +++ b/include/rfl/parquet/read.hpp @@ -42,7 +42,7 @@ Result> read( return error("Could not read table: " + status.message()); } - using ArrowReader = parsing::tabular::ArrowReader>; + using ArrowReader = parsing::tabular::ArrowReader; return ArrowReader::make(table).and_then( [](const auto& _r) { return _r.read(); }); diff --git a/include/rfl/parquet/write.hpp b/include/rfl/parquet/write.hpp index ca0e3ddf..611f2896 100644 --- a/include/rfl/parquet/write.hpp +++ b/include/rfl/parquet/write.hpp @@ -23,9 +23,8 @@ template Ref to_buffer(const auto& _arr) { using T = std::remove_cvref_t; - const auto table = - parsing::tabular::ArrowWriter>(/*chunksize=*/2000) - .to_table(_arr); + const auto table = parsing::tabular::ArrowWriter(/*chunksize=*/2000) + .to_table(_arr); const auto props = ::parquet::WriterProperties::Builder() .compression(arrow::Compression::SNAPPY) diff --git a/include/rfl/parsing/tabular/ArrowReader.hpp b/include/rfl/parsing/tabular/ArrowReader.hpp index fe6962f7..0063b0e6 100644 --- a/include/rfl/parsing/tabular/ArrowReader.hpp +++ b/include/rfl/parsing/tabular/ArrowReader.hpp @@ -16,12 +16,13 @@ #include "../../get.hpp" #include "../../named_tuple_t.hpp" #include "../../to_view.hpp" +#include "../../view_t.hpp" #include "../call_destructors_where_necessary.hpp" #include "make_chunked_array_iterators.hpp" namespace rfl::parsing::tabular { -template +template class ArrowReader { public: using ValueType = typename std::remove_cvref_t; @@ -37,7 +38,7 @@ class ArrowReader { ~ArrowReader() = default; Result read() const noexcept { - return make_chunked_array_iterators>(table_) + return make_chunked_array_iterators>(table_) .and_then([&](auto chunked_array_iterators) -> Result { VecType result; while (!end(chunked_array_iterators)) { @@ -64,7 +65,7 @@ class ArrowReader { Result new_value(auto* _chunked_array_iterators) const noexcept { alignas(ValueType) unsigned char buf[sizeof(ValueType)]{}; auto ptr = internal::ptr_cast(&buf); - auto view = ProcessorsType::template process(to_view(*ptr)); + auto view = to_view(*ptr); using ViewType = std::remove_cvref_t; try { const auto set_one = [&](std::integral_constant) { diff --git a/include/rfl/parsing/tabular/ArrowWriter.hpp b/include/rfl/parsing/tabular/ArrowWriter.hpp index 162df7bb..0709faa6 100644 --- a/include/rfl/parsing/tabular/ArrowWriter.hpp +++ b/include/rfl/parsing/tabular/ArrowWriter.hpp @@ -19,7 +19,7 @@ namespace rfl::parsing::tabular { -template +template class ArrowWriter { public: using ValueType = typename std::remove_cvref_t; @@ -29,8 +29,9 @@ class ArrowWriter { ~ArrowWriter() = default; std::shared_ptr to_table(const VecType& _data) const { - return arrow::Table::Make(make_arrow_schema(), - to_chunked_arrays(_data)); + return arrow::Table::Make( + make_arrow_schema>(), + to_chunked_arrays(_data)); } private: @@ -41,10 +42,9 @@ class ArrowWriter { size_t chunksize_; }; -template +template std::vector> -ArrowWriter::to_chunked_arrays( - const VecType& _data) const { +ArrowWriter::to_chunked_arrays(const VecType& _data) const { using ValueType = typename VecType::value_type; auto builders = @@ -61,8 +61,7 @@ ArrowWriter::to_chunked_arrays( for (; it != _data.end() && (i < chunksize_ || chunksize_ == 0); ++i, ++it) { - const auto view = - ProcessorsType::template process(to_view(*it)); + const auto view = to_view(*it); [&](const auto& _v, auto* _b, std::integer_sequence) { diff --git a/include/rfl/view_t.hpp b/include/rfl/view_t.hpp new file mode 100644 index 00000000..c1c7569d --- /dev/null +++ b/include/rfl/view_t.hpp @@ -0,0 +1,20 @@ +#ifndef RFL_VIEW_T_HPP_ +#define RFL_VIEW_T_HPP_ + +#include + +#include "Processors.hpp" +#include "internal/ptr_named_tuple_t.hpp" + +namespace rfl { + +/// Generates the named tuple that would be the result of to_view +template +using view_t = + std::invoke_result_t::template process< + T, internal::ptr_named_tuple_t>), + internal::ptr_named_tuple_t>; + +} // namespace rfl + +#endif diff --git a/tests/parquet/test_camel_case.cpp b/tests/parquet/test_camel_case.cpp index 8b84b520..7eeffa30 100644 --- a/tests/parquet/test_camel_case.cpp +++ b/tests/parquet/test_camel_case.cpp @@ -10,8 +10,8 @@ namespace test_camel_case { using Age = rfl::Validator, rfl::Maximum<130>>; struct Person { - rfl::Rename<"firstName", std::string> first_name; - rfl::Rename<"lastName", std::string> last_name = "Simpson"; + std::string first_name; + std::string last_name = "Simpson"; std::string town = "Springfield"; rfl::Timestamp<"%Y-%m-%d"> birthday; Age age; diff --git a/tests/parquet/test_literal.cpp b/tests/parquet/test_literal.cpp new file mode 100644 index 00000000..1e36b6ca --- /dev/null +++ b/tests/parquet/test_literal.cpp @@ -0,0 +1,42 @@ +#include +#include +#include +#include + +#include "write_and_read.hpp" + +namespace test_literal { + +using Age = rfl::Validator, rfl::Maximum<130>>; + +struct Person { + std::string first_name; + rfl::Literal<"Simpson"> last_name; + std::string town = "Springfield"; + rfl::Timestamp<"%Y-%m-%d"> birthday; + Age age; + rfl::Email email; +}; + +TEST(parquet, test_literal) { + const auto people = + std::vector({Person{.first_name = "Bart", + .birthday = "1987-04-19", + .age = 10, + .email = "bart@simpson.com"}, + Person{.first_name = "Lisa", + .birthday = "1987-04-19", + .age = 8, + .email = "lisa@simpson.com"}, + Person{.first_name = "Maggie", + .birthday = "1987-04-19", + .age = 0, + .email = "maggie@simpson.com"}, + Person{.first_name = "Homer", + .birthday = "1987-04-19", + .age = 45, + .email = "homer@simpson.com"}}); + + write_and_read(people); +} +} // namespace test_literal diff --git a/tests/parquet/test_save_load.cpp b/tests/parquet/test_save_load.cpp index b0e19619..fe760dc6 100644 --- a/tests/parquet/test_save_load.cpp +++ b/tests/parquet/test_save_load.cpp @@ -13,8 +13,8 @@ using Age = rfl::Validator, rfl::Maximum<130>>>; struct Person { - rfl::Rename<"firstName", std::string> first_name; - rfl::Rename<"lastName", std::string> last_name = "Simpson"; + std::string first_name; + std::string last_name = "Simpson"; std::string town = "Springfield"; rfl::Timestamp<"%Y-%m-%d"> birthday; Age age; From 234084d9c499c1432137cec49bcde1fc704abf39 Mon Sep 17 00:00:00 2001 From: "Dr. Patrick Urbanke" Date: Tue, 26 Aug 2025 23:34:02 +0200 Subject: [PATCH 16/36] Add static_asserts for the processors that make no sense in a tabular context --- include/rfl/parsing/tabular/ArrowReader.hpp | 13 +++++++++++++ include/rfl/parsing/tabular/ArrowWriter.hpp | 13 +++++++++++++ 2 files changed, 26 insertions(+) diff --git a/include/rfl/parsing/tabular/ArrowReader.hpp b/include/rfl/parsing/tabular/ArrowReader.hpp index 0063b0e6..153778b7 100644 --- a/include/rfl/parsing/tabular/ArrowReader.hpp +++ b/include/rfl/parsing/tabular/ArrowReader.hpp @@ -10,6 +10,7 @@ #include #include +#include "../../Processors.hpp" #include "../../Result.hpp" #include "../../Tuple.hpp" #include "../../apply.hpp" @@ -24,6 +25,18 @@ namespace rfl::parsing::tabular { template class ArrowReader { + static_assert(!Processors::add_tags_to_variants_, + "rfl::AddTagsToVariants cannot be used for tabular data."); + static_assert(!Processors::all_required_, + "rfl::NoOptionals cannot be used for tabular data."); + static_assert(!Processors::default_if_missing_, + "rfl::DefaultIfMissing cannot be used for tabular data."); + static_assert(!Processors::no_extra_fields_, + "rfl::NoExtraFields cannot be used for tabular data (neither " + "can rfl::ExtraFields)."); + static_assert(!Processors::no_field_names_, + "rfl::NoFieldNames cannot be used for tabular data."); + public: using ValueType = typename std::remove_cvref_t; diff --git a/include/rfl/parsing/tabular/ArrowWriter.hpp b/include/rfl/parsing/tabular/ArrowWriter.hpp index 0709faa6..5deae33d 100644 --- a/include/rfl/parsing/tabular/ArrowWriter.hpp +++ b/include/rfl/parsing/tabular/ArrowWriter.hpp @@ -8,6 +8,7 @@ #include #include +#include "../../Processors.hpp" #include "../../Tuple.hpp" #include "../../get.hpp" #include "../../named_tuple_t.hpp" @@ -21,6 +22,18 @@ namespace rfl::parsing::tabular { template class ArrowWriter { + static_assert(!Processors::add_tags_to_variants_, + "rfl::AddTagsToVariants cannot be used for tabular data."); + static_assert(!Processors::all_required_, + "rfl::NoOptionals cannot be used for tabular data."); + static_assert(!Processors::default_if_missing_, + "rfl::DefaultIfMissing cannot be used for tabular data."); + static_assert(!Processors::no_extra_fields_, + "rfl::NoExtraFields cannot be used for tabular data (neither " + "can rfl::ExtraFields)."); + static_assert(!Processors::no_field_names_, + "rfl::NoFieldNames cannot be used for tabular data."); + public: using ValueType = typename std::remove_cvref_t; From 02715ade637ffffb7d72b54dc46f138cb5968fb7 Mon Sep 17 00:00:00 2001 From: "Dr. Patrick Urbanke" Date: Wed, 27 Aug 2025 00:01:32 +0200 Subject: [PATCH 17/36] Added support for bytestrings --- include/rfl/parsing/tabular/ArrowTypes.hpp | 21 ++++++++++++++++ .../parsing/tabular/ChunkedArrayIterator.hpp | 10 +++++++- tests/parquet/test_bytestring.cpp | 24 +++++++++++++++++++ 3 files changed, 54 insertions(+), 1 deletion(-) create mode 100644 tests/parquet/test_bytestring.cpp diff --git a/include/rfl/parsing/tabular/ArrowTypes.hpp b/include/rfl/parsing/tabular/ArrowTypes.hpp index d0827dd5..b5bfa6a7 100644 --- a/include/rfl/parsing/tabular/ArrowTypes.hpp +++ b/include/rfl/parsing/tabular/ArrowTypes.hpp @@ -17,8 +17,10 @@ #include "../../Rename.hpp" #include "../../Timestamp.hpp" #include "../../Tuple.hpp" +#include "../../concepts.hpp" #include "../../internal/StringLiteral.hpp" #include "../../internal/has_reflection_type_v.hpp" +#include "../../internal/ptr_cast.hpp" #include "../../named_tuple_t.hpp" namespace rfl::parsing::tabular { @@ -213,6 +215,25 @@ struct ArrowTypes { static auto make_builder() { return BuilderType(); } }; +template + requires concepts::ContiguousByteContainer +struct ArrowTypes { + using ArrayType = arrow::BinaryArray; + using BuilderType = arrow::BinaryBuilder; + + static auto data_type() { return arrow::binary(); } + + static void add_to_builder(const auto& _val, BuilderType* _builder) { + const auto status = _builder->Append( + internal::ptr_cast(_val.data()), _val.size()); + if (!status.ok()) { + throw std::runtime_error(status.message()); + } + } + + static auto make_builder() { return BuilderType(); } +}; + template struct ArrowTypes> { using ArrayType = arrow::TimestampArray; diff --git a/include/rfl/parsing/tabular/ChunkedArrayIterator.hpp b/include/rfl/parsing/tabular/ChunkedArrayIterator.hpp index 20343ed3..21fb8f53 100644 --- a/include/rfl/parsing/tabular/ChunkedArrayIterator.hpp +++ b/include/rfl/parsing/tabular/ChunkedArrayIterator.hpp @@ -10,6 +10,7 @@ #include "../../Ref.hpp" #include "../../Result.hpp" +#include "../../internal/ptr_cast.hpp" #include "../is_required.hpp" #include "array_t.hpp" @@ -46,7 +47,14 @@ class ChunkedArrayIterator { } } - if constexpr (std::is_same_v) { + if constexpr (std::is_same_v) { + return current_chunk_.transform([&](const auto& _c) { + const auto begin = internal::ptr_cast( + _c->Value(ix_).data()); + return T(begin, begin + _c->Value(ix_).size()); + }); + + } else if constexpr (std::is_same_v) { return current_chunk_.transform( [&](const auto& _c) { return T(std::string(_c->Value(ix_))); }); diff --git a/tests/parquet/test_bytestring.cpp b/tests/parquet/test_bytestring.cpp new file mode 100644 index 00000000..55e9f4e8 --- /dev/null +++ b/tests/parquet/test_bytestring.cpp @@ -0,0 +1,24 @@ +#include +#include +#include +#include + +#include "write_and_read.hpp" + +namespace test_bytestring { + +struct TestStruct { + rfl::Bytestring bytestring; +}; + +TEST(parquet, test_bytestring) { + const auto test_struct = + TestStruct{.bytestring = rfl::Bytestring({std::byte{13}, std::byte{14}, + std::byte{15}, std::byte{16}})}; + + const auto test_structs = std::vector( + {test_struct, test_struct, test_struct, test_struct}); + + write_and_read(test_structs); +} +} // namespace test_bytestring From 71bbd9b7e2026ee515cd774359eff2895283afda Mon Sep 17 00:00:00 2001 From: "Dr. Patrick Urbanke" Date: Sat, 30 Aug 2025 22:39:32 +0200 Subject: [PATCH 18/36] Added support for boolean columns --- include/rfl/parquet.hpp | 3 - include/rfl/parquet/Parser.hpp | 15 --- include/rfl/parquet/Reader.hpp | 146 --------------------- include/rfl/parquet/Writer.hpp | 117 ----------------- include/rfl/parsing/tabular/ArrowTypes.hpp | 17 +++ tests/parquet/test_boolean.cpp | 47 +++++++ 6 files changed, 64 insertions(+), 281 deletions(-) delete mode 100644 include/rfl/parquet/Parser.hpp delete mode 100644 include/rfl/parquet/Reader.hpp delete mode 100644 include/rfl/parquet/Writer.hpp create mode 100644 tests/parquet/test_boolean.cpp diff --git a/include/rfl/parquet.hpp b/include/rfl/parquet.hpp index cc07bad7..6d7b847a 100644 --- a/include/rfl/parquet.hpp +++ b/include/rfl/parquet.hpp @@ -2,9 +2,6 @@ #define RFL_PARQUET_HPP_ #include "../rfl.hpp" -// #include "parquet/Parser.hpp" -// #include "parquet/Reader.hpp" -// #include "parquet/Writer.hpp" #include "parquet/load.hpp" #include "parquet/read.hpp" #include "parquet/save.hpp" diff --git a/include/rfl/parquet/Parser.hpp b/include/rfl/parquet/Parser.hpp deleted file mode 100644 index c4149b97..00000000 --- a/include/rfl/parquet/Parser.hpp +++ /dev/null @@ -1,15 +0,0 @@ -#ifndef RFL_PARQUET_PARSER_HPP_ -#define RFL_PARQUET_PARSER_HPP_ - -#include "../parsing/Parser.hpp" -#include "Reader.hpp" -#include "Writer.hpp" - -namespace rfl::parquet { - -template -using Parser = parsing::Parser; - -} // namespace rfl::parquet - -#endif diff --git a/include/rfl/parquet/Reader.hpp b/include/rfl/parquet/Reader.hpp deleted file mode 100644 index dfce8595..00000000 --- a/include/rfl/parquet/Reader.hpp +++ /dev/null @@ -1,146 +0,0 @@ -#ifndef RFL_PARQUET_READER_HPP_ -#define RFL_PARQUET_READER_HPP_ - -#include - -#include -#include -#include -#include -#include - -#include "../Bytestring.hpp" -#include "../Result.hpp" -#include "../always_false.hpp" -#include "../internal/ptr_cast.hpp" - -namespace rfl::parquet { - -struct Reader { - using InputArrayType = parquet_object_array; - using InputObjectType = parquet_object_map; - using InputVarType = parquet_object; - - template - static constexpr bool has_custom_constructor = - (requires(InputVarType var) { T::from_parquet_obj(var); }); - - rfl::Result get_field_from_array( - const size_t _idx, const InputArrayType _arr) const noexcept { - if (_idx >= _arr.size) { - return error("Index " + std::to_string(_idx) + " of of bounds."); - } - return _arr.ptr[_idx]; - } - - rfl::Result get_field_from_object( - const std::string& _name, const InputObjectType& _obj) const noexcept { - for (uint32_t i = 0; i < _obj.size; ++i) { - const auto& key = _obj.ptr[i].key; - if (key.type != PARQUET_OBJECT_STR) { - return error("Key in element " + std::to_string(i) + - " was not a string."); - } - const auto current_name = - std::string_view(key.via.str.ptr, key.via.str.size); - if (_name == current_name) { - return _obj.ptr[i].val; - } - } - return error("No field named '" + _name + "' was found."); - } - - bool is_empty(const InputVarType& _var) const noexcept { - return _var.type == PARQUET_OBJECT_NIL; - } - - template - rfl::Result to_basic_type(const InputVarType& _var) const noexcept { - const auto type = _var.type; - if constexpr (std::is_same, std::string>()) { - if (type != PARQUET_OBJECT_STR) { - return error("Could not cast to string."); - } - const auto str = _var.via.str; - return std::string(str.ptr, str.size); - - } else if constexpr (std::is_same, - rfl::Bytestring>()) { - if (type != PARQUET_OBJECT_BIN) { - return error("Could not cast to a bytestring."); - } - const auto bin = _var.via.bin; - const auto data = internal::ptr_cast(bin.ptr); - return rfl::Bytestring(data, data + bin.size); - - } else if constexpr (std::is_same, bool>()) { - if (type != PARQUET_OBJECT_BOOLEAN) { - return error("Could not cast to boolean."); - } - return _var.via.boolean; - - } else if constexpr (std::is_floating_point>()) { - if (type == PARQUET_OBJECT_FLOAT32 || type == PARQUET_OBJECT_FLOAT64 || - type == PARQUET_OBJECT_FLOAT) { - return static_cast(_var.via.f64); - } - return error( - "Could not cast to numeric value. The type must be float " - "or double."); - - } else if constexpr (std::is_integral>()) { - if (type == PARQUET_OBJECT_POSITIVE_INTEGER) { - return static_cast(_var.via.u64); - } else if (type == PARQUET_OBJECT_NEGATIVE_INTEGER) { - return static_cast(_var.via.i64); - } - return error( - "Could not cast to numeric value. The type must be integral, float " - "or double."); - } else { - static_assert(rfl::always_false_v, "Unsupported type."); - } - } - - template - std::optional read_array(const ArrayReader& _array_reader, - const InputArrayType& _arr) const noexcept { - for (uint32_t i = 0; i < _arr.size; ++i) { - const auto err = _array_reader.read(_arr.ptr[i]); - if (err) { - return err; - } - } - return std::nullopt; - } - - template - std::optional read_object(const ObjectReader& _object_reader, - const InputObjectType& _obj) const noexcept { - for (uint32_t i = 0; i < _obj.size; ++i) { - const auto& key = _obj.ptr[i].key; - const auto& val = _obj.ptr[i].val; - if (key.type != PARQUET_OBJECT_STR) { - return rfl::Error("Key in element " + std::to_string(i) + - " was not a string."); - } - const auto name = std::string_view(key.via.str.ptr, key.via.str.size); - _object_reader.read(name, val); - } - return std::nullopt; - } - - template - rfl::Result use_custom_constructor( - const InputVarType& _var) const noexcept { - try { - return T::from_parquet_obj(_var); - } catch (std::exception& e) { - return error(e.what()); - } - } -}; - -} // namespace rfl::parquet - -#endif diff --git a/include/rfl/parquet/Writer.hpp b/include/rfl/parquet/Writer.hpp deleted file mode 100644 index cb5bc118..00000000 --- a/include/rfl/parquet/Writer.hpp +++ /dev/null @@ -1,117 +0,0 @@ -#ifndef RFL_MSGPACK_WRITER_HPP_ -#define RFL_MSGPACK_WRITER_HPP_ - -#include - -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include "../Box.hpp" -#include "../Bytestring.hpp" -#include "../Ref.hpp" -#include "../Result.hpp" -#include "../Vectorstring.hpp" -#include "../always_false.hpp" - -namespace rfl::parquet { - -class Writer { - public: - struct ParquetOutputArray {}; - - struct ParquetOutputObject {}; - - struct ParquetOutputVar {}; - - using OutputArrayType = ParquetOutputArray; - using OutputObjectType = ParquetOutputObject; - using OutputVarType = ParquetOutputVar; - - Writer(parquet_packer* _pk); - - ~Writer(); - - OutputArrayType add_array_to_array(const size_t _size, - OutputArrayType* _parent) const noexcept; - - OutputArrayType add_array_to_object(const std::string_view& _name, - const size_t _size, - OutputObjectType* _parent) const noexcept; - - OutputObjectType add_object_to_array(const size_t _size, - OutputArrayType* _parent) const noexcept; - - OutputObjectType add_object_to_object( - const std::string_view& _name, const size_t _size, - OutputObjectType* _parent) const noexcept; - - template - OutputVarType add_value_to_array(const T& _var, - OutputArrayType* _parent) const noexcept { - return new_value(_var); - } - - template - OutputVarType add_value_to_object(const std::string_view& _name, - const T& _var, - OutputObjectType* _parent) const noexcept { - parquet_pack_str(pk_, _name.size()); - parquet_pack_str_body(pk_, _name.data(), _name.size()); - return new_value(_var); - } - - OutputVarType add_null_to_array(OutputArrayType* _parent) const noexcept; - - OutputVarType add_null_to_object(const std::string_view& _name, - OutputObjectType* _parent) const noexcept; - - void end_array(OutputArrayType* _arr) const noexcept; - - void end_object(OutputObjectType* _obj) const noexcept; - - private: - OutputArrayType new_array(const size_t _size) const noexcept; - - OutputObjectType new_object(const size_t _size) const noexcept; - - template - OutputVarType new_value(const T& _var) const noexcept { - using Type = std::remove_cvref_t; - if constexpr (std::is_same()) { - parquet_pack_str(pk_, _var.size()); - parquet_pack_str_body(pk_, _var.c_str(), _var.size()); - } else if constexpr (std::is_same() || - std::is_same()) { - parquet_pack_bin(pk_, _var.size()); - parquet_pack_bin_body(pk_, _var.data(), _var.size()); - } else if constexpr (std::is_same()) { - if (_var) { - parquet_pack_true(pk_); - } else { - parquet_pack_false(pk_); - } - } else if constexpr (std::is_floating_point()) { - parquet_pack_double(pk_, static_cast(_var)); - } else if constexpr (std::is_integral()) { - parquet_pack_int64(pk_, static_cast(_var)); - } else { - static_assert(rfl::always_false_v, "Unsupported type."); - } - return OutputVarType{}; - } - - private: - /// The underlying packer. - parquet_packer* pk_; -}; - -} // namespace rfl::parquet - -#endif diff --git a/include/rfl/parsing/tabular/ArrowTypes.hpp b/include/rfl/parsing/tabular/ArrowTypes.hpp index b5bfa6a7..2f5e4a32 100644 --- a/include/rfl/parsing/tabular/ArrowTypes.hpp +++ b/include/rfl/parsing/tabular/ArrowTypes.hpp @@ -28,6 +28,23 @@ namespace rfl::parsing::tabular { template struct ArrowTypes; +template <> +struct ArrowTypes { + using ArrayType = arrow::BooleanArray; + using BuilderType = arrow::BooleanBuilder; + + static auto data_type() { return arrow::boolean(); } + + static void add_to_builder(const bool _val, BuilderType* _builder) { + const auto status = _builder->Append(_val); + if (!status.ok()) { + throw std::runtime_error(status.message()); + } + } + + static auto make_builder() { return BuilderType(); } +}; + template <> struct ArrowTypes { using ArrayType = arrow::UInt8Array; diff --git a/tests/parquet/test_boolean.cpp b/tests/parquet/test_boolean.cpp new file mode 100644 index 00000000..e01f1ad3 --- /dev/null +++ b/tests/parquet/test_boolean.cpp @@ -0,0 +1,47 @@ +#include +#include +#include +#include + +#include "write_and_read.hpp" + +namespace test_boolean { + +using Age = rfl::Validator, rfl::Maximum<130>>; + +struct Person { + rfl::Rename<"firstName", std::string> first_name; + rfl::Rename<"lastName", std::string> last_name = "Simpson"; + std::string town = "Springfield"; + rfl::Timestamp<"%Y-%m-%d"> birthday; + bool is_child; + Age age; + rfl::Email email; +}; + +TEST(parquet, test_boolean) { + const auto people = + std::vector({Person{.first_name = "Bart", + .birthday = "1987-04-19", + .is_child = true, + .age = 10, + .email = "bart@simpson.com"}, + Person{.first_name = "Lisa", + .birthday = "1987-04-19", + .is_child = true, + .age = 8, + .email = "lisa@simpson.com"}, + Person{.first_name = "Maggie", + .birthday = "1987-04-19", + .is_child = true, + .age = 0, + .email = "maggie@simpson.com"}, + Person{.first_name = "Homer", + .birthday = "1987-04-19", + .is_child = false, + .age = 45, + .email = "homer@simpson.com"}}); + + write_and_read(people); +} +} // namespace test_boolean From 5d43c558cfadf765de0758b07502f1cdfe10f778 Mon Sep 17 00:00:00 2001 From: "Dr. Patrick Urbanke" Date: Sat, 30 Aug 2025 23:36:17 +0200 Subject: [PATCH 19/36] Added support for enums --- include/rfl/parsing/tabular/ArrowTypes.hpp | 136 ++++++++++++++++++ .../parsing/tabular/ChunkedArrayIterator.hpp | 21 +-- tests/parquet/test_enums.cpp | 44 ++++++ 3 files changed, 182 insertions(+), 19 deletions(-) create mode 100644 tests/parquet/test_enums.cpp diff --git a/include/rfl/parsing/tabular/ArrowTypes.hpp b/include/rfl/parsing/tabular/ArrowTypes.hpp index 2f5e4a32..5cd828ed 100644 --- a/include/rfl/parsing/tabular/ArrowTypes.hpp +++ b/include/rfl/parsing/tabular/ArrowTypes.hpp @@ -18,6 +18,7 @@ #include "../../Timestamp.hpp" #include "../../Tuple.hpp" #include "../../concepts.hpp" +#include "../../enums.hpp" #include "../../internal/StringLiteral.hpp" #include "../../internal/has_reflection_type_v.hpp" #include "../../internal/ptr_cast.hpp" @@ -42,6 +43,11 @@ struct ArrowTypes { } } + static Result get_value(const Ref& _chunk, + const int64_t _ix) { + return _chunk->Value(_ix); + } + static auto make_builder() { return BuilderType(); } }; @@ -59,6 +65,11 @@ struct ArrowTypes { } } + static Result get_value(const Ref& _chunk, + const int64_t _ix) { + return _chunk->Value(_ix); + } + static auto make_builder() { return BuilderType(); } }; @@ -76,6 +87,11 @@ struct ArrowTypes { } } + static Result get_value(const Ref& _chunk, + const int64_t _ix) { + return _chunk->Value(_ix); + } + static auto make_builder() { return BuilderType(); } }; @@ -93,6 +109,11 @@ struct ArrowTypes { } } + static Result get_value(const Ref& _chunk, + const int64_t _ix) { + return _chunk->Value(_ix); + } + static auto make_builder() { return BuilderType(); } }; @@ -110,6 +131,11 @@ struct ArrowTypes { } } + static Result get_value(const Ref& _chunk, + const int64_t _ix) { + return _chunk->Value(_ix); + } + static auto make_builder() { return BuilderType(); } }; @@ -127,6 +153,11 @@ struct ArrowTypes { } } + static Result get_value(const Ref& _chunk, + const int64_t _ix) { + return _chunk->Value(_ix); + } + static auto make_builder() { return BuilderType(); } }; @@ -144,6 +175,11 @@ struct ArrowTypes { } } + static Result get_value(const Ref& _chunk, + const int64_t _ix) { + return _chunk->Value(_ix); + } + static auto make_builder() { return BuilderType(); } }; @@ -161,6 +197,11 @@ struct ArrowTypes { } } + static Result get_value(const Ref& _chunk, + const int64_t _ix) { + return _chunk->Value(_ix); + } + static auto make_builder() { return BuilderType(); } }; @@ -178,6 +219,11 @@ struct ArrowTypes { } } + static Result get_value(const Ref& _chunk, + const int64_t _ix) { + return _chunk->Value(_ix); + } + static auto make_builder() { return BuilderType(); } }; @@ -195,6 +241,11 @@ struct ArrowTypes { } } + static Result get_value(const Ref& _chunk, + const int64_t _ix) { + return _chunk->Value(_ix); + } + static auto make_builder() { return BuilderType(); } }; @@ -212,6 +263,11 @@ struct ArrowTypes { } } + static Result get_value(const Ref& _chunk, + const int64_t _ix) { + return _chunk->Value(_ix); + } + static auto make_builder() { return BuilderType(); } }; @@ -229,6 +285,33 @@ struct ArrowTypes { } } + static Result get_value(const Ref& _chunk, + const int64_t _ix) { + return std::string(_chunk->Value(_ix)); + } + + static auto make_builder() { return BuilderType(); } +}; + +template + requires enchantum::Enum +struct ArrowTypes { + using ArrayType = arrow::StringArray; + using BuilderType = arrow::StringBuilder; + + static auto data_type() { return arrow::utf8(); } + + static void add_to_builder(const T& _val, BuilderType* _builder) { + const auto status = _builder->Append(enum_to_string(_val)); + if (!status.ok()) { + throw std::runtime_error(status.message()); + } + } + + static Result get_value(const Ref& _chunk, const int64_t _ix) { + return string_to_enum(std::string(_chunk->Value(_ix))); + } + static auto make_builder() { return BuilderType(); } }; @@ -248,6 +331,12 @@ struct ArrowTypes { } } + static Result get_value(const Ref& _chunk, const int64_t _ix) { + const auto begin = internal::ptr_cast( + _chunk->Value(_ix).data()); + return T(begin, begin + _chunk->Value(_ix).size()); + } + static auto make_builder() { return BuilderType(); } }; @@ -265,6 +354,11 @@ struct ArrowTypes> { } } + static Result> get_value(const Ref& _chunk, + const int64_t _ix) { + return Timestamp<_format>(_chunk->Value(_ix) / 1000); + } + static auto make_builder() { return BuilderType(data_type(), arrow::default_memory_pool()); } @@ -286,6 +380,18 @@ struct ArrowTypes { _builder); } + static Result get_value(const Ref& _chunk, const int64_t _ix) { + return ArrowTypes>:: + get_value(_chunk, _ix) + .and_then([](const auto& _v) -> Result { + try { + return T(_v); + } catch (const std::exception& e) { + return error(e.what()); + } + }); + } + static auto make_builder() { return ArrowTypes::make_builder(); } @@ -309,6 +415,11 @@ struct ArrowTypes> { } } + static auto get_value(const Ref& _chunk, const int64_t _ix) { + return ArrowTypes>::get_value(_chunk, _ix) + .transform([](const auto& _v) { return std::make_optional(_v); }); + } + static auto make_builder() { return ArrowTypes::make_builder(); } }; @@ -330,6 +441,11 @@ struct ArrowTypes> { } } + static auto get_value(const Ref& _chunk, const int64_t _ix) { + return ArrowTypes>::get_value(_chunk, _ix) + .transform([](const auto& _v) { return std::make_shared(_v); }); + } + static auto make_builder() { return ArrowTypes::make_builder(); } }; @@ -351,6 +467,11 @@ struct ArrowTypes> { } } + static auto get_value(const Ref& _chunk, const int64_t _ix) { + return ArrowTypes>::get_value(_chunk, _ix) + .transform([](const auto& _v) { return std::make_unique(_v); }); + } + static auto make_builder() { return ArrowTypes::make_builder(); } }; @@ -365,6 +486,11 @@ struct ArrowTypes> { ArrowTypes::add_to_builder(*_val, _builder); } + static auto get_value(const Ref& _chunk, const int64_t _ix) { + return ArrowTypes>::get_value(_chunk, _ix) + .transform([](const auto& _v) { return Box::make(_v); }); + } + static auto make_builder() { return ArrowTypes::make_builder(); } }; @@ -379,6 +505,11 @@ struct ArrowTypes> { ArrowTypes::add_to_builder(*_val, _builder); } + static auto get_value(const Ref& _chunk, const int64_t _ix) { + return ArrowTypes>::get_value(_chunk, _ix) + .transform([](const auto& _v) { return Ref::make(_v); }); + } + static auto make_builder() { return ArrowTypes::make_builder(); } }; @@ -393,6 +524,11 @@ struct ArrowTypes> { ArrowTypes::add_to_builder(_val.value(), _builder); } + static auto get_value(const Ref& _chunk, const int64_t _ix) { + return ArrowTypes>::get_value(_chunk, _ix) + .transform([](const auto& _v) { return Rename<_name, T>(_v); }); + } + static auto make_builder() { return ArrowTypes::make_builder(); } }; diff --git a/include/rfl/parsing/tabular/ChunkedArrayIterator.hpp b/include/rfl/parsing/tabular/ChunkedArrayIterator.hpp index 21fb8f53..68b229bf 100644 --- a/include/rfl/parsing/tabular/ChunkedArrayIterator.hpp +++ b/include/rfl/parsing/tabular/ChunkedArrayIterator.hpp @@ -47,25 +47,8 @@ class ChunkedArrayIterator { } } - if constexpr (std::is_same_v) { - return current_chunk_.transform([&](const auto& _c) { - const auto begin = internal::ptr_cast( - _c->Value(ix_).data()); - return T(begin, begin + _c->Value(ix_).size()); - }); - - } else if constexpr (std::is_same_v) { - return current_chunk_.transform( - [&](const auto& _c) { return T(std::string(_c->Value(ix_))); }); - - } else if constexpr (std::is_same_v) { - return current_chunk_.transform( - [&](const auto& _c) { return T(_c->Value(ix_) / 1000); }); - - } else { - return current_chunk_.transform( - [&](const auto& _c) { return T(_c->Value(ix_)); }); - } + return current_chunk_.and_then( + [&](const auto& _c) { return ArrowTypes::get_value(_c, ix_); }); } bool end() const noexcept { diff --git a/tests/parquet/test_enums.cpp b/tests/parquet/test_enums.cpp new file mode 100644 index 00000000..ff3b2091 --- /dev/null +++ b/tests/parquet/test_enums.cpp @@ -0,0 +1,44 @@ +#include +#include +#include +#include + +#include "write_and_read.hpp" + +namespace test_enums { + +using Age = rfl::Validator, rfl::Maximum<130>>; + +enum class FirstName { Bart, Lisa, Maggie, Homer }; + +struct Person { + rfl::Rename<"firstName", FirstName> first_name; + rfl::Rename<"lastName", std::string> last_name = "Simpson"; + std::string town = "Springfield"; + rfl::Timestamp<"%Y-%m-%d"> birthday; + Age age; + rfl::Email email; +}; + +TEST(parquet, test_enums) { + const auto people = + std::vector({Person{.first_name = FirstName::Bart, + .birthday = "1987-04-19", + .age = 10, + .email = "bart@simpson.com"}, + Person{.first_name = FirstName::Lisa, + .birthday = "1987-04-19", + .age = 8, + .email = "lisa@simpson.com"}, + Person{.first_name = FirstName::Lisa, + .birthday = "1987-04-19", + .age = 0, + .email = "maggie@simpson.com"}, + Person{.first_name = FirstName::Homer, + .birthday = "1987-04-19", + .age = 45, + .email = "homer@simpson.com"}}); + + write_and_read(people); +} +} // namespace test_enums From c351b28d4fe42c4e3c6bab5fe134225193a94ef8 Mon Sep 17 00:00:00 2001 From: "Dr. Patrick Urbanke" Date: Sun, 31 Aug 2025 11:14:14 +0200 Subject: [PATCH 20/36] Added settings --- include/rfl/parquet/Compression.hpp | 12 ++++++++ include/rfl/parquet/Settings.hpp | 28 ++++++++++++++++++ include/rfl/parquet/write.hpp | 22 ++++++++------ tests/parquet/test_gzip.cpp | 45 +++++++++++++++++++++++++++++ tests/parquet/write_and_read.hpp | 7 +++-- 5 files changed, 102 insertions(+), 12 deletions(-) create mode 100644 include/rfl/parquet/Compression.hpp create mode 100644 include/rfl/parquet/Settings.hpp create mode 100644 tests/parquet/test_gzip.cpp diff --git a/include/rfl/parquet/Compression.hpp b/include/rfl/parquet/Compression.hpp new file mode 100644 index 00000000..59dc229c --- /dev/null +++ b/include/rfl/parquet/Compression.hpp @@ -0,0 +1,12 @@ +#ifndef RFL_PARQUET_COMPRESSION_HPP_ +#define RFL_PARQUET_COMPRESSION_HPP_ + +#include + +namespace rfl::parquet { + +using Compression = arrow::Compression::type; + +} // namespace rfl::parquet + +#endif diff --git a/include/rfl/parquet/Settings.hpp b/include/rfl/parquet/Settings.hpp new file mode 100644 index 00000000..60bd6906 --- /dev/null +++ b/include/rfl/parquet/Settings.hpp @@ -0,0 +1,28 @@ +#ifndef RFL_PARQUET_SETTINGS_HPP_ +#define RFL_PARQUET_SETTINGS_HPP_ + +#include "../Field.hpp" +#include "../replace.hpp" +#include "Compression.hpp" + +namespace rfl::parquet { + +struct Settings { + /// The size of the chunks of the parquet file. + size_t chunksize = 2000; + + /// The compression algorithm used to compress the parquet file. + Compression compression = Compression::SNAPPY; + + Settings with_chunksize(const size_t _chunksize) const noexcept { + return replace(*this, make_field<"chunksize">(_chunksize)); + } + + Settings with_compression(const Compression _compression) const noexcept { + return replace(*this, make_field<"compression">(_compression)); + } +}; + +} // namespace rfl::parquet + +#endif diff --git a/include/rfl/parquet/write.hpp b/include/rfl/parquet/write.hpp index 611f2896..0dd397af 100644 --- a/include/rfl/parquet/write.hpp +++ b/include/rfl/parquet/write.hpp @@ -15,19 +15,21 @@ #include "../Processors.hpp" #include "../Ref.hpp" #include "../parsing/tabular/ArrowWriter.hpp" +#include "Settings.hpp" namespace rfl::parquet { /// Returns parquet bytes. template -Ref to_buffer(const auto& _arr) { +Ref to_buffer(const auto& _arr, const Settings& _settings) { using T = std::remove_cvref_t; - const auto table = parsing::tabular::ArrowWriter(/*chunksize=*/2000) - .to_table(_arr); + const auto table = + parsing::tabular::ArrowWriter(_settings.chunksize) + .to_table(_arr); const auto props = ::parquet::WriterProperties::Builder() - .compression(arrow::Compression::SNAPPY) + .compression(_settings.compression) ->build(); const auto arrow_props = @@ -41,7 +43,7 @@ Ref to_buffer(const auto& _arr) { const auto status = ::parquet::arrow::WriteTable( *table.get(), arrow::default_memory_pool(), output_buffer.ValueOrDie(), - /*chunk_size=*/2000, props, arrow_props); + _settings.chunksize, props, arrow_props); if (!status.ok()) { throw std::runtime_error(status.message()); @@ -58,16 +60,18 @@ Ref to_buffer(const auto& _arr) { /// Returns parquet bytes. template -std::vector write(const auto& _arr) { - const auto buffer = to_buffer(_arr); +std::vector write(const auto& _arr, + const Settings& _settings = Settings{}) { + const auto buffer = to_buffer(_arr, _settings); const auto view = std::string_view(*buffer); return std::vector(view.begin(), view.end()); } /// Writes a PARQUET into an ostream. template -std::ostream& write(const auto& _arr, std::ostream& _stream) noexcept { - auto buffer = to_buffer(_arr); +std::ostream& write(const auto& _arr, std::ostream& _stream, + const Settings& _settings = Settings{}) noexcept { + auto buffer = to_buffer(_arr, _settings); _stream << std::string_view(*buffer); return _stream; } diff --git a/tests/parquet/test_gzip.cpp b/tests/parquet/test_gzip.cpp new file mode 100644 index 00000000..ee254395 --- /dev/null +++ b/tests/parquet/test_gzip.cpp @@ -0,0 +1,45 @@ +#include +#include +#include +#include + +#include "write_and_read.hpp" + +namespace test_gzip { + +using Age = rfl::Validator, rfl::Maximum<130>>; + +struct Person { + rfl::Rename<"firstName", std::string> first_name; + rfl::Rename<"lastName", std::string> last_name = "Simpson"; + std::string town = "Springfield"; + rfl::Timestamp<"%Y-%m-%d"> birthday; + Age age; + rfl::Email email; +}; + +TEST(parquet, test_gzip) { + const auto people = + std::vector({Person{.first_name = "Bart", + .birthday = "1987-04-19", + .age = 10, + .email = "bart@simpson.com"}, + Person{.first_name = "Lisa", + .birthday = "1987-04-19", + .age = 8, + .email = "lisa@simpson.com"}, + Person{.first_name = "Maggie", + .birthday = "1987-04-19", + .age = 0, + .email = "maggie@simpson.com"}, + Person{.first_name = "Homer", + .birthday = "1987-04-19", + .age = 45, + .email = "homer@simpson.com"}}); + + const auto settings = rfl::parquet::Settings{}.with_compression( + rfl::parquet::Compression::GZIP); + + write_and_read(people, settings); +} +} // namespace test_gzip diff --git a/tests/parquet/write_and_read.hpp b/tests/parquet/write_and_read.hpp index 4bec476f..53ee3b9c 100644 --- a/tests/parquet/write_and_read.hpp +++ b/tests/parquet/write_and_read.hpp @@ -8,13 +8,14 @@ #include template -void write_and_read(const auto& _vec) { +void write_and_read(const auto& _vec, const rfl::parquet::Settings& _settings = + rfl::parquet::Settings{}) { using T = std::remove_cvref_t; - const auto serialized1 = rfl::parquet::write(_vec); + const auto serialized1 = rfl::parquet::write(_vec, _settings); const auto res = rfl::parquet::read(serialized1); EXPECT_TRUE(res && true) << "Test failed on read. Error: " << res.error().what(); - const auto serialized2 = rfl::parquet::write(res.value()); + const auto serialized2 = rfl::parquet::write(res.value(), _settings); EXPECT_EQ(serialized1, serialized2); } From 46a05be3c201e117b49ee6bb6fb8820b4fddc10d Mon Sep 17 00:00:00 2001 From: "Dr. Patrick Urbanke" Date: Sun, 31 Aug 2025 11:55:39 +0200 Subject: [PATCH 21/36] Moved compression into settings --- include/rfl/parquet/Compression.hpp | 12 ------------ include/rfl/parquet/Settings.hpp | 6 +++++- 2 files changed, 5 insertions(+), 13 deletions(-) delete mode 100644 include/rfl/parquet/Compression.hpp diff --git a/include/rfl/parquet/Compression.hpp b/include/rfl/parquet/Compression.hpp deleted file mode 100644 index 59dc229c..00000000 --- a/include/rfl/parquet/Compression.hpp +++ /dev/null @@ -1,12 +0,0 @@ -#ifndef RFL_PARQUET_COMPRESSION_HPP_ -#define RFL_PARQUET_COMPRESSION_HPP_ - -#include - -namespace rfl::parquet { - -using Compression = arrow::Compression::type; - -} // namespace rfl::parquet - -#endif diff --git a/include/rfl/parquet/Settings.hpp b/include/rfl/parquet/Settings.hpp index 60bd6906..2ba65def 100644 --- a/include/rfl/parquet/Settings.hpp +++ b/include/rfl/parquet/Settings.hpp @@ -1,12 +1,16 @@ #ifndef RFL_PARQUET_SETTINGS_HPP_ #define RFL_PARQUET_SETTINGS_HPP_ +#include +#include + #include "../Field.hpp" #include "../replace.hpp" -#include "Compression.hpp" namespace rfl::parquet { +using Compression = arrow::Compression::type; + struct Settings { /// The size of the chunks of the parquet file. size_t chunksize = 2000; From 0441ea5c442060614b8a0ab338565079660fc7e8 Mon Sep 17 00:00:00 2001 From: "Dr. Patrick Urbanke" Date: Sun, 31 Aug 2025 12:53:14 +0200 Subject: [PATCH 22/36] Pass by reference, where appropriate --- include/rfl/parsing/tabular/ArrowTypes.hpp | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/include/rfl/parsing/tabular/ArrowTypes.hpp b/include/rfl/parsing/tabular/ArrowTypes.hpp index 5cd828ed..2c7bee83 100644 --- a/include/rfl/parsing/tabular/ArrowTypes.hpp +++ b/include/rfl/parsing/tabular/ArrowTypes.hpp @@ -301,7 +301,7 @@ struct ArrowTypes { static auto data_type() { return arrow::utf8(); } - static void add_to_builder(const T& _val, BuilderType* _builder) { + static void add_to_builder(const T _val, BuilderType* _builder) { const auto status = _builder->Append(enum_to_string(_val)); if (!status.ok()) { throw std::runtime_error(status.message()); @@ -347,7 +347,7 @@ struct ArrowTypes> { static auto data_type() { return arrow::timestamp(arrow::TimeUnit::SECOND); } - static void add_to_builder(const auto _val, BuilderType* _builder) { + static void add_to_builder(const auto& _val, BuilderType* _builder) { const auto status = _builder->Append(_val.to_time_t()); if (!status.ok()) { throw std::runtime_error(status.message()); @@ -375,7 +375,7 @@ struct ArrowTypes { return ArrowTypes::data_type(); } - static void add_to_builder(const auto _val, BuilderType* _builder) { + static void add_to_builder(const auto& _val, BuilderType* _builder) { ArrowTypes::add_to_builder(_val.reflection(), _builder); } @@ -404,7 +404,7 @@ struct ArrowTypes> { static auto data_type() { return ArrowTypes::data_type(); } - static void add_to_builder(const auto _val, BuilderType* _builder) { + static void add_to_builder(const auto& _val, BuilderType* _builder) { if (_val) { ArrowTypes::add_to_builder(*_val, _builder); } else { @@ -430,7 +430,7 @@ struct ArrowTypes> { static auto data_type() { return ArrowTypes::data_type(); } - static void add_to_builder(const auto _val, BuilderType* _builder) { + static void add_to_builder(const auto& _val, BuilderType* _builder) { if (_val) { ArrowTypes::add_to_builder(*_val, _builder); } else { @@ -456,7 +456,7 @@ struct ArrowTypes> { static auto data_type() { return ArrowTypes::data_type(); } - static void add_to_builder(const auto _val, BuilderType* _builder) { + static void add_to_builder(const auto& _val, BuilderType* _builder) { if (_val) { ArrowTypes::add_to_builder(*_val, _builder); } else { @@ -482,7 +482,7 @@ struct ArrowTypes> { static auto data_type() { return ArrowTypes::data_type(); } - static void add_to_builder(const auto _val, BuilderType* _builder) { + static void add_to_builder(const auto& _val, BuilderType* _builder) { ArrowTypes::add_to_builder(*_val, _builder); } @@ -501,7 +501,7 @@ struct ArrowTypes> { static auto data_type() { return ArrowTypes::data_type(); } - static void add_to_builder(const auto _val, BuilderType* _builder) { + static void add_to_builder(const auto& _val, BuilderType* _builder) { ArrowTypes::add_to_builder(*_val, _builder); } @@ -520,7 +520,7 @@ struct ArrowTypes> { static auto data_type() { return ArrowTypes::data_type(); } - static void add_to_builder(const auto _val, BuilderType* _builder) { + static void add_to_builder(const auto& _val, BuilderType* _builder) { ArrowTypes::add_to_builder(_val.value(), _builder); } From 50b6c25a1082f41aeeab9ccf2f08a2abb7452538 Mon Sep 17 00:00:00 2001 From: "Dr. Patrick Urbanke" Date: Sun, 31 Aug 2025 12:53:19 +0200 Subject: [PATCH 23/36] Added more tests --- tests/parquet/test_box.cpp | 44 +++++++++++++++++++++++++++++++ tests/parquet/test_ref.cpp | 44 +++++++++++++++++++++++++++++++ tests/parquet/test_shared_ptr.cpp | 44 +++++++++++++++++++++++++++++++ tests/parquet/test_unique_ptr.cpp | 44 +++++++++++++++++++++++++++++++ 4 files changed, 176 insertions(+) create mode 100644 tests/parquet/test_box.cpp create mode 100644 tests/parquet/test_ref.cpp create mode 100644 tests/parquet/test_shared_ptr.cpp create mode 100644 tests/parquet/test_unique_ptr.cpp diff --git a/tests/parquet/test_box.cpp b/tests/parquet/test_box.cpp new file mode 100644 index 00000000..f03fa082 --- /dev/null +++ b/tests/parquet/test_box.cpp @@ -0,0 +1,44 @@ +#include +#include +#include +#include +#include + +#include "write_and_read.hpp" + +namespace test_box { + +using Age = rfl::Validator, rfl::Maximum<130>>; + +struct Person { + rfl::Rename<"firstName", std::string> first_name; + rfl::Rename<"lastName", std::string> last_name = "Simpson"; + std::string town = "Springfield"; + rfl::Timestamp<"%Y-%m-%d"> birthday; + Age age; + rfl::Box email; +}; + +TEST(parquet, test_box) { + auto people = std::vector(); + people.emplace_back( + Person{.first_name = "Bart", + .birthday = "1987-04-19", + .age = 10, + .email = rfl::make_box("bart@simpson.com")}); + people.emplace_back( + Person{.first_name = "Lisa", + .birthday = "1987-04-19", + .age = 8, + .email = rfl::make_box("lisa@simpson.com")}); + people.emplace_back( + Person{.first_name = "Maggie", .birthday = "1987-04-19", .age = 0}); + people.emplace_back( + Person{.first_name = "Homer", + .birthday = "1987-04-19", + .age = 45, + .email = rfl::make_box("homer@simpson.com")}); + + write_and_read(people); +} +} // namespace test_box diff --git a/tests/parquet/test_ref.cpp b/tests/parquet/test_ref.cpp new file mode 100644 index 00000000..47c2b3b5 --- /dev/null +++ b/tests/parquet/test_ref.cpp @@ -0,0 +1,44 @@ +#include +#include +#include +#include +#include + +#include "write_and_read.hpp" + +namespace test_ref { + +using Age = rfl::Validator, rfl::Maximum<130>>; + +struct Person { + rfl::Rename<"firstName", std::string> first_name; + rfl::Rename<"lastName", std::string> last_name = "Simpson"; + std::string town = "Springfield"; + rfl::Timestamp<"%Y-%m-%d"> birthday; + Age age; + rfl::Ref email; +}; + +TEST(parquet, test_ref) { + auto people = std::vector(); + people.emplace_back( + Person{.first_name = "Bart", + .birthday = "1987-04-19", + .age = 10, + .email = rfl::make_ref("bart@simpson.com")}); + people.emplace_back( + Person{.first_name = "Lisa", + .birthday = "1987-04-19", + .age = 8, + .email = rfl::make_ref("lisa@simpson.com")}); + people.emplace_back( + Person{.first_name = "Maggie", .birthday = "1987-04-19", .age = 0}); + people.emplace_back( + Person{.first_name = "Homer", + .birthday = "1987-04-19", + .age = 45, + .email = rfl::make_ref("homer@simpson.com")}); + + write_and_read(people); +} +} // namespace test_ref diff --git a/tests/parquet/test_shared_ptr.cpp b/tests/parquet/test_shared_ptr.cpp new file mode 100644 index 00000000..09f9c6e7 --- /dev/null +++ b/tests/parquet/test_shared_ptr.cpp @@ -0,0 +1,44 @@ +#include +#include +#include +#include +#include + +#include "write_and_read.hpp" + +namespace test_shared_ptr { + +using Age = rfl::Validator, rfl::Maximum<130>>; + +struct Person { + rfl::Rename<"firstName", std::string> first_name; + rfl::Rename<"lastName", std::string> last_name = "Simpson"; + std::string town = "Springfield"; + rfl::Timestamp<"%Y-%m-%d"> birthday; + Age age; + std::shared_ptr email; +}; + +TEST(parquet, test_shared_ptr) { + auto people = std::vector(); + people.emplace_back( + Person{.first_name = "Bart", + .birthday = "1987-04-19", + .age = 10, + .email = std::make_shared("bart@simpson.com")}); + people.emplace_back( + Person{.first_name = "Lisa", + .birthday = "1987-04-19", + .age = 8, + .email = std::make_shared("lisa@simpson.com")}); + people.emplace_back( + Person{.first_name = "Maggie", .birthday = "1987-04-19", .age = 0}); + people.emplace_back( + Person{.first_name = "Homer", + .birthday = "1987-04-19", + .age = 45, + .email = std::make_shared("homer@simpson.com")}); + + write_and_read(people); +} +} // namespace test_shared_ptr diff --git a/tests/parquet/test_unique_ptr.cpp b/tests/parquet/test_unique_ptr.cpp new file mode 100644 index 00000000..1df85ffa --- /dev/null +++ b/tests/parquet/test_unique_ptr.cpp @@ -0,0 +1,44 @@ +#include +#include +#include +#include +#include + +#include "write_and_read.hpp" + +namespace test_unique_ptr { + +using Age = rfl::Validator, rfl::Maximum<130>>; + +struct Person { + rfl::Rename<"firstName", std::string> first_name; + rfl::Rename<"lastName", std::string> last_name = "Simpson"; + std::string town = "Springfield"; + rfl::Timestamp<"%Y-%m-%d"> birthday; + Age age; + std::unique_ptr email; +}; + +TEST(parquet, test_unique_ptr) { + auto people = std::vector(); + people.emplace_back( + Person{.first_name = "Bart", + .birthday = "1987-04-19", + .age = 10, + .email = std::make_unique("bart@simpson.com")}); + people.emplace_back( + Person{.first_name = "Lisa", + .birthday = "1987-04-19", + .age = 8, + .email = std::make_unique("lisa@simpson.com")}); + people.emplace_back( + Person{.first_name = "Maggie", .birthday = "1987-04-19", .age = 0}); + people.emplace_back( + Person{.first_name = "Homer", + .birthday = "1987-04-19", + .age = 45, + .email = std::make_unique("homer@simpson.com")}); + + write_and_read(people); +} +} // namespace test_unique_ptr From d74df0184b839d1826a201bcaa5efe1aabbc681a Mon Sep 17 00:00:00 2001 From: "Dr. Patrick Urbanke" Date: Sun, 31 Aug 2025 16:12:05 +0200 Subject: [PATCH 24/36] Added documentation for the parquet foramt --- docs/supported_formats/parquet.md | 246 ++++++++++++++++++++++++++++++ 1 file changed, 246 insertions(+) create mode 100644 docs/supported_formats/parquet.md diff --git a/docs/supported_formats/parquet.md b/docs/supported_formats/parquet.md new file mode 100644 index 00000000..88f3f5a5 --- /dev/null +++ b/docs/supported_formats/parquet.md @@ -0,0 +1,246 @@ +# Parquet + +For Parquet support, you must also include the header `` and link to the [Apache Arrow](https://arrow.apache.org/) and [Apache Parquet](https://parquet.apache.org/) libraries. +Furthermore, when compiling reflect-cpp, you need to pass `-DREFLECTCPP_PARQUET=ON` to cmake. + +Parquet is a columnar storage format optimized for analytical workloads. Unlike most other formats supported by reflect-cpp, Parquet is designed for tabular data and has specific limitations regarding nested structures. + +## Reading and writing + +Suppose you have a struct like this: + +```cpp +struct Person { + rfl::Rename<"firstName", std::string> first_name; + rfl::Rename<"lastName", std::string> last_name; + rfl::Timestamp<"%Y-%m-%d"> birthday; + unsigned int age; + rfl::Email email; +}; +``` + +**Important**: Parquet is a tabular format that requires collections of records. You cannot serialize individual structs - you must use containers like `std::vector`, `std::deque`, etc. + +A collection of `Person` structs can be serialized to a bytes vector like this: + +```cpp +const auto people = std::vector{ + Person{.first_name = "Bart", .birthday = "1987-04-19", .age = 10, .email = "bart@simpson.com"}, + Person{.first_name = "Lisa", .birthday = "1987-04-19", .age = 8, .email = "lisa@simpson.com"} +}; +const std::vector bytes = rfl::parquet::write(people); +``` + +You can parse bytes like this: + +```cpp +const rfl::Result> result = rfl::parquet::read>(bytes); +``` + +## Settings and compression + +Parquet supports various compression algorithms and chunk sizes. You can configure these using the `Settings` struct: + +```cpp +const auto settings = rfl::parquet::Settings{} + .with_compression(rfl::parquet::Compression::GZIP) + .with_chunksize(1000); + +const std::vector bytes = rfl::parquet::write(people, settings); +``` + +Available compression options include: +- `SNAPPY` (default) - Fast compression/decompression +- `GZIP` - Good compression ratio +- `LZ4` - Very fast compression +- `ZSTD` - Excellent compression ratio +- `BROTLI` - Good compression for text data + +## Loading and saving + +You can also load and save to disc using a very similar syntax: + +```cpp +const rfl::Result> result = rfl::parquet::load>("/path/to/file.parquet"); + +const auto people = std::vector{...}; +rfl::parquet::save("/path/to/file.parquet", people); +``` + +With custom settings: + +```cpp +const auto settings = rfl::parquet::Settings{} + .with_compression(rfl::parquet::Compression::GZIP); +rfl::parquet::save("/path/to/file.parquet", people, settings); +``` + +## Reading from and writing into streams + +You can also read from and write into any `std::istream` and `std::ostream` respectively. + +```cpp +const rfl::Result> result = rfl::parquet::read>(my_istream); + +const auto people = std::vector{...}; +rfl::parquet::write(people, my_ostream); +``` + +With custom settings: + +```cpp +const auto settings = rfl::parquet::Settings{} + .with_compression(rfl::parquet::Compression::GZIP); +rfl::parquet::write(people, my_ostream, settings); +``` + +## Field name transformations + +Like other formats, Parquet supports field name transformations. You can use processors like `SnakeCaseToCamelCase`: + +```cpp +const auto people = std::vector{...}; +const auto result = rfl::parquet::read, rfl::SnakeCaseToCamelCase>(bytes); +``` + +This will automatically convert field names from snake_case to camelCase during serialization and deserialization. + +## Supported processors + +The following processors are **NOT supported** and will cause compilation errors: + +- `rfl::AddTagsToVariants` - Cannot be used for tabular data +- `rfl::NoOptionals` - Cannot be used for tabular data +- `rfl::DefaultIfMissing` - Cannot be used for tabular data +- `rfl::NoExtraFields` - Cannot be used for tabular data +- `rfl::NoFieldNames` - Cannot be used for tabular data + +```cpp +// ✅ This works +const auto result = rfl::parquet::read, rfl::SnakeCaseToCamelCase>(bytes); + +// ❌ This will cause compilation errors +const auto result = rfl::parquet::read, rfl::AddTagsToVariants>(bytes); +const auto result = rfl::parquet::read, rfl::NoOptionals>(bytes); +const auto result = rfl::parquet::read, rfl::DefaultIfMissing>(bytes); +``` + +## Enums and validation + +Parquet supports enums and validated types. Enums are stored as strings: + +```cpp +enum class FirstName { Bart, Lisa, Maggie, Homer }; + +struct Person { + rfl::Rename<"firstName", FirstName> first_name; + rfl::Rename<"lastName", std::string> last_name; + rfl::Timestamp<"%Y-%m-%d"> birthday; + rfl::Validator, rfl::Maximum<130>> age; + rfl::Email email; +}; +``` + +## No variant types + +Parquet does not support variant types like `std::variant`, `rfl::Variant`, or `rfl::TaggedUnion`. These types cannot be serialized to Parquet format. + +```cpp +// ❌ This will NOT work +struct Person { + std::string first_name; + std::variant status; // Variant - not supported + rfl::Variant type; // rfl::Variant - not supported + rfl::TaggedUnion<"type", std::string, int> category; // TaggedUnion - not supported +}; +``` + +## Limitations of tabular formats + +Parquet, like other tabular formats, has specific limitations that differ from hierarchical formats like JSON or XML: + +### No nested objects +Unlike JSON or XML, Parquet cannot directly represent nested objects within a single row. Each field must be a primitive type, enum, or a simple container of primitives. + +```cpp +// This works fine +struct Person { + std::string first_name; + std::string last_name; + unsigned int age; +}; + +// This would NOT work as expected - nested objects are not automatically flattened +struct Address { + std::string street; + std::string city; +}; + +struct Person { + std::string first_name; + std::string last_name; + Address address; // ❌ This will cause compilation errors +}; +``` + +### Using rfl::Flatten for nested objects + +If you need to include nested objects, you can use `rfl::Flatten` to explicitly flatten them: + +```cpp +struct Address { + std::string street; + std::string city; +}; + +struct Person { + std::string first_name; + std::string last_name; + rfl::Flatten
address; // ✅ This will flatten the Address fields +}; + +// The resulting Parquet file will have columns: first_name, last_name, street, city +``` + +### Collections requirement + +You must serialize collections, not individual objects: +```cpp +std::vector people = {...}; // ✅ Correct +Person person = {...}; // ❌ Wrong - must be in a container +``` + +### No arrays (except bytestrings) +Parquet does not support arrays of any type except for binary data (bytestrings). This includes arrays of primitive types, strings, and objects. + +```cpp +// ❌ This will NOT work +struct Person { + std::string first_name; + std::vector hobbies; // Array of strings - not supported + std::vector scores; // Array of integers - not supported + std::vector
addresses; // Array of objects - not supported +}; + +// ✅ This works +struct Person { + std::string first_name; + std::string last_name; + std::vector binary_data; // Binary data - supported as bytestring +}; +``` +### Use cases +Parquet is ideal for: +- Data warehousing and analytics +- Large datasets with repeated values +- Integration with big data tools (Spark, Hadoop, etc.) +- Simple, flat data structures with consistent types + +Parquet is less suitable for: +- Complex nested data structures +- Data with arrays or variant types +- Frequent schema changes +- Row-oriented access patterns +- Small datasets where the overhead isn't justified +- Data with complex object hierarchies + From 87f9d1f6c3eb36bdd2090c4fb9e908e44d0e3329 Mon Sep 17 00:00:00 2001 From: "Dr. Patrick Urbanke" Date: Sun, 31 Aug 2025 16:26:35 +0200 Subject: [PATCH 25/36] Fixed typo --- tests/parquet/test_box.cpp | 5 ++++- tests/parquet/test_ref.cpp | 5 ++++- 2 files changed, 8 insertions(+), 2 deletions(-) diff --git a/tests/parquet/test_box.cpp b/tests/parquet/test_box.cpp index f03fa082..8fd5098a 100644 --- a/tests/parquet/test_box.cpp +++ b/tests/parquet/test_box.cpp @@ -32,7 +32,10 @@ TEST(parquet, test_box) { .age = 8, .email = rfl::make_box("lisa@simpson.com")}); people.emplace_back( - Person{.first_name = "Maggie", .birthday = "1987-04-19", .age = 0}); + Person{.first_name = "Maggie", + .birthday = "1987-04-19", + .age = 0, + .email = rfl::make_box("maggie@simpson.com")}); people.emplace_back( Person{.first_name = "Homer", .birthday = "1987-04-19", diff --git a/tests/parquet/test_ref.cpp b/tests/parquet/test_ref.cpp index 47c2b3b5..03417a85 100644 --- a/tests/parquet/test_ref.cpp +++ b/tests/parquet/test_ref.cpp @@ -32,7 +32,10 @@ TEST(parquet, test_ref) { .age = 8, .email = rfl::make_ref("lisa@simpson.com")}); people.emplace_back( - Person{.first_name = "Maggie", .birthday = "1987-04-19", .age = 0}); + Person{.first_name = "Maggie", + .birthday = "1987-04-19", + .age = 0, + .email = rfl::make_ref("maggie@simpson.com")}); people.emplace_back( Person{.first_name = "Homer", .birthday = "1987-04-19", From 854dab7694e2239bc15a08ec37993c9893ca6971 Mon Sep 17 00:00:00 2001 From: "Dr. Patrick Urbanke" Date: Sun, 31 Aug 2025 16:26:46 +0200 Subject: [PATCH 26/36] Added test for rfl::Flatten --- tests/parquet/test_flatten.cpp | 54 ++++++++++++++++++++++++++++++++++ 1 file changed, 54 insertions(+) create mode 100644 tests/parquet/test_flatten.cpp diff --git a/tests/parquet/test_flatten.cpp b/tests/parquet/test_flatten.cpp new file mode 100644 index 00000000..4c7c12e1 --- /dev/null +++ b/tests/parquet/test_flatten.cpp @@ -0,0 +1,54 @@ +#include +#include +#include +#include + +#include "write_and_read.hpp" + +namespace test_flatten { + +using Age = rfl::Validator, rfl::Maximum<130>>; + +struct Address { + std::string street; + std::string city; +}; + +struct Person { + rfl::Rename<"firstName", std::string> first_name; + rfl::Rename<"lastName", std::string> last_name = "Simpson"; + std::string town = "Springfield"; + rfl::Timestamp<"%Y-%m-%d"> birthday; + Age age; + rfl::Email email; + rfl::Flatten
address; +}; + +TEST(parquet, test_flatten) { + const auto address = + Address{.street = "Evergreen Terrace", .city = "Springfield"}; + + const auto people = std::vector({Person{.first_name = "Bart", + .birthday = "1987-04-19", + .age = 10, + .email = "bart@simpson.com", + .address = address}, + Person{.first_name = "Lisa", + .birthday = "1987-04-19", + .age = 8, + .email = "lisa@simpson.com", + .address = address}, + Person{.first_name = "Maggie", + .birthday = "1987-04-19", + .age = 0, + .email = "maggie@simpson.com", + .address = address}, + Person{.first_name = "Homer", + .birthday = "1987-04-19", + .age = 45, + .email = "homer@simpson.com", + .address = address}}); + + write_and_read(people); +} +} // namespace test_flatten From b875ccc84ef285140daa2b5a8cb1388484bdd9ec Mon Sep 17 00:00:00 2001 From: "Dr. Patrick Urbanke" Date: Sun, 31 Aug 2025 16:39:53 +0200 Subject: [PATCH 27/36] Document all of the compression formats --- docs/supported_formats/parquet.md | 31 ++++++++++++++++++++++++++----- 1 file changed, 26 insertions(+), 5 deletions(-) diff --git a/docs/supported_formats/parquet.md b/docs/supported_formats/parquet.md index 88f3f5a5..05ae8e66 100644 --- a/docs/supported_formats/parquet.md +++ b/docs/supported_formats/parquet.md @@ -50,11 +50,32 @@ const std::vector bytes = rfl::parquet::write(people, settings); ``` Available compression options include: -- `SNAPPY` (default) - Fast compression/decompression -- `GZIP` - Good compression ratio -- `LZ4` - Very fast compression -- `ZSTD` - Excellent compression ratio -- `BROTLI` - Good compression for text data + +- `UNCOMPRESSED` - No compression, fastest read/write but largest file size +- `SNAPPY` (default) - Fast compression/decompression, good balance of speed and size +- `GZIP` - Good compression ratio, slower than Snappy but better compression +- `BROTLI` - Good compression for text data, optimized for web content +- `ZSTD` - Excellent compression ratio, modern algorithm with good speed +- `LZ4` - Very fast compression/decompression, lower compression ratio +- `LZ4_FRAME` - LZ4 with frame format, better compatibility +- `LZO` - Fast compression, older algorithm +- `BZ2` - High compression ratio, slower compression/decompression +- `LZ4_HADOOP` - LZ4 optimized for Hadoop ecosystem + +```cpp +// Examples of different compression settings +const auto snappy_settings = rfl::parquet::Settings{} + .with_compression(rfl::parquet::Compression::SNAPPY); + +const auto gzip_settings = rfl::parquet::Settings{} + .with_compression(rfl::parquet::Compression::GZIP); + +const auto zstd_settings = rfl::parquet::Settings{} + .with_compression(rfl::parquet::Compression::ZSTD); + +const auto uncompressed_settings = rfl::parquet::Settings{} + .with_compression(rfl::parquet::Compression::UNCOMPRESSED); +``` ## Loading and saving From c4c216c34ecaa703ee0feee1e5c5d4e1335a5ce8 Mon Sep 17 00:00:00 2001 From: "Dr. Patrick Urbanke" Date: Sun, 31 Aug 2025 22:40:32 +0200 Subject: [PATCH 28/36] Updated parquet to 21.0.0 --- vcpkg.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vcpkg.json b/vcpkg.json index 12c707d3..635d4e09 100644 --- a/vcpkg.json +++ b/vcpkg.json @@ -115,7 +115,7 @@ "dependencies": [ { "name": "arrow", - "version>=": "19.0.1", + "version>=": "21.0.0", "features": ["parquet"] } ] From ae5e60f7e9adab7af2a9c471f87b16c08ab2b0db Mon Sep 17 00:00:00 2001 From: "Dr. Patrick Urbanke" Date: Sun, 31 Aug 2025 22:40:39 +0200 Subject: [PATCH 29/36] Updated the README --- README.md | 33 +++++++++++++++++++++++++++++++-- 1 file changed, 31 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 68b09aeb..c32de0b7 100644 --- a/README.md +++ b/README.md @@ -30,7 +30,6 @@ reflect-cpp and sqlgen fill important gaps in C++ development. They reduce boile
- ## Table of Contents ### On this page @@ -38,6 +37,7 @@ reflect-cpp and sqlgen fill important gaps in C++ development. They reduce boile - [Feature Overview](#feature-overview) - [Simple Example](#simple-example) - [More Comprehensive Example](#more-comprehensive-example) + - [Tabular data](#tabular-data) - [Error messages](#error-messages) - [JSON schema](#json-schema) - [Enums](#enums) @@ -70,6 +70,7 @@ The following table lists the serialization formats currently supported by refle | CBOR | [jsoncons](https://github.com/danielaparker/jsoncons)| >= 0.176.0 | BSL 1.0 | JSON-like binary format | | flexbuffers | [flatbuffers](https://github.com/google/flatbuffers) | >= 23.5.26 | Apache 2.0 | Schema-less version of flatbuffers, binary format | | msgpack | [msgpack-c](https://github.com/msgpack/msgpack-c) | >= 6.0.0 | BSL 1.0 | JSON-like binary format | +| parquet | [Apache Arrow](https://arrow.apache.org/) | >= 21.0.0 | Apache 2.0 | Tabular binary format | | TOML | [toml++](https://github.com/marzer/tomlplusplus) | >= 3.4.0 | MIT | Textual format with an emphasis on readability | | UBJSON | [jsoncons](https://github.com/danielaparker/jsoncons)| >= 0.176.0 | BSL 1.0 | JSON-like binary format | | XML | [pugixml](https://github.com/zeux/pugixml) | >= 1.14 | MIT | Textual format used in many legacy projects | @@ -145,7 +146,7 @@ age: 45 ``` This will work for just about any example in the entire documentation -and any supported format, except where explicitly noted otherwise: +and any of the following formats, except where explicitly noted otherwise: ```cpp rfl::avro::write(homer); @@ -242,6 +243,34 @@ std::cout << "Hello, my name is " << homer2.first_name() << " " << homer2.last_name() << "." << std::endl; ``` +### Tabular data + +reflect-cpp also supports tabular data formats, like Parquet: + +```cpp +#include + +const auto people = + std::vector({Person{.first_name = "Bart", + .birthday = "1987-04-19", + .age = 10, + .email = "bart@simpson.com"}, + Person{.first_name = "Lisa", + .birthday = "1987-04-19", + .age = 8, + .email = "lisa@simpson.com"}, + Person{.first_name = "Maggie", + .birthday = "1987-04-19", + .age = 0, + .email = "maggie@simpson.com"}, + Person{.first_name = "Homer", + .birthday = "1987-04-19", + .age = 45, + .email = "homer@simpson.com"}}); + +const auto bytestring = rfl::parquet::write(people); +``` + ### Error messages reflect-cpp returns clear and comprehensive error messages: From 6f73ac558826c5c35c6461d095e53eb79ec78bfc Mon Sep 17 00:00:00 2001 From: "Dr. Patrick Urbanke" Date: Mon, 1 Sep 2025 22:12:16 +0200 Subject: [PATCH 30/36] Use timegm for the Timestamp --- include/rfl/Timestamp.hpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/include/rfl/Timestamp.hpp b/include/rfl/Timestamp.hpp index 4ba33958..fde8ee6f 100644 --- a/include/rfl/Timestamp.hpp +++ b/include/rfl/Timestamp.hpp @@ -76,10 +76,10 @@ class Timestamp { /// Trivial (const) accessor to the underlying time stamp. const std::tm& tm() const { return tm_; } - /// Returns time_t by calling std::mktime under-the-hood. + /// Returns time_t by calling timegm under-the-hood. time_t to_time_t() const { auto tm = tm_; - return std::mktime(&tm) - timezone; + return static_cast(timegm(&tm) - tm_.tm_gmtoff); } private: From e082549d494a363c54cbbeed506863971f335771 Mon Sep 17 00:00:00 2001 From: "Dr. Patrick Urbanke" Date: Mon, 1 Sep 2025 22:21:41 +0200 Subject: [PATCH 31/36] Run the parquet tests && install bison --- .github/workflows/linux.yaml | 2 +- .github/workflows/macos.yaml | 4 ++-- .github/workflows/windows.yaml | 2 +- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/.github/workflows/linux.yaml b/.github/workflows/linux.yaml index 83df6c4a..0125cca6 100644 --- a/.github/workflows/linux.yaml +++ b/.github/workflows/linux.yaml @@ -10,7 +10,7 @@ jobs: strategy: fail-fast: false matrix: - format: ["JSON", "AVRO", "CAPNPROTO", "CBOR", "FLEXBUFFERS", "MSGPACK", "XML", "TOML", "UBJSON", "YAML", "benchmarks"] + format: ["JSON", "AVRO", "CAPNPROTO", "CBOR", "FLEXBUFFERS", "MSGPACK", "PARQUET", "TOML", "UBJSON", "XML", "YAML", "benchmarks"] compiler: [llvm, gcc] compiler-version: [11, 12, 13, 14, 16, 17, 18] cxx: [20, 23] diff --git a/.github/workflows/macos.yaml b/.github/workflows/macos.yaml index 60f3f0aa..46b5cb80 100644 --- a/.github/workflows/macos.yaml +++ b/.github/workflows/macos.yaml @@ -11,7 +11,7 @@ jobs: fail-fast: false matrix: os: ["macos-latest", "macos-13"] - format: ["JSON", "AVRO", "CAPNPROTO", "CBOR", "FLEXBUFFERS", "MSGPACK", "XML", "TOML", "UBJSON", "YAML", "benchmarks"] + format: ["JSON", "AVRO", "CAPNPROTO", "CBOR", "FLEXBUFFERS", "MSGPACK", "PARQUET", "TOML", "UBJSON", "XML", "YAML", "benchmarks"] name: "${{ matrix.os }} (${{ matrix.format }})" runs-on: ${{ matrix.os }} steps: @@ -29,7 +29,7 @@ jobs: - name: Run vcpkg uses: lukka/run-vcpkg@v11 - name: Install ninja - run: brew install ninja + run: brew install bison ninja if: matrix.os == 'macos-latest' - name: Compile env: diff --git a/.github/workflows/windows.yaml b/.github/workflows/windows.yaml index 1727de4e..2445f3ce 100644 --- a/.github/workflows/windows.yaml +++ b/.github/workflows/windows.yaml @@ -10,7 +10,7 @@ jobs: strategy: fail-fast: false matrix: - format: ["JSON", "CAPNPROTO", "CBOR", "FLEXBUFFERS", "MSGPACK", "XML", "TOML", "UBJSON", "YAML", "benchmarks"] + format: ["JSON", "AVRO", "CAPNPROTO", "CBOR", "FLEXBUFFERS", "MSGPACK", "PARQUET", "TOML", "UBJSON", "XML", "YAML", "benchmarks"] name: "windows-msvc (${{ matrix.format }})" runs-on: windows-latest steps: From ff53c75462226c2ffec92ce3c9065a233f3808f5 Mon Sep 17 00:00:00 2001 From: "Dr. Patrick Urbanke" Date: Mon, 1 Sep 2025 22:58:28 +0200 Subject: [PATCH 32/36] Install bison on all macOS version --- .github/workflows/macos.yaml | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/.github/workflows/macos.yaml b/.github/workflows/macos.yaml index 46b5cb80..2b34b06c 100644 --- a/.github/workflows/macos.yaml +++ b/.github/workflows/macos.yaml @@ -28,8 +28,10 @@ jobs: core.exportVariable('ACTIONS_RUNTIME_TOKEN', process.env.ACTIONS_RUNTIME_TOKEN || ''); - name: Run vcpkg uses: lukka/run-vcpkg@v11 + - name: Install bison + run: brew install bison - name: Install ninja - run: brew install bison ninja + run: brew install ninja if: matrix.os == 'macos-latest' - name: Compile env: From de4e63c1c069081166e900cb7b6a25bf42c7cb7e Mon Sep 17 00:00:00 2001 From: "Dr. Patrick Urbanke" Date: Mon, 1 Sep 2025 23:36:08 +0200 Subject: [PATCH 33/36] Use _mkgmtime on Windows --- include/rfl/Timestamp.hpp | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/include/rfl/Timestamp.hpp b/include/rfl/Timestamp.hpp index fde8ee6f..f9749578 100644 --- a/include/rfl/Timestamp.hpp +++ b/include/rfl/Timestamp.hpp @@ -76,10 +76,14 @@ class Timestamp { /// Trivial (const) accessor to the underlying time stamp. const std::tm& tm() const { return tm_; } - /// Returns time_t by calling timegm under-the-hood. + /// Returns a UTC time represented by a time_t type. time_t to_time_t() const { auto tm = tm_; +#if defined(_MSC_VER) || defined(__MINGW32__) + return _mkgmtime(&tm); +#else return static_cast(timegm(&tm) - tm_.tm_gmtoff); +#endif } private: From 488ed78ab4cfd806a7a684fc676c10e26a9b3938 Mon Sep 17 00:00:00 2001 From: "Dr. Patrick Urbanke" Date: Tue, 2 Sep 2025 17:42:36 +0200 Subject: [PATCH 34/36] Removed unneeded source files --- CMakeLists.txt | 3 -- src/reflectcpp_parquet.cpp | 31 --------------- src/rfl/parquet/Writer.cpp | 78 -------------------------------------- 3 files changed, 112 deletions(-) delete mode 100644 src/reflectcpp_parquet.cpp delete mode 100644 src/rfl/parquet/Writer.cpp diff --git a/CMakeLists.txt b/CMakeLists.txt index ff3a99b5..e811cb42 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -267,9 +267,6 @@ if (REFLECTCPP_MSGPACK) endif () if (REFLECTCPP_PARQUET) - list(APPEND REFLECT_CPP_SOURCES - src/reflectcpp_parquet.cpp - ) if (NOT TARGET Arrow) find_package(Arrow CONFIG REQUIRED) endif() diff --git a/src/reflectcpp_parquet.cpp b/src/reflectcpp_parquet.cpp deleted file mode 100644 index 975c55b8..00000000 --- a/src/reflectcpp_parquet.cpp +++ /dev/null @@ -1,31 +0,0 @@ -/* - -MIT License - -Copyright (c) 2023-2024 Code17 GmbH - -Permission is hereby granted, free of charge, to any person obtaining a copy -of this software and associated documentation files (the "Software"), to deal -in the Software without restriction, including without limitation the rights -to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -copies of the Software, and to permit persons to whom the Software is -furnished to do so, subject to the following conditions: - -The above copyright notice and this permission notice shall be included in all -copies or substantial portions of the Software. - -THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -SOFTWARE. - -*/ - -// This file include all other source files, so that the user of the library -// don't need to add multiple source files into their build. -// Also, this speeds up compile time, compared to multiple separate .cpp files -// compilation. - diff --git a/src/rfl/parquet/Writer.cpp b/src/rfl/parquet/Writer.cpp deleted file mode 100644 index f711a0f4..00000000 --- a/src/rfl/parquet/Writer.cpp +++ /dev/null @@ -1,78 +0,0 @@ -#include "rfl/msgpack/Writer.hpp" - -namespace rfl::msgpack { - -Writer::Writer(msgpack_packer* _pk) : pk_(_pk) {} - -Writer::~Writer() = default; - -Writer::OutputArrayType Writer::array_as_root( - const size_t _size) const noexcept { - return new_array(_size); -} - -Writer::OutputObjectType Writer::object_as_root( - const size_t _size) const noexcept { - return new_object(_size); -} - -Writer::OutputVarType Writer::null_as_root() const noexcept { - msgpack_pack_nil(pk_); - return OutputVarType{}; -} - -Writer::OutputArrayType Writer::add_array_to_array( - const size_t _size, OutputArrayType* _parent) const noexcept { - return new_array(_size); -} - -Writer::OutputArrayType Writer::add_array_to_object( - const std::string_view& _name, const size_t _size, - OutputObjectType* _parent) const noexcept { - msgpack_pack_str(pk_, _name.size()); - msgpack_pack_str_body(pk_, _name.data(), _name.size()); - return new_array(_size); -} - -Writer::OutputObjectType Writer::add_object_to_array( - const size_t _size, OutputArrayType* _parent) const noexcept { - return new_object(_size); -} - -Writer::OutputObjectType Writer::add_object_to_object( - const std::string_view& _name, const size_t _size, - OutputObjectType* _parent) const noexcept { - msgpack_pack_str(pk_, _name.size()); - msgpack_pack_str_body(pk_, _name.data(), _name.size()); - return new_object(_size); -} - -Writer::OutputVarType Writer::add_null_to_array( - OutputArrayType* _parent) const noexcept { - msgpack_pack_nil(pk_); - return OutputVarType{}; -} - -Writer::OutputVarType Writer::add_null_to_object( - const std::string_view& _name, OutputObjectType* _parent) const noexcept { - msgpack_pack_str(pk_, _name.size()); - msgpack_pack_str_body(pk_, _name.data(), _name.size()); - msgpack_pack_nil(pk_); - return OutputVarType{}; -} - -void Writer::end_array(OutputArrayType* _arr) const noexcept {} - -void Writer::end_object(OutputObjectType* _obj) const noexcept {} - -Writer::OutputArrayType Writer::new_array(const size_t _size) const noexcept { - msgpack_pack_array(pk_, _size); - return OutputArrayType{}; -} - -Writer::OutputObjectType Writer::new_object(const size_t _size) const noexcept { - msgpack_pack_map(pk_, _size); - return OutputObjectType{}; -} - -} // namespace rfl::msgpack From 5c0068a65d1b42256c5a89146b7699c1be2afa17 Mon Sep 17 00:00:00 2001 From: "Dr. Patrick Urbanke" Date: Tue, 2 Sep 2025 17:42:46 +0200 Subject: [PATCH 35/36] Use thread-safe versions --- include/rfl/Timestamp.hpp | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/include/rfl/Timestamp.hpp b/include/rfl/Timestamp.hpp index f9749578..9909c0f0 100644 --- a/include/rfl/Timestamp.hpp +++ b/include/rfl/Timestamp.hpp @@ -39,7 +39,11 @@ class Timestamp { Timestamp(const time_t _t) : tm_(std::tm{}) { auto t = _t; - tm_ = *std::gmtime(&t); +#if defined(_MSC_VER) || defined(__MINGW32__) + gmtime_s(&tm_, &t); +#else + gmtime_r(&t, &tm_); +#endif } ~Timestamp() = default; From 44b1cb4039f1e4c02993770a58a92196e3c51ab0 Mon Sep 17 00:00:00 2001 From: "Dr. Patrick Urbanke" Date: Tue, 2 Sep 2025 17:42:51 +0200 Subject: [PATCH 36/36] Fixed typo --- docs/supported_formats/parquet.md | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/docs/supported_formats/parquet.md b/docs/supported_formats/parquet.md index 05ae8e66..f259b3ce 100644 --- a/docs/supported_formats/parquet.md +++ b/docs/supported_formats/parquet.md @@ -1,5 +1,3 @@ -# Parquet - For Parquet support, you must also include the header `` and link to the [Apache Arrow](https://arrow.apache.org/) and [Apache Parquet](https://parquet.apache.org/) libraries. Furthermore, when compiling reflect-cpp, you need to pass `-DREFLECTCPP_PARQUET=ON` to cmake. @@ -79,7 +77,7 @@ const auto uncompressed_settings = rfl::parquet::Settings{} ## Loading and saving -You can also load and save to disc using a very similar syntax: +You can also load and save to disk using a very similar syntax: ```cpp const rfl::Result> result = rfl::parquet::load>("/path/to/file.parquet");