From 91cea4970647db6138e7ed5c2ab271743c89d909 Mon Sep 17 00:00:00 2001 From: Hind Montassif Date: Wed, 30 Jul 2025 16:53:27 +0200 Subject: [PATCH 1/6] Add linter analysis workflow --- .github/workflows/analysis.yml | 66 ++++++++++++++++++++++++++++++++++ 1 file changed, 66 insertions(+) create mode 100644 .github/workflows/analysis.yml diff --git a/.github/workflows/analysis.yml b/.github/workflows/analysis.yml new file mode 100644 index 0000000..784813a --- /dev/null +++ b/.github/workflows/analysis.yml @@ -0,0 +1,66 @@ +name: cpp-linter-analysis + +on: [push, pull_request] + +defaults: + run: + shell: bash -e -l {0} + +jobs: + build: + runs-on: ubuntu-24.04 + + steps: + # Set up Clang and LLVM + - name: Install LLVM and Clang + run: | + wget https://apt.llvm.org/llvm.sh + chmod +x llvm.sh + sudo ./llvm.sh 20 + sudo apt-get install -y clang-tools-20 + sudo update-alternatives --install /usr/bin/clang++ clang++ /usr/bin/clang++-20 200 + sudo update-alternatives --install /usr/bin/clang clang /usr/bin/clang-20 200 + sudo update-alternatives --install /usr/bin/clang-scan-deps clang-scan-deps /usr/bin/clang-scan-deps-20 200 + sudo update-alternatives --set clang /usr/bin/clang-20 + sudo update-alternatives --set clang++ /usr/bin/clang++-20 + sudo update-alternatives --set clang-scan-deps /usr/bin/clang-scan-deps-20 + + - name: Checkout repository + uses: actions/checkout@v4 + + # Set conda environment using setup-micromamba + - name: Set conda environment + uses: mamba-org/setup-micromamba@main + with: + environment-name: myenv + environment-file: environment-dev.yml + init-shell: bash + cache-downloads: true + + # Run CMake configuration + - name: Configure using CMake + run: | + export CC=clang; export CXX=clang++ + cmake -G Ninja \ + -Bbuild \ + -DCMAKE_BUILD_TYPE=Release \ + -DCMAKE_INSTALL_PREFIX=$CONDA_PREFIX \ + -DBUILD_TESTS=ON \ + -DCMAKE_EXPORT_COMPILE_COMMANDS=ON + #-DFETCH_DEPENDENCIES_WITH_CMAKE=MISSING + + # Run Clang-Tidy and Clang-Format Analysis + - name: Run C++ analysis + uses: cpp-linter/cpp-linter-action@v2 + id: linter + env: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + with: + version: 20 + files-changed-only: false # check all files + database: 'build' + style: 'file' # Use .clang-format config file + tidy-checks: '' # Use .clang-tidy config file + step-summary: true + ignore: 'build' + extra-args: '-std=c++20' From 7a30661750a81c734db02be758a7dfc6b536dfcb Mon Sep 17 00:00:00 2001 From: Hind Montassif Date: Wed, 30 Jul 2025 17:33:18 +0200 Subject: [PATCH 2/6] Enable failure --- .github/workflows/analysis.yml | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/.github/workflows/analysis.yml b/.github/workflows/analysis.yml index 784813a..8ccbc37 100644 --- a/.github/workflows/analysis.yml +++ b/.github/workflows/analysis.yml @@ -61,6 +61,9 @@ jobs: database: 'build' style: 'file' # Use .clang-format config file tidy-checks: '' # Use .clang-tidy config file - step-summary: true + step-summary: false # Disable summary output for more detailed logs ignore: 'build' extra-args: '-std=c++20' + - name: Fail fast?! + if: steps.linter.outputs.checks-failed > 0 + run: exit 1 From 75fb73eaec065dfabc7d156f4130dfdf66ee9316 Mon Sep 17 00:00:00 2001 From: Hind Montassif Date: Thu, 31 Jul 2025 10:46:20 +0200 Subject: [PATCH 3/6] Add thread-comments --- .github/workflows/analysis.yml | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/.github/workflows/analysis.yml b/.github/workflows/analysis.yml index 8ccbc37..13335cb 100644 --- a/.github/workflows/analysis.yml +++ b/.github/workflows/analysis.yml @@ -61,9 +61,10 @@ jobs: database: 'build' style: 'file' # Use .clang-format config file tidy-checks: '' # Use .clang-tidy config file - step-summary: false # Disable summary output for more detailed logs + step-summary: true ignore: 'build' extra-args: '-std=c++20' - - name: Fail fast?! + thread-comments: true + - name: Fail fast if: steps.linter.outputs.checks-failed > 0 run: exit 1 From 4a56f1e5860333f5126b4737619129244c89266b Mon Sep 17 00:00:00 2001 From: Hind Montassif Date: Thu, 31 Jul 2025 14:40:22 +0200 Subject: [PATCH 4/6] Fix some linter warnings --- .github/workflows/analysis.yml | 1 - src/serialize.cpp | 20 ++++++++++++-------- 2 files changed, 12 insertions(+), 9 deletions(-) diff --git a/.github/workflows/analysis.yml b/.github/workflows/analysis.yml index 13335cb..7e78ff9 100644 --- a/.github/workflows/analysis.yml +++ b/.github/workflows/analysis.yml @@ -64,7 +64,6 @@ jobs: step-summary: true ignore: 'build' extra-args: '-std=c++20' - thread-comments: true - name: Fail fast if: steps.linter.outputs.checks-failed > 0 run: exit 1 diff --git a/src/serialize.cpp b/src/serialize.cpp index be5e84a..48b3772 100644 --- a/src/serialize.cpp +++ b/src/serialize.cpp @@ -12,10 +12,12 @@ namespace { + constexpr int64_t arrow_alignment = 8; + // Aligns a value to the next multiple of 8, as required by the Arrow IPC format for message bodies. int64_t align_to_8(int64_t n) { - return (n + 7) & -8; + return (n + arrow_alignment - 1) & -arrow_alignment; } // TODO Complete this with all possible formats? @@ -137,7 +139,7 @@ std::vector serialize_primitive_array(const sparrow::primitive_array // Copy the metadata into the buffer, after the 4-byte length prefix memcpy(final_buffer.data() + sizeof(uint32_t), schema_builder.GetBufferPointer(), schema_len); // Write the 4-byte metadata length at the beginning of the message - *(reinterpret_cast(final_buffer.data())) = schema_len; + memcpy(final_buffer.data(), &schema_len, sizeof(schema_len)); } // II - Serialize the RecordBatch message @@ -148,11 +150,11 @@ std::vector serialize_primitive_array(const sparrow::primitive_array // arrow_arr.buffers[0] is the validity bitmap // arrow_arr.buffers[1] is the data buffer - const uint8_t* validity_bitmap = reinterpret_cast(arrow_arr.buffers[0]); - const uint8_t* data_buffer = reinterpret_cast(arrow_arr.buffers[1]); + const uint8_t* validity_bitmap = static_cast(arrow_arr.buffers[0]); + const uint8_t* data_buffer = static_cast(arrow_arr.buffers[1]); // Calculate the size of the validity and data buffers - int64_t validity_size = (arrow_arr.length + 7) / 8; + int64_t validity_size = (arrow_arr.length + arrow_alignment - 1) / arrow_alignment; int64_t data_size = arrow_arr.length * sizeof(T); int64_t body_len = validity_size + data_size; // The total size of the message body @@ -190,7 +192,7 @@ std::vector serialize_primitive_array(const sparrow::primitive_array uint8_t* dst = final_buffer.data() + current_size; // Get a pointer to where the new message will start // Write the 4-byte metadata length for the RecordBatch message - *(reinterpret_cast(dst)) = batch_meta_len; + memcpy(dst, &batch_meta_len, sizeof(batch_meta_len)); dst += sizeof(uint32_t); // Copy the RecordBatch metadata into the buffer memcpy(dst, batch_builder.GetBufferPointer(), batch_meta_len); @@ -228,7 +230,8 @@ sparrow::primitive_array deserialize_primitive_array(const std::vector(buf_ptr + current_offset)); + uint32_t schema_meta_len; + memcpy(&schema_meta_len, buf_ptr + current_offset, sizeof(schema_meta_len)); current_offset += sizeof(uint32_t); auto schema_message = org::apache::arrow::flatbuf::GetMessage(buf_ptr + current_offset); if (schema_message->header_type() != org::apache::arrow::flatbuf::MessageHeader::Schema) @@ -245,7 +248,8 @@ sparrow::primitive_array deserialize_primitive_array(const std::vector(buf_ptr + current_offset)); + uint32_t batch_meta_len; + memcpy(&batch_meta_len, buf_ptr + current_offset, sizeof(batch_meta_len)); current_offset += sizeof(uint32_t); auto batch_message = org::apache::arrow::flatbuf::GetMessage(buf_ptr + current_offset); if (batch_message->header_type() != org::apache::arrow::flatbuf::MessageHeader::RecordBatch) From 97a7beec0c7a6b234d8e642bb0baaf42e96744ef Mon Sep 17 00:00:00 2001 From: Hind Montassif Date: Thu, 31 Jul 2025 17:18:53 +0200 Subject: [PATCH 5/6] Add more improvements --- src/serialize.cpp | 31 ++++++++++++++++++++----------- 1 file changed, 20 insertions(+), 11 deletions(-) diff --git a/src/serialize.cpp b/src/serialize.cpp index 48b3772..c26ecf5 100644 --- a/src/serialize.cpp +++ b/src/serialize.cpp @@ -150,8 +150,8 @@ std::vector serialize_primitive_array(const sparrow::primitive_array // arrow_arr.buffers[0] is the validity bitmap // arrow_arr.buffers[1] is the data buffer - const uint8_t* validity_bitmap = static_cast(arrow_arr.buffers[0]); - const uint8_t* data_buffer = static_cast(arrow_arr.buffers[1]); + const auto validity_bitmap = static_cast(arrow_arr.buffers[0]); + const auto data_buffer = static_cast(arrow_arr.buffers[1]); // Calculate the size of the validity and data buffers int64_t validity_size = (arrow_arr.length + arrow_alignment - 1) / arrow_alignment; @@ -183,10 +183,10 @@ std::vector serialize_primitive_array(const sparrow::primitive_array batch_builder.Finish(batch_message_offset); // III - Append the RecordBatch message to the final buffer - uint32_t batch_meta_len = batch_builder.GetSize(); // Get the size of the batch metadata - int64_t aligned_batch_meta_len = align_to_8(batch_meta_len); // Calculate the padded length + const uint32_t batch_meta_len = batch_builder.GetSize(); // Get the size of the batch metadata + const int64_t aligned_batch_meta_len = align_to_8(batch_meta_len); // Calculate the padded length - size_t current_size = final_buffer.size(); // Get the current size (which is the end of the Schema message) + const size_t current_size = final_buffer.size(); // Get the current size (which is the end of the Schema message) // Resize the buffer to append the new message final_buffer.resize(current_size + sizeof(uint32_t) + aligned_batch_meta_len + body_len); uint8_t* dst = final_buffer.data() + current_size; // Get a pointer to where the new message will start @@ -197,7 +197,15 @@ std::vector serialize_primitive_array(const sparrow::primitive_array // Copy the RecordBatch metadata into the buffer memcpy(dst, batch_builder.GetBufferPointer(), batch_meta_len); // Add padding to align the body to an 8-byte boundary - memset(dst + batch_meta_len, 0, aligned_batch_meta_len - batch_meta_len); + if (aligned_batch_meta_len >= batch_meta_len) + { + memset(dst + batch_meta_len, 0, aligned_batch_meta_len - batch_meta_len); + } + else + { + throw std::runtime_error("aligned_batch_meta_len should be greater than batch_meta_len"); + } + dst += aligned_batch_meta_len; // Copy the actual data buffers (the message body) into the buffer if (validity_bitmap) @@ -207,7 +215,8 @@ std::vector serialize_primitive_array(const sparrow::primitive_array else { // If validity_bitmap is null, it means there are no nulls - memset(dst, 0xFF, validity_size); + constexpr uint8_t no_nulls_bitmap = 0xFF; + memset(dst, no_nulls_bitmap, validity_size); } dst += validity_size; if (data_buffer) @@ -230,7 +239,7 @@ sparrow::primitive_array deserialize_primitive_array(const std::vector deserialize_primitive_array(const std::vector deserialize_primitive_array(const std::vectorGet(0)->length(); int64_t data_len = buffers_meta->Get(1)->length(); - uint8_t* validity_buffer_copy = new uint8_t[validity_len]; + auto validity_buffer_copy = new uint8_t[validity_len]; memcpy(validity_buffer_copy, body_ptr + buffers_meta->Get(0)->offset(), validity_len); - uint8_t* data_buffer_copy = new uint8_t[data_len]; + auto data_buffer_copy = new uint8_t[data_len]; memcpy(data_buffer_copy, body_ptr + buffers_meta->Get(1)->offset(), data_len); // Get name From e9d43d36e0c1b35eabc1e76becff26d36fc4f374 Mon Sep 17 00:00:00 2001 From: Hind Montassif Date: Fri, 1 Aug 2025 16:11:01 +0200 Subject: [PATCH 6/6] Add static cast --- src/serialize.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/serialize.cpp b/src/serialize.cpp index c26ecf5..1da7ac3 100644 --- a/src/serialize.cpp +++ b/src/serialize.cpp @@ -197,7 +197,7 @@ std::vector serialize_primitive_array(const sparrow::primitive_array // Copy the RecordBatch metadata into the buffer memcpy(dst, batch_builder.GetBufferPointer(), batch_meta_len); // Add padding to align the body to an 8-byte boundary - if (aligned_batch_meta_len >= batch_meta_len) + if (static_cast(aligned_batch_meta_len) >= batch_meta_len) { memset(dst + batch_meta_len, 0, aligned_batch_meta_len - batch_meta_len); }