Skip to content

[C++][Acero] materializeColumn for boolean type overflow #48072

@waruto210

Description

@waruto210

Describe the bug, including details regarding any error messages, version, and platform.

branch: main
commit sha: fcfb33d0b6f7d368bcd00bb661d35c71885c5140

/// file: cpp/src/arrow/acero/unmaterialized_table_internal.h:175

  template <class Type, class Builder = typename TypeTraits<Type>::BuilderType>
  enable_if_boolean<Type, Status> static BuilderAppend(
      Builder& builder, const std::shared_ptr<ArrayData>& source, uint64_t row) {
    if (source->IsNull(row)) {
      builder.UnsafeAppendNull();
      return Status::OK();
    }
    builder.UnsafeAppend(bit_util::GetBit(source->template GetValues<uint8_t>(1), row));
    return Status::OK();
  }

/// cpp/src/arrow/array/data.h:271
  template <typename T>
  inline const T* GetValues(int i) const {
    return GetValues<T>(i, offset);
  }

Here, uint8_t type is used for GetValues, and the underlying buffer applies a logical offset. However, since the boolean type is actually stored bitwise, this may lead to a segmentation fault or incorrect sorting results.
I added the following unit test, which can verify that the result of sortd_merge is incorrect.

TEST(SortedMergeNode, TestSortedMergeTwoInputsWithBool) {
  const int64_t row_count = (16 << 10);  // 16k rows per input

  // Create schema with int column A and bool column B
  auto test_schema = arrow::schema(
      {arrow::field("col_a", arrow::int32()), arrow::field("col_b", arrow::boolean())});

  // Helper lambda to create table with specific pattern
  auto create_test_source = [&](int64_t cnt, int offset) -> arrow::Result<Declaration> {
    // Create column A (int) - values from offset to offset+cnt-1
    arrow::Int32Builder col_a_builder;
    std::vector<int32_t> col_a_values;
    col_a_values.reserve(cnt);
    for (int64_t i = 0; i < cnt; ++i) {
      col_a_values.push_back(static_cast<int32_t>(offset + i));
    }
    ARROW_RETURN_NOT_OK(col_a_builder.AppendValues(col_a_values));
    std::shared_ptr<arrow::Array> col_a_arr;
    ARROW_RETURN_NOT_OK(col_a_builder.Finish(&col_a_arr));

    // Create column B (bool) - pattern: true if col_a % 5 == 0, false otherwise
    arrow::BooleanBuilder col_b_builder;
    for (int64_t i = 0; i < cnt; ++i) {
      int32_t a_value = offset + i;
      bool b_value = (a_value % 5 == 0);
      ARROW_RETURN_NOT_OK(col_b_builder.Append(b_value));
    }
    std::shared_ptr<arrow::Array> col_b_arr;
    ARROW_RETURN_NOT_OK(col_b_builder.Finish(&col_b_arr));

    auto table = arrow::Table::Make(test_schema, {col_a_arr, col_b_arr});
    auto table_source =
        Declaration("table_source", TableSourceNodeOptions(table, row_count / 16));
    return table_source;
  };

  ASSERT_OK_AND_ASSIGN(auto source1, create_test_source(row_count, 0));
  ASSERT_OK_AND_ASSIGN(auto source2, create_test_source(row_count, 8192));

  // Create sorted merge by column A
  auto ops = OrderByNodeOptions(compute::Ordering({compute::SortKey("col_a")}));
  Declaration sorted_merge{"sorted_merge", {source1, source2}, ops};

  // Execute plan and collect result
  ASSERT_OK_AND_ASSIGN(auto result_table,
                       arrow::acero::DeclarationToTable(sorted_merge, false));

  ASSERT_TRUE(result_table != nullptr);

  // Verify results
  auto col_a = result_table->GetColumnByName("col_a");
  auto col_b = result_table->GetColumnByName("col_b");
  ASSERT_TRUE(col_a != nullptr);
  ASSERT_TRUE(col_b != nullptr);

  // Verify sorting and bool values
  int32_t last_a_value = std::numeric_limits<int32_t>::min();
  int64_t total_rows_checked = 0;

  for (int i = 0; i < col_a->num_chunks(); i++) {
    auto a_chunk = std::static_pointer_cast<arrow::Int32Array>(col_a->chunk(i));
    auto b_chunk = std::static_pointer_cast<arrow::BooleanArray>(col_b->chunk(i));

    ASSERT_EQ(a_chunk->length(), b_chunk->length())
        << "Column A and B must have same length in chunk " << i;

    for (int64_t j = 0; j < a_chunk->length(); j++) {
      ASSERT_FALSE(a_chunk->IsNull(j)) << "Column A should not have null values";
      ASSERT_FALSE(b_chunk->IsNull(j)) << "Column B should not have null values";

      int32_t a_value = a_chunk->Value(j);
      bool b_value = b_chunk->Value(j);

      // Verify sorting by column A
      ASSERT_GE(a_value, last_a_value)
          << "Values not sorted at chunk " << i << ", row " << j
          << ": current=" << a_value << ", last=" << last_a_value;
      last_a_value = a_value;

      // Verify bool value correctness: should be true if a_value % 3 == 0
      bool expected_b_value = (a_value % 5 == 0);
      ASSERT_EQ(b_value, expected_b_value)
          << "Bool value incorrect at chunk " << i << ", row " << j
          << ": col_a=" << a_value << ", col_b=" << b_value
          << ", expected=" << expected_b_value;
      total_rows_checked++;
    }
  }

  ASSERT_EQ(last_a_value, 24575);

  ASSERT_EQ(total_rows_checked, row_count * 2)
      << "Expected " << row_count << " unique rows after merge";
}

Component(s)

C++

Metadata

Metadata

Assignees

Type

No type

Projects

No projects

Milestone

No milestone

Relationships

None yet

Development

No branches or pull requests

Issue actions