-
Notifications
You must be signed in to change notification settings - Fork 3.9k
Open
Labels
Description
Describe the bug, including details regarding any error messages, version, and platform.
branch: main
commit sha: fcfb33d0b6f7d368bcd00bb661d35c71885c5140
/// file: cpp/src/arrow/acero/unmaterialized_table_internal.h:175
template <class Type, class Builder = typename TypeTraits<Type>::BuilderType>
enable_if_boolean<Type, Status> static BuilderAppend(
Builder& builder, const std::shared_ptr<ArrayData>& source, uint64_t row) {
if (source->IsNull(row)) {
builder.UnsafeAppendNull();
return Status::OK();
}
builder.UnsafeAppend(bit_util::GetBit(source->template GetValues<uint8_t>(1), row));
return Status::OK();
}
/// cpp/src/arrow/array/data.h:271
template <typename T>
inline const T* GetValues(int i) const {
return GetValues<T>(i, offset);
}
Here, uint8_t type is used for GetValues, and the underlying buffer applies a logical offset. However, since the boolean type is actually stored bitwise, this may lead to a segmentation fault or incorrect sorting results.
I added the following unit test, which can verify that the result of sortd_merge is incorrect.
TEST(SortedMergeNode, TestSortedMergeTwoInputsWithBool) {
const int64_t row_count = (16 << 10); // 16k rows per input
// Create schema with int column A and bool column B
auto test_schema = arrow::schema(
{arrow::field("col_a", arrow::int32()), arrow::field("col_b", arrow::boolean())});
// Helper lambda to create table with specific pattern
auto create_test_source = [&](int64_t cnt, int offset) -> arrow::Result<Declaration> {
// Create column A (int) - values from offset to offset+cnt-1
arrow::Int32Builder col_a_builder;
std::vector<int32_t> col_a_values;
col_a_values.reserve(cnt);
for (int64_t i = 0; i < cnt; ++i) {
col_a_values.push_back(static_cast<int32_t>(offset + i));
}
ARROW_RETURN_NOT_OK(col_a_builder.AppendValues(col_a_values));
std::shared_ptr<arrow::Array> col_a_arr;
ARROW_RETURN_NOT_OK(col_a_builder.Finish(&col_a_arr));
// Create column B (bool) - pattern: true if col_a % 5 == 0, false otherwise
arrow::BooleanBuilder col_b_builder;
for (int64_t i = 0; i < cnt; ++i) {
int32_t a_value = offset + i;
bool b_value = (a_value % 5 == 0);
ARROW_RETURN_NOT_OK(col_b_builder.Append(b_value));
}
std::shared_ptr<arrow::Array> col_b_arr;
ARROW_RETURN_NOT_OK(col_b_builder.Finish(&col_b_arr));
auto table = arrow::Table::Make(test_schema, {col_a_arr, col_b_arr});
auto table_source =
Declaration("table_source", TableSourceNodeOptions(table, row_count / 16));
return table_source;
};
ASSERT_OK_AND_ASSIGN(auto source1, create_test_source(row_count, 0));
ASSERT_OK_AND_ASSIGN(auto source2, create_test_source(row_count, 8192));
// Create sorted merge by column A
auto ops = OrderByNodeOptions(compute::Ordering({compute::SortKey("col_a")}));
Declaration sorted_merge{"sorted_merge", {source1, source2}, ops};
// Execute plan and collect result
ASSERT_OK_AND_ASSIGN(auto result_table,
arrow::acero::DeclarationToTable(sorted_merge, false));
ASSERT_TRUE(result_table != nullptr);
// Verify results
auto col_a = result_table->GetColumnByName("col_a");
auto col_b = result_table->GetColumnByName("col_b");
ASSERT_TRUE(col_a != nullptr);
ASSERT_TRUE(col_b != nullptr);
// Verify sorting and bool values
int32_t last_a_value = std::numeric_limits<int32_t>::min();
int64_t total_rows_checked = 0;
for (int i = 0; i < col_a->num_chunks(); i++) {
auto a_chunk = std::static_pointer_cast<arrow::Int32Array>(col_a->chunk(i));
auto b_chunk = std::static_pointer_cast<arrow::BooleanArray>(col_b->chunk(i));
ASSERT_EQ(a_chunk->length(), b_chunk->length())
<< "Column A and B must have same length in chunk " << i;
for (int64_t j = 0; j < a_chunk->length(); j++) {
ASSERT_FALSE(a_chunk->IsNull(j)) << "Column A should not have null values";
ASSERT_FALSE(b_chunk->IsNull(j)) << "Column B should not have null values";
int32_t a_value = a_chunk->Value(j);
bool b_value = b_chunk->Value(j);
// Verify sorting by column A
ASSERT_GE(a_value, last_a_value)
<< "Values not sorted at chunk " << i << ", row " << j
<< ": current=" << a_value << ", last=" << last_a_value;
last_a_value = a_value;
// Verify bool value correctness: should be true if a_value % 3 == 0
bool expected_b_value = (a_value % 5 == 0);
ASSERT_EQ(b_value, expected_b_value)
<< "Bool value incorrect at chunk " << i << ", row " << j
<< ": col_a=" << a_value << ", col_b=" << b_value
<< ", expected=" << expected_b_value;
total_rows_checked++;
}
}
ASSERT_EQ(last_a_value, 24575);
ASSERT_EQ(total_rows_checked, row_count * 2)
<< "Expected " << row_count << " unique rows after merge";
}Component(s)
C++