diff --git a/cpp/src/parquet/bloom_filter.cc b/cpp/src/parquet/bloom_filter.cc index e8011b5fc80..9d0aa1a1d88 100644 --- a/cpp/src/parquet/bloom_filter.cc +++ b/cpp/src/parquet/bloom_filter.cc @@ -70,7 +70,7 @@ void BlockSplitBloomFilter::Init(const uint8_t* bitset, uint32_t num_bytes) { num_bytes_ = num_bytes; PARQUET_ASSIGN_OR_THROW(data_, ::arrow::AllocateBuffer(num_bytes_, pool_)); - memcpy(data_->mutable_data(), bitset, num_bytes_); + std::memcpy(data_->mutable_data(), bitset, num_bytes_); this->hasher_ = std::make_unique(); } @@ -157,10 +157,8 @@ BlockSplitBloomFilter BlockSplitBloomFilter::Deserialize( auto buffer = AllocateBuffer(properties.memory_pool(), bloom_filter_size); const auto bloom_filter_bytes_in_header = header_buf->size() - header_size; - if (bloom_filter_bytes_in_header > 0) { - std::memcpy(buffer->mutable_data(), header_buf->data() + header_size, - static_cast(bloom_filter_bytes_in_header)); - } + SafeMemcpy(buffer->mutable_data(), header_buf->data() + header_size, + static_cast(bloom_filter_bytes_in_header)); const auto required_read_size = bloom_filter_size - bloom_filter_bytes_in_header; PARQUET_ASSIGN_OR_THROW( diff --git a/cpp/src/parquet/column_reader.cc b/cpp/src/parquet/column_reader.cc index 79b837f755c..857ed89694e 100644 --- a/cpp/src/parquet/column_reader.cc +++ b/cpp/src/parquet/column_reader.cc @@ -559,11 +559,9 @@ std::shared_ptr SerializedPageReader::DecompressIfNeeded( PARQUET_THROW_NOT_OK( decompression_buffer_->Resize(uncompressed_len, /*shrink_to_fit=*/false)); - if (levels_byte_len > 0) { - // First copy the levels as-is - uint8_t* decompressed = decompression_buffer_->mutable_data(); - memcpy(decompressed, page_buffer->data(), levels_byte_len); - } + // First copy the levels as-is + uint8_t* decompressed = decompression_buffer_->mutable_data(); + SafeMemcpy(decompressed, page_buffer->data(), levels_byte_len); // GH-31992: DataPageV2 may store only levels and no values when all // values are null. In this case, Parquet java is known to produce a diff --git a/cpp/src/parquet/column_writer.cc b/cpp/src/parquet/column_writer.cc index 94b67dfa807..609f8454617 100644 --- a/cpp/src/parquet/column_writer.cc +++ b/cpp/src/parquet/column_writer.cc @@ -919,13 +919,13 @@ class ColumnWriterImpl { void ConcatenateBuffers(int64_t definition_levels_rle_size, int64_t repetition_levels_rle_size, const std::shared_ptr& values, uint8_t* combined) { - memcpy(combined, repetition_levels_rle_->data(), - static_cast(repetition_levels_rle_size)); + SafeMemcpy(combined, repetition_levels_rle_->data(), + static_cast(repetition_levels_rle_size)); combined += repetition_levels_rle_size; - memcpy(combined, definition_levels_rle_->data(), - static_cast(definition_levels_rle_size)); + SafeMemcpy(combined, definition_levels_rle_->data(), + static_cast(definition_levels_rle_size)); combined += definition_levels_rle_size; - memcpy(combined, values->data(), static_cast(values->size())); + SafeMemcpy(combined, values->data(), static_cast(values->size())); } }; diff --git a/cpp/src/parquet/column_writer_test.cc b/cpp/src/parquet/column_writer_test.cc index dedf25abcab..50197d25111 100644 --- a/cpp/src/parquet/column_writer_test.cc +++ b/cpp/src/parquet/column_writer_test.cc @@ -536,7 +536,7 @@ void TestPrimitiveWriter::ReadColumnFully(Compression::type compr uint8_t* data_ptr = data.data(); for (int64_t i = 0; i < values_read_recently; i++) { const ByteArray& value = this->values_out_ptr_[i + values_read_]; - memcpy(data_ptr, value.ptr, value.len); + SafeMemcpy(data_ptr, value.ptr, value.len); this->values_out_[i + values_read_].ptr = data_ptr; data_ptr += value.len; } diff --git a/cpp/src/parquet/decoder.cc b/cpp/src/parquet/decoder.cc index 3ce2323d29a..9c140b4df53 100644 --- a/cpp/src/parquet/decoder.cc +++ b/cpp/src/parquet/decoder.cc @@ -458,10 +458,7 @@ inline int DecodePlain(const uint8_t* data, int64_t data_size, int num_values, if (bytes_to_decode > data_size || bytes_to_decode > INT_MAX) { ParquetException::EofException(); } - // If bytes_to_decode == 0, data could be null - if (bytes_to_decode > 0) { - memcpy(out, data, static_cast(bytes_to_decode)); - } + SafeMemcpy(out, data, static_cast(bytes_to_decode)); return static_cast(bytes_to_decode); } @@ -675,7 +672,7 @@ inline int PlainDecoder::DecodeArrow( // 1. Copy directly into the FixedSizeBinary data buffer, packed to the right. uint8_t* decode_out = builder->GetMutableValue(builder->length() + null_count); - memcpy(decode_out, data_, values_decoded * byte_width); + SafeMemcpy(decode_out, data_, values_decoded * byte_width); // 2. Expand the values into their final positions. if (null_count == 0) { @@ -1058,7 +1055,7 @@ void DictDecoderImpl::SetDict(TypedDecoder* dictio uint8_t* bytes_data = byte_array_data_->mutable_data(); int32_t* bytes_offsets = byte_array_offsets_->mutable_data_as(); for (int i = 0; i < dictionary_length_; ++i) { - memcpy(bytes_data + offset, dict_values[i].ptr, dict_values[i].len); + SafeMemcpy(bytes_data + offset, dict_values[i].ptr, dict_values[i].len); bytes_offsets[i] = offset; dict_values[i].ptr = bytes_data + offset; offset += dict_values[i].len; @@ -1079,7 +1076,7 @@ inline void DictDecoderImpl::SetDict(TypedDecoder* dictionar /*shrink_to_fit=*/false)); uint8_t* bytes_data = byte_array_data_->mutable_data(); for (int32_t i = 0, offset = 0; i < dictionary_length_; ++i, offset += fixed_len) { - memcpy(bytes_data + offset, dict_values[i].ptr, fixed_len); + std::memcpy(bytes_data + offset, dict_values[i].ptr, fixed_len); dict_values[i].ptr = bytes_data + offset; } } @@ -2017,9 +2014,9 @@ class DeltaByteArrayDecoderImpl : public TypedDecoderImpl { // Both prefix and suffix are non-empty, so we need to decode the string // into `data_ptr`. // 1. Copy the prefix - memcpy(*data_ptr, prefix->data(), prefix_len_ptr[i]); + SafeMemcpy(*data_ptr, prefix->data(), prefix_len_ptr[i]); // 2. Copy the suffix. - memcpy(*data_ptr + prefix_len_ptr[i], buffer[i].ptr, buffer[i].len); + SafeMemcpy(*data_ptr + prefix_len_ptr[i], buffer[i].ptr, buffer[i].len); // 3. Point buffer[i] to the decoded string. buffer[i].ptr = *data_ptr; buffer[i].len += prefix_len_ptr[i]; diff --git a/cpp/src/parquet/encoder.cc b/cpp/src/parquet/encoder.cc index 0e8c0ba32b6..54dd2e5feb3 100644 --- a/cpp/src/parquet/encoder.cc +++ b/cpp/src/parquet/encoder.cc @@ -658,9 +658,9 @@ template <> void DictEncoderImpl::WriteDict(uint8_t* buffer) const { memo_table_.VisitValues(0, [&buffer](::std::string_view v) { uint32_t len = static_cast(v.length()); - memcpy(buffer, &len, sizeof(len)); + std::memcpy(buffer, &len, sizeof(len)); buffer += sizeof(len); - memcpy(buffer, v.data(), len); + SafeMemcpy(buffer, v.data(), len); buffer += len; }); } @@ -669,7 +669,7 @@ template <> void DictEncoderImpl::WriteDict(uint8_t* buffer) const { memo_table_.VisitValues(0, [&](::std::string_view v) { DCHECK_EQ(v.length(), static_cast(type_length_)); - memcpy(buffer, v.data(), type_length_); + std::memcpy(buffer, v.data(), type_length_); buffer += type_length_; }); } @@ -1216,8 +1216,8 @@ std::shared_ptr DeltaBitPackEncoder::FlushValues() { // and data was written immediately after. We now write the header data immediately // before the end of reserved space. const size_t offset_bytes = kMaxPageHeaderWriterSize - header_writer.bytes_written(); - std::memcpy(buffer->mutable_data() + offset_bytes, header_buffer_, - header_writer.bytes_written()); + SafeMemcpy(buffer->mutable_data() + offset_bytes, header_buffer_, + header_writer.bytes_written()); // Reset counter of cached values total_value_count_ = 0; diff --git a/cpp/src/parquet/statistics.cc b/cpp/src/parquet/statistics.cc index 2e5f6fe37c4..7b91f4b5b32 100644 --- a/cpp/src/parquet/statistics.cc +++ b/cpp/src/parquet/statistics.cc @@ -890,7 +890,7 @@ inline void TypedStatisticsImpl::Copy(const ByteArray& src, ByteA ResizableBuffer* buffer) { if (dst->ptr == src.ptr) return; PARQUET_THROW_NOT_OK(buffer->Resize(src.len, false)); - std::memcpy(buffer->mutable_data(), src.ptr, src.len); + SafeMemcpy(buffer->mutable_data(), src.ptr, src.len); *dst = ByteArray(src.len, buffer->data()); } diff --git a/cpp/src/parquet/statistics_test.cc b/cpp/src/parquet/statistics_test.cc index 905502cb0a5..b509a8be34a 100644 --- a/cpp/src/parquet/statistics_test.cc +++ b/cpp/src/parquet/statistics_test.cc @@ -523,7 +523,7 @@ std::vector TestStatistics::GetDeepCopy( for (const ByteArray& ba : values) { uint8_t* ptr; PARQUET_THROW_NOT_OK(pool->Allocate(ba.len, &ptr)); - memcpy(ptr, ba.ptr, ba.len); + SafeMemcpy(ptr, ba.ptr, ba.len); copy.emplace_back(ba.len, ptr); } return copy; diff --git a/cpp/src/parquet/types.h b/cpp/src/parquet/types.h index 7e8a18fc94d..c189c2db10b 100644 --- a/cpp/src/parquet/types.h +++ b/cpp/src/parquet/types.h @@ -704,6 +704,16 @@ static inline std::string ByteArrayToString(const ByteArray& a) { return std::string(reinterpret_cast(a.ptr), a.len); } +/// \brief Safe memcpy that avoids undefined behavior when src is nullptr. +/// +/// According to the C++ standard, calling std::memcpy with a nullptr is undefined +/// behavior even when size is 0. This function guards against that case. +static inline void SafeMemcpy(void* dest, const void* src, size_t size) { + if (size > 0) { + std::memcpy(dest, src, size); + } +} + static inline void Int96SetNanoSeconds(parquet::Int96& i96, int64_t nanoseconds) { std::memcpy(&i96.value, &nanoseconds, sizeof(nanoseconds)); }