Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions cpp/src/parquet/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -173,6 +173,7 @@ set(PARQUET_SRCS
encryption/internal_file_encryptor.cc
exception.cc
file_reader.cc
file_rewriter.cc
file_writer.cc
geospatial/statistics.cc
geospatial/util_internal.cc
Expand Down Expand Up @@ -412,6 +413,8 @@ add_parquet_test(arrow-reader-writer-test

add_parquet_test(arrow-index-test SOURCES arrow/index_test.cc)

add_parquet_test(arrow-rewriter-test SOURCES arrow/arrow_rewriter_test.cc)

add_parquet_test(arrow-internals-test SOURCES arrow/path_internal_test.cc
arrow/reconstruct_internal_test.cc)

Expand Down
117 changes: 117 additions & 0 deletions cpp/src/parquet/arrow/arrow_rewriter_test.cc
Original file line number Diff line number Diff line change
@@ -0,0 +1,117 @@
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.

#include "arrow/io/memory.h"
#include "arrow/testing/gtest_util.h"
#include "parquet/arrow/reader.h"
#include "parquet/file_reader.h"
#include "parquet/file_rewriter.h"
#ifdef _MSC_VER
# pragma warning(push)
// Disable forcing value to bool warnings
# pragma warning(disable : 4800)
#endif

#include <memory>

#include "gtest/gtest.h"

#include "parquet/arrow/test_util.h"
#include "parquet/platform.h"
#include "parquet/properties.h"

using arrow::Table;
using arrow::io::BufferReader;

namespace parquet::arrow {

TEST(ParquetRewriterTest, SimpleRoundTrip) {
auto rewriter_properties =
RewriterProperties::Builder()
.writer_properties(
WriterProperties::Builder().enable_write_page_index()->build())
->build();

auto schema = ::arrow::schema(
{::arrow::field("a", ::arrow::int32()), ::arrow::field("b", ::arrow::utf8())});

std::shared_ptr<Buffer> buffer;

WriteFile(rewriter_properties->writer_properties(),
::arrow::TableFromJSON(schema, {R"([[1, "a"], [2, "b"]])"}), buffer);

auto sink = CreateOutputStream();
auto rewriter =
ParquetFileRewriter::Open({std::make_shared<BufferReader>(buffer)}, sink, {NULLPTR},
NULLPTR, rewriter_properties);
rewriter->Rewrite();
rewriter->Close();

ASSERT_OK_AND_ASSIGN(auto out_buffer, sink->Finish());
auto file_reader = ParquetFileReader::Open(std::make_shared<BufferReader>(out_buffer));
ASSERT_OK_AND_ASSIGN(auto reader, FileReader::Make(::arrow::default_memory_pool(),
std::move(file_reader)));

std::shared_ptr<Table> table;
ASSERT_OK(reader->ReadTable(&table));
ASSERT_OK(table->ValidateFull());

auto expected_table = ::arrow::TableFromJSON(schema, {R"([[1, "a"], [2, "b"]])"});
AssertTablesEqual(*expected_table, *table);
}

TEST(ParquetRewriterTest, ConcatRoundTrip) {
auto rewriter_properties =
RewriterProperties::Builder()
.writer_properties(
WriterProperties::Builder().enable_write_page_index()->build())
->build();

auto schema = ::arrow::schema(
{::arrow::field("a", ::arrow::int32()), ::arrow::field("b", ::arrow::utf8())});

std::shared_ptr<Buffer> buffer_up;
std::shared_ptr<Buffer> buffer_down;

WriteFile(rewriter_properties->writer_properties(),
::arrow::TableFromJSON(schema, {R"([[1, "a"], [2, "b"]])"}), buffer_up);
WriteFile(rewriter_properties->writer_properties(),
::arrow::TableFromJSON(schema, {R"([[3, "c"]])"}), buffer_down);

auto sink = CreateOutputStream();
auto rewriter =
ParquetFileRewriter::Open({std::make_shared<BufferReader>(buffer_up),
std::make_shared<BufferReader>(buffer_down)},
sink, {NULLPTR, NULLPTR}, NULLPTR, rewriter_properties);
rewriter->Rewrite();
rewriter->Close();

ASSERT_OK_AND_ASSIGN(auto out_buffer, sink->Finish());
auto file_reader = ParquetFileReader::Open(std::make_shared<BufferReader>(out_buffer));
ASSERT_OK_AND_ASSIGN(auto reader, FileReader::Make(::arrow::default_memory_pool(),
std::move(file_reader)));

std::shared_ptr<Table> table;
ASSERT_OK(reader->ReadTable(&table));
ASSERT_OK(table->ValidateFull());

auto expected_table =
::arrow::TableFromJSON(schema, {R"([[1, "a"], [2, "b"], [3, "c"]])"});
AssertTablesEqual(*expected_table, *table);
}

} // namespace parquet::arrow
28 changes: 28 additions & 0 deletions cpp/src/parquet/arrow/test_util.h
Original file line number Diff line number Diff line change
Expand Up @@ -28,18 +28,23 @@
#include "arrow/array/builder_binary.h"
#include "arrow/array/builder_decimal.h"
#include "arrow/array/builder_primitive.h"
#include "arrow/table.h"
#include "arrow/testing/gtest_util.h"
#include "arrow/testing/random.h"
#include "arrow/type_fwd.h"
#include "arrow/type_traits.h"
#include "arrow/util/decimal.h"
#include "arrow/util/float16.h"
#include "parquet/arrow/schema.h"
#include "parquet/arrow/writer.h"
#include "parquet/column_reader.h"
#include "parquet/file_writer.h"
#include "parquet/test_util.h"

namespace parquet {

using internal::RecordReader;
using schema::GroupNode;

namespace arrow {

Expand Down Expand Up @@ -482,6 +487,29 @@ void ExpectArrayT<::arrow::BooleanType>(void* expected, Array* result) {
EXPECT_TRUE(result->Equals(*expected_array));
}

void WriteFile(const std::shared_ptr<WriterProperties>& writer_properties,
const std::shared_ptr<::arrow::Table>& table,
std::shared_ptr<Buffer>& buffer) {
// Get schema from table.
auto schema = table->schema();
std::shared_ptr<SchemaDescriptor> parquet_schema;
auto arrow_writer_properties = default_arrow_writer_properties();
ASSERT_OK_NO_THROW(ToParquetSchema(schema.get(), *writer_properties,
*arrow_writer_properties, &parquet_schema));
auto schema_node = std::static_pointer_cast<GroupNode>(parquet_schema->schema_root());

// Write table to buffer.
auto sink = CreateOutputStream();
auto pool = ::arrow::default_memory_pool();
auto writer = ParquetFileWriter::Open(sink, schema_node, writer_properties);
std::unique_ptr<FileWriter> arrow_writer;
ASSERT_OK(FileWriter::Make(pool, std::move(writer), schema, arrow_writer_properties,
&arrow_writer));
ASSERT_OK_NO_THROW(arrow_writer->WriteTable(*table));
ASSERT_OK_NO_THROW(arrow_writer->Close());
ASSERT_OK_AND_ASSIGN(buffer, sink->Finish());
}

} // namespace arrow

} // namespace parquet
Loading
Loading