Skip to content
This repository was archived by the owner on May 6, 2024. It is now read-only.

Commit 3ad360c

Browse files
author
harborn
authored
[POAE7-2932] support convert Dictionary encoding vector to Flat encoding vector (#430)
* [POAE7-2932] support convert Dictionary encoding vector to Flat encoding vector * format fix * fix a bug * fix code style * fix compile error * fix compile error * fix bug for UT * fix
1 parent 7e70079 commit 3ad360c

13 files changed

+93
-25
lines changed

cpp/src/cider-velox/src/CiderPipelineOperator.cpp

Lines changed: 34 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -23,11 +23,14 @@
2323
#include "Allocator.h"
2424
#include "CiderCrossJoinBuild.h"
2525
#include "CiderHashJoinBuild.h"
26+
#include "ciderTransformer/CiderPlanTransformerOptions.h"
2627
#include "exec/plan/substrait/SubstraitPlan.h"
2728
#include "velox/exec/Task.h"
2829
#ifndef CIDER_BATCH_PROCESSOR_CONTEXT_H
2930
#include "velox/vector/arrow/Abi.h"
3031
#endif
32+
#include "velox/vector/DecodedVector.h"
33+
#include "velox/vector/DictionaryVector.h"
3134
#include "velox/vector/arrow/Bridge.h"
3235

3336
namespace facebook::velox::plugin {
@@ -36,7 +39,38 @@ bool CiderPipelineOperator::needsInput() const {
3639
return !finished_;
3740
}
3841

42+
VectorPtr CiderPipelineOperator::copyVector(const VectorPtr& vectorPtr) {
43+
SelectivityVector allRows(vectorPtr->size());
44+
auto flatVector =
45+
BaseVector::create(vectorPtr->type(), vectorPtr->size(), operatorCtx_->pool());
46+
flatVector->copy(vectorPtr.get(), allRows, nullptr);
47+
return flatVector;
48+
}
49+
50+
RowVectorPtr CiderPipelineOperator::convertDictionaryToFlat(RowVectorPtr& input) {
51+
std::vector<std::shared_ptr<const Type>> types;
52+
types.reserve(input->childrenSize());
53+
for (auto& ptr : input->children()) {
54+
types.emplace_back(ptr->type());
55+
}
56+
auto rowTypePtr = ROW(std::move(types));
57+
std::vector<VectorPtr> children;
58+
for (column_index_t i = 0; i < rowTypePtr->size(); ++i) {
59+
VectorPtr& vectorPtr = input->childAt(i);
60+
if (vectorPtr->encoding() == VectorEncoding::Simple::DICTIONARY) {
61+
children.emplace_back(copyVector(vectorPtr));
62+
} else {
63+
children.emplace_back(vectorPtr);
64+
}
65+
}
66+
return std::make_shared<RowVector>(
67+
operatorCtx_->pool(), rowTypePtr, BufferPtr(nullptr), input->size(), children);
68+
}
69+
3970
void CiderPipelineOperator::addInput(RowVectorPtr input) {
71+
if (FLAGS_enable_flatten_dictionary_encoding) {
72+
input = convertDictionaryToFlat(input);
73+
}
4074
for (size_t i = 0; i < input->childrenSize(); i++) {
4175
input->childAt(i)->mutableRawNulls();
4276
}

cpp/src/cider-velox/src/CiderPipelineOperator.h

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -45,6 +45,11 @@ class CiderPipelineOperator : public exec::Operator {
4545

4646
void noMoreInput() override;
4747

48+
private:
49+
VectorPtr copyVector(const VectorPtr& vectorPtr);
50+
51+
RowVectorPtr convertDictionaryToFlat(RowVectorPtr& input);
52+
4853
private:
4954
cider::exec::processor::BatchProcessorPtr batchProcessor_;
5055

cpp/src/cider-velox/src/ciderTransformer/CiderPlanTransformerOptions.cpp

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,9 @@
2121

2222
#include "CiderPlanTransformerOptions.h"
2323

24+
DEFINE_bool(enable_flatten_dictionary_encoding,
25+
false,
26+
"Enable flatten dictionary encoding to flat encoding");
2427
DEFINE_bool(left_deep_join_pattern, false, "Enable LeftDeepJoinPattern ");
2528
DEFINE_bool(compound_pattern, false, "Enable CompoundPattern ");
2629
DEFINE_bool(filter_pattern, true, "Enable FilterPattern ");

cpp/src/cider-velox/src/ciderTransformer/CiderPlanTransformerOptions.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,7 @@
2222

2323
#include <gflags/gflags.h>
2424

25+
DECLARE_bool(enable_flatten_dictionary_encoding);
2526
DECLARE_bool(left_deep_join_pattern);
2627
DECLARE_bool(compound_pattern);
2728
DECLARE_bool(filter_pattern);

cpp/src/cider-velox/test/BatchDataGenerator.h

Lines changed: 28 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -51,27 +51,31 @@ class BatchDataGenerator {
5151
auto generate(RowTypePtr& rowType,
5252
int rowVectorSize,
5353
vector_size_t vectorSize,
54+
bool withZero,
5455
bool withNull) {
56+
std::mt19937 gen{std::mt19937::default_seed};
5557
std::vector<RowVectorPtr> batches;
5658
for (int i = 0; i < rowVectorSize; ++i) {
57-
auto batch =
58-
createRowVector(rowType, vectorSize, withNull ? randomNulls(7) : nullptr);
59+
auto batch = createRowVector(
60+
rowType, vectorSize, gen, withZero, withNull ? randomNulls(7) : nullptr);
5961
batches.push_back(batch);
6062
}
6163
return batches;
6264
}
6365

6466
RowVectorPtr createRowVector(RowTypePtr& rowType,
6567
vector_size_t vectorSize,
68+
std::mt19937& gen,
69+
bool withZero,
6670
std::function<bool(vector_size_t)> isNullAt = nullptr) {
67-
std::mt19937 gen{std::mt19937::default_seed};
6871
std::vector<VectorPtr> children;
6972
for (uint32_t i = 0; i < rowType->size(); ++i) {
7073
auto vectorPtr = VELOX_DYNAMIC_SCALAR_TYPE_DISPATCH(createScalar,
7174
rowType->childAt(i)->kind(),
7275
rowType->childAt(i),
7376
vectorSize,
7477
gen,
78+
withZero,
7579
isNullAt);
7680
children.emplace_back(vectorPtr);
7781
}
@@ -88,10 +92,30 @@ class BatchDataGenerator {
8892
template <typename T>
8993
T gen_value(std::mt19937& gen);
9094

95+
template <typename T>
96+
static constexpr bool is_integral_type =
97+
std::is_same_v<int8_t, T> || std::is_same_v<int16_t, T> ||
98+
std::is_same_v<int32_t, T> || std::is_same_v<int64_t, T>;
99+
100+
template <typename T, typename std::enable_if_t<is_integral_type<T>, bool> = true>
101+
T gen_value(std::mt19937& gen, bool withZero) {
102+
T v = gen_value<T>(gen);
103+
while (!withZero && v == 0) {
104+
v = gen_value<T>(gen);
105+
}
106+
return v;
107+
}
108+
109+
template <typename T, typename std::enable_if_t<!is_integral_type<T>, bool> = true>
110+
T gen_value(std::mt19937& gen, bool withZero) {
111+
return gen_value<T>(gen);
112+
}
113+
91114
template <TypeKind KIND>
92115
VectorPtr createScalar(TypePtr type,
93116
vector_size_t size,
94117
std::mt19937& gen,
118+
bool withZero,
95119
std::function<bool(vector_size_t)> isNullAt = nullptr) {
96120
using T = facebook::velox::TypeTraits<KIND>::NativeType;
97121
auto flatVector =
@@ -100,7 +124,7 @@ class BatchDataGenerator {
100124
if (isNullAt && isNullAt(i)) {
101125
flatVector->setNull(i, true);
102126
} else {
103-
flatVector->set(i, gen_value<T>(gen));
127+
flatVector->set(i, gen_value<T>(gen, withZero));
104128
}
105129
}
106130
return flatVector;

cpp/src/cider-velox/test/CiderOperatorHashJoinTest.cpp

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -70,11 +70,12 @@ class CiderOperatorHashJoinTest : public CiderOperatorTestBase {
7070
int32_t rightSize,
7171
const std::string& referenceQuery,
7272
const std::string& filter = "") {
73+
std::mt19937 gen{std::mt19937::default_seed};
7374
auto leftType = makeRowType(keyTypes, "t_");
7475
auto rightType = makeRowType(keyTypes, "u_");
7576

76-
auto leftBatch = generator_.createRowVector(leftType, leftSize);
77-
auto rightBatch = generator_.createRowVector(rightType, rightSize);
77+
auto leftBatch = generator_.createRowVector(leftType, leftSize, gen);
78+
auto rightBatch = generator_.createRowVector(rightType, rightSize, gen);
7879

7980
auto planNodeIdGenerator = std::make_shared<core::PlanNodeIdGenerator>();
8081

cpp/src/cider-velox/test/CiderOperatorTest.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -45,7 +45,7 @@ using namespace facebook::velox::plugin::plantransformer::test;
4545
class CiderOperatorTest : public OperatorTestBase {
4646
void SetUp() override {
4747
// FLAGS_partial_agg_pattern = true;
48-
vectors = generator_.generate(rowType_, 10, 100, false);
48+
vectors = generator_.generate(rowType_, 10, 100, true, false);
4949
createDuckDbTable(vectors);
5050
CiderVeloxPluginCtx::init();
5151
}

cpp/src/cider-velox/test/CiderOperatorTestBase.h

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -41,7 +41,15 @@ class CiderOperatorTestBase : public facebook::velox::exec::test::OperatorTestBa
4141

4242
protected:
4343
auto generateTestBatch(facebook::velox::RowTypePtr& rowType, bool withNull) {
44-
auto batches = generator_.generate(rowType, 10, 100, withNull);
44+
auto batches = generator_.generate(rowType, 10, 100, true, withNull);
45+
createDuckDbTable(batches);
46+
return batches;
47+
}
48+
49+
auto generateTestBatch(facebook::velox::RowTypePtr& rowType,
50+
bool withZero,
51+
bool withNull) {
52+
auto batches = generator_.generate(rowType, 10, 100, withZero, withNull);
4553
createDuckDbTable(batches);
4654
return batches;
4755
}

cpp/src/cider-velox/test/CiderPlanNodeTest.cpp

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -43,7 +43,7 @@ class CiderPlanNodeTest : public OperatorTestBase {
4343
};
4444

4545
TEST_F(CiderPlanNodeTest, filter) {
46-
std::vector<RowVectorPtr> vectors = generator_.generate(rowType_, 10, 100, false);
46+
std::vector<RowVectorPtr> vectors = generator_.generate(rowType_, 10, 100, true, false);
4747
createDuckDbTable(vectors);
4848

4949
const std::string filter =
@@ -62,7 +62,7 @@ TEST_F(CiderPlanNodeTest, filter) {
6262
}
6363

6464
TEST_F(CiderPlanNodeTest, project) {
65-
std::vector<RowVectorPtr> vectors = generator_.generate(rowType_, 10, 100, false);
65+
std::vector<RowVectorPtr> vectors = generator_.generate(rowType_, 10, 100, true, false);
6666
createDuckDbTable(vectors);
6767

6868
auto veloxPlan = PlanBuilder()
@@ -86,7 +86,7 @@ TEST_F(CiderPlanNodeTest, project) {
8686

8787
#if 0
8888
TEST_F(CiderPlanNodeTest, Q6) {
89-
std::vector<RowVectorPtr> vectors = generator_.generate(rowType_, 10, 100, false);
89+
std::vector<RowVectorPtr> vectors = generator_.generate(rowType_, 10, 100, true, false);
9090
createDuckDbTable(vectors);
9191

9292
auto veloxPlan = PlanBuilder()

cpp/src/cider-velox/test/CiderScalarFunctionTest.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -100,7 +100,7 @@ TEST_F(CiderScalarFunctionMathOpTest, colAndColMathOpWithBigIntTest) {
100100
for (auto& type : types_) {
101101
std::shared_ptr<const RowType> rowType{ROW({"c0", "c1"}, {BIGINT(), type})};
102102
verify(CiderPlanBuilder()
103-
.values(generateTestBatch(rowType, false))
103+
.values(generateTestBatch(rowType, false, false))
104104
.project({"c0 + c1", "c0 - c1", "c0 / c1"})
105105
.planNode(),
106106
" select c0 + c1, c0 - c1, c0 / c1 from tmp");

0 commit comments

Comments
 (0)