Skip to content

Commit 3394ac8

Browse files
authored
enhance: [2.6] add ScalarFieldProto& overload to avoid unnecessary copies (#45742)
1. Array.h: Add output_data(ScalarFieldProto&) overload for both Array and ArrayView classes 2. Use std::string_view instead of std::string for VARCHAR and GEOMETRY types to avoid extra string copies 3. Call Reserve(length_) before writing to proto objects to reduce memory reallocations a simple test shows those optimizations improve the Array of Varchar bulk_subscript performance by 20% issue: #45679 pr: #45743 Signed-off-by: Buqian Zheng <[email protected]>
1 parent f49951e commit 3394ac8

File tree

5 files changed

+65
-27
lines changed

5 files changed

+65
-27
lines changed

internal/core/src/common/Array.h

Lines changed: 54 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -296,11 +296,12 @@ class Array {
296296
return offsets_ptr_.get();
297297
}
298298

299-
ScalarFieldProto
300-
output_data() const {
301-
ScalarFieldProto data_array;
299+
void
300+
output_data(ScalarFieldProto& data_array) const {
302301
switch (element_type_) {
303302
case DataType::BOOL: {
303+
data_array.mutable_bool_data()->mutable_data()->Reserve(
304+
length_);
304305
for (int j = 0; j < length_; ++j) {
305306
auto element = get_data<bool>(j);
306307
data_array.mutable_bool_data()->add_data(element);
@@ -310,13 +311,16 @@ class Array {
310311
case DataType::INT8:
311312
case DataType::INT16:
312313
case DataType::INT32: {
314+
data_array.mutable_int_data()->mutable_data()->Reserve(length_);
313315
for (int j = 0; j < length_; ++j) {
314316
auto element = get_data<int>(j);
315317
data_array.mutable_int_data()->add_data(element);
316318
}
317319
break;
318320
}
319321
case DataType::INT64: {
322+
data_array.mutable_long_data()->mutable_data()->Reserve(
323+
length_);
320324
for (int j = 0; j < length_; ++j) {
321325
auto element = get_data<int64_t>(j);
322326
data_array.mutable_long_data()->add_data(element);
@@ -325,37 +329,53 @@ class Array {
325329
}
326330
case DataType::STRING:
327331
case DataType::VARCHAR: {
332+
data_array.mutable_string_data()->mutable_data()->Reserve(
333+
length_);
328334
for (int j = 0; j < length_; ++j) {
329-
auto element = get_data<std::string>(j);
330-
data_array.mutable_string_data()->add_data(element);
335+
auto element = get_data<std::string_view>(j);
336+
data_array.mutable_string_data()->add_data(element.data(),
337+
element.size());
331338
}
332339
break;
333340
}
334341
case DataType::FLOAT: {
342+
data_array.mutable_float_data()->mutable_data()->Reserve(
343+
length_);
335344
for (int j = 0; j < length_; ++j) {
336345
auto element = get_data<float>(j);
337346
data_array.mutable_float_data()->add_data(element);
338347
}
339348
break;
340349
}
341350
case DataType::DOUBLE: {
351+
data_array.mutable_double_data()->mutable_data()->Reserve(
352+
length_);
342353
for (int j = 0; j < length_; ++j) {
343354
auto element = get_data<double>(j);
344355
data_array.mutable_double_data()->add_data(element);
345356
}
346357
break;
347358
}
348359
case DataType::GEOMETRY: {
360+
data_array.mutable_geometry_data()->mutable_data()->Reserve(
361+
length_);
349362
for (int j = 0; j < length_; ++j) {
350-
auto element = get_data<std::string>(j);
351-
data_array.mutable_geometry_data()->add_data(element);
363+
auto element = get_data<std::string_view>(j);
364+
data_array.mutable_geometry_data()->add_data(
365+
element.data(), element.size());
352366
}
353367
break;
354368
}
355369
default: {
356370
// empty array
357371
}
358372
}
373+
}
374+
375+
ScalarFieldProto
376+
output_data() const {
377+
ScalarFieldProto data_array;
378+
output_data(data_array);
359379
return data_array;
360380
}
361381

@@ -541,11 +561,12 @@ class ArrayView {
541561
return reinterpret_cast<T*>(data_)[index];
542562
}
543563

544-
ScalarFieldProto
545-
output_data() const {
546-
ScalarFieldProto data_array;
564+
void
565+
output_data(ScalarFieldProto& data_array) const {
547566
switch (element_type_) {
548567
case DataType::BOOL: {
568+
data_array.mutable_bool_data()->mutable_data()->Reserve(
569+
length_);
549570
for (int j = 0; j < length_; ++j) {
550571
auto element = get_data<bool>(j);
551572
data_array.mutable_bool_data()->add_data(element);
@@ -555,13 +576,16 @@ class ArrayView {
555576
case DataType::INT8:
556577
case DataType::INT16:
557578
case DataType::INT32: {
579+
data_array.mutable_int_data()->mutable_data()->Reserve(length_);
558580
for (int j = 0; j < length_; ++j) {
559581
auto element = get_data<int>(j);
560582
data_array.mutable_int_data()->add_data(element);
561583
}
562584
break;
563585
}
564586
case DataType::INT64: {
587+
data_array.mutable_long_data()->mutable_data()->Reserve(
588+
length_);
565589
for (int j = 0; j < length_; ++j) {
566590
auto element = get_data<int64_t>(j);
567591
data_array.mutable_long_data()->add_data(element);
@@ -570,37 +594,53 @@ class ArrayView {
570594
}
571595
case DataType::STRING:
572596
case DataType::VARCHAR: {
597+
data_array.mutable_string_data()->mutable_data()->Reserve(
598+
length_);
573599
for (int j = 0; j < length_; ++j) {
574-
auto element = get_data<std::string>(j);
575-
data_array.mutable_string_data()->add_data(element);
600+
auto element = get_data<std::string_view>(j);
601+
data_array.mutable_string_data()->add_data(element.data(),
602+
element.size());
576603
}
577604
break;
578605
}
579606
case DataType::FLOAT: {
607+
data_array.mutable_float_data()->mutable_data()->Reserve(
608+
length_);
580609
for (int j = 0; j < length_; ++j) {
581610
auto element = get_data<float>(j);
582611
data_array.mutable_float_data()->add_data(element);
583612
}
584613
break;
585614
}
586615
case DataType::DOUBLE: {
616+
data_array.mutable_double_data()->mutable_data()->Reserve(
617+
length_);
587618
for (int j = 0; j < length_; ++j) {
588619
auto element = get_data<double>(j);
589620
data_array.mutable_double_data()->add_data(element);
590621
}
591622
break;
592623
}
593624
case DataType::GEOMETRY: {
625+
data_array.mutable_geometry_data()->mutable_data()->Reserve(
626+
length_);
594627
for (int j = 0; j < length_; ++j) {
595-
auto element = get_data<std::string>(j);
596-
data_array.mutable_geometry_data()->add_data(element);
628+
auto element = get_data<std::string_view>(j);
629+
data_array.mutable_geometry_data()->add_data(
630+
element.data(), element.size());
597631
}
598632
break;
599633
}
600634
default: {
601635
// empty array
602636
}
603637
}
638+
}
639+
640+
ScalarFieldProto
641+
output_data() const {
642+
ScalarFieldProto data_array;
643+
output_data(data_array);
604644
return data_array;
605645
}
606646

internal/core/src/mmap/ChunkedColumn.h

Lines changed: 4 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -629,16 +629,15 @@ class ChunkedArrayColumn : public ChunkedColumnBase {
629629

630630
void
631631
BulkArrayAt(milvus::OpContext* op_ctx,
632-
std::function<void(ScalarFieldProto&&, size_t)> fn,
632+
std::function<void(const ArrayView&, size_t)> fn,
633633
const int64_t* offsets,
634634
int64_t count) const override {
635635
auto [cids, offsets_in_chunk] = ToChunkIdAndOffset(offsets, count);
636636
auto ca = SemiInlineGet(slot_->PinCells(op_ctx, cids));
637637
for (int64_t i = 0; i < count; i++) {
638-
auto array = static_cast<ArrayChunk*>(ca->get_cell_of(cids[i]))
639-
->View(offsets_in_chunk[i])
640-
.output_data();
641-
fn(std::move(array), i);
638+
auto view = static_cast<ArrayChunk*>(ca->get_cell_of(cids[i]))
639+
->View(offsets_in_chunk[i]);
640+
fn(view, i);
642641
}
643642
}
644643

internal/core/src/mmap/ChunkedColumnGroup.h

Lines changed: 4 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -626,7 +626,7 @@ class ProxyChunkColumn : public ChunkedColumnInterface {
626626

627627
void
628628
BulkArrayAt(milvus::OpContext* op_ctx,
629-
std::function<void(ScalarFieldProto&&, size_t)> fn,
629+
std::function<void(const ArrayView&, size_t)> fn,
630630
const int64_t* offsets,
631631
int64_t count) const override {
632632
if (!IsChunkedArrayColumnDataType(data_type_)) {
@@ -639,10 +639,9 @@ class ProxyChunkColumn : public ChunkedColumnInterface {
639639
for (int64_t i = 0; i < count; i++) {
640640
auto* group_chunk = ca->get_cell_of(cids[i]);
641641
auto chunk = group_chunk->GetChunk(field_id_);
642-
auto array = static_cast<ArrayChunk*>(chunk.get())
643-
->View(offsets_in_chunk[i])
644-
.output_data();
645-
fn(std::move(array), i);
642+
auto view = static_cast<ArrayChunk*>(chunk.get())
643+
->View(offsets_in_chunk[i]);
644+
fn(view, i);
646645
}
647646
}
648647

internal/core/src/mmap/ChunkedColumnInterface.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -187,7 +187,7 @@ class ChunkedColumnInterface {
187187

188188
virtual void
189189
BulkArrayAt(milvus::OpContext* op_ctx,
190-
std::function<void(ScalarFieldProto&&, size_t)> fn,
190+
std::function<void(const ArrayView&, size_t)> fn,
191191
const int64_t* offsets,
192192
int64_t count) const {
193193
ThrowInfo(ErrorCode::Unsupported,

internal/core/src/segcore/ChunkedSegmentSealedImpl.cpp

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1583,8 +1583,8 @@ ChunkedSegmentSealedImpl::bulk_subscript_array_impl(
15831583
google::protobuf::RepeatedPtrField<T>* dst) {
15841584
column->BulkArrayAt(
15851585
op_ctx,
1586-
[dst](ScalarFieldProto&& array, size_t i) {
1587-
dst->at(i) = std::move(array);
1586+
[dst](const ArrayView& view, size_t i) {
1587+
view.output_data(dst->at(i));
15881588
},
15891589
seg_offsets,
15901590
count);

0 commit comments

Comments
 (0)