Skip to content

Commit 3b43f71

Browse files
PSeitzfulmicoton
andauthored
collect_block in QuickwitCollector (#4753)
* collect_block in QuickwitCollector collect_block + using `first_vals` to batch fetch sort values * Isolating the 3 components of the collector. * Apply suggestions from code review Co-authored-by: Paul Masurel <[email protected]> * fix var name --------- Co-authored-by: Paul Masurel <[email protected]>
1 parent 33920d6 commit 3b43f71

File tree

2 files changed

+249
-38
lines changed

2 files changed

+249
-38
lines changed

quickwit/quickwit-search/src/collector.rs

Lines changed: 248 additions & 37 deletions
Original file line numberDiff line numberDiff line change
@@ -150,6 +150,20 @@ enum SortingFieldExtractorComponent {
150150
}
151151

152152
impl SortingFieldExtractorComponent {
153+
fn is_fast_field(&self) -> bool {
154+
matches!(self, SortingFieldExtractorComponent::FastField { .. })
155+
}
156+
/// Loads the fast field values for the given doc_ids in its u64 representation. The returned
157+
/// u64 representation maintains the ordering of the original value.
158+
#[inline]
159+
fn extract_typed_sort_values_block(&self, doc_ids: &[DocId], values: &mut [Option<u64>]) {
160+
// In the collect block case we don't have scores to extract
161+
if let SortingFieldExtractorComponent::FastField { sort_column, .. } = self {
162+
let values = &mut values[..doc_ids.len()];
163+
sort_column.first_vals(doc_ids, values);
164+
}
165+
}
166+
153167
/// Returns the sort value for the given element in its u64 representation. The returned u64
154168
/// representation maintains the ordering of the original value.
155169
///
@@ -369,6 +383,23 @@ pub(crate) struct SortingFieldExtractorPair {
369383
}
370384

371385
impl SortingFieldExtractorPair {
386+
/// Returns the list of sort values for the given element
387+
///
388+
/// See also [`SortingFieldExtractorComponent::extract_typed_sort_values_block`] for more
389+
/// information.
390+
#[inline]
391+
fn extract_typed_sort_values(
392+
&self,
393+
doc_ids: &[DocId],
394+
values1: &mut [Option<u64>],
395+
values2: &mut [Option<u64>],
396+
) {
397+
self.first
398+
.extract_typed_sort_values_block(doc_ids, &mut values1[..doc_ids.len()]);
399+
if let Some(second) = self.second.as_ref() {
400+
second.extract_typed_sort_values_block(doc_ids, &mut values2[..doc_ids.len()]);
401+
}
402+
}
372403
/// Returns the list of sort values for the given element
373404
///
374405
/// See also [`SortingFieldExtractorComponent::extract_typed_sort_value_opt`] for more
@@ -469,17 +500,36 @@ enum AggregationSegmentCollectors {
469500

470501
/// Quickwit collector working at the scale of the segment.
471502
pub struct QuickwitSegmentCollector {
503+
timestamp_filter_opt: Option<TimestampFilter>,
504+
segment_top_k_collector: Option<QuickwitSegmentTopKCollector>,
505+
// Caches for block fetching
506+
filtered_docs: Box<[DocId; 64]>,
507+
aggregation: Option<AggregationSegmentCollectors>,
472508
num_hits: u64,
509+
}
510+
511+
impl QuickwitSegmentCollector {
512+
#[inline]
513+
fn accept_document(&self, doc_id: DocId) -> bool {
514+
if let Some(ref timestamp_filter) = self.timestamp_filter_opt {
515+
return timestamp_filter.is_within_range(doc_id);
516+
}
517+
true
518+
}
519+
}
520+
521+
/// Quickwit collector working at the scale of the segment.
522+
struct QuickwitSegmentTopKCollector {
473523
split_id: String,
474524
score_extractor: SortingFieldExtractorPair,
475525
// PartialHits in this heap don't contain a split_id yet.
476526
top_k_hits: TopK<SegmentPartialHit, SegmentPartialHitSortingKey, HitSortingMapper>,
477527
segment_ord: u32,
478-
timestamp_filter_opt: Option<TimestampFilter>,
479-
aggregation: Option<AggregationSegmentCollectors>,
480528
search_after: Option<SearchAfterSegment>,
481529
// Precomputed order for search_after for split_id and segment_ord
482530
precomp_search_after_order: Ordering,
531+
sort_values1: Box<[Option<u64>; 64]>,
532+
sort_values2: Box<[Option<u64>; 64]>,
483533
}
484534

485535
/// Search After, but the sort values are converted to the u64 fast field representation.
@@ -542,16 +592,100 @@ impl SearchAfterSegment {
542592
}
543593
}
544594

545-
impl QuickwitSegmentCollector {
546-
#[inline]
547-
fn collect_top_k(&mut self, doc_id: DocId, score: Score) {
548-
let (sort_value, sort_value2): (Option<u64>, Option<u64>) =
549-
self.score_extractor.extract_typed_sort_value(doc_id, score);
595+
impl QuickwitSegmentTopKCollector {
596+
fn collect_top_k_block(&mut self, docs: &[DocId]) {
597+
self.score_extractor.extract_typed_sort_values(
598+
docs,
599+
&mut self.sort_values1[..],
600+
&mut self.sort_values2[..],
601+
);
602+
if self.search_after.is_some() {
603+
// Search after not optimized for block collection yet
604+
for ((doc_id, sort_value), sort_value2) in docs
605+
.iter()
606+
.cloned()
607+
.zip(self.sort_values1.iter().cloned())
608+
.zip(self.sort_values2.iter().cloned())
609+
{
610+
Self::collect_top_k_vals(
611+
doc_id,
612+
sort_value,
613+
sort_value2,
614+
&self.search_after,
615+
self.precomp_search_after_order,
616+
&mut self.top_k_hits,
617+
);
618+
}
619+
} else {
620+
// Probaly would make sense to check the fence against e.g. sort_values1 earlier,
621+
// before creating the SegmentPartialHit.
622+
//
623+
// Below are different versions to avoid iterating the caches if they are unused.
624+
//
625+
// No sort values loaded. Sort only by doc_id.
626+
if !self.score_extractor.first.is_fast_field() {
627+
for doc_id in docs.iter().cloned() {
628+
let hit = SegmentPartialHit {
629+
sort_value: None,
630+
sort_value2: None,
631+
doc_id,
632+
};
633+
self.top_k_hits.add_entry(hit);
634+
}
635+
return;
636+
}
637+
let has_no_second_sort = !self
638+
.score_extractor
639+
.second
640+
.as_ref()
641+
.map(|extr| extr.is_fast_field())
642+
.unwrap_or(false);
643+
// No second sort values => We can skip iterating the second sort values cache.
644+
if has_no_second_sort {
645+
for (doc_id, sort_value) in
646+
docs.iter().cloned().zip(self.sort_values1.iter().cloned())
647+
{
648+
let hit = SegmentPartialHit {
649+
sort_value,
650+
sort_value2: None,
651+
doc_id,
652+
};
653+
self.top_k_hits.add_entry(hit);
654+
}
655+
return;
656+
}
550657

551-
if let Some(search_after) = &self.search_after {
658+
for ((doc_id, sort_value), sort_value2) in docs
659+
.iter()
660+
.cloned()
661+
.zip(self.sort_values1.iter().cloned())
662+
.zip(self.sort_values2.iter().cloned())
663+
{
664+
let hit = SegmentPartialHit {
665+
sort_value,
666+
sort_value2,
667+
doc_id,
668+
};
669+
self.top_k_hits.add_entry(hit);
670+
}
671+
}
672+
}
673+
#[inline]
674+
/// Generic top k collection, that includes search_after handling
675+
///
676+
/// Outside of the collector to circumvent lifetime issues.
677+
fn collect_top_k_vals(
678+
doc_id: DocId,
679+
sort_value: Option<u64>,
680+
sort_value2: Option<u64>,
681+
search_after: &Option<SearchAfterSegment>,
682+
precomp_search_after_order: Ordering,
683+
top_k_hits: &mut TopK<SegmentPartialHit, SegmentPartialHitSortingKey, HitSortingMapper>,
684+
) {
685+
if let Some(search_after) = &search_after {
552686
let search_after_value1 = search_after.sort_value;
553687
let search_after_value2 = search_after.sort_value2;
554-
let orders = &self.top_k_hits.sort_key_mapper;
688+
let orders = &top_k_hits.sort_key_mapper;
555689
let mut cmp_result = orders
556690
.order1
557691
.compare_opt(&sort_value, &search_after_value1)
@@ -565,7 +699,7 @@ impl QuickwitSegmentCollector {
565699
// default
566700
let order = orders.order1;
567701
cmp_result = cmp_result
568-
.then(self.precomp_search_after_order)
702+
.then(precomp_search_after_order)
569703
// We compare doc_id only if sort_value1, sort_value2, split_id and segment_ord
570704
// are equal.
571705
.then_with(|| order.compare(&doc_id, &search_after.doc_id))
@@ -581,15 +715,21 @@ impl QuickwitSegmentCollector {
581715
sort_value2,
582716
doc_id,
583717
};
584-
self.top_k_hits.add_entry(hit);
718+
top_k_hits.add_entry(hit);
585719
}
586720

587721
#[inline]
588-
fn accept_document(&self, doc_id: DocId) -> bool {
589-
if let Some(ref timestamp_filter) = self.timestamp_filter_opt {
590-
return timestamp_filter.is_within_range(doc_id);
591-
}
592-
true
722+
fn collect_top_k(&mut self, doc_id: DocId, score: Score) {
723+
let (sort_value, sort_value2): (Option<u64>, Option<u64>) =
724+
self.score_extractor.extract_typed_sort_value(doc_id, score);
725+
Self::collect_top_k_vals(
726+
doc_id,
727+
sort_value,
728+
sort_value2,
729+
&self.search_after,
730+
self.precomp_search_after_order,
731+
&mut self.top_k_hits,
732+
);
593733
}
594734
}
595735

@@ -635,17 +775,72 @@ impl SegmentPartialHit {
635775
}
636776
}
637777

778+
pub use tantivy::COLLECT_BLOCK_BUFFER_LEN;
779+
/// Store the filtered docs in `filtered_docs_buffer` if `timestamp_filter_opt` is present.
780+
///
781+
/// Returns the number of docs.
782+
///
783+
/// Ideally we would return just final docs slice, but we can't do that because of the borrow
784+
/// checker.
785+
fn compute_filtered_block<'a>(
786+
timestamp_filter_opt: &Option<TimestampFilter>,
787+
docs: &'a [DocId],
788+
filtered_docs_buffer: &'a mut [DocId; COLLECT_BLOCK_BUFFER_LEN],
789+
) -> &'a [DocId] {
790+
let Some(timestamp_filter) = &timestamp_filter_opt else {
791+
return docs;
792+
};
793+
let mut len = 0;
794+
for &doc in docs {
795+
filtered_docs_buffer[len] = doc;
796+
len += if timestamp_filter.is_within_range(doc) {
797+
1
798+
} else {
799+
0
800+
};
801+
}
802+
&filtered_docs_buffer[..len]
803+
}
804+
638805
impl SegmentCollector for QuickwitSegmentCollector {
639806
type Fruit = tantivy::Result<LeafSearchResponse>;
640807

808+
#[inline]
809+
fn collect_block(&mut self, unfiltered_docs: &[DocId]) {
810+
let filtered_docs: &[DocId] = compute_filtered_block(
811+
&self.timestamp_filter_opt,
812+
unfiltered_docs,
813+
&mut self.filtered_docs,
814+
);
815+
816+
// Update results
817+
self.num_hits += filtered_docs.len() as u64;
818+
819+
if let Some(segment_top_k_collector) = self.segment_top_k_collector.as_mut() {
820+
segment_top_k_collector.collect_top_k_block(filtered_docs);
821+
}
822+
823+
match self.aggregation.as_mut() {
824+
Some(AggregationSegmentCollectors::FindTraceIdsSegmentCollector(collector)) => {
825+
collector.collect_block(filtered_docs)
826+
}
827+
Some(AggregationSegmentCollectors::TantivyAggregationSegmentCollector(collector)) => {
828+
collector.collect_block(filtered_docs)
829+
}
830+
None => (),
831+
}
832+
}
833+
641834
#[inline]
642835
fn collect(&mut self, doc_id: DocId, score: Score) {
643836
if !self.accept_document(doc_id) {
644837
return;
645838
}
646839

647840
self.num_hits += 1;
648-
self.collect_top_k(doc_id, score);
841+
if let Some(segment_top_k_collector) = self.segment_top_k_collector.as_mut() {
842+
segment_top_k_collector.collect_top_k(doc_id, score);
843+
}
649844

650845
match self.aggregation.as_mut() {
651846
Some(AggregationSegmentCollectors::FindTraceIdsSegmentCollector(collector)) => {
@@ -659,19 +854,23 @@ impl SegmentCollector for QuickwitSegmentCollector {
659854
}
660855

661856
fn harvest(self) -> Self::Fruit {
662-
let partial_hits: Vec<PartialHit> = self
663-
.top_k_hits
664-
.finalize()
665-
.into_iter()
666-
.map(|segment_partial_hit: SegmentPartialHit| {
667-
segment_partial_hit.into_partial_hit(
668-
self.split_id.clone(),
669-
self.segment_ord,
670-
&self.score_extractor.first,
671-
&self.score_extractor.second,
672-
)
673-
})
674-
.collect();
857+
let mut partial_hits: Vec<PartialHit> = Vec::new();
858+
if let Some(segment_top_k_collector) = self.segment_top_k_collector {
859+
// TODO put that in a method of segment_top_k_collector
860+
partial_hits = segment_top_k_collector
861+
.top_k_hits
862+
.finalize()
863+
.into_iter()
864+
.map(|segment_partial_hit: SegmentPartialHit| {
865+
segment_partial_hit.into_partial_hit(
866+
segment_top_k_collector.split_id.clone(),
867+
segment_top_k_collector.segment_ord,
868+
&segment_top_k_collector.score_extractor.first,
869+
&segment_top_k_collector.score_extractor.second,
870+
)
871+
})
872+
.collect();
873+
}
675874

676875
let intermediate_aggregation_result = match self.aggregation {
677876
Some(AggregationSegmentCollectors::FindTraceIdsSegmentCollector(collector)) => {
@@ -897,16 +1096,28 @@ impl Collector for QuickwitCollector {
8971096
// Convert search_after into fast field u64
8981097
let search_after =
8991098
SearchAfterSegment::new(self.search_after.clone(), order1, order2, &score_extractor);
1099+
1100+
let segment_top_k_collector = if leaf_max_hits == 0 {
1101+
None
1102+
} else {
1103+
Some(QuickwitSegmentTopKCollector {
1104+
split_id: self.split_id.clone(),
1105+
score_extractor,
1106+
top_k_hits: TopK::new(leaf_max_hits, sort_key_mapper),
1107+
segment_ord,
1108+
search_after,
1109+
precomp_search_after_order,
1110+
sort_values1: Box::new([None; 64]),
1111+
sort_values2: Box::new([None; 64]),
1112+
})
1113+
};
1114+
9001115
Ok(QuickwitSegmentCollector {
901-
num_hits: 0u64,
902-
split_id: self.split_id.clone(),
903-
score_extractor,
904-
top_k_hits: TopK::new(leaf_max_hits, sort_key_mapper),
905-
segment_ord,
1116+
num_hits: 0,
9061117
timestamp_filter_opt,
1118+
segment_top_k_collector,
9071119
aggregation,
908-
search_after,
909-
precomp_search_after_order,
1120+
filtered_docs: Box::new([0; 64]),
9101121
})
9111122
}
9121123

quickwit/rest-api-tests/run_tests.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -147,7 +147,7 @@ def check_result_list(result, expected, context_path=""):
147147
display_filtered_result = filtered_result[:5] + ['...'] if len(filtered_result) > 5 else filtered_result
148148
else:
149149
display_filtered_result = filtered_result
150-
raise Exception("Wrong length at context %s. Expected: %s Received: %s,\n Expected \n%s \n Received \n%s" % (context_path, len(expected), len(result), display_filtered_result, expected))
150+
raise Exception("Wrong length at context %s. Expected: %s Received: %s,\n Expected \n%s \n Received \n%s" % (context_path, len(expected), len(result), expected, display_filtered_result))
151151
raise Exception("Wrong length at context %s. Expected: %s Received: %s" % (context_path, len(expected), len(result)))
152152
for (i, (left, right)) in enumerate(zip(result, expected)):
153153
check_result(left, right, context_path + "[%s]" % i)

0 commit comments

Comments
 (0)