@@ -150,6 +150,20 @@ enum SortingFieldExtractorComponent {
150150}
151151
152152impl SortingFieldExtractorComponent {
153+ fn is_fast_field ( & self ) -> bool {
154+ matches ! ( self , SortingFieldExtractorComponent :: FastField { .. } )
155+ }
156+ /// Loads the fast field values for the given doc_ids in its u64 representation. The returned
157+ /// u64 representation maintains the ordering of the original value.
158+ #[ inline]
159+ fn extract_typed_sort_values_block ( & self , doc_ids : & [ DocId ] , values : & mut [ Option < u64 > ] ) {
160+ // In the collect block case we don't have scores to extract
161+ if let SortingFieldExtractorComponent :: FastField { sort_column, .. } = self {
162+ let values = & mut values[ ..doc_ids. len ( ) ] ;
163+ sort_column. first_vals ( doc_ids, values) ;
164+ }
165+ }
166+
153167 /// Returns the sort value for the given element in its u64 representation. The returned u64
154168 /// representation maintains the ordering of the original value.
155169 ///
@@ -369,6 +383,23 @@ pub(crate) struct SortingFieldExtractorPair {
369383}
370384
371385impl SortingFieldExtractorPair {
386+ /// Returns the list of sort values for the given element
387+ ///
388+ /// See also [`SortingFieldExtractorComponent::extract_typed_sort_values_block`] for more
389+ /// information.
390+ #[ inline]
391+ fn extract_typed_sort_values (
392+ & self ,
393+ doc_ids : & [ DocId ] ,
394+ values1 : & mut [ Option < u64 > ] ,
395+ values2 : & mut [ Option < u64 > ] ,
396+ ) {
397+ self . first
398+ . extract_typed_sort_values_block ( doc_ids, & mut values1[ ..doc_ids. len ( ) ] ) ;
399+ if let Some ( second) = self . second . as_ref ( ) {
400+ second. extract_typed_sort_values_block ( doc_ids, & mut values2[ ..doc_ids. len ( ) ] ) ;
401+ }
402+ }
372403 /// Returns the list of sort values for the given element
373404 ///
374405 /// See also [`SortingFieldExtractorComponent::extract_typed_sort_value_opt`] for more
@@ -469,17 +500,36 @@ enum AggregationSegmentCollectors {
469500
470501/// Quickwit collector working at the scale of the segment.
471502pub struct QuickwitSegmentCollector {
503+ timestamp_filter_opt : Option < TimestampFilter > ,
504+ segment_top_k_collector : Option < QuickwitSegmentTopKCollector > ,
505+ // Caches for block fetching
506+ filtered_docs : Box < [ DocId ; 64 ] > ,
507+ aggregation : Option < AggregationSegmentCollectors > ,
472508 num_hits : u64 ,
509+ }
510+
511+ impl QuickwitSegmentCollector {
512+ #[ inline]
513+ fn accept_document ( & self , doc_id : DocId ) -> bool {
514+ if let Some ( ref timestamp_filter) = self . timestamp_filter_opt {
515+ return timestamp_filter. is_within_range ( doc_id) ;
516+ }
517+ true
518+ }
519+ }
520+
521+ /// Quickwit collector working at the scale of the segment.
522+ struct QuickwitSegmentTopKCollector {
473523 split_id : String ,
474524 score_extractor : SortingFieldExtractorPair ,
475525 // PartialHits in this heap don't contain a split_id yet.
476526 top_k_hits : TopK < SegmentPartialHit , SegmentPartialHitSortingKey , HitSortingMapper > ,
477527 segment_ord : u32 ,
478- timestamp_filter_opt : Option < TimestampFilter > ,
479- aggregation : Option < AggregationSegmentCollectors > ,
480528 search_after : Option < SearchAfterSegment > ,
481529 // Precomputed order for search_after for split_id and segment_ord
482530 precomp_search_after_order : Ordering ,
531+ sort_values1 : Box < [ Option < u64 > ; 64 ] > ,
532+ sort_values2 : Box < [ Option < u64 > ; 64 ] > ,
483533}
484534
485535/// Search After, but the sort values are converted to the u64 fast field representation.
@@ -542,16 +592,100 @@ impl SearchAfterSegment {
542592 }
543593}
544594
545- impl QuickwitSegmentCollector {
546- #[ inline]
547- fn collect_top_k ( & mut self , doc_id : DocId , score : Score ) {
548- let ( sort_value, sort_value2) : ( Option < u64 > , Option < u64 > ) =
549- self . score_extractor . extract_typed_sort_value ( doc_id, score) ;
595+ impl QuickwitSegmentTopKCollector {
596+ fn collect_top_k_block ( & mut self , docs : & [ DocId ] ) {
597+ self . score_extractor . extract_typed_sort_values (
598+ docs,
599+ & mut self . sort_values1 [ ..] ,
600+ & mut self . sort_values2 [ ..] ,
601+ ) ;
602+ if self . search_after . is_some ( ) {
603+ // Search after not optimized for block collection yet
604+ for ( ( doc_id, sort_value) , sort_value2) in docs
605+ . iter ( )
606+ . cloned ( )
607+ . zip ( self . sort_values1 . iter ( ) . cloned ( ) )
608+ . zip ( self . sort_values2 . iter ( ) . cloned ( ) )
609+ {
610+ Self :: collect_top_k_vals (
611+ doc_id,
612+ sort_value,
613+ sort_value2,
614+ & self . search_after ,
615+ self . precomp_search_after_order ,
616+ & mut self . top_k_hits ,
617+ ) ;
618+ }
619+ } else {
620+ // Probaly would make sense to check the fence against e.g. sort_values1 earlier,
621+ // before creating the SegmentPartialHit.
622+ //
623+ // Below are different versions to avoid iterating the caches if they are unused.
624+ //
625+ // No sort values loaded. Sort only by doc_id.
626+ if !self . score_extractor . first . is_fast_field ( ) {
627+ for doc_id in docs. iter ( ) . cloned ( ) {
628+ let hit = SegmentPartialHit {
629+ sort_value : None ,
630+ sort_value2 : None ,
631+ doc_id,
632+ } ;
633+ self . top_k_hits . add_entry ( hit) ;
634+ }
635+ return ;
636+ }
637+ let has_no_second_sort = !self
638+ . score_extractor
639+ . second
640+ . as_ref ( )
641+ . map ( |extr| extr. is_fast_field ( ) )
642+ . unwrap_or ( false ) ;
643+ // No second sort values => We can skip iterating the second sort values cache.
644+ if has_no_second_sort {
645+ for ( doc_id, sort_value) in
646+ docs. iter ( ) . cloned ( ) . zip ( self . sort_values1 . iter ( ) . cloned ( ) )
647+ {
648+ let hit = SegmentPartialHit {
649+ sort_value,
650+ sort_value2 : None ,
651+ doc_id,
652+ } ;
653+ self . top_k_hits . add_entry ( hit) ;
654+ }
655+ return ;
656+ }
550657
551- if let Some ( search_after) = & self . search_after {
658+ for ( ( doc_id, sort_value) , sort_value2) in docs
659+ . iter ( )
660+ . cloned ( )
661+ . zip ( self . sort_values1 . iter ( ) . cloned ( ) )
662+ . zip ( self . sort_values2 . iter ( ) . cloned ( ) )
663+ {
664+ let hit = SegmentPartialHit {
665+ sort_value,
666+ sort_value2,
667+ doc_id,
668+ } ;
669+ self . top_k_hits . add_entry ( hit) ;
670+ }
671+ }
672+ }
673+ #[ inline]
674+ /// Generic top k collection, that includes search_after handling
675+ ///
676+ /// Outside of the collector to circumvent lifetime issues.
677+ fn collect_top_k_vals (
678+ doc_id : DocId ,
679+ sort_value : Option < u64 > ,
680+ sort_value2 : Option < u64 > ,
681+ search_after : & Option < SearchAfterSegment > ,
682+ precomp_search_after_order : Ordering ,
683+ top_k_hits : & mut TopK < SegmentPartialHit , SegmentPartialHitSortingKey , HitSortingMapper > ,
684+ ) {
685+ if let Some ( search_after) = & search_after {
552686 let search_after_value1 = search_after. sort_value ;
553687 let search_after_value2 = search_after. sort_value2 ;
554- let orders = & self . top_k_hits . sort_key_mapper ;
688+ let orders = & top_k_hits. sort_key_mapper ;
555689 let mut cmp_result = orders
556690 . order1
557691 . compare_opt ( & sort_value, & search_after_value1)
@@ -565,7 +699,7 @@ impl QuickwitSegmentCollector {
565699 // default
566700 let order = orders. order1 ;
567701 cmp_result = cmp_result
568- . then ( self . precomp_search_after_order )
702+ . then ( precomp_search_after_order)
569703 // We compare doc_id only if sort_value1, sort_value2, split_id and segment_ord
570704 // are equal.
571705 . then_with ( || order. compare ( & doc_id, & search_after. doc_id ) )
@@ -581,15 +715,21 @@ impl QuickwitSegmentCollector {
581715 sort_value2,
582716 doc_id,
583717 } ;
584- self . top_k_hits . add_entry ( hit) ;
718+ top_k_hits. add_entry ( hit) ;
585719 }
586720
587721 #[ inline]
588- fn accept_document ( & self , doc_id : DocId ) -> bool {
589- if let Some ( ref timestamp_filter) = self . timestamp_filter_opt {
590- return timestamp_filter. is_within_range ( doc_id) ;
591- }
592- true
722+ fn collect_top_k ( & mut self , doc_id : DocId , score : Score ) {
723+ let ( sort_value, sort_value2) : ( Option < u64 > , Option < u64 > ) =
724+ self . score_extractor . extract_typed_sort_value ( doc_id, score) ;
725+ Self :: collect_top_k_vals (
726+ doc_id,
727+ sort_value,
728+ sort_value2,
729+ & self . search_after ,
730+ self . precomp_search_after_order ,
731+ & mut self . top_k_hits ,
732+ ) ;
593733 }
594734}
595735
@@ -635,17 +775,72 @@ impl SegmentPartialHit {
635775 }
636776}
637777
778+ pub use tantivy:: COLLECT_BLOCK_BUFFER_LEN ;
779+ /// Store the filtered docs in `filtered_docs_buffer` if `timestamp_filter_opt` is present.
780+ ///
781+ /// Returns the number of docs.
782+ ///
783+ /// Ideally we would return just final docs slice, but we can't do that because of the borrow
784+ /// checker.
785+ fn compute_filtered_block < ' a > (
786+ timestamp_filter_opt : & Option < TimestampFilter > ,
787+ docs : & ' a [ DocId ] ,
788+ filtered_docs_buffer : & ' a mut [ DocId ; COLLECT_BLOCK_BUFFER_LEN ] ,
789+ ) -> & ' a [ DocId ] {
790+ let Some ( timestamp_filter) = & timestamp_filter_opt else {
791+ return docs;
792+ } ;
793+ let mut len = 0 ;
794+ for & doc in docs {
795+ filtered_docs_buffer[ len] = doc;
796+ len += if timestamp_filter. is_within_range ( doc) {
797+ 1
798+ } else {
799+ 0
800+ } ;
801+ }
802+ & filtered_docs_buffer[ ..len]
803+ }
804+
638805impl SegmentCollector for QuickwitSegmentCollector {
639806 type Fruit = tantivy:: Result < LeafSearchResponse > ;
640807
808+ #[ inline]
809+ fn collect_block ( & mut self , unfiltered_docs : & [ DocId ] ) {
810+ let filtered_docs: & [ DocId ] = compute_filtered_block (
811+ & self . timestamp_filter_opt ,
812+ unfiltered_docs,
813+ & mut self . filtered_docs ,
814+ ) ;
815+
816+ // Update results
817+ self . num_hits += filtered_docs. len ( ) as u64 ;
818+
819+ if let Some ( segment_top_k_collector) = self . segment_top_k_collector . as_mut ( ) {
820+ segment_top_k_collector. collect_top_k_block ( filtered_docs) ;
821+ }
822+
823+ match self . aggregation . as_mut ( ) {
824+ Some ( AggregationSegmentCollectors :: FindTraceIdsSegmentCollector ( collector) ) => {
825+ collector. collect_block ( filtered_docs)
826+ }
827+ Some ( AggregationSegmentCollectors :: TantivyAggregationSegmentCollector ( collector) ) => {
828+ collector. collect_block ( filtered_docs)
829+ }
830+ None => ( ) ,
831+ }
832+ }
833+
641834 #[ inline]
642835 fn collect ( & mut self , doc_id : DocId , score : Score ) {
643836 if !self . accept_document ( doc_id) {
644837 return ;
645838 }
646839
647840 self . num_hits += 1 ;
648- self . collect_top_k ( doc_id, score) ;
841+ if let Some ( segment_top_k_collector) = self . segment_top_k_collector . as_mut ( ) {
842+ segment_top_k_collector. collect_top_k ( doc_id, score) ;
843+ }
649844
650845 match self . aggregation . as_mut ( ) {
651846 Some ( AggregationSegmentCollectors :: FindTraceIdsSegmentCollector ( collector) ) => {
@@ -659,19 +854,23 @@ impl SegmentCollector for QuickwitSegmentCollector {
659854 }
660855
661856 fn harvest ( self ) -> Self :: Fruit {
662- let partial_hits: Vec < PartialHit > = self
663- . top_k_hits
664- . finalize ( )
665- . into_iter ( )
666- . map ( |segment_partial_hit : SegmentPartialHit | {
667- segment_partial_hit. into_partial_hit (
668- self . split_id . clone ( ) ,
669- self . segment_ord ,
670- & self . score_extractor . first ,
671- & self . score_extractor . second ,
672- )
673- } )
674- . collect ( ) ;
857+ let mut partial_hits: Vec < PartialHit > = Vec :: new ( ) ;
858+ if let Some ( segment_top_k_collector) = self . segment_top_k_collector {
859+ // TODO put that in a method of segment_top_k_collector
860+ partial_hits = segment_top_k_collector
861+ . top_k_hits
862+ . finalize ( )
863+ . into_iter ( )
864+ . map ( |segment_partial_hit : SegmentPartialHit | {
865+ segment_partial_hit. into_partial_hit (
866+ segment_top_k_collector. split_id . clone ( ) ,
867+ segment_top_k_collector. segment_ord ,
868+ & segment_top_k_collector. score_extractor . first ,
869+ & segment_top_k_collector. score_extractor . second ,
870+ )
871+ } )
872+ . collect ( ) ;
873+ }
675874
676875 let intermediate_aggregation_result = match self . aggregation {
677876 Some ( AggregationSegmentCollectors :: FindTraceIdsSegmentCollector ( collector) ) => {
@@ -897,16 +1096,28 @@ impl Collector for QuickwitCollector {
8971096 // Convert search_after into fast field u64
8981097 let search_after =
8991098 SearchAfterSegment :: new ( self . search_after . clone ( ) , order1, order2, & score_extractor) ;
1099+
1100+ let segment_top_k_collector = if leaf_max_hits == 0 {
1101+ None
1102+ } else {
1103+ Some ( QuickwitSegmentTopKCollector {
1104+ split_id : self . split_id . clone ( ) ,
1105+ score_extractor,
1106+ top_k_hits : TopK :: new ( leaf_max_hits, sort_key_mapper) ,
1107+ segment_ord,
1108+ search_after,
1109+ precomp_search_after_order,
1110+ sort_values1 : Box :: new ( [ None ; 64 ] ) ,
1111+ sort_values2 : Box :: new ( [ None ; 64 ] ) ,
1112+ } )
1113+ } ;
1114+
9001115 Ok ( QuickwitSegmentCollector {
901- num_hits : 0u64 ,
902- split_id : self . split_id . clone ( ) ,
903- score_extractor,
904- top_k_hits : TopK :: new ( leaf_max_hits, sort_key_mapper) ,
905- segment_ord,
1116+ num_hits : 0 ,
9061117 timestamp_filter_opt,
1118+ segment_top_k_collector,
9071119 aggregation,
908- search_after,
909- precomp_search_after_order,
1120+ filtered_docs : Box :: new ( [ 0 ; 64 ] ) ,
9101121 } )
9111122 }
9121123
0 commit comments