diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt index 399bd8d1a2f5..95ec4699e905 100644 --- a/lucene/CHANGES.txt +++ b/lucene/CHANGES.txt @@ -270,6 +270,8 @@ Optimizations * GITHUB#15498: ExitableDirectoryReader keeps singleton SortedSetDocValues or SortedNumericDocValues as singletons. (Houston Putman) +* GITHUB#15511: Dynamic pruning for SORTED(_SET) fields with doc values skipper (Pan Guixin) + Bug Fixes --------------------- * GITHUB#14161: PointInSetQuery's constructor now throws IllegalArgumentException diff --git a/lucene/core/src/java/org/apache/lucene/search/DocValuesRangeIterator.java b/lucene/core/src/java/org/apache/lucene/search/DocValuesRangeIterator.java index 72066bc5f3e7..b3b955978d10 100644 --- a/lucene/core/src/java/org/apache/lucene/search/DocValuesRangeIterator.java +++ b/lucene/core/src/java/org/apache/lucene/search/DocValuesRangeIterator.java @@ -197,8 +197,7 @@ protected Match match(int level) { @Override public final boolean matches() throws IOException { return switch (approximation.match) { - case YES -> true; - case IF_DOC_HAS_VALUE -> true; + case YES, IF_DOC_HAS_VALUE -> true; case MAYBE -> innerTwoPhase.matches(); case NO -> throw new IllegalStateException("Unpositioned approximation"); }; diff --git a/lucene/core/src/java/org/apache/lucene/search/comparators/TermOrdValComparator.java b/lucene/core/src/java/org/apache/lucene/search/comparators/TermOrdValComparator.java index 6318cac5276f..f02e9d4b3ae1 100644 --- a/lucene/core/src/java/org/apache/lucene/search/comparators/TermOrdValComparator.java +++ b/lucene/core/src/java/org/apache/lucene/search/comparators/TermOrdValComparator.java @@ -19,6 +19,8 @@ import java.io.IOException; import java.util.ArrayDeque; import org.apache.lucene.index.DocValues; +import org.apache.lucene.index.DocValuesSkipIndexType; +import org.apache.lucene.index.DocValuesSkipper; import org.apache.lucene.index.FieldInfo; import org.apache.lucene.index.IndexOptions; import org.apache.lucene.index.LeafReaderContext; @@ -28,11 +30,13 @@ import org.apache.lucene.index.TermsEnum; import org.apache.lucene.search.AbstractDocIdSetIterator; import org.apache.lucene.search.DocIdSetIterator; +import org.apache.lucene.search.DocValuesRangeIterator; import org.apache.lucene.search.FieldComparator; import org.apache.lucene.search.IndexSearcher; import org.apache.lucene.search.LeafFieldComparator; import org.apache.lucene.search.Pruning; import org.apache.lucene.search.Scorable; +import org.apache.lucene.search.TwoPhaseIterator; import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.BytesRefBuilder; import org.apache.lucene.util.PriorityQueue; @@ -246,6 +250,8 @@ private class TermOrdValLeafComparator implements LeafFieldComparator { } final boolean enableSkipping; + Terms terms = null; + DocValuesSkipper skipper = null; if (canSkipDocuments == false) { dense = false; enableSkipping = false; @@ -257,31 +263,41 @@ private class TermOrdValLeafComparator implements LeafFieldComparator { } dense = false; enableSkipping = true; - } else if (fieldInfo.getIndexOptions() == IndexOptions.NONE) { - // No terms index + } else if (fieldInfo.getIndexOptions() != IndexOptions.NONE) { + terms = context.reader().terms(field); + dense = terms != null && terms.getDocCount() == context.reader().maxDoc(); + enableSkipping = shouldEnableSkipping(dense); + } else if (fieldInfo.docValuesSkipIndexType() != DocValuesSkipIndexType.NONE) { + skipper = context.reader().getDocValuesSkipper(field); + dense = skipper != null && skipper.docCount() == context.reader().maxDoc(); + enableSkipping = shouldEnableSkipping(dense); + } else { dense = false; enableSkipping = false; - } else { - Terms terms = context.reader().terms(field); - dense = terms != null && terms.getDocCount() == context.reader().maxDoc(); - if (dense || topValue != null) { - enableSkipping = true; - } else if (reverse == sortMissingLast) { - // Missing values are always competitive, we can never skip - enableSkipping = false; - } else { - enableSkipping = true; - } } } - if (enableSkipping) { - competitiveState = new CompetitiveState(context, field, dense, values.termsEnum()); + if (enableSkipping && terms != null) { + competitiveState = + new PostingsBasedCompetitiveState(context, field, dense, values.termsEnum()); + } else if (enableSkipping && skipper != null) { + competitiveState = new SkipperBasedCompetitiveState(context, skipper); } else { competitiveState = null; } updateCompetitiveIterator(); } + private boolean shouldEnableSkipping(boolean dense) { + if (dense || topValue != null) { + return true; + } else if (reverse == sortMissingLast) { + // Missing values are always competitive, we can never skip + return false; + } else { + return true; + } + } + private int getOrdForDoc(int doc) throws IOException { if (termsIndex.advanceExact(doc)) { return termsIndex.ordValue(); @@ -477,7 +493,18 @@ public DocIdSetIterator competitiveIterator() { private record PostingsEnumAndOrd(PostingsEnum postings, int ord) {} - private class CompetitiveState { + private abstract class CompetitiveState { + final UpdateableDocIdSetIterator iterator; + + CompetitiveState(LeafReaderContext context) { + iterator = new UpdateableDocIdSetIterator(); + iterator.update(DocIdSetIterator.all(context.reader().maxDoc())); + } + + abstract void update(int minOrd, int maxOrd) throws IOException; + } + + private class PostingsBasedCompetitiveState extends CompetitiveState { private static final int MAX_TERMS = 1024; @@ -490,10 +517,9 @@ private class CompetitiveState { private PriorityQueue disjunction; private final DocIdSetIterator disjunctionIterator; - private final UpdateableDocIdSetIterator iterator; - - CompetitiveState( + PostingsBasedCompetitiveState( LeafReaderContext context, String field, boolean dense, TermsEnum docValuesTerms) { + super(context); this.context = context; this.field = field; this.dense = dense; @@ -529,15 +555,14 @@ public long cost() { return cost; } }; - this.iterator = new UpdateableDocIdSetIterator(); - this.iterator.update(DocIdSetIterator.all(context.reader().maxDoc())); } /** * Update this iterator to only match postings whose term has an ordinal between {@code minOrd} * included and {@code maxOrd} included. */ - private void update(int minOrd, int maxOrd) throws IOException { + @Override + public void update(int minOrd, int maxOrd) throws IOException { final int maxTerms = Math.min(MAX_TERMS, IndexSearcher.getMaxClauseCount()); final int size = Math.max(0, maxOrd - minOrd + 1); if (size > maxTerms) { @@ -599,4 +624,42 @@ private void init(int minOrd, int maxOrd) throws IOException { disjunction.addAll(postings); } } + + private class SkipperBasedCompetitiveState extends CompetitiveState { + private final DocValuesSkipper skipper; + private final TwoPhaseIterator innerTwoPhase; + private int minOrd; + private int maxOrd; + + SkipperBasedCompetitiveState(LeafReaderContext context, DocValuesSkipper skipper) + throws IOException { + super(context); + this.skipper = skipper; + this.iterator.update(DocIdSetIterator.all(context.reader().maxDoc())); + final SortedDocValues docValues = getSortedDocValues(context, field); + this.innerTwoPhase = + new TwoPhaseIterator(docValues) { + @Override + public boolean matches() throws IOException { + final int cur = docValues.ordValue(); + return cur >= minOrd && cur <= maxOrd; + } + + @Override + public float matchCost() { + return 2; + } + }; + } + + @Override + public void update(int minOrd, int maxOrd) throws IOException { + this.minOrd = minOrd; + this.maxOrd = maxOrd; + + final TwoPhaseIterator twoPhaseIterator = + new DocValuesRangeIterator(innerTwoPhase, skipper, minOrd, maxOrd, false); + iterator.update(TwoPhaseIterator.asDocIdSetIterator(twoPhaseIterator)); + } + } } diff --git a/lucene/core/src/test/org/apache/lucene/search/TestSortOptimization.java b/lucene/core/src/test/org/apache/lucene/search/TestSortOptimization.java index 79b8ed6d6667..bdbb15867728 100644 --- a/lucene/core/src/test/org/apache/lucene/search/TestSortOptimization.java +++ b/lucene/core/src/test/org/apache/lucene/search/TestSortOptimization.java @@ -35,6 +35,7 @@ import org.apache.lucene.document.LongField; import org.apache.lucene.document.LongPoint; import org.apache.lucene.document.NumericDocValuesField; +import org.apache.lucene.document.SortedDocValuesField; import org.apache.lucene.document.StoredField; import org.apache.lucene.document.StringField; import org.apache.lucene.index.DirectoryReader; @@ -1000,14 +1001,23 @@ private void assertNonCompetitiveHitsAreSkipped(long collectedHits, long numDocs } } - public void testStringSortOptimization() throws IOException { + public void testStringSortOptimizationBasedPostings() throws IOException { + testStringSortOptimization((field, value) -> new KeywordField(field, value, Field.Store.NO)); + } + + public void testStringSortOptimizationBasedDVSkipper() throws IOException { + testStringSortOptimization(SortedDocValuesField::indexedField); + } + + private void testStringSortOptimization( + BiFunction fieldsBuilder) throws IOException { final Directory dir = newDirectory(); final IndexWriter writer = new IndexWriter(dir, new IndexWriterConfig()); final int numDocs = atLeast(10000); for (int i = 0; i < numDocs; ++i) { final Document doc = new Document(); final BytesRef value = new BytesRef(Integer.toString(random().nextInt(1000))); - doc.add(new KeywordField("my_field", value, Field.Store.NO)); + doc.add(fieldsBuilder.apply("my_field", value)); writer.addDocument(doc); if (i == 7000) writer.flush(); // multiple segments } @@ -1019,7 +1029,17 @@ public void testStringSortOptimization() throws IOException { dir.close(); } - public void testStringSortOptimizationWithMissingValues() throws IOException { + public void testStringSortOptimizationWithMissingValuesBasedPostings() throws IOException { + testStringSortOptimizationWithMissingValues( + (field, value) -> new KeywordField(field, value, Field.Store.NO)); + } + + public void testStringSortOptimizationWithMissingValuesBasedDVSkipper() throws IOException { + testStringSortOptimizationWithMissingValues(SortedDocValuesField::indexedField); + } + + private void testStringSortOptimizationWithMissingValues( + BiFunction fieldsBuilder) throws IOException { final Directory dir = newDirectory(); final IndexWriter writer = new IndexWriter(dir, new IndexWriterConfig().setMergePolicy(newLogMergePolicy())); @@ -1031,7 +1051,7 @@ public void testStringSortOptimizationWithMissingValues() throws IOException { final Document doc = new Document(); if (random().nextInt(2) == 0) { final BytesRef value = new BytesRef(Integer.toString(random().nextInt(1000))); - doc.add(new KeywordField("my_field", value, Field.Store.NO)); + doc.add(fieldsBuilder.apply("my_field", value)); } writer.addDocument(doc); }