Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions lucene/CHANGES.txt
Original file line number Diff line number Diff line change
Expand Up @@ -270,6 +270,8 @@ Optimizations

* GITHUB#15498: ExitableDirectoryReader keeps singleton SortedSetDocValues or SortedNumericDocValues as singletons. (Houston Putman)

* GITHUB#15511: Dynamic pruning for SORTED(_SET) fields with doc values skipper (Pan Guixin)

Bug Fixes
---------------------
* GITHUB#14161: PointInSetQuery's constructor now throws IllegalArgumentException
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -197,8 +197,7 @@ protected Match match(int level) {
@Override
public final boolean matches() throws IOException {
return switch (approximation.match) {
case YES -> true;
case IF_DOC_HAS_VALUE -> true;
case YES, IF_DOC_HAS_VALUE -> true;
case MAYBE -> innerTwoPhase.matches();
case NO -> throw new IllegalStateException("Unpositioned approximation");
};
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,8 @@
import java.io.IOException;
import java.util.ArrayDeque;
import org.apache.lucene.index.DocValues;
import org.apache.lucene.index.DocValuesSkipIndexType;
import org.apache.lucene.index.DocValuesSkipper;
import org.apache.lucene.index.FieldInfo;
import org.apache.lucene.index.IndexOptions;
import org.apache.lucene.index.LeafReaderContext;
Expand All @@ -28,11 +30,13 @@
import org.apache.lucene.index.TermsEnum;
import org.apache.lucene.search.AbstractDocIdSetIterator;
import org.apache.lucene.search.DocIdSetIterator;
import org.apache.lucene.search.DocValuesRangeIterator;
import org.apache.lucene.search.FieldComparator;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.LeafFieldComparator;
import org.apache.lucene.search.Pruning;
import org.apache.lucene.search.Scorable;
import org.apache.lucene.search.TwoPhaseIterator;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.BytesRefBuilder;
import org.apache.lucene.util.PriorityQueue;
Expand Down Expand Up @@ -246,6 +250,8 @@ private class TermOrdValLeafComparator implements LeafFieldComparator {
}

final boolean enableSkipping;
Terms terms = null;
DocValuesSkipper skipper = null;
if (canSkipDocuments == false) {
dense = false;
enableSkipping = false;
Expand All @@ -257,31 +263,41 @@ private class TermOrdValLeafComparator implements LeafFieldComparator {
}
dense = false;
enableSkipping = true;
} else if (fieldInfo.getIndexOptions() == IndexOptions.NONE) {
// No terms index
} else if (fieldInfo.getIndexOptions() != IndexOptions.NONE) {
terms = context.reader().terms(field);
dense = terms != null && terms.getDocCount() == context.reader().maxDoc();
enableSkipping = shouldEnableSkipping(dense);
} else if (fieldInfo.docValuesSkipIndexType() != DocValuesSkipIndexType.NONE) {
skipper = context.reader().getDocValuesSkipper(field);
dense = skipper != null && skipper.docCount() == context.reader().maxDoc();
enableSkipping = shouldEnableSkipping(dense);
} else {
dense = false;
enableSkipping = false;
} else {
Terms terms = context.reader().terms(field);
dense = terms != null && terms.getDocCount() == context.reader().maxDoc();
if (dense || topValue != null) {
enableSkipping = true;
} else if (reverse == sortMissingLast) {
// Missing values are always competitive, we can never skip
enableSkipping = false;
} else {
enableSkipping = true;
}
}
}
if (enableSkipping) {
competitiveState = new CompetitiveState(context, field, dense, values.termsEnum());
if (enableSkipping && terms != null) {
competitiveState =
new PostingsBasedCompetitiveState(context, field, dense, values.termsEnum());
} else if (enableSkipping && skipper != null) {
competitiveState = new SkipperBasedCompetitiveState(context, skipper);
} else {
competitiveState = null;
}
updateCompetitiveIterator();
}

private boolean shouldEnableSkipping(boolean dense) {
if (dense || topValue != null) {
return true;
} else if (reverse == sortMissingLast) {
// Missing values are always competitive, we can never skip
return false;
} else {
return true;
}
}

private int getOrdForDoc(int doc) throws IOException {
if (termsIndex.advanceExact(doc)) {
return termsIndex.ordValue();
Expand Down Expand Up @@ -477,7 +493,18 @@ public DocIdSetIterator competitiveIterator() {

private record PostingsEnumAndOrd(PostingsEnum postings, int ord) {}

private class CompetitiveState {
private abstract class CompetitiveState {
final UpdateableDocIdSetIterator iterator;

CompetitiveState(LeafReaderContext context) {
iterator = new UpdateableDocIdSetIterator();
iterator.update(DocIdSetIterator.all(context.reader().maxDoc()));
}

abstract void update(int minOrd, int maxOrd) throws IOException;
}

private class PostingsBasedCompetitiveState extends CompetitiveState {

private static final int MAX_TERMS = 1024;

Expand All @@ -490,10 +517,9 @@ private class CompetitiveState {
private PriorityQueue<PostingsEnumAndOrd> disjunction;
private final DocIdSetIterator disjunctionIterator;

private final UpdateableDocIdSetIterator iterator;

CompetitiveState(
PostingsBasedCompetitiveState(
LeafReaderContext context, String field, boolean dense, TermsEnum docValuesTerms) {
super(context);
this.context = context;
this.field = field;
this.dense = dense;
Expand Down Expand Up @@ -529,15 +555,14 @@ public long cost() {
return cost;
}
};
this.iterator = new UpdateableDocIdSetIterator();
this.iterator.update(DocIdSetIterator.all(context.reader().maxDoc()));
}

/**
* Update this iterator to only match postings whose term has an ordinal between {@code minOrd}
* included and {@code maxOrd} included.
*/
private void update(int minOrd, int maxOrd) throws IOException {
@Override
public void update(int minOrd, int maxOrd) throws IOException {
final int maxTerms = Math.min(MAX_TERMS, IndexSearcher.getMaxClauseCount());
final int size = Math.max(0, maxOrd - minOrd + 1);
if (size > maxTerms) {
Expand Down Expand Up @@ -599,4 +624,42 @@ private void init(int minOrd, int maxOrd) throws IOException {
disjunction.addAll(postings);
}
}

private class SkipperBasedCompetitiveState extends CompetitiveState {
private final DocValuesSkipper skipper;
private final TwoPhaseIterator innerTwoPhase;
private int minOrd;
private int maxOrd;

SkipperBasedCompetitiveState(LeafReaderContext context, DocValuesSkipper skipper)
throws IOException {
super(context);
this.skipper = skipper;
this.iterator.update(DocIdSetIterator.all(context.reader().maxDoc()));
final SortedDocValues docValues = getSortedDocValues(context, field);
this.innerTwoPhase =
new TwoPhaseIterator(docValues) {
@Override
public boolean matches() throws IOException {
final int cur = docValues.ordValue();
return cur >= minOrd && cur <= maxOrd;
}

@Override
public float matchCost() {
return 2;
}
};
}

@Override
public void update(int minOrd, int maxOrd) throws IOException {
this.minOrd = minOrd;
this.maxOrd = maxOrd;

final TwoPhaseIterator twoPhaseIterator =
new DocValuesRangeIterator(innerTwoPhase, skipper, minOrd, maxOrd, false);
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

One thing that I've been meaning to test with numeric comparators but haven't got round to yet: I wonder if its worth replacing this usage of DocValuesRangeIterator with something that only advances to skip-block boundaries? My reasoning being that if the parent query is reasonably sparse, then calling advance() on this competitive iterator might end up doing a bunch of work to find the precise first matching value within a block that is then immediately thrown away when we leapfrog back and call advance() on the parent iterator. Whereas if we just advance to the first competitive block then we can let the parent query do the work.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I had the same thought. When I implemented this, I considered whether approximation could serve as a competitive iterator, but I haven't figured out when to do that.

iterator.update(TwoPhaseIterator.asDocIdSetIterator(twoPhaseIterator));
}
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,7 @@
import org.apache.lucene.document.LongField;
import org.apache.lucene.document.LongPoint;
import org.apache.lucene.document.NumericDocValuesField;
import org.apache.lucene.document.SortedDocValuesField;
import org.apache.lucene.document.StoredField;
import org.apache.lucene.document.StringField;
import org.apache.lucene.index.DirectoryReader;
Expand Down Expand Up @@ -1000,14 +1001,23 @@ private void assertNonCompetitiveHitsAreSkipped(long collectedHits, long numDocs
}
}

public void testStringSortOptimization() throws IOException {
public void testStringSortOptimizationBasedPostings() throws IOException {
testStringSortOptimization((field, value) -> new KeywordField(field, value, Field.Store.NO));
}

public void testStringSortOptimizationBasedDVSkipper() throws IOException {
testStringSortOptimization(SortedDocValuesField::indexedField);
}

private void testStringSortOptimization(
BiFunction<String, BytesRef, IndexableField> fieldsBuilder) throws IOException {
final Directory dir = newDirectory();
final IndexWriter writer = new IndexWriter(dir, new IndexWriterConfig());
final int numDocs = atLeast(10000);
for (int i = 0; i < numDocs; ++i) {
final Document doc = new Document();
final BytesRef value = new BytesRef(Integer.toString(random().nextInt(1000)));
doc.add(new KeywordField("my_field", value, Field.Store.NO));
doc.add(fieldsBuilder.apply("my_field", value));
writer.addDocument(doc);
if (i == 7000) writer.flush(); // multiple segments
}
Expand All @@ -1019,7 +1029,17 @@ public void testStringSortOptimization() throws IOException {
dir.close();
}

public void testStringSortOptimizationWithMissingValues() throws IOException {
public void testStringSortOptimizationWithMissingValuesBasedPostings() throws IOException {
testStringSortOptimizationWithMissingValues(
(field, value) -> new KeywordField(field, value, Field.Store.NO));
}

public void testStringSortOptimizationWithMissingValuesBasedDVSkipper() throws IOException {
testStringSortOptimizationWithMissingValues(SortedDocValuesField::indexedField);
}

private void testStringSortOptimizationWithMissingValues(
BiFunction<String, BytesRef, IndexableField> fieldsBuilder) throws IOException {
final Directory dir = newDirectory();
final IndexWriter writer =
new IndexWriter(dir, new IndexWriterConfig().setMergePolicy(newLogMergePolicy()));
Expand All @@ -1031,7 +1051,7 @@ public void testStringSortOptimizationWithMissingValues() throws IOException {
final Document doc = new Document();
if (random().nextInt(2) == 0) {
final BytesRef value = new BytesRef(Integer.toString(random().nextInt(1000)));
doc.add(new KeywordField("my_field", value, Field.Store.NO));
doc.add(fieldsBuilder.apply("my_field", value));
}
writer.addDocument(doc);
}
Expand Down