diff --git a/lib/src/main/java/edu/stanford/futuredata/macrobase/analysis/summary/BatchSummarizer.java b/lib/src/main/java/edu/stanford/futuredata/macrobase/analysis/summary/BatchSummarizer.java index 5b6199ade..ff1a80135 100644 --- a/lib/src/main/java/edu/stanford/futuredata/macrobase/analysis/summary/BatchSummarizer.java +++ b/lib/src/main/java/edu/stanford/futuredata/macrobase/analysis/summary/BatchSummarizer.java @@ -76,9 +76,9 @@ public BatchSummarizer setRatioMetric(final String ratioMetric) { } public BatchSummarizer setMaxOrder(final int maxOrder) throws MacroBaseException { - if (maxOrder < 1 || maxOrder > 3) { + if (maxOrder < 1 || maxOrder > 5) { throw new MacroBaseException("Max Order " + maxOrder + - " cannot be less than 1 or greater than 3"); + " cannot be less than 1 or greater than 5"); } this.maxOrder = maxOrder; return this; diff --git a/lib/src/main/java/edu/stanford/futuredata/macrobase/analysis/summary/aplinear/APrioriLinear.java b/lib/src/main/java/edu/stanford/futuredata/macrobase/analysis/summary/aplinear/APrioriLinear.java index 52b316bcf..4e695e43e 100644 --- a/lib/src/main/java/edu/stanford/futuredata/macrobase/analysis/summary/aplinear/APrioriLinear.java +++ b/lib/src/main/java/edu/stanford/futuredata/macrobase/analysis/summary/aplinear/APrioriLinear.java @@ -40,8 +40,8 @@ public APrioriLinear( for (int i = 0; i < thresholds.size(); i++) { this.thresholds[i] = thresholds.get(i); } - this.setNext = new HashMap<>(3); - this.savedAggregates = new HashMap<>(3); + this.setNext = new HashMap<>(5); + this.savedAggregates = new HashMap<>(5); } public List explain( @@ -60,8 +60,12 @@ public List explain( final boolean useIntSetAsArray; // 2097151 is 2^21 - 1, the largest value that can fit in a length-three IntSetAsLong. // If the cardinality is greater than that, don't use them. - if (cardinality >= 2097151) { - log.warn("Cardinality is extremely high. Candidate generation will be slow."); + if (cardinality >= 2097151 || maxOrder > 3) { + if (cardinality >= 2097151) { + log.warn("Cardinality is extremely high. Candidate generation will be slow."); + } else { + log.warn("Experimental: generating summaries for order > 3."); + } useIntSetAsArray = true; } else{ useIntSetAsArray = false; @@ -225,6 +229,100 @@ public List explain( } } } + } else if (curOrderFinal == 4) { + for (int colNumOne = 0; colNumOne < numColumns; colNumOne++) { + int[] curColumnOneAttributes = byThreadAttributesTranspose[curThreadNum][colNumOne % numColumns]; + for (int colNumTwo = colNumOne + 1; colNumTwo < numColumns; colNumTwo++) { + int[] curColumnTwoAttributes = byThreadAttributesTranspose[curThreadNum][colNumTwo % numColumns]; + for (int colnumThree = colNumTwo + 1; colnumThree < numColumns; colnumThree++) { + int[] curColumnThreeAttributes = byThreadAttributesTranspose[curThreadNum][colnumThree % numColumns]; + for (int colnumFour = colnumThree + 1; colnumFour < numColumns; colnumFour++) { + int[] curColumnFourAttributes = byThreadAttributesTranspose[curThreadNum][colnumFour % numColumns]; + for (int rowNum = startIndex; rowNum < endIndex; rowNum++) { + int rowNumInCol = rowNum - startIndex; + // Only construct a triple if all its singleton members have minimum support. + if (curColumnOneAttributes[rowNumInCol] == AttributeEncoder.noSupport + || curColumnTwoAttributes[rowNumInCol] == AttributeEncoder.noSupport + || curColumnThreeAttributes[rowNumInCol] == AttributeEncoder.noSupport + || curColumnFourAttributes[rowNumInCol] == AttributeEncoder.noSupport + || !singleNextArray[curColumnFourAttributes[rowNumInCol]] + || !singleNextArray[curColumnThreeAttributes[rowNumInCol]] + || !singleNextArray[curColumnOneAttributes[rowNumInCol]] + || !singleNextArray[curColumnTwoAttributes[rowNumInCol]]) + continue; + // Cascade to arrays. Packing to long not supported for order > 3 + List sorted = new ArrayList<>(); + sorted.add(curColumnOneAttributes[rowNumInCol]); + sorted.add(curColumnTwoAttributes[rowNumInCol]); + sorted.add(curColumnThreeAttributes[rowNumInCol]); + sorted.add(curColumnFourAttributes[rowNumInCol]); + Collections.sort(sorted); + curCandidate = new IntSetAsArray(sorted); + + double[] candidateVal = thisThreadSetAggregates.get(curCandidate); + if (candidateVal == null) { + thisThreadSetAggregates.put(curCandidate, + Arrays.copyOf(aRows[rowNum], numAggregates)); + } else { + for (int a = 0; a < numAggregates; a++) { + candidateVal[a] += aRows[rowNum][a]; + } + } + } + } + } + } + } + } else if (curOrderFinal == 5) { + for (int colNumOne = 0; colNumOne < numColumns; colNumOne++) { + int[] curColumnOneAttributes = byThreadAttributesTranspose[curThreadNum][colNumOne % numColumns]; + for (int colNumTwo = colNumOne + 1; colNumTwo < numColumns; colNumTwo++) { + int[] curColumnTwoAttributes = byThreadAttributesTranspose[curThreadNum][colNumTwo % numColumns]; + for (int colnumThree = colNumTwo + 1; colnumThree < numColumns; colnumThree++) { + int[] curColumnThreeAttributes = byThreadAttributesTranspose[curThreadNum][colnumThree % numColumns]; + for (int colnumFour = colnumThree + 1; colnumFour < numColumns; colnumFour++) { + int[] curColumnFourAttributes = byThreadAttributesTranspose[curThreadNum][colnumFour % numColumns]; + for (int colnumFive = colnumFour + 1; colnumFive < numColumns; colnumFive++) { + int[] curColumnFiveAttributes = byThreadAttributesTranspose[curThreadNum][colnumFive % numColumns]; + for (int rowNum = startIndex; rowNum < endIndex; rowNum++) { + int rowNumInCol = rowNum - startIndex; + // Only construct a triple if all its singleton members have minimum support. + if (curColumnOneAttributes[rowNumInCol] == AttributeEncoder.noSupport + || curColumnTwoAttributes[rowNumInCol] == AttributeEncoder.noSupport + || curColumnThreeAttributes[rowNumInCol] == AttributeEncoder.noSupport + || curColumnFourAttributes[rowNumInCol] == AttributeEncoder.noSupport + || curColumnFiveAttributes[rowNumInCol] == AttributeEncoder.noSupport + || !singleNextArray[curColumnFiveAttributes[rowNumInCol]] + || !singleNextArray[curColumnFourAttributes[rowNumInCol]] + || !singleNextArray[curColumnThreeAttributes[rowNumInCol]] + || !singleNextArray[curColumnOneAttributes[rowNumInCol]] + || !singleNextArray[curColumnTwoAttributes[rowNumInCol]]) + continue; + // Cascade to arrays. Packing to long not supported for order > 3 + List sorted = new ArrayList<>(); + sorted.add(curColumnOneAttributes[rowNumInCol]); + sorted.add(curColumnTwoAttributes[rowNumInCol]); + sorted.add(curColumnThreeAttributes[rowNumInCol]); + sorted.add(curColumnFourAttributes[rowNumInCol]); + sorted.add(curColumnFiveAttributes[rowNumInCol]); + Collections.sort(sorted); + curCandidate = new IntSetAsArray(sorted); + + double[] candidateVal = thisThreadSetAggregates.get(curCandidate); + if (candidateVal == null) { + thisThreadSetAggregates.put(curCandidate, + Arrays.copyOf(aRows[rowNum], numAggregates)); + } else { + for (int a = 0; a < numAggregates; a++) { + candidateVal[a] += aRows[rowNum][a]; + } + } + } + } + } + } + } + } } else { throw new MacroBaseInternalError("High Order not supported"); } @@ -292,7 +390,8 @@ public List explain( } if (action == QualityMetric.Action.KEEP) { // Make sure the candidate isn't already covered by a pair - if (curOrder != 3 || validateCandidate(curCandidate, setNext.get(2))) { + if (curOrder < 3 || curOrder == 3 && validateCandidateO3(curCandidate, setNext.get(2)) + || curOrder == 4 && validateCandidateO4(curCandidate, setNext.get(3))) { // if a set is already past the threshold on all metrics, // save it and no need for further exploration if we do containment curOrderSaved.add(curCandidate); @@ -344,7 +443,7 @@ public List explain( * @param curCandidate An order-3 candidate * @return Boolean */ - private boolean validateCandidate(IntSet curCandidate, + private boolean validateCandidateO3(IntSet curCandidate, HashSet o2Candidates) { IntSet subPair; subPair = new IntSetAsArray( @@ -365,4 +464,41 @@ private boolean validateCandidate(IntSet curCandidate, } return false; } + + /** + * Check if all subsets of an order-4 candidate are order-3 candidates. + * @param o3Candidates All candidates of order 3 with minimum support. + * @param curCandidate An order-4 candidate + * @return Boolean + */ + private boolean validateCandidateO4(IntSet curCandidate, + HashSet o3Candidates) { + IntSet subPair; + subPair = new IntSetAsArray( + curCandidate.getFirst(), + curCandidate.getSecond(), + curCandidate.getThird()); + if (o3Candidates.contains(subPair)) { + subPair = new IntSetAsArray( + curCandidate.getSecond(), + curCandidate.getThird(), + curCandidate.getFourth()); + if (o3Candidates.contains(subPair)) { + subPair = new IntSetAsArray( + curCandidate.getFirst(), + curCandidate.getThird(), + curCandidate.getFourth()); + if (o3Candidates.contains(subPair)) { + subPair = new IntSetAsArray( + curCandidate.getFirst(), + curCandidate.getSecond(), + curCandidate.getFourth()); + if (o3Candidates.contains(subPair)) { + return true; + } + } + } + } + return false; + } } diff --git a/lib/src/main/java/edu/stanford/futuredata/macrobase/analysis/summary/util/IntSet.java b/lib/src/main/java/edu/stanford/futuredata/macrobase/analysis/summary/util/IntSet.java index ef359300e..dc6b6c23a 100644 --- a/lib/src/main/java/edu/stanford/futuredata/macrobase/analysis/summary/util/IntSet.java +++ b/lib/src/main/java/edu/stanford/futuredata/macrobase/analysis/summary/util/IntSet.java @@ -6,6 +6,8 @@ public interface IntSet { int getFirst(); int getSecond(); int getThird(); + int getFourth(); + int getFifth(); boolean contains(int query); Set getSet(); } diff --git a/lib/src/main/java/edu/stanford/futuredata/macrobase/analysis/summary/util/IntSetAsArray.java b/lib/src/main/java/edu/stanford/futuredata/macrobase/analysis/summary/util/IntSetAsArray.java index 4a952f91a..2c1a393ea 100644 --- a/lib/src/main/java/edu/stanford/futuredata/macrobase/analysis/summary/util/IntSetAsArray.java +++ b/lib/src/main/java/edu/stanford/futuredata/macrobase/analysis/summary/util/IntSetAsArray.java @@ -1,5 +1,6 @@ package edu.stanford.futuredata.macrobase.analysis.summary.util; +import java.util.List; import java.util.Arrays; import java.util.HashMap; import java.util.HashSet; @@ -55,6 +56,12 @@ public IntSetAsArray(int a, int b, int c) { values[2] = c; } + public IntSetAsArray(List list) { + values = new int[list.size()]; + for (int i = 0; i < list.size(); i++) + values[i] = list.get(i); + } + /* * Hand-rolled three-integer sort. Extremely performant and saves a lot of time in the * apriori/aplinear implementation versus just calling sort. @@ -109,6 +116,14 @@ public int getThird() { return values[2]; } + public int getFourth() { + return values[3]; + } + + public int getFifth() { + return values[4]; + } + public Set getSet() { HashSet curSet = new HashSet<>(values.length); for (int v : values) { diff --git a/lib/src/main/java/edu/stanford/futuredata/macrobase/analysis/summary/util/IntSetAsLong.java b/lib/src/main/java/edu/stanford/futuredata/macrobase/analysis/summary/util/IntSetAsLong.java index 539e8700e..92905d629 100644 --- a/lib/src/main/java/edu/stanford/futuredata/macrobase/analysis/summary/util/IntSetAsLong.java +++ b/lib/src/main/java/edu/stanford/futuredata/macrobase/analysis/summary/util/IntSetAsLong.java @@ -125,6 +125,16 @@ public int getThird() { return Math.toIntExact((this.value >>> 42)); } + /** + * !! Only support integer packing up to order 3 + */ + public int getFourth() { return 0; } + + /** + * !! Only support integer packing up to order 3 + */ + public int getFifth() { return 0; } + /** * Check if setLong contains queryLong. * @param query An integer. diff --git a/sql/src/main/java/edu/stanford/futuredata/macrobase/sql/QueryEngine.java b/sql/src/main/java/edu/stanford/futuredata/macrobase/sql/QueryEngine.java index afbae7557..6f3f87a19 100644 --- a/sql/src/main/java/edu/stanford/futuredata/macrobase/sql/QueryEngine.java +++ b/sql/src/main/java/edu/stanford/futuredata/macrobase/sql/QueryEngine.java @@ -8,6 +8,7 @@ import com.google.common.collect.Lists; import edu.stanford.futuredata.macrobase.analysis.MBFunction; import edu.stanford.futuredata.macrobase.analysis.summary.aplinear.APLOutlierSummarizer; +import edu.stanford.futuredata.macrobase.analysis.summary.fpg.FPGrowthSummarizer; import edu.stanford.futuredata.macrobase.datamodel.DataFrame; import edu.stanford.futuredata.macrobase.datamodel.Schema.ColType; import edu.stanford.futuredata.macrobase.ingest.CSVDataFrameParser; @@ -185,6 +186,14 @@ private DataFrame executeDiffQuerySpec(final DiffQuerySpecification diffQuery) .setOutlierColumn(outlierColName) .setAttributes(explainCols) .setNumThreads(numThreads); + // FP growth summarizer: need to udpate output format +// FPGrowthSummarizer summarizer = new FPGrowthSummarizer(); +// summarizer.setRatioMetric(ratioMetric) +// .setMaxOrder(order) +// .setMinSupport(minSupport) +// .setMinRatioMetric(minRatioMetric) +// .setOutlierColumn(outlierColName) +// .setAttributes(explainCols); try { summarizer.process(dfToExplain);