Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -76,9 +76,9 @@ public BatchSummarizer setRatioMetric(final String ratioMetric) {
}

public BatchSummarizer setMaxOrder(final int maxOrder) throws MacroBaseException {
if (maxOrder < 1 || maxOrder > 3) {
if (maxOrder < 1 || maxOrder > 5) {
throw new MacroBaseException("Max Order " + maxOrder +
" cannot be less than 1 or greater than 3");
" cannot be less than 1 or greater than 5");
}
this.maxOrder = maxOrder;
return this;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -40,8 +40,8 @@ public APrioriLinear(
for (int i = 0; i < thresholds.size(); i++) {
this.thresholds[i] = thresholds.get(i);
}
this.setNext = new HashMap<>(3);
this.savedAggregates = new HashMap<>(3);
this.setNext = new HashMap<>(5);
this.savedAggregates = new HashMap<>(5);
}

public List<APLExplanationResult> explain(
Expand All @@ -60,8 +60,12 @@ public List<APLExplanationResult> explain(
final boolean useIntSetAsArray;
// 2097151 is 2^21 - 1, the largest value that can fit in a length-three IntSetAsLong.
// If the cardinality is greater than that, don't use them.
if (cardinality >= 2097151) {
log.warn("Cardinality is extremely high. Candidate generation will be slow.");
if (cardinality >= 2097151 || maxOrder > 3) {
if (cardinality >= 2097151) {
log.warn("Cardinality is extremely high. Candidate generation will be slow.");
} else {
log.warn("Experimental: generating summaries for order > 3.");
}
useIntSetAsArray = true;
} else{
useIntSetAsArray = false;
Expand Down Expand Up @@ -225,6 +229,100 @@ public List<APLExplanationResult> explain(
}
}
}
} else if (curOrderFinal == 4) {
for (int colNumOne = 0; colNumOne < numColumns; colNumOne++) {
int[] curColumnOneAttributes = byThreadAttributesTranspose[curThreadNum][colNumOne % numColumns];
for (int colNumTwo = colNumOne + 1; colNumTwo < numColumns; colNumTwo++) {
int[] curColumnTwoAttributes = byThreadAttributesTranspose[curThreadNum][colNumTwo % numColumns];
for (int colnumThree = colNumTwo + 1; colnumThree < numColumns; colnumThree++) {
int[] curColumnThreeAttributes = byThreadAttributesTranspose[curThreadNum][colnumThree % numColumns];
for (int colnumFour = colnumThree + 1; colnumFour < numColumns; colnumFour++) {
int[] curColumnFourAttributes = byThreadAttributesTranspose[curThreadNum][colnumFour % numColumns];
for (int rowNum = startIndex; rowNum < endIndex; rowNum++) {
int rowNumInCol = rowNum - startIndex;
// Only construct a triple if all its singleton members have minimum support.
if (curColumnOneAttributes[rowNumInCol] == AttributeEncoder.noSupport
|| curColumnTwoAttributes[rowNumInCol] == AttributeEncoder.noSupport
|| curColumnThreeAttributes[rowNumInCol] == AttributeEncoder.noSupport
|| curColumnFourAttributes[rowNumInCol] == AttributeEncoder.noSupport
|| !singleNextArray[curColumnFourAttributes[rowNumInCol]]
|| !singleNextArray[curColumnThreeAttributes[rowNumInCol]]
|| !singleNextArray[curColumnOneAttributes[rowNumInCol]]
|| !singleNextArray[curColumnTwoAttributes[rowNumInCol]])
continue;
// Cascade to arrays. Packing to long not supported for order > 3
List<Integer> sorted = new ArrayList<>();
sorted.add(curColumnOneAttributes[rowNumInCol]);
sorted.add(curColumnTwoAttributes[rowNumInCol]);
sorted.add(curColumnThreeAttributes[rowNumInCol]);
sorted.add(curColumnFourAttributes[rowNumInCol]);
Collections.sort(sorted);
curCandidate = new IntSetAsArray(sorted);

double[] candidateVal = thisThreadSetAggregates.get(curCandidate);
if (candidateVal == null) {
thisThreadSetAggregates.put(curCandidate,
Arrays.copyOf(aRows[rowNum], numAggregates));
} else {
for (int a = 0; a < numAggregates; a++) {
candidateVal[a] += aRows[rowNum][a];
}
}
}
}
}
}
}
} else if (curOrderFinal == 5) {
for (int colNumOne = 0; colNumOne < numColumns; colNumOne++) {
int[] curColumnOneAttributes = byThreadAttributesTranspose[curThreadNum][colNumOne % numColumns];
for (int colNumTwo = colNumOne + 1; colNumTwo < numColumns; colNumTwo++) {
int[] curColumnTwoAttributes = byThreadAttributesTranspose[curThreadNum][colNumTwo % numColumns];
for (int colnumThree = colNumTwo + 1; colnumThree < numColumns; colnumThree++) {
int[] curColumnThreeAttributes = byThreadAttributesTranspose[curThreadNum][colnumThree % numColumns];
for (int colnumFour = colnumThree + 1; colnumFour < numColumns; colnumFour++) {
int[] curColumnFourAttributes = byThreadAttributesTranspose[curThreadNum][colnumFour % numColumns];
for (int colnumFive = colnumFour + 1; colnumFive < numColumns; colnumFive++) {
int[] curColumnFiveAttributes = byThreadAttributesTranspose[curThreadNum][colnumFive % numColumns];
for (int rowNum = startIndex; rowNum < endIndex; rowNum++) {
int rowNumInCol = rowNum - startIndex;
// Only construct a triple if all its singleton members have minimum support.
if (curColumnOneAttributes[rowNumInCol] == AttributeEncoder.noSupport
|| curColumnTwoAttributes[rowNumInCol] == AttributeEncoder.noSupport
|| curColumnThreeAttributes[rowNumInCol] == AttributeEncoder.noSupport
|| curColumnFourAttributes[rowNumInCol] == AttributeEncoder.noSupport
|| curColumnFiveAttributes[rowNumInCol] == AttributeEncoder.noSupport
|| !singleNextArray[curColumnFiveAttributes[rowNumInCol]]
|| !singleNextArray[curColumnFourAttributes[rowNumInCol]]
|| !singleNextArray[curColumnThreeAttributes[rowNumInCol]]
|| !singleNextArray[curColumnOneAttributes[rowNumInCol]]
|| !singleNextArray[curColumnTwoAttributes[rowNumInCol]])
continue;
// Cascade to arrays. Packing to long not supported for order > 3
List<Integer> sorted = new ArrayList<>();
sorted.add(curColumnOneAttributes[rowNumInCol]);
sorted.add(curColumnTwoAttributes[rowNumInCol]);
sorted.add(curColumnThreeAttributes[rowNumInCol]);
sorted.add(curColumnFourAttributes[rowNumInCol]);
sorted.add(curColumnFiveAttributes[rowNumInCol]);
Collections.sort(sorted);
curCandidate = new IntSetAsArray(sorted);

double[] candidateVal = thisThreadSetAggregates.get(curCandidate);
if (candidateVal == null) {
thisThreadSetAggregates.put(curCandidate,
Arrays.copyOf(aRows[rowNum], numAggregates));
} else {
for (int a = 0; a < numAggregates; a++) {
candidateVal[a] += aRows[rowNum][a];
}
}
}
}
}
}
}
}
} else {
throw new MacroBaseInternalError("High Order not supported");
}
Expand Down Expand Up @@ -292,7 +390,8 @@ public List<APLExplanationResult> explain(
}
if (action == QualityMetric.Action.KEEP) {
// Make sure the candidate isn't already covered by a pair
if (curOrder != 3 || validateCandidate(curCandidate, setNext.get(2))) {
if (curOrder < 3 || curOrder == 3 && validateCandidateO3(curCandidate, setNext.get(2))
|| curOrder == 4 && validateCandidateO4(curCandidate, setNext.get(3))) {
// if a set is already past the threshold on all metrics,
// save it and no need for further exploration if we do containment
curOrderSaved.add(curCandidate);
Expand Down Expand Up @@ -344,7 +443,7 @@ public List<APLExplanationResult> explain(
* @param curCandidate An order-3 candidate
* @return Boolean
*/
private boolean validateCandidate(IntSet curCandidate,
private boolean validateCandidateO3(IntSet curCandidate,
HashSet<IntSet> o2Candidates) {
IntSet subPair;
subPair = new IntSetAsArray(
Expand All @@ -365,4 +464,41 @@ private boolean validateCandidate(IntSet curCandidate,
}
return false;
}

/**
* Check if all subsets of an order-4 candidate are order-3 candidates.
* @param o3Candidates All candidates of order 3 with minimum support.
* @param curCandidate An order-4 candidate
* @return Boolean
*/
private boolean validateCandidateO4(IntSet curCandidate,
HashSet<IntSet> o3Candidates) {
IntSet subPair;
subPair = new IntSetAsArray(
curCandidate.getFirst(),
curCandidate.getSecond(),
curCandidate.getThird());
if (o3Candidates.contains(subPair)) {
subPair = new IntSetAsArray(
curCandidate.getSecond(),
curCandidate.getThird(),
curCandidate.getFourth());
if (o3Candidates.contains(subPair)) {
subPair = new IntSetAsArray(
curCandidate.getFirst(),
curCandidate.getThird(),
curCandidate.getFourth());
if (o3Candidates.contains(subPair)) {
subPair = new IntSetAsArray(
curCandidate.getFirst(),
curCandidate.getSecond(),
curCandidate.getFourth());
if (o3Candidates.contains(subPair)) {
return true;
}
}
}
}
return false;
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,8 @@ public interface IntSet {
int getFirst();
int getSecond();
int getThird();
int getFourth();
int getFifth();
boolean contains(int query);
Set<Integer> getSet();
}
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
package edu.stanford.futuredata.macrobase.analysis.summary.util;

import java.util.List;
import java.util.Arrays;
import java.util.HashMap;
import java.util.HashSet;
Expand Down Expand Up @@ -55,6 +56,12 @@ public IntSetAsArray(int a, int b, int c) {
values[2] = c;
}

public IntSetAsArray(List<Integer> list) {
values = new int[list.size()];
for (int i = 0; i < list.size(); i++)
values[i] = list.get(i);
}

/*
* Hand-rolled three-integer sort. Extremely performant and saves a lot of time in the
* apriori/aplinear implementation versus just calling sort.
Expand Down Expand Up @@ -109,6 +116,14 @@ public int getThird() {
return values[2];
}

public int getFourth() {
return values[3];
}

public int getFifth() {
return values[4];
}

public Set<Integer> getSet() {
HashSet<Integer> curSet = new HashSet<>(values.length);
for (int v : values) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -125,6 +125,16 @@ public int getThird() {
return Math.toIntExact((this.value >>> 42));
}

/**
* !! Only support integer packing up to order 3
*/
public int getFourth() { return 0; }

/**
* !! Only support integer packing up to order 3
*/
public int getFifth() { return 0; }

/**
* Check if setLong contains queryLong.
* @param query An integer.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
import com.google.common.collect.Lists;
import edu.stanford.futuredata.macrobase.analysis.MBFunction;
import edu.stanford.futuredata.macrobase.analysis.summary.aplinear.APLOutlierSummarizer;
import edu.stanford.futuredata.macrobase.analysis.summary.fpg.FPGrowthSummarizer;
import edu.stanford.futuredata.macrobase.datamodel.DataFrame;
import edu.stanford.futuredata.macrobase.datamodel.Schema.ColType;
import edu.stanford.futuredata.macrobase.ingest.CSVDataFrameParser;
Expand Down Expand Up @@ -185,6 +186,14 @@ private DataFrame executeDiffQuerySpec(final DiffQuerySpecification diffQuery)
.setOutlierColumn(outlierColName)
.setAttributes(explainCols)
.setNumThreads(numThreads);
// FP growth summarizer: need to udpate output format
// FPGrowthSummarizer summarizer = new FPGrowthSummarizer();
// summarizer.setRatioMetric(ratioMetric)
// .setMaxOrder(order)
// .setMinSupport(minSupport)
// .setMinRatioMetric(minRatioMetric)
// .setOutlierColumn(outlierColName)
// .setAttributes(explainCols);

try {
summarizer.process(dfToExplain);
Expand Down