Skip to content

Commit 464a703

Browse files
committed
add ml code
1 parent adbb16c commit 464a703

26 files changed

+2203
-79
lines changed
Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
0 128:51 129:159 130:253
2+
1 159:124 160:253 161:255
3+
1 125:145 126:255 127:211
4+
1 153:5 154:63 155:197 181:20
Lines changed: 87 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,87 @@
1+
2+
//package org.apache.spark.ml;
3+
//
4+
//import java.util.Arrays;
5+
//
6+
//import org.apache.spark.SparkConf;
7+
//import org.apache.spark.api.java.JavaRDD;
8+
//import org.apache.spark.api.java.JavaSparkContext;
9+
//import org.apache.spark.ml.classification.LogisticRegression;
10+
//import org.apache.spark.ml.classification.LogisticRegressionModel;
11+
//import org.apache.spark.ml.param.ParamMap;
12+
//import org.apache.spark.mllib.linalg.Vectors;
13+
//import org.apache.spark.mllib.regression.LabeledPoint;
14+
//import org.apache.spark.sql.DataFrame;
15+
//import org.apache.spark.sql.Row;
16+
//import org.apache.spark.sql.SQLContext;
17+
//
18+
///**
19+
// *
20+
// * @author xingyun.xb
21+
// * @version $Id: ExampleETP.java, v 0.1 2016-07-23 17:58 xingyun.xb Exp $
22+
// */
23+
//public class ExampleETP {
24+
// public static void main(String[] args) {
25+
//
26+
// SparkConf conf = new SparkConf().setAppName("Simple Application").setMaster("local[4]");
27+
// JavaSparkContext sc = new JavaSparkContext(conf);
28+
// SQLContext sqlContext = new org.apache.spark.sql.SQLContext(sc);
29+
//
30+
// // Prepare training data.
31+
// // We use LabeledPoint, which is a JavaBean. Spark SQL can convert RDDs of JavaBeans
32+
// // into DataFrames, where it uses the bean metadata to infer the schema.
33+
// DataFrame training = sqlContext
34+
// .createDataFrame((JavaRDD<Object>) Arrays.asList(new LabeledPoint(1.0, Vectors.dense(0.0, 1.1, 0.1)),
35+
// new LabeledPoint(0.0, Vectors.dense(2.0, 1.0, -1.0)),
36+
// new LabeledPoint(0.0, Vectors.dense(2.0, 1.3, 1.0)),
37+
// new LabeledPoint(1.0, Vectors.dense(0.0, 1.2, -0.5))), LabeledPoint.class);
38+
//
39+
// // Create a LogisticRegression instance. This instance is an Estimator.
40+
// LogisticRegression lr = new LogisticRegression();
41+
// // Print out the parameters, documentation, and any default values.
42+
// System.out.println("LogisticRegression parameters:\n" + lr.explainParams() + "\n");
43+
//
44+
// // We may set parameters using setter methods.
45+
// lr.setMaxIter(10).setRegParam(0.01);
46+
//
47+
// // Learn a LogisticRegression model. This uses the parameters stored in lr.
48+
// LogisticRegressionModel model1 = lr.fit(training);
49+
// // Since model1 is a Model (i.e., a Transformer produced by an Estimator),
50+
// // we can view the parameters it used during fit().
51+
// // This prints the parameter (name: value) pairs, where names are unique IDs for this
52+
// // LogisticRegression instance.
53+
// System.out
54+
// .println("Model 1 was fit using parameters: " + model1.parent().extractParamMap());
55+
//
56+
// // We may alternatively specify parameters using a ParamMap.
57+
// ParamMap paramMap = new ParamMap().put(lr.maxIter().w(20)) // Specify 1 Param.
58+
// .put(lr.maxIter(), 30) // This overwrites the original maxIter.
59+
// .put(lr.regParam().w(0.1), lr.threshold().w(0.55)); // Specify multiple Params.
60+
//
61+
// // One can also combine ParamMaps.
62+
// ParamMap paramMap2 = new ParamMap().put(lr.probabilityCol().w("myProbability")); // Change output column name
63+
// ParamMap paramMapCombined = paramMap.$plus$plus(paramMap2);
64+
//
65+
// // Now learn a new model using the paramMapCombined parameters.
66+
// // paramMapCombined overrides all parameters set earlier via lr.set* methods.
67+
// LogisticRegressionModel model2 = lr.fit(training, paramMapCombined);
68+
// System.out
69+
// .println("Model 2 was fit using parameters: " + model2.parent().extractParamMap());
70+
//
71+
// // Prepare test documents.
72+
// DataFrame test = sqlContext
73+
// .createDataFrame((JavaRDD<?>) Arrays.asList(new LabeledPoint(1.0, Vectors.dense(-1.0, 1.5, 1.3)),
74+
// new LabeledPoint(0.0, Vectors.dense(3.0, 2.0, -0.1)),
75+
// new LabeledPoint(1.0, Vectors.dense(0.0, 2.2, -1.5))), LabeledPoint.class);
76+
//
77+
// // Make predictions on test documents using the Transformer.transform() method.
78+
// // LogisticRegression.transform will only use the 'features' column.
79+
// // Note that model2.transform() outputs a 'myProbability' column instead of the usual
80+
// // 'probability' column since we renamed the lr.probabilityCol parameter previously.
81+
// DataFrame results = model2.transform(test);
82+
// for (Row r : results.select("features", "label", "myProbability", "prediction").collect()) {
83+
// System.out.println("(" + r.get(0) + ", " + r.get(1) + ") -> prob=" + r.get(2)
84+
// + ", prediction=" + r.get(3));
85+
// }
86+
// }
87+
//}
Lines changed: 106 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,106 @@
1+
2+
//package org.apache.spark.ml.Example;
3+
//
4+
//import org.apache.spark.SparkConf;
5+
//import org.apache.spark.api.java.JavaRDD;
6+
//import org.apache.spark.api.java.JavaSparkContext;
7+
//import org.apache.spark.rdd.RDD;
8+
//import org.apache.spark.sql.SQLContext;
9+
//
10+
//import java.io.Serializable;
11+
//import java.util.Arrays;
12+
//import java.util.List;
13+
//
14+
//import org.apache.spark.ml.Pipeline;
15+
//import org.apache.spark.ml.PipelineModel;
16+
//import org.apache.spark.ml.PipelineStage;
17+
//import org.apache.spark.ml.classification.LogisticRegression;
18+
//import org.apache.spark.ml.feature.HashingTF;
19+
//import org.apache.spark.ml.feature.Tokenizer;
20+
//import org.apache.spark.sql.DataFrame;
21+
//import org.apache.spark.sql.Row;
22+
///**
23+
// *
24+
// * @author xingyun.xb
25+
// * @version $Id: PipelineLearning.java, v 0.1 2016-07-23 18:10 xingyun.xb Exp $
26+
// */
27+
//public class PipelineLearning {
28+
// public static void main(String[] args) {
29+
// SparkConf conf = new SparkConf().setAppName("Simple Application").setMaster("local[4]");
30+
// JavaSparkContext sc = new JavaSparkContext(conf);
31+
// SQLContext sqlContext = new org.apache.spark.sql.SQLContext(sc);
32+
//
33+
//
34+
//
35+
//// Prepare training documents, which are labeled.
36+
// DataFrame training = sqlContext.createDataFrame((JavaRDD<LabeledDocument>) Arrays.asList(
37+
// new LabeledDocument(0L, "a b c d e spark", 1.0),
38+
// new LabeledDocument(1L, "b d", 0.0),
39+
// new LabeledDocument(2L, "spark f g h", 1.0),
40+
// new LabeledDocument(3L, "hadoop mapreduce", 0.0)
41+
// ), LabeledDocument.class);
42+
//
43+
//// Configure an ML pipeline, which consists of three stages: tokenizer, hashingTF, and lr.
44+
// Tokenizer tokenizer = new Tokenizer()
45+
// .setInputCol("text")
46+
// .setOutputCol("words");
47+
// HashingTF hashingTF = new HashingTF()
48+
// .setNumFeatures(1000)
49+
// .setInputCol(tokenizer.getOutputCol())
50+
// .setOutputCol("features");
51+
// LogisticRegression lr = new LogisticRegression()
52+
// .setMaxIter(10)
53+
// .setRegParam(0.01);
54+
// Pipeline pipeline = new Pipeline()
55+
// .setStages(new PipelineStage[] {tokenizer, hashingTF, lr});
56+
//
57+
//// Fit the pipeline to training documents.
58+
// PipelineModel model = pipeline.fit(training);
59+
//
60+
//// Prepare test documents, which are unlabeled.
61+
// DataFrame test = sqlContext.createDataFrame((JavaRDD<?>) Arrays.asList(
62+
// new Document(4L, "spark i j k"),
63+
// new Document(5L, "l m n"),
64+
// new Document(6L, "mapreduce spark"),
65+
// new Document(7L, "apache hadoop")
66+
// ), Document.class);
67+
//
68+
//// Make predictions on test documents.
69+
// DataFrame predictions = model.transform(test);
70+
// for (Row r: predictions.select("id", "text", "probability", "prediction").collect()) {
71+
// System.out.println("(" + r.get(0) + ", " + r.get(1) + ") --> prob=" + r.get(2)
72+
// + ", prediction=" + r.get(3));
73+
// }
74+
// }
75+
//}
76+
//
77+
//
78+
//// Labeled and unlabeled instance types.
79+
//// Spark SQL can infer schema from Java Beans.
80+
//class Document implements Serializable {
81+
// private long id;
82+
// private String text;
83+
//
84+
// public Document(long id, String text) {
85+
// this.id = id;
86+
// this.text = text;
87+
// }
88+
//
89+
// public long getId() { return this.id; }
90+
// public void setId(long id) { this.id = id; }
91+
//
92+
// public String getText() { return this.text; }
93+
// public void setText(String text) { this.text = text; }
94+
//}
95+
//
96+
//class LabeledDocument extends Document implements Serializable {
97+
// private double label;
98+
//
99+
// public LabeledDocument(long id, String text, double label) {
100+
// super(id, text);
101+
// this.label = label;
102+
// }
103+
//
104+
// public double getLabel() { return this.label; }
105+
// public void setLabel(double label) { this.label = label; }
106+
//}
Lines changed: 29 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,29 @@
1+
package org.apache.spark.ml.Example;
2+
3+
import org.apache.spark.SparkConf;
4+
import org.apache.spark.api.java.JavaRDD;
5+
import org.apache.spark.api.java.JavaSparkContext;
6+
import org.apache.spark.sql.SQLContext;
7+
8+
import java.util.Arrays;
9+
import java.util.List;
10+
11+
/**
12+
*
13+
* @author xingyun.xb
14+
* @version $Id: Pipeline.java, v 0.1 2016-07-23 18:06 xingyun.xb Exp $
15+
*/
16+
public class test {
17+
public static void main(String[] args) {
18+
SparkConf conf = new SparkConf().setAppName("Simple Application").setMaster("local[4]");
19+
JavaSparkContext sc = new JavaSparkContext(conf);
20+
SQLContext sqlContext = new org.apache.spark.sql.SQLContext(sc);
21+
22+
List<Integer> data = Arrays.asList(1, 2, 3, 4, 5);
23+
JavaRDD<Integer> distData = sc.parallelize(data);
24+
25+
System.out.println(distData.count());
26+
27+
sc.stop();
28+
}
29+
}
Lines changed: 147 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,147 @@
1+
package org.apache.spark.ml.DecisionTrees
2+
3+
import org.apache.log4j.{Level, Logger}
4+
import org.apache.spark.util.SparkLearningFunSuite
5+
6+
/**
7+
* Created by xingyun.xb on 2016/7/24.
8+
*/
9+
class DTSuite extends SparkLearningFunSuite{
10+
11+
12+
test("Classification Suite"){
13+
Logger.getLogger("org.apache.spark").setLevel(Level.ERROR)
14+
Logger.getLogger("org.eclipse.jetty.server").setLevel(Level.ERROR)
15+
16+
import org.apache.spark.ml.Pipeline
17+
import org.apache.spark.ml.classification.DecisionTreeClassifier
18+
import org.apache.spark.ml.classification.DecisionTreeClassificationModel
19+
import org.apache.spark.ml.feature.{StringIndexer, IndexToString, VectorIndexer}
20+
import org.apache.spark.ml.evaluation.MulticlassClassificationEvaluator
21+
import org.apache.spark.mllib.util.MLUtils
22+
23+
// Load and parse the data file, converting it to a DataFrame.
24+
val dataRDD = MLUtils.loadLibSVMFile(sc, "file/data/mllib/input/basic/sample_libsvm_data.txt")
25+
val data=sqlContext.createDataFrame(dataRDD)
26+
27+
data.show()
28+
29+
// Index labels, adding metadata to the label column.
30+
// Fit on whole dataset to include all labels in index.
31+
val labelIndexer = new StringIndexer()
32+
.setInputCol("label")
33+
.setOutputCol("indexedLabel")
34+
.fit(data)
35+
// Automatically identify categorical features, and index them.
36+
val featureIndexer = new VectorIndexer()
37+
.setInputCol("features")
38+
.setOutputCol("indexedFeatures")
39+
.setMaxCategories(4) // features with > 4 distinct values are treated as continuous
40+
.fit(data)
41+
42+
// Split the data into training and test sets (30% held out for testing)
43+
val Array(trainingData, testData) = data.randomSplit(Array(0.7, 0.3))
44+
45+
// Train a DecisionTree model.
46+
val dt = new DecisionTreeClassifier()
47+
.setLabelCol("indexedLabel")
48+
.setFeaturesCol("indexedFeatures")
49+
50+
// Convert indexed labels back to original labels.
51+
val labelConverter = new IndexToString()
52+
.setInputCol("prediction")
53+
.setOutputCol("predictedLabel")
54+
.setLabels(labelIndexer.labels)
55+
56+
// Chain indexers and tree in a Pipeline
57+
val pipeline = new Pipeline()
58+
.setStages(Array(labelIndexer, featureIndexer, dt, labelConverter))
59+
60+
// Train model. This also runs the indexers.
61+
val model = pipeline.fit(trainingData)
62+
63+
// Make predictions.
64+
val predictions = model.transform(testData)
65+
66+
// Select example rows to display.
67+
predictions.select("predictedLabel", "label", "features").show(5)
68+
69+
// Select (prediction, true label) and compute test error
70+
val evaluator = new MulticlassClassificationEvaluator()
71+
.setLabelCol("indexedLabel")
72+
.setPredictionCol("prediction")
73+
.setMetricName("precision")
74+
val accuracy = evaluator.evaluate(predictions)
75+
println("Test Error = " + (1.0 - accuracy))
76+
77+
val treeModel = model.stages(2).asInstanceOf[DecisionTreeClassificationModel]
78+
println("Learned classification tree model:\n" + treeModel.toDebugString)
79+
80+
}
81+
82+
test("Regression Suite"){
83+
Logger.getLogger("org.apache.spark").setLevel(Level.ERROR)
84+
Logger.getLogger("org.eclipse.jetty.server").setLevel(Level.ERROR)
85+
86+
import org.apache.spark.ml.Pipeline
87+
import org.apache.spark.ml.regression.DecisionTreeRegressor
88+
import org.apache.spark.ml.regression.DecisionTreeRegressionModel
89+
import org.apache.spark.ml.feature.VectorIndexer
90+
import org.apache.spark.ml.evaluation.RegressionEvaluator
91+
import org.apache.spark.mllib.util.MLUtils
92+
93+
// Load and parse the data file, converting it to a DataFrame.
94+
val dataRDD = MLUtils.loadLibSVMFile(sc, "file/data/mllib/input/basic/sample_libsvm_data.txt")
95+
val data=sqlContext.createDataFrame(dataRDD)
96+
97+
data.show()
98+
// Automatically identify categorical features, and index them.
99+
// Here, we treat features with > 4 distinct values as continuous.
100+
val featureIndexer = new VectorIndexer()
101+
.setInputCol("features")
102+
.setOutputCol("indexedFeatures")
103+
.setMaxCategories(4)
104+
.fit(data)
105+
106+
// Split the data into training and test sets (30% held out for testing)
107+
val Array(trainingData, testData) = data.randomSplit(Array(0.7, 0.3))
108+
109+
// Train a DecisionTree model.
110+
val dt = new DecisionTreeRegressor()
111+
.setLabelCol("label")
112+
.setFeaturesCol("indexedFeatures")
113+
114+
// Chain indexer and tree in a Pipeline
115+
val pipeline = new Pipeline()
116+
.setStages(Array(featureIndexer, dt))
117+
118+
// Train model. This also runs the indexer.
119+
val model = pipeline.fit(trainingData)
120+
121+
// Make predictions.
122+
val predictions = model.transform(testData)
123+
124+
// Select example rows to display.
125+
predictions.select("prediction", "label", "features").show(5)
126+
127+
// Select (prediction, true label) and compute test error
128+
val evaluator = new RegressionEvaluator()
129+
.setLabelCol("label")
130+
.setPredictionCol("prediction")
131+
.setMetricName("rmse")
132+
val rmse = evaluator.evaluate(predictions)
133+
println("Root Mean Squared Error (RMSE) on test data = " + rmse)
134+
135+
val treeModel = model.stages(1).asInstanceOf[DecisionTreeRegressionModel]
136+
println("Learned regression tree model:\n" + treeModel.toDebugString)
137+
138+
}
139+
140+
141+
test("Suite"){
142+
Logger.getLogger("org.apache.spark").setLevel(Level.ERROR)
143+
Logger.getLogger("org.eclipse.jetty.server").setLevel(Level.ERROR)
144+
145+
}
146+
147+
}

0 commit comments

Comments
 (0)