1from sklearn.datasets import load_iris
2from sklearn.model_selection import cross_val_score
3from sklearn.tree import DecisionTreeClassifier
4clf = DecisionTreeClassifier(random_state=0)
5iris = load_iris()
6cross_val_score(clf, iris.data, iris.target, cv=10)
1from sklearn.datasets import load_iris
2>>> from sklearn import tree
3>>> X, y = load_iris(return_X_y=True)
4>>> clf = tree.DecisionTreeClassifier()
5>>> clf = clf.fit(X, y)
1# Decision tree learning algorithm for classification
2
3from pyspark.ml.linalg import Vectors
4from pyspark.ml.feature import StringIndexer
5df = spark.createDataFrame([
6 (1.0, Vectors.dense(1.0)),
7 (0.0, Vectors.sparse(1, [], []))], ["label", "features"])
8stringIndexer = StringIndexer(inputCol="label", outputCol="indexed")
9si_model = stringIndexer.fit(df)
10td = si_model.transform(df)
11dt = DecisionTreeClassifier(maxDepth=2, labelCol="indexed")
12model = dt.fit(td)
13model.numNodes
14# 3
15model.depth
16# 1
17model.featuresImportances
18# SparseVector(1, {0: 1.0})
19model.numFeatures
20# 1
21model.numClasses
22# 2
23print(model.toDebugString)
24# DecisionTreeClassificationModel (uid=...) of depth 1 with 3 nodes...
25test0 = spark.createDataFrame([(Vectors.dense(-1.0),)], ["features"])
26result = model.transform(test0).head()
27result.prediction
28# 0.0
29result.probability
30# DenseVectors([1.0, 0.0])
31result.rawPrediction
32# DenseVector([1.0, 0.0])
33test1 = spark.createDataFrame([Vectors.sparse(1, [0], [1.0]),)], ["features"])
34model.transform(test1).head().prediction
35# 1.0
36
37dtc_path = temp_path + "/dtc"
38dt.save(dtc_path)
39dt2 = DecisionTreeClassifier.load(dtc_path)
40dt2.getMaxDepth()
41# 2
42model_path = temp_path + "/dtc_model"
43model.save(model_path)
44model2 = DecisionTreeClassificationModel.load(model_path)
45model.featureImportances == model2.featureImportances
46# True
1# Decision tree learning algorithm for regression
2
3from pyspark.ml.linalg import Vectors
4df = spark.createDataFrame([
5 (1.0, Vectors.dense(1.0)),
6 (0.0, Vectors.sparse(1, [], []))], ["label", "features"])
7dt = DecisionTreeRegressor(maxDepth=2, varianceCol="variance")
8model = dt.fit(df)
9model.depth
10# 1
11model.numNodes
12# 3
13model.featureImportances
14# SparseVector(1, {0: 1.0}
15model.numFeatures
16# 1
17test0 = spark.createDataFrame([(Vectors.dense(-1.0),)], ["features"])
18model.transform(test0).head().prediction
19# 0.0
20test1 = spark.createDataFrame([(Vectors.sparse(1, [0], [1.0]),)], ["features"])
21model.transform(test1).head().prediction
22# 1.0
23dtr_path = temp_path + "/dtr"
24dt.save(dtr_path)
25dt2 = DecisionTreeRegressor.load(dtr_path)
26dt2.getMaxDepth()
27# 2
28model_path = temp_path + "/dtr_model"
29model.save(model_path)
30model2 = DecisionTreeRegressionModel.load(model_path)
31model.numNodes == model2.numNodes
32# True
33model.depth == model2.depth
34# True
35model.transform(test1).head().variance
36# 0.0
1Training Data Set Accuracy: 0.9610705596107056
2Training Data F1 Score 0.972027972027972
3Validation Mean F1 Score: 0.6348494236272646
4Validation Mean Accuracy: 0.7030561269468117
5