Введение в машинное обучение.
Деревья решений. Ближайшего соседа. Используем библиотеку scikit-learn
Это предварительная версия! Любые замечания приветствуются.
import numpy as np
import matplotlib.pyplot as plt
from itertools import product
import graphviz
from sklearn import tree
from sklearn import datasets
from sklearn.tree import DecisionTreeClassifier
%matplotlib inline
iris = datasets.load_iris()
X = iris.data[:, [0, 2]]
y = iris.target
iris.keys()
import pandas as pd
df = pd.DataFrame( iris.data )
df.columns = iris.feature_names
df
clf1 = DecisionTreeClassifier(max_depth=1)
clf2 = DecisionTreeClassifier(max_depth=2)
clf3 = DecisionTreeClassifier(max_depth=3)
clf4 = DecisionTreeClassifier(max_depth=5)
clf1.fit(X, y)
clf2.fit(X, y)
clf3.fit(X, y)
clf4.fit(X, y)
x_min, x_max = X[:, 0].min() - 1, X[:, 0].max() + 1
y_min, y_max = X[:, 1].min() - 1, X[:, 1].max() + 1
xx, yy = np.meshgrid(np.arange(x_min, x_max, 0.1),
np.arange(y_min, y_max, 0.1))
f, axarr = plt.subplots(2, 2, sharex='col', sharey='row', figsize=(10, 8))
for idx, clf, tt in zip(product([0, 1], [0, 1]),
[clf1, clf2, clf3, clf4],
['Decision Tree (depth=1)', 'Decision Tree (depth=2)',
'Decision Tree (depth=3)', 'Decision Tree (depth=5)']):
Z = clf.predict(np.c_[xx.ravel(), yy.ravel()])
Z = Z.reshape(xx.shape)
axarr[idx[0], idx[1]].contourf(xx, yy, Z, alpha=0.4)
axarr[idx[0], idx[1]].scatter(X[:, 0], X[:, 1], c=y,
s=20, edgecolor='k')
axarr[idx[0], idx[1]].set_title(tt)
plt.show()
dot_data = tree.export_graphviz(clf1, out_file=None)
graph = graphviz.Source(dot_data)
#graph.render("iris")
dot_data = tree.export_graphviz(clf1, out_file=None,
feature_names=[iris.feature_names[0],iris.feature_names[2]],
class_names=iris.target_names,
filled=True, rounded=True,
special_characters=True)
graph = graphviz.Source(dot_data)
graph
dot_data = tree.export_graphviz(clf2, out_file=None,
feature_names=[iris.feature_names[0],iris.feature_names[2]],
class_names=iris.target_names,
filled=True, rounded=True,
special_characters=True)
graph = graphviz.Source(dot_data)
graph
dot_data = tree.export_graphviz(clf3, out_file=None,
feature_names=[iris.feature_names[0],iris.feature_names[2]],
class_names=iris.target_names,
filled=True, rounded=True,
special_characters=True)
graph = graphviz.Source(dot_data)
graph
dot_data = tree.export_graphviz(clf4, out_file=None,
feature_names=[iris.feature_names[0],iris.feature_names[2]],
class_names=iris.target_names,
filled=True, rounded=True,
special_characters=True)
graph = graphviz.Source(dot_data)
graph
clf = DecisionTreeClassifier(max_depth=3)
clf.fit(iris.data, iris.target)
dot_data = tree.export_graphviz(clf, out_file=None,
feature_names=iris.feature_names,
class_names=iris.target_names,
filled=True, rounded=True,
special_characters=True)
graph = graphviz.Source(dot_data)
graph
clf = DecisionTreeClassifier(max_depth=4)
clf.fit(iris.data, iris.target)
dot_data = tree.export_graphviz(clf, out_file=None,
feature_names=iris.feature_names,
class_names=iris.target_names,
filled=True, rounded=True,
special_characters=True)
graph = graphviz.Source(dot_data)
graph
clf = DecisionTreeClassifier(max_depth=4, min_impurity_decrease=0.15)
clf.fit(iris.data, iris.target)
dot_data = tree.export_graphviz(clf, out_file=None,
feature_names=iris.feature_names,
class_names=iris.target_names,
filled=True, rounded=True,
special_characters=True)
graph = graphviz.Source(dot_data)
graph
n_nodes = clf.tree_.node_count
children_left = clf.tree_.children_left
children_right = clf.tree_.children_right
feature = clf.tree_.feature
feature_names = iris.feature_names#clf.tree_.feature
threshold = clf.tree_.threshold
# The tree structure can be traversed to compute various properties such
# as the depth of each node and whether or not it is a leaf.
node_depth = np.zeros(shape=n_nodes, dtype=np.int64)
is_leaves = np.zeros(shape=n_nodes, dtype=bool)
stack = [(0, -1)] # seed is the root node id and its parent depth
while len(stack) > 0:
node_id, parent_depth = stack.pop()
node_depth[node_id] = parent_depth + 1
# If we have a test node
if (children_left[node_id] != children_right[node_id]):
stack.append((children_left[node_id], parent_depth + 1))
stack.append((children_right[node_id], parent_depth + 1))
else:
is_leaves[node_id] = True
print("Стрктура бинарного дерева имеет %s узлов и следущий вид:"
% n_nodes)
for i in range(n_nodes):
if is_leaves[i]:
print("%sузел=%s конечный." % (node_depth[i] * "\t", i))
else:
print("%sузел=%s проверка: перейти к узлу %s если %s <= %s иначе к узлу %s."
% (node_depth[i] * "\t",
i,
children_left[i],
feature_names[i],
threshold[i],
children_right[i],
))
print()
sample = 33
node_indicator = clf.decision_path( [iris.data[ sample ]] )
node_indicator
leave_id = clf.apply( [iris.data[ sample ]] )
leave_id[0]
class_id = clf.predict( [iris.data[ sample ]] )
class_id, iris.target_names[class_id]
# Now, it's possible to get the tests that were used to predict a sample or
# a group of samples. First, let's make it for the sample.
sample_id = 0
node_index = node_indicator.indices[node_indicator.indptr[0]:
node_indicator.indptr[0 + 1]]
print('Используемые правила для предсказания %s: ' % sample)
for node_id in node_index:
if leave_id[sample_id] != node_id:
continue
if (iris.data[sample, feature[node_id]] <= threshold[node_id]):
threshold_sign = "<="
else:
threshold_sign = ">"
print("узел дерева решений %s : ( фича %s, (= %s) %s %s)"
% (node_id,
feature[node_id],
iris.data[sample, feature[node_id]],
threshold_sign,
threshold[node_id]))
clf.score(iris.data, iris.target)
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split( iris.data, iris.target, test_size=0.15)
X_train.shape[0]/iris.data.shape[0], X_test.shape[0]/iris.data.shape[0]
clf = DecisionTreeClassifier(max_depth=4, min_impurity_decrease=0.15)
clf.fit( X_train, y_train )
clf.score( X_test, y_test )
clf = DecisionTreeClassifier(max_depth=5, min_impurity_decrease=0.01 )
clf.fit( X_train, y_train )
clf.score( X_test, y_test )
Лассо регрессия
from sklearn.datasets import load_boston
boston = load_boston()
print(boston.data.shape)
boston.keys()
import pandas as pd
df = pd.DataFrame(boston['data'])
df
df.columns = boston['feature_names']
df
boston['target']
targ = pd.DataFrame( boston.target )
targ.columns = ["Цена"]
targ
targ["Цена"][1]
pd.concat([df, targ], axis=1)
from sklearn.linear_model import Lasso
las = Lasso( 0.5)
X_train, X_test, y_train, y_test = train_test_split( boston.data, boston.target, test_size=0.15)
las.fit( X_train, y_train )
las.score( X_train, y_train )
las.score( X_test, y_test )
from sklearn.tree import DecisionTreeRegressor
regressor = DecisionTreeRegressor( ) #max_depth=5, min_impurity_decrease=0.01, random_state=0
regressor.fit( boston.data, boston.target )
X_train, X_test, y_train, y_test = train_test_split( boston.data, boston.target, test_size=0.15)
regressor.fit( X_train, y_train )
regressor.score( X_train, y_train )
regressor.score( X_test, y_test )
pred = regressor.predict( X_test )
pred
y_test
plt.hist( pred/y_test )
np.std(pred - y_test)
from sklearn.model_selection import cross_val_score
cross_val_score(regressor, boston.data, boston.target, cv=10)
from sklearn.tree import DecisionTreeRegressor
regressor = DecisionTreeRegressor( max_depth=5, min_impurity_decrease=0.01 ) #random_state=0
regressor.fit( boston.data, boston.target )
X_train, X_test, y_train, y_test = train_test_split( boston.data, boston.target, test_size=0.15)
regressor.fit( X_train, np.log(y_train) )
regressor.score( X_train, np.log(y_train) )
regressor.score( X_test, np.log(y_test) )
pred = regressor.predict( X_test )
pred
np.log(y_test)
plt.hist(pred - np.log(y_test) )
plt.hist(np.exp(pred - np.log(y_test)))
#from sklearn.datasets import fetch_california_housing
#housing = fetch_california_housing()
from sklearn.neighbors import KNeighborsClassifier
neigh = KNeighborsClassifier(n_neighbors=3)
X_train, X_test, y_train, y_test = train_test_split( iris.data, iris.target, test_size=0.15)
neigh.fit( X_train, y_train )
neigh.score( X_train, y_train )
neigh.score( X_test, y_test )
from sklearn.model_selection import cross_val_score
cross_val_score( neigh, iris.data, iris.target, cv=10)
for n in range(1,5):
neigh = KNeighborsClassifier(n_neighbors=n)
print(np.mean(cross_val_score( neigh, iris.data, iris.target, cv=10)))
from sklearn.neighbors import RadiusNeighborsClassifier
rad = RadiusNeighborsClassifier( radius = 0.75 )
cross_val_score( rad, iris.data, iris.target, cv=10)
for r in np.arange(0.75, 5, 0.15):
rad = RadiusNeighborsClassifier( radius = r )
print(r, np.mean(cross_val_score( rad, iris.data, iris.target, cv=10)))
X = iris.data[:, [0, 2]]
y = iris.target
x_min, x_max = X[:, 0].min() - 1, X[:, 0].max() + 1
y_min, y_max = X[:, 1].min() - 1, X[:, 1].max() + 1
xx, yy = np.meshgrid(np.arange(x_min, x_max, 0.1),
np.arange(y_min, y_max, 0.1))
f, axarr = plt.subplots(2, 2, sharex='col', sharey='row', figsize=(10, 8))
for idx, clf, tt in zip(product([0, 1], [0, 1]),
[KNeighborsClassifier(n_neighbors=1), KNeighborsClassifier(n_neighbors=3),
RadiusNeighborsClassifier( radius = 4. ), DecisionTreeClassifier(max_depth=4, min_impurity_decrease=0.1)],
['KNeighborsClassifier, 1', 'KNeighborsClassifier, 3',
'RadiusNeighborsClassifier( radius = 4. ) 4', 'Decision Tree']):
clf.fit( X, y )
Z = clf.predict(np.c_[xx.ravel(), yy.ravel()])
Z = Z.reshape(xx.shape)
axarr[idx[0], idx[1]].contourf(xx, yy, Z, alpha=0.4)
axarr[idx[0], idx[1]].scatter(X[:, 0], X[:, 1], c=y,
s=20, edgecolor='k')
axarr[idx[0], idx[1]].set_title(tt)
plt.show()