note_04
Заметка 4. Деревья решения и др.
курса Введение в машинное обучение.
Шокуров Антон В.
shokurov.anton.v@yandex.ru
http://машинноезрение.рф
Версия 0.10

Анотация

Деревья решений. Ближайшего соседа. Используем библиотеку scikit-learn

Это предварительная версия! Любые замечания приветствуются.

In [1]:
import numpy as np
import matplotlib.pyplot as plt

from itertools import product

import graphviz
from sklearn import tree

from sklearn import datasets
from sklearn.tree import DecisionTreeClassifier
%matplotlib inline
In [2]:
iris = datasets.load_iris()
X = iris.data[:, [0, 2]]
y = iris.target
In [3]:
iris.keys()
Out[3]:
dict_keys(['data', 'target', 'target_names', 'DESCR', 'feature_names'])
In [4]:
import pandas as pd
In [5]:
df = pd.DataFrame( iris.data )
df.columns = iris.feature_names
df
Out[5]:
sepal length (cm) sepal width (cm) petal length (cm) petal width (cm)
0 5.1 3.5 1.4 0.2
1 4.9 3.0 1.4 0.2
2 4.7 3.2 1.3 0.2
3 4.6 3.1 1.5 0.2
4 5.0 3.6 1.4 0.2
5 5.4 3.9 1.7 0.4
6 4.6 3.4 1.4 0.3
7 5.0 3.4 1.5 0.2
8 4.4 2.9 1.4 0.2
9 4.9 3.1 1.5 0.1
10 5.4 3.7 1.5 0.2
11 4.8 3.4 1.6 0.2
12 4.8 3.0 1.4 0.1
13 4.3 3.0 1.1 0.1
14 5.8 4.0 1.2 0.2
15 5.7 4.4 1.5 0.4
16 5.4 3.9 1.3 0.4
17 5.1 3.5 1.4 0.3
18 5.7 3.8 1.7 0.3
19 5.1 3.8 1.5 0.3
20 5.4 3.4 1.7 0.2
21 5.1 3.7 1.5 0.4
22 4.6 3.6 1.0 0.2
23 5.1 3.3 1.7 0.5
24 4.8 3.4 1.9 0.2
25 5.0 3.0 1.6 0.2
26 5.0 3.4 1.6 0.4
27 5.2 3.5 1.5 0.2
28 5.2 3.4 1.4 0.2
29 4.7 3.2 1.6 0.2
... ... ... ... ...
120 6.9 3.2 5.7 2.3
121 5.6 2.8 4.9 2.0
122 7.7 2.8 6.7 2.0
123 6.3 2.7 4.9 1.8
124 6.7 3.3 5.7 2.1
125 7.2 3.2 6.0 1.8
126 6.2 2.8 4.8 1.8
127 6.1 3.0 4.9 1.8
128 6.4 2.8 5.6 2.1
129 7.2 3.0 5.8 1.6
130 7.4 2.8 6.1 1.9
131 7.9 3.8 6.4 2.0
132 6.4 2.8 5.6 2.2
133 6.3 2.8 5.1 1.5
134 6.1 2.6 5.6 1.4
135 7.7 3.0 6.1 2.3
136 6.3 3.4 5.6 2.4
137 6.4 3.1 5.5 1.8
138 6.0 3.0 4.8 1.8
139 6.9 3.1 5.4 2.1
140 6.7 3.1 5.6 2.4
141 6.9 3.1 5.1 2.3
142 5.8 2.7 5.1 1.9
143 6.8 3.2 5.9 2.3
144 6.7 3.3 5.7 2.5
145 6.7 3.0 5.2 2.3
146 6.3 2.5 5.0 1.9
147 6.5 3.0 5.2 2.0
148 6.2 3.4 5.4 2.3
149 5.9 3.0 5.1 1.8

150 rows × 4 columns

In [6]:
clf1 = DecisionTreeClassifier(max_depth=1)
clf2 = DecisionTreeClassifier(max_depth=2)
clf3 = DecisionTreeClassifier(max_depth=3)
clf4 = DecisionTreeClassifier(max_depth=5)
In [7]:
clf1.fit(X, y)
clf2.fit(X, y)
clf3.fit(X, y)
clf4.fit(X, y)
Out[7]:
DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=5,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best')
In [8]:
x_min, x_max = X[:, 0].min() - 1, X[:, 0].max() + 1
y_min, y_max = X[:, 1].min() - 1, X[:, 1].max() + 1
xx, yy = np.meshgrid(np.arange(x_min, x_max, 0.1),
                     np.arange(y_min, y_max, 0.1))

f, axarr = plt.subplots(2, 2, sharex='col', sharey='row', figsize=(10, 8))

for idx, clf, tt in zip(product([0, 1], [0, 1]),
                        [clf1, clf2, clf3, clf4],
                        ['Decision Tree (depth=1)', 'Decision Tree (depth=2)',
                         'Decision Tree (depth=3)', 'Decision Tree (depth=5)']):

    Z = clf.predict(np.c_[xx.ravel(), yy.ravel()])
    Z = Z.reshape(xx.shape)

    axarr[idx[0], idx[1]].contourf(xx, yy, Z, alpha=0.4)
    axarr[idx[0], idx[1]].scatter(X[:, 0], X[:, 1], c=y,
                                  s=20, edgecolor='k')
    axarr[idx[0], idx[1]].set_title(tt)

plt.show()
In [9]:
dot_data = tree.export_graphviz(clf1, out_file=None) 
graph = graphviz.Source(dot_data) 
#graph.render("iris") 
In [10]:
dot_data = tree.export_graphviz(clf1, out_file=None, 
                         feature_names=[iris.feature_names[0],iris.feature_names[2]],
                         class_names=iris.target_names,  
                         filled=True, rounded=True,  
                         special_characters=True)
graph = graphviz.Source(dot_data)
graph
Out[10]:
Tree 0 petal length (cm) ≤ 2.45 gini = 0.667 samples = 150 value = [50, 50, 50] class = setosa 1 gini = 0.0 samples = 50 value = [50, 0, 0] class = setosa 0->1 True 2 gini = 0.5 samples = 100 value = [0, 50, 50] class = versicolor 0->2 False
In [11]:
dot_data = tree.export_graphviz(clf2, out_file=None, 
                         feature_names=[iris.feature_names[0],iris.feature_names[2]],
                         class_names=iris.target_names,  
                         filled=True, rounded=True,  
                         special_characters=True)
graph = graphviz.Source(dot_data)
graph
Out[11]:
Tree 0 petal length (cm) ≤ 2.45 gini = 0.667 samples = 150 value = [50, 50, 50] class = setosa 1 gini = 0.0 samples = 50 value = [50, 0, 0] class = setosa 0->1 True 2 petal length (cm) ≤ 4.75 gini = 0.5 samples = 100 value = [0, 50, 50] class = versicolor 0->2 False 3 gini = 0.043 samples = 45 value = [0, 44, 1] class = versicolor 2->3 4 gini = 0.194 samples = 55 value = [0, 6, 49] class = virginica 2->4
In [12]:
dot_data = tree.export_graphviz(clf3, out_file=None, 
                         feature_names=[iris.feature_names[0],iris.feature_names[2]],
                         class_names=iris.target_names,  
                         filled=True, rounded=True,  
                         special_characters=True)
graph = graphviz.Source(dot_data)
graph
Out[12]:
Tree 0 petal length (cm) ≤ 2.45 gini = 0.667 samples = 150 value = [50, 50, 50] class = setosa 1 gini = 0.0 samples = 50 value = [50, 0, 0] class = setosa 0->1 True 2 petal length (cm) ≤ 4.75 gini = 0.5 samples = 100 value = [0, 50, 50] class = versicolor 0->2 False 3 sepal length (cm) ≤ 4.95 gini = 0.043 samples = 45 value = [0, 44, 1] class = versicolor 2->3 6 petal length (cm) ≤ 5.05 gini = 0.194 samples = 55 value = [0, 6, 49] class = virginica 2->6 4 gini = 0.5 samples = 2 value = [0, 1, 1] class = versicolor 3->4 5 gini = 0.0 samples = 43 value = [0, 43, 0] class = versicolor 3->5 7 gini = 0.473 samples = 13 value = [0, 5, 8] class = virginica 6->7 8 gini = 0.046 samples = 42 value = [0, 1, 41] class = virginica 6->8
In [13]:
dot_data = tree.export_graphviz(clf4, out_file=None, 
                         feature_names=[iris.feature_names[0],iris.feature_names[2]],
                         class_names=iris.target_names,  
                         filled=True, rounded=True,  
                         special_characters=True)
graph = graphviz.Source(dot_data)
graph
Out[13]:
Tree 0 petal length (cm) ≤ 2.45 gini = 0.667 samples = 150 value = [50, 50, 50] class = setosa 1 gini = 0.0 samples = 50 value = [50, 0, 0] class = setosa 0->1 True 2 petal length (cm) ≤ 4.75 gini = 0.5 samples = 100 value = [0, 50, 50] class = versicolor 0->2 False 3 sepal length (cm) ≤ 4.95 gini = 0.043 samples = 45 value = [0, 44, 1] class = versicolor 2->3 8 petal length (cm) ≤ 5.05 gini = 0.194 samples = 55 value = [0, 6, 49] class = virginica 2->8 4 petal length (cm) ≤ 3.9 gini = 0.5 samples = 2 value = [0, 1, 1] class = versicolor 3->4 7 gini = 0.0 samples = 43 value = [0, 43, 0] class = versicolor 3->7 5 gini = 0.0 samples = 1 value = [0, 1, 0] class = versicolor 4->5 6 gini = 0.0 samples = 1 value = [0, 0, 1] class = virginica 4->6 9 sepal length (cm) ≤ 6.5 gini = 0.473 samples = 13 value = [0, 5, 8] class = virginica 8->9 14 sepal length (cm) ≤ 6.05 gini = 0.046 samples = 42 value = [0, 1, 41] class = virginica 8->14 10 petal length (cm) ≤ 4.95 gini = 0.32 samples = 10 value = [0, 2, 8] class = virginica 9->10 13 gini = 0.0 samples = 3 value = [0, 3, 0] class = versicolor 9->13 11 gini = 0.408 samples = 7 value = [0, 2, 5] class = virginica 10->11 12 gini = 0.0 samples = 3 value = [0, 0, 3] class = virginica 10->12 15 sepal length (cm) ≤ 5.95 gini = 0.32 samples = 5 value = [0, 1, 4] class = virginica 14->15 18 gini = 0.0 samples = 37 value = [0, 0, 37] class = virginica 14->18 16 gini = 0.0 samples = 4 value = [0, 0, 4] class = virginica 15->16 17 gini = 0.0 samples = 1 value = [0, 1, 0] class = versicolor 15->17
In [14]:
clf = DecisionTreeClassifier(max_depth=3)
clf.fit(iris.data, iris.target)
Out[14]:
DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=3,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best')
In [15]:
dot_data = tree.export_graphviz(clf, out_file=None, 
                         feature_names=iris.feature_names,
                         class_names=iris.target_names,  
                         filled=True, rounded=True,  
                         special_characters=True)
graph = graphviz.Source(dot_data)
graph
Out[15]:
Tree 0 petal width (cm) ≤ 0.8 gini = 0.667 samples = 150 value = [50, 50, 50] class = setosa 1 gini = 0.0 samples = 50 value = [50, 0, 0] class = setosa 0->1 True 2 petal width (cm) ≤ 1.75 gini = 0.5 samples = 100 value = [0, 50, 50] class = versicolor 0->2 False 3 petal length (cm) ≤ 4.95 gini = 0.168 samples = 54 value = [0, 49, 5] class = versicolor 2->3 6 petal length (cm) ≤ 4.85 gini = 0.043 samples = 46 value = [0, 1, 45] class = virginica 2->6 4 gini = 0.041 samples = 48 value = [0, 47, 1] class = versicolor 3->4 5 gini = 0.444 samples = 6 value = [0, 2, 4] class = virginica 3->5 7 gini = 0.444 samples = 3 value = [0, 1, 2] class = virginica 6->7 8 gini = 0.0 samples = 43 value = [0, 0, 43] class = virginica 6->8
In [16]:
clf = DecisionTreeClassifier(max_depth=4)
clf.fit(iris.data, iris.target)
Out[16]:
DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=4,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best')
In [17]:
dot_data = tree.export_graphviz(clf, out_file=None, 
                         feature_names=iris.feature_names,
                         class_names=iris.target_names,  
                         filled=True, rounded=True,  
                         special_characters=True)
graph = graphviz.Source(dot_data)
graph
Out[17]:
Tree 0 petal length (cm) ≤ 2.45 gini = 0.667 samples = 150 value = [50, 50, 50] class = setosa 1 gini = 0.0 samples = 50 value = [50, 0, 0] class = setosa 0->1 True 2 petal width (cm) ≤ 1.75 gini = 0.5 samples = 100 value = [0, 50, 50] class = versicolor 0->2 False 3 petal length (cm) ≤ 4.95 gini = 0.168 samples = 54 value = [0, 49, 5] class = versicolor 2->3 10 petal length (cm) ≤ 4.85 gini = 0.043 samples = 46 value = [0, 1, 45] class = virginica 2->10 4 petal width (cm) ≤ 1.65 gini = 0.041 samples = 48 value = [0, 47, 1] class = versicolor 3->4 7 petal width (cm) ≤ 1.55 gini = 0.444 samples = 6 value = [0, 2, 4] class = virginica 3->7 5 gini = 0.0 samples = 47 value = [0, 47, 0] class = versicolor 4->5 6 gini = 0.0 samples = 1 value = [0, 0, 1] class = virginica 4->6 8 gini = 0.0 samples = 3 value = [0, 0, 3] class = virginica 7->8 9 gini = 0.444 samples = 3 value = [0, 2, 1] class = versicolor 7->9 11 sepal width (cm) ≤ 3.1 gini = 0.444 samples = 3 value = [0, 1, 2] class = virginica 10->11 14 gini = 0.0 samples = 43 value = [0, 0, 43] class = virginica 10->14 12 gini = 0.0 samples = 2 value = [0, 0, 2] class = virginica 11->12 13 gini = 0.0 samples = 1 value = [0, 1, 0] class = versicolor 11->13
In [18]:
clf = DecisionTreeClassifier(max_depth=4, min_impurity_decrease=0.15)
clf.fit(iris.data, iris.target)
Out[18]:
DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=4,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.15, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best')
In [19]:
dot_data = tree.export_graphviz(clf, out_file=None, 
                         feature_names=iris.feature_names,
                         class_names=iris.target_names,  
                         filled=True, rounded=True,  
                         special_characters=True)
graph = graphviz.Source(dot_data)
graph
Out[19]:
Tree 0 petal width (cm) ≤ 0.8 gini = 0.667 samples = 150 value = [50, 50, 50] class = setosa 1 gini = 0.0 samples = 50 value = [50, 0, 0] class = setosa 0->1 True 2 petal width (cm) ≤ 1.75 gini = 0.5 samples = 100 value = [0, 50, 50] class = versicolor 0->2 False 3 gini = 0.168 samples = 54 value = [0, 49, 5] class = versicolor 2->3 4 gini = 0.043 samples = 46 value = [0, 1, 45] class = virginica 2->4
In [20]:
n_nodes = clf.tree_.node_count
children_left = clf.tree_.children_left
children_right = clf.tree_.children_right
feature = clf.tree_.feature
feature_names = iris.feature_names#clf.tree_.feature
threshold = clf.tree_.threshold


# The tree structure can be traversed to compute various properties such
# as the depth of each node and whether or not it is a leaf.
node_depth = np.zeros(shape=n_nodes, dtype=np.int64)
is_leaves = np.zeros(shape=n_nodes, dtype=bool)
stack = [(0, -1)]  # seed is the root node id and its parent depth
while len(stack) > 0:
    node_id, parent_depth = stack.pop()
    node_depth[node_id] = parent_depth + 1

    # If we have a test node
    if (children_left[node_id] != children_right[node_id]):
        stack.append((children_left[node_id], parent_depth + 1))
        stack.append((children_right[node_id], parent_depth + 1))
    else:
        is_leaves[node_id] = True

print("Стрктура бинарного дерева имеет %s узлов и следущий вид:"
      % n_nodes)
for i in range(n_nodes):
    if is_leaves[i]:
        print("%sузел=%s конечный." % (node_depth[i] * "\t", i))
    else:
        print("%sузел=%s проверка: перейти к узлу %s если %s <= %s иначе к узлу %s."
              % (node_depth[i] * "\t",
                 i,
                 children_left[i],
                 feature_names[i],
                 threshold[i],
                 children_right[i],
                 ))
print()
Стрктура бинарного дерева имеет 5 узлов и следущий вид:
узел=0 проверка: перейти к узлу 1 если sepal length (cm) <= 0.800000011921 иначе к узлу 2.
	узел=1 конечный.
	узел=2 проверка: перейти к узлу 3 если petal length (cm) <= 1.75 иначе к узлу 4.
		узел=3 конечный.
		узел=4 конечный.

In [21]:
sample = 33
In [22]:
node_indicator = clf.decision_path( [iris.data[ sample ]] )
node_indicator
Out[22]:
<1x5 sparse matrix of type '<class 'numpy.int64'>'
	with 2 stored elements in Compressed Sparse Row format>
In [23]:
leave_id = clf.apply( [iris.data[ sample ]] )
leave_id[0]
Out[23]:
1
In [24]:
class_id = clf.predict( [iris.data[ sample ]] )
class_id, iris.target_names[class_id]
Out[24]:
(array([0]), array(['setosa'],
       dtype='<U10'))
In [25]:
# Now, it's possible to get the tests that were used to predict a sample or
# a group of samples. First, let's make it for the sample.

sample_id = 0
node_index = node_indicator.indices[node_indicator.indptr[0]:
                                    node_indicator.indptr[0 + 1]]

print('Используемые правила для предсказания %s: ' % sample)
for node_id in node_index:
    if leave_id[sample_id] != node_id:
        continue

    if (iris.data[sample, feature[node_id]] <= threshold[node_id]):
        threshold_sign = "<="
    else:
        threshold_sign = ">"

    print("узел дерева решений %s : ( фича %s, (= %s) %s %s)"
          % (node_id,
             feature[node_id],
             iris.data[sample, feature[node_id]],
             threshold_sign,
             threshold[node_id]))
Используемые правила для предсказания 33: 
узел дерева решений 1 : ( фича -2, (= 1.4) > -2.0)
In [26]:
clf.score(iris.data, iris.target)
Out[26]:
0.95999999999999996
In [27]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split( iris.data, iris.target, test_size=0.15)
In [28]:
X_train.shape[0]/iris.data.shape[0], X_test.shape[0]/iris.data.shape[0]
Out[28]:
(0.8466666666666667, 0.15333333333333332)
In [29]:
clf = DecisionTreeClassifier(max_depth=4, min_impurity_decrease=0.15)
clf.fit( X_train, y_train )
Out[29]:
DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=4,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.15, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best')
In [30]:
clf.score( X_test, y_test )
Out[30]:
0.91304347826086951
In [31]:
clf = DecisionTreeClassifier(max_depth=5, min_impurity_decrease=0.01 )
clf.fit( X_train, y_train )
clf.score( X_test, y_test )
Out[31]:
0.86956521739130432

Лассо регрессия

In [32]:
from sklearn.datasets import load_boston
boston = load_boston()
print(boston.data.shape)
(506, 13)
In [33]:
boston.keys()
Out[33]:
dict_keys(['data', 'target', 'feature_names', 'DESCR'])
In [34]:
import pandas as pd
df = pd.DataFrame(boston['data'])
df
Out[34]:
0 1 2 3 4 5 6 7 8 9 10 11 12
0 0.00632 18.0 2.31 0.0 0.538 6.575 65.2 4.0900 1.0 296.0 15.3 396.90 4.98
1 0.02731 0.0 7.07 0.0 0.469 6.421 78.9 4.9671 2.0 242.0 17.8 396.90 9.14
2 0.02729 0.0 7.07 0.0 0.469 7.185 61.1 4.9671 2.0 242.0 17.8 392.83 4.03
3 0.03237 0.0 2.18 0.0 0.458 6.998 45.8 6.0622 3.0 222.0 18.7 394.63 2.94
4 0.06905 0.0 2.18 0.0 0.458 7.147 54.2 6.0622 3.0 222.0 18.7 396.90 5.33
5 0.02985 0.0 2.18 0.0 0.458 6.430 58.7 6.0622 3.0 222.0 18.7 394.12 5.21
6 0.08829 12.5 7.87 0.0 0.524 6.012 66.6 5.5605 5.0 311.0 15.2 395.60 12.43
7 0.14455 12.5 7.87 0.0 0.524 6.172 96.1 5.9505 5.0 311.0 15.2 396.90 19.15
8 0.21124 12.5 7.87 0.0 0.524 5.631 100.0 6.0821 5.0 311.0 15.2 386.63 29.93
9 0.17004 12.5 7.87 0.0 0.524 6.004 85.9 6.5921 5.0 311.0 15.2 386.71 17.10
10 0.22489 12.5 7.87 0.0 0.524 6.377 94.3 6.3467 5.0 311.0 15.2 392.52 20.45
11 0.11747 12.5 7.87 0.0 0.524 6.009 82.9 6.2267 5.0 311.0 15.2 396.90 13.27
12 0.09378 12.5 7.87 0.0 0.524 5.889 39.0 5.4509 5.0 311.0 15.2 390.50 15.71
13 0.62976 0.0 8.14 0.0 0.538 5.949 61.8 4.7075 4.0 307.0 21.0 396.90 8.26
14 0.63796 0.0 8.14 0.0 0.538 6.096 84.5 4.4619 4.0 307.0 21.0 380.02 10.26
15 0.62739 0.0 8.14 0.0 0.538 5.834 56.5 4.4986 4.0 307.0 21.0 395.62 8.47
16 1.05393 0.0 8.14 0.0 0.538 5.935 29.3 4.4986 4.0 307.0 21.0 386.85 6.58
17 0.78420 0.0 8.14 0.0 0.538 5.990 81.7 4.2579 4.0 307.0 21.0 386.75 14.67
18 0.80271 0.0 8.14 0.0 0.538 5.456 36.6 3.7965 4.0 307.0 21.0 288.99 11.69
19 0.72580 0.0 8.14 0.0 0.538 5.727 69.5 3.7965 4.0 307.0 21.0 390.95 11.28
20 1.25179 0.0 8.14 0.0 0.538 5.570 98.1 3.7979 4.0 307.0 21.0 376.57 21.02
21 0.85204 0.0 8.14 0.0 0.538 5.965 89.2 4.0123 4.0 307.0 21.0 392.53 13.83
22 1.23247 0.0 8.14 0.0 0.538 6.142 91.7 3.9769 4.0 307.0 21.0 396.90 18.72
23 0.98843 0.0 8.14 0.0 0.538 5.813 100.0 4.0952 4.0 307.0 21.0 394.54 19.88
24 0.75026 0.0 8.14 0.0 0.538 5.924 94.1 4.3996 4.0 307.0 21.0 394.33 16.30
25 0.84054 0.0 8.14 0.0 0.538 5.599 85.7 4.4546 4.0 307.0 21.0 303.42 16.51
26 0.67191 0.0 8.14 0.0 0.538 5.813 90.3 4.6820 4.0 307.0 21.0 376.88 14.81
27 0.95577 0.0 8.14 0.0 0.538 6.047 88.8 4.4534 4.0 307.0 21.0 306.38 17.28
28 0.77299 0.0 8.14 0.0 0.538 6.495 94.4 4.4547 4.0 307.0 21.0 387.94 12.80
29 1.00245 0.0 8.14 0.0 0.538 6.674 87.3 4.2390 4.0 307.0 21.0 380.23 11.98
... ... ... ... ... ... ... ... ... ... ... ... ... ...
476 4.87141 0.0 18.10 0.0 0.614 6.484 93.6 2.3053 24.0 666.0 20.2 396.21 18.68
477 15.02340 0.0 18.10 0.0 0.614 5.304 97.3 2.1007 24.0 666.0 20.2 349.48 24.91
478 10.23300 0.0 18.10 0.0 0.614 6.185 96.7 2.1705 24.0 666.0 20.2 379.70 18.03
479 14.33370 0.0 18.10 0.0 0.614 6.229 88.0 1.9512 24.0 666.0 20.2 383.32 13.11
480 5.82401 0.0 18.10 0.0 0.532 6.242 64.7 3.4242 24.0 666.0 20.2 396.90 10.74
481 5.70818 0.0 18.10 0.0 0.532 6.750 74.9 3.3317 24.0 666.0 20.2 393.07 7.74
482 5.73116 0.0 18.10 0.0 0.532 7.061 77.0 3.4106 24.0 666.0 20.2 395.28 7.01
483 2.81838 0.0 18.10 0.0 0.532 5.762 40.3 4.0983 24.0 666.0 20.2 392.92 10.42
484 2.37857 0.0 18.10 0.0 0.583 5.871 41.9 3.7240 24.0 666.0 20.2 370.73 13.34
485 3.67367 0.0 18.10 0.0 0.583 6.312 51.9 3.9917 24.0 666.0 20.2 388.62 10.58
486 5.69175 0.0 18.10 0.0 0.583 6.114 79.8 3.5459 24.0 666.0 20.2 392.68 14.98
487 4.83567 0.0 18.10 0.0 0.583 5.905 53.2 3.1523 24.0 666.0 20.2 388.22 11.45
488 0.15086 0.0 27.74 0.0 0.609 5.454 92.7 1.8209 4.0 711.0 20.1 395.09 18.06
489 0.18337 0.0 27.74 0.0 0.609 5.414 98.3 1.7554 4.0 711.0 20.1 344.05 23.97
490 0.20746 0.0 27.74 0.0 0.609 5.093 98.0 1.8226 4.0 711.0 20.1 318.43 29.68
491 0.10574 0.0 27.74 0.0 0.609 5.983 98.8 1.8681 4.0 711.0 20.1 390.11 18.07
492 0.11132 0.0 27.74 0.0 0.609 5.983 83.5 2.1099 4.0 711.0 20.1 396.90 13.35
493 0.17331 0.0 9.69 0.0 0.585 5.707 54.0 2.3817 6.0 391.0 19.2 396.90 12.01
494 0.27957 0.0 9.69 0.0 0.585 5.926 42.6 2.3817 6.0 391.0 19.2 396.90 13.59
495 0.17899 0.0 9.69 0.0 0.585 5.670 28.8 2.7986 6.0 391.0 19.2 393.29 17.60
496 0.28960 0.0 9.69 0.0 0.585 5.390 72.9 2.7986 6.0 391.0 19.2 396.90 21.14
497 0.26838 0.0 9.69 0.0 0.585 5.794 70.6 2.8927 6.0 391.0 19.2 396.90 14.10
498 0.23912 0.0 9.69 0.0 0.585 6.019 65.3 2.4091 6.0 391.0 19.2 396.90 12.92
499 0.17783 0.0 9.69 0.0 0.585 5.569 73.5 2.3999 6.0 391.0 19.2 395.77 15.10
500 0.22438 0.0 9.69 0.0 0.585 6.027 79.7 2.4982 6.0 391.0 19.2 396.90 14.33
501 0.06263 0.0 11.93 0.0 0.573 6.593 69.1 2.4786 1.0 273.0 21.0 391.99 9.67
502 0.04527 0.0 11.93 0.0 0.573 6.120 76.7 2.2875 1.0 273.0 21.0 396.90 9.08
503 0.06076 0.0 11.93 0.0 0.573 6.976 91.0 2.1675 1.0 273.0 21.0 396.90 5.64
504 0.10959 0.0 11.93 0.0 0.573 6.794 89.3 2.3889 1.0 273.0 21.0 393.45 6.48
505 0.04741 0.0 11.93 0.0 0.573 6.030 80.8 2.5050 1.0 273.0 21.0 396.90 7.88

506 rows × 13 columns

In [35]:
df.columns = boston['feature_names']
df
Out[35]:
CRIM ZN INDUS CHAS NOX RM AGE DIS RAD TAX PTRATIO B LSTAT
0 0.00632 18.0 2.31 0.0 0.538 6.575 65.2 4.0900 1.0 296.0 15.3 396.90 4.98
1 0.02731 0.0 7.07 0.0 0.469 6.421 78.9 4.9671 2.0 242.0 17.8 396.90 9.14
2 0.02729 0.0 7.07 0.0 0.469 7.185 61.1 4.9671 2.0 242.0 17.8 392.83 4.03
3 0.03237 0.0 2.18 0.0 0.458 6.998 45.8 6.0622 3.0 222.0 18.7 394.63 2.94
4 0.06905 0.0 2.18 0.0 0.458 7.147 54.2 6.0622 3.0 222.0 18.7 396.90 5.33
5 0.02985 0.0 2.18 0.0 0.458 6.430 58.7 6.0622 3.0 222.0 18.7 394.12 5.21
6 0.08829 12.5 7.87 0.0 0.524 6.012 66.6 5.5605 5.0 311.0 15.2 395.60 12.43
7 0.14455 12.5 7.87 0.0 0.524 6.172 96.1 5.9505 5.0 311.0 15.2 396.90 19.15
8 0.21124 12.5 7.87 0.0 0.524 5.631 100.0 6.0821 5.0 311.0 15.2 386.63 29.93
9 0.17004 12.5 7.87 0.0 0.524 6.004 85.9 6.5921 5.0 311.0 15.2 386.71 17.10
10 0.22489 12.5 7.87 0.0 0.524 6.377 94.3 6.3467 5.0 311.0 15.2 392.52 20.45
11 0.11747 12.5 7.87 0.0 0.524 6.009 82.9 6.2267 5.0 311.0 15.2 396.90 13.27
12 0.09378 12.5 7.87 0.0 0.524 5.889 39.0 5.4509 5.0 311.0 15.2 390.50 15.71
13 0.62976 0.0 8.14 0.0 0.538 5.949 61.8 4.7075 4.0 307.0 21.0 396.90 8.26
14 0.63796 0.0 8.14 0.0 0.538 6.096 84.5 4.4619 4.0 307.0 21.0 380.02 10.26
15 0.62739 0.0 8.14 0.0 0.538 5.834 56.5 4.4986 4.0 307.0 21.0 395.62 8.47
16 1.05393 0.0 8.14 0.0 0.538 5.935 29.3 4.4986 4.0 307.0 21.0 386.85 6.58
17 0.78420 0.0 8.14 0.0 0.538 5.990 81.7 4.2579 4.0 307.0 21.0 386.75 14.67
18 0.80271 0.0 8.14 0.0 0.538 5.456 36.6 3.7965 4.0 307.0 21.0 288.99 11.69
19 0.72580 0.0 8.14 0.0 0.538 5.727 69.5 3.7965 4.0 307.0 21.0 390.95 11.28
20 1.25179 0.0 8.14 0.0 0.538 5.570 98.1 3.7979 4.0 307.0 21.0 376.57 21.02
21 0.85204 0.0 8.14 0.0 0.538 5.965 89.2 4.0123 4.0 307.0 21.0 392.53 13.83
22 1.23247 0.0 8.14 0.0 0.538 6.142 91.7 3.9769 4.0 307.0 21.0 396.90 18.72
23 0.98843 0.0 8.14 0.0 0.538 5.813 100.0 4.0952 4.0 307.0 21.0 394.54 19.88
24 0.75026 0.0 8.14 0.0 0.538 5.924 94.1 4.3996 4.0 307.0 21.0 394.33 16.30
25 0.84054 0.0 8.14 0.0 0.538 5.599 85.7 4.4546 4.0 307.0 21.0 303.42 16.51
26 0.67191 0.0 8.14 0.0 0.538 5.813 90.3 4.6820 4.0 307.0 21.0 376.88 14.81
27 0.95577 0.0 8.14 0.0 0.538 6.047 88.8 4.4534 4.0 307.0 21.0 306.38 17.28
28 0.77299 0.0 8.14 0.0 0.538 6.495 94.4 4.4547 4.0 307.0 21.0 387.94 12.80
29 1.00245 0.0 8.14 0.0 0.538 6.674 87.3 4.2390 4.0 307.0 21.0 380.23 11.98
... ... ... ... ... ... ... ... ... ... ... ... ... ...
476 4.87141 0.0 18.10 0.0 0.614 6.484 93.6 2.3053 24.0 666.0 20.2 396.21 18.68
477 15.02340 0.0 18.10 0.0 0.614 5.304 97.3 2.1007 24.0 666.0 20.2 349.48 24.91
478 10.23300 0.0 18.10 0.0 0.614 6.185 96.7 2.1705 24.0 666.0 20.2 379.70 18.03
479 14.33370 0.0 18.10 0.0 0.614 6.229 88.0 1.9512 24.0 666.0 20.2 383.32 13.11
480 5.82401 0.0 18.10 0.0 0.532 6.242 64.7 3.4242 24.0 666.0 20.2 396.90 10.74
481 5.70818 0.0 18.10 0.0 0.532 6.750 74.9 3.3317 24.0 666.0 20.2 393.07 7.74
482 5.73116 0.0 18.10 0.0 0.532 7.061 77.0 3.4106 24.0 666.0 20.2 395.28 7.01
483 2.81838 0.0 18.10 0.0 0.532 5.762 40.3 4.0983 24.0 666.0 20.2 392.92 10.42
484 2.37857 0.0 18.10 0.0 0.583 5.871 41.9 3.7240 24.0 666.0 20.2 370.73 13.34
485 3.67367 0.0 18.10 0.0 0.583 6.312 51.9 3.9917 24.0 666.0 20.2 388.62 10.58
486 5.69175 0.0 18.10 0.0 0.583 6.114 79.8 3.5459 24.0 666.0 20.2 392.68 14.98
487 4.83567 0.0 18.10 0.0 0.583 5.905 53.2 3.1523 24.0 666.0 20.2 388.22 11.45
488 0.15086 0.0 27.74 0.0 0.609 5.454 92.7 1.8209 4.0 711.0 20.1 395.09 18.06
489 0.18337 0.0 27.74 0.0 0.609 5.414 98.3 1.7554 4.0 711.0 20.1 344.05 23.97
490 0.20746 0.0 27.74 0.0 0.609 5.093 98.0 1.8226 4.0 711.0 20.1 318.43 29.68
491 0.10574 0.0 27.74 0.0 0.609 5.983 98.8 1.8681 4.0 711.0 20.1 390.11 18.07
492 0.11132 0.0 27.74 0.0 0.609 5.983 83.5 2.1099 4.0 711.0 20.1 396.90 13.35
493 0.17331 0.0 9.69 0.0 0.585 5.707 54.0 2.3817 6.0 391.0 19.2 396.90 12.01
494 0.27957 0.0 9.69 0.0 0.585 5.926 42.6 2.3817 6.0 391.0 19.2 396.90 13.59
495 0.17899 0.0 9.69 0.0 0.585 5.670 28.8 2.7986 6.0 391.0 19.2 393.29 17.60
496 0.28960 0.0 9.69 0.0 0.585 5.390 72.9 2.7986 6.0 391.0 19.2 396.90 21.14
497 0.26838 0.0 9.69 0.0 0.585 5.794 70.6 2.8927 6.0 391.0 19.2 396.90 14.10
498 0.23912 0.0 9.69 0.0 0.585 6.019 65.3 2.4091 6.0 391.0 19.2 396.90 12.92
499 0.17783 0.0 9.69 0.0 0.585 5.569 73.5 2.3999 6.0 391.0 19.2 395.77 15.10
500 0.22438 0.0 9.69 0.0 0.585 6.027 79.7 2.4982 6.0 391.0 19.2 396.90 14.33
501 0.06263 0.0 11.93 0.0 0.573 6.593 69.1 2.4786 1.0 273.0 21.0 391.99 9.67
502 0.04527 0.0 11.93 0.0 0.573 6.120 76.7 2.2875 1.0 273.0 21.0 396.90 9.08
503 0.06076 0.0 11.93 0.0 0.573 6.976 91.0 2.1675 1.0 273.0 21.0 396.90 5.64
504 0.10959 0.0 11.93 0.0 0.573 6.794 89.3 2.3889 1.0 273.0 21.0 393.45 6.48
505 0.04741 0.0 11.93 0.0 0.573 6.030 80.8 2.5050 1.0 273.0 21.0 396.90 7.88

506 rows × 13 columns

In [36]:
boston['target']
Out[36]:
array([ 24. ,  21.6,  34.7,  33.4,  36.2,  28.7,  22.9,  27.1,  16.5,
        18.9,  15. ,  18.9,  21.7,  20.4,  18.2,  19.9,  23.1,  17.5,
        20.2,  18.2,  13.6,  19.6,  15.2,  14.5,  15.6,  13.9,  16.6,
        14.8,  18.4,  21. ,  12.7,  14.5,  13.2,  13.1,  13.5,  18.9,
        20. ,  21. ,  24.7,  30.8,  34.9,  26.6,  25.3,  24.7,  21.2,
        19.3,  20. ,  16.6,  14.4,  19.4,  19.7,  20.5,  25. ,  23.4,
        18.9,  35.4,  24.7,  31.6,  23.3,  19.6,  18.7,  16. ,  22.2,
        25. ,  33. ,  23.5,  19.4,  22. ,  17.4,  20.9,  24.2,  21.7,
        22.8,  23.4,  24.1,  21.4,  20. ,  20.8,  21.2,  20.3,  28. ,
        23.9,  24.8,  22.9,  23.9,  26.6,  22.5,  22.2,  23.6,  28.7,
        22.6,  22. ,  22.9,  25. ,  20.6,  28.4,  21.4,  38.7,  43.8,
        33.2,  27.5,  26.5,  18.6,  19.3,  20.1,  19.5,  19.5,  20.4,
        19.8,  19.4,  21.7,  22.8,  18.8,  18.7,  18.5,  18.3,  21.2,
        19.2,  20.4,  19.3,  22. ,  20.3,  20.5,  17.3,  18.8,  21.4,
        15.7,  16.2,  18. ,  14.3,  19.2,  19.6,  23. ,  18.4,  15.6,
        18.1,  17.4,  17.1,  13.3,  17.8,  14. ,  14.4,  13.4,  15.6,
        11.8,  13.8,  15.6,  14.6,  17.8,  15.4,  21.5,  19.6,  15.3,
        19.4,  17. ,  15.6,  13.1,  41.3,  24.3,  23.3,  27. ,  50. ,
        50. ,  50. ,  22.7,  25. ,  50. ,  23.8,  23.8,  22.3,  17.4,
        19.1,  23.1,  23.6,  22.6,  29.4,  23.2,  24.6,  29.9,  37.2,
        39.8,  36.2,  37.9,  32.5,  26.4,  29.6,  50. ,  32. ,  29.8,
        34.9,  37. ,  30.5,  36.4,  31.1,  29.1,  50. ,  33.3,  30.3,
        34.6,  34.9,  32.9,  24.1,  42.3,  48.5,  50. ,  22.6,  24.4,
        22.5,  24.4,  20. ,  21.7,  19.3,  22.4,  28.1,  23.7,  25. ,
        23.3,  28.7,  21.5,  23. ,  26.7,  21.7,  27.5,  30.1,  44.8,
        50. ,  37.6,  31.6,  46.7,  31.5,  24.3,  31.7,  41.7,  48.3,
        29. ,  24. ,  25.1,  31.5,  23.7,  23.3,  22. ,  20.1,  22.2,
        23.7,  17.6,  18.5,  24.3,  20.5,  24.5,  26.2,  24.4,  24.8,
        29.6,  42.8,  21.9,  20.9,  44. ,  50. ,  36. ,  30.1,  33.8,
        43.1,  48.8,  31. ,  36.5,  22.8,  30.7,  50. ,  43.5,  20.7,
        21.1,  25.2,  24.4,  35.2,  32.4,  32. ,  33.2,  33.1,  29.1,
        35.1,  45.4,  35.4,  46. ,  50. ,  32.2,  22. ,  20.1,  23.2,
        22.3,  24.8,  28.5,  37.3,  27.9,  23.9,  21.7,  28.6,  27.1,
        20.3,  22.5,  29. ,  24.8,  22. ,  26.4,  33.1,  36.1,  28.4,
        33.4,  28.2,  22.8,  20.3,  16.1,  22.1,  19.4,  21.6,  23.8,
        16.2,  17.8,  19.8,  23.1,  21. ,  23.8,  23.1,  20.4,  18.5,
        25. ,  24.6,  23. ,  22.2,  19.3,  22.6,  19.8,  17.1,  19.4,
        22.2,  20.7,  21.1,  19.5,  18.5,  20.6,  19. ,  18.7,  32.7,
        16.5,  23.9,  31.2,  17.5,  17.2,  23.1,  24.5,  26.6,  22.9,
        24.1,  18.6,  30.1,  18.2,  20.6,  17.8,  21.7,  22.7,  22.6,
        25. ,  19.9,  20.8,  16.8,  21.9,  27.5,  21.9,  23.1,  50. ,
        50. ,  50. ,  50. ,  50. ,  13.8,  13.8,  15. ,  13.9,  13.3,
        13.1,  10.2,  10.4,  10.9,  11.3,  12.3,   8.8,   7.2,  10.5,
         7.4,  10.2,  11.5,  15.1,  23.2,   9.7,  13.8,  12.7,  13.1,
        12.5,   8.5,   5. ,   6.3,   5.6,   7.2,  12.1,   8.3,   8.5,
         5. ,  11.9,  27.9,  17.2,  27.5,  15. ,  17.2,  17.9,  16.3,
         7. ,   7.2,   7.5,  10.4,   8.8,   8.4,  16.7,  14.2,  20.8,
        13.4,  11.7,   8.3,  10.2,  10.9,  11. ,   9.5,  14.5,  14.1,
        16.1,  14.3,  11.7,  13.4,   9.6,   8.7,   8.4,  12.8,  10.5,
        17.1,  18.4,  15.4,  10.8,  11.8,  14.9,  12.6,  14.1,  13. ,
        13.4,  15.2,  16.1,  17.8,  14.9,  14.1,  12.7,  13.5,  14.9,
        20. ,  16.4,  17.7,  19.5,  20.2,  21.4,  19.9,  19. ,  19.1,
        19.1,  20.1,  19.9,  19.6,  23.2,  29.8,  13.8,  13.3,  16.7,
        12. ,  14.6,  21.4,  23. ,  23.7,  25. ,  21.8,  20.6,  21.2,
        19.1,  20.6,  15.2,   7. ,   8.1,  13.6,  20.1,  21.8,  24.5,
        23.1,  19.7,  18.3,  21.2,  17.5,  16.8,  22.4,  20.6,  23.9,
        22. ,  11.9])
In [37]:
targ = pd.DataFrame( boston.target )
targ.columns = ["Цена"]
targ
Out[37]:
Цена
0 24.0
1 21.6
2 34.7
3 33.4
4 36.2
5 28.7
6 22.9
7 27.1
8 16.5
9 18.9
10 15.0
11 18.9
12 21.7
13 20.4
14 18.2
15 19.9
16 23.1
17 17.5
18 20.2
19 18.2
20 13.6
21 19.6
22 15.2
23 14.5
24 15.6
25 13.9
26 16.6
27 14.8
28 18.4
29 21.0
... ...
476 16.7
477 12.0
478 14.6
479 21.4
480 23.0
481 23.7
482 25.0
483 21.8
484 20.6
485 21.2
486 19.1
487 20.6
488 15.2
489 7.0
490 8.1
491 13.6
492 20.1
493 21.8
494 24.5
495 23.1
496 19.7
497 18.3
498 21.2
499 17.5
500 16.8
501 22.4
502 20.6
503 23.9
504 22.0
505 11.9

506 rows × 1 columns

In [38]:
targ["Цена"][1]
Out[38]:
21.600000000000001
In [39]:
pd.concat([df, targ], axis=1) 
Out[39]:
CRIM ZN INDUS CHAS NOX RM AGE DIS RAD TAX PTRATIO B LSTAT Цена
0 0.00632 18.0 2.31 0.0 0.538 6.575 65.2 4.0900 1.0 296.0 15.3 396.90 4.98 24.0
1 0.02731 0.0 7.07 0.0 0.469 6.421 78.9 4.9671 2.0 242.0 17.8 396.90 9.14 21.6
2 0.02729 0.0 7.07 0.0 0.469 7.185 61.1 4.9671 2.0 242.0 17.8 392.83 4.03 34.7
3 0.03237 0.0 2.18 0.0 0.458 6.998 45.8 6.0622 3.0 222.0 18.7 394.63 2.94 33.4
4 0.06905 0.0 2.18 0.0 0.458 7.147 54.2 6.0622 3.0 222.0 18.7 396.90 5.33 36.2
5 0.02985 0.0 2.18 0.0 0.458 6.430 58.7 6.0622 3.0 222.0 18.7 394.12 5.21 28.7
6 0.08829 12.5 7.87 0.0 0.524 6.012 66.6 5.5605 5.0 311.0 15.2 395.60 12.43 22.9
7 0.14455 12.5 7.87 0.0 0.524 6.172 96.1 5.9505 5.0 311.0 15.2 396.90 19.15 27.1
8 0.21124 12.5 7.87 0.0 0.524 5.631 100.0 6.0821 5.0 311.0 15.2 386.63 29.93 16.5
9 0.17004 12.5 7.87 0.0 0.524 6.004 85.9 6.5921 5.0 311.0 15.2 386.71 17.10 18.9
10 0.22489 12.5 7.87 0.0 0.524 6.377 94.3 6.3467 5.0 311.0 15.2 392.52 20.45 15.0
11 0.11747 12.5 7.87 0.0 0.524 6.009 82.9 6.2267 5.0 311.0 15.2 396.90 13.27 18.9
12 0.09378 12.5 7.87 0.0 0.524 5.889 39.0 5.4509 5.0 311.0 15.2 390.50 15.71 21.7
13 0.62976 0.0 8.14 0.0 0.538 5.949 61.8 4.7075 4.0 307.0 21.0 396.90 8.26 20.4
14 0.63796 0.0 8.14 0.0 0.538 6.096 84.5 4.4619 4.0 307.0 21.0 380.02 10.26 18.2
15 0.62739 0.0 8.14 0.0 0.538 5.834 56.5 4.4986 4.0 307.0 21.0 395.62 8.47 19.9
16 1.05393 0.0 8.14 0.0 0.538 5.935 29.3 4.4986 4.0 307.0 21.0 386.85 6.58 23.1
17 0.78420 0.0 8.14 0.0 0.538 5.990 81.7 4.2579 4.0 307.0 21.0 386.75 14.67 17.5
18 0.80271 0.0 8.14 0.0 0.538 5.456 36.6 3.7965 4.0 307.0 21.0 288.99 11.69 20.2
19 0.72580 0.0 8.14 0.0 0.538 5.727 69.5 3.7965 4.0 307.0 21.0 390.95 11.28 18.2
20 1.25179 0.0 8.14 0.0 0.538 5.570 98.1 3.7979 4.0 307.0 21.0 376.57 21.02 13.6
21 0.85204 0.0 8.14 0.0 0.538 5.965 89.2 4.0123 4.0 307.0 21.0 392.53 13.83 19.6
22 1.23247 0.0 8.14 0.0 0.538 6.142 91.7 3.9769 4.0 307.0 21.0 396.90 18.72 15.2
23 0.98843 0.0 8.14 0.0 0.538 5.813 100.0 4.0952 4.0 307.0 21.0 394.54 19.88 14.5
24 0.75026 0.0 8.14 0.0 0.538 5.924 94.1 4.3996 4.0 307.0 21.0 394.33 16.30 15.6
25 0.84054 0.0 8.14 0.0 0.538 5.599 85.7 4.4546 4.0 307.0 21.0 303.42 16.51 13.9
26 0.67191 0.0 8.14 0.0 0.538 5.813 90.3 4.6820 4.0 307.0 21.0 376.88 14.81 16.6
27 0.95577 0.0 8.14 0.0 0.538 6.047 88.8 4.4534 4.0 307.0 21.0 306.38 17.28 14.8
28 0.77299 0.0 8.14 0.0 0.538 6.495 94.4 4.4547 4.0 307.0 21.0 387.94 12.80 18.4
29 1.00245 0.0 8.14 0.0 0.538 6.674 87.3 4.2390 4.0 307.0 21.0 380.23 11.98 21.0
... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
476 4.87141 0.0 18.10 0.0 0.614 6.484 93.6 2.3053 24.0 666.0 20.2 396.21 18.68 16.7
477 15.02340 0.0 18.10 0.0 0.614 5.304 97.3 2.1007 24.0 666.0 20.2 349.48 24.91 12.0
478 10.23300 0.0 18.10 0.0 0.614 6.185 96.7 2.1705 24.0 666.0 20.2 379.70 18.03 14.6
479 14.33370 0.0 18.10 0.0 0.614 6.229 88.0 1.9512 24.0 666.0 20.2 383.32 13.11 21.4
480 5.82401 0.0 18.10 0.0 0.532 6.242 64.7 3.4242 24.0 666.0 20.2 396.90 10.74 23.0
481 5.70818 0.0 18.10 0.0 0.532 6.750 74.9 3.3317 24.0 666.0 20.2 393.07 7.74 23.7
482 5.73116 0.0 18.10 0.0 0.532 7.061 77.0 3.4106 24.0 666.0 20.2 395.28 7.01 25.0
483 2.81838 0.0 18.10 0.0 0.532 5.762 40.3 4.0983 24.0 666.0 20.2 392.92 10.42 21.8
484 2.37857 0.0 18.10 0.0 0.583 5.871 41.9 3.7240 24.0 666.0 20.2 370.73 13.34 20.6
485 3.67367 0.0 18.10 0.0 0.583 6.312 51.9 3.9917 24.0 666.0 20.2 388.62 10.58 21.2
486 5.69175 0.0 18.10 0.0 0.583 6.114 79.8 3.5459 24.0 666.0 20.2 392.68 14.98 19.1
487 4.83567 0.0 18.10 0.0 0.583 5.905 53.2 3.1523 24.0 666.0 20.2 388.22 11.45 20.6
488 0.15086 0.0 27.74 0.0 0.609 5.454 92.7 1.8209 4.0 711.0 20.1 395.09 18.06 15.2
489 0.18337 0.0 27.74 0.0 0.609 5.414 98.3 1.7554 4.0 711.0 20.1 344.05 23.97 7.0
490 0.20746 0.0 27.74 0.0 0.609 5.093 98.0 1.8226 4.0 711.0 20.1 318.43 29.68 8.1
491 0.10574 0.0 27.74 0.0 0.609 5.983 98.8 1.8681 4.0 711.0 20.1 390.11 18.07 13.6
492 0.11132 0.0 27.74 0.0 0.609 5.983 83.5 2.1099 4.0 711.0 20.1 396.90 13.35 20.1
493 0.17331 0.0 9.69 0.0 0.585 5.707 54.0 2.3817 6.0 391.0 19.2 396.90 12.01 21.8
494 0.27957 0.0 9.69 0.0 0.585 5.926 42.6 2.3817 6.0 391.0 19.2 396.90 13.59 24.5
495 0.17899 0.0 9.69 0.0 0.585 5.670 28.8 2.7986 6.0 391.0 19.2 393.29 17.60 23.1
496 0.28960 0.0 9.69 0.0 0.585 5.390 72.9 2.7986 6.0 391.0 19.2 396.90 21.14 19.7
497 0.26838 0.0 9.69 0.0 0.585 5.794 70.6 2.8927 6.0 391.0 19.2 396.90 14.10 18.3
498 0.23912 0.0 9.69 0.0 0.585 6.019 65.3 2.4091 6.0 391.0 19.2 396.90 12.92 21.2
499 0.17783 0.0 9.69 0.0 0.585 5.569 73.5 2.3999 6.0 391.0 19.2 395.77 15.10 17.5
500 0.22438 0.0 9.69 0.0 0.585 6.027 79.7 2.4982 6.0 391.0 19.2 396.90 14.33 16.8
501 0.06263 0.0 11.93 0.0 0.573 6.593 69.1 2.4786 1.0 273.0 21.0 391.99 9.67 22.4
502 0.04527 0.0 11.93 0.0 0.573 6.120 76.7 2.2875 1.0 273.0 21.0 396.90 9.08 20.6
503 0.06076 0.0 11.93 0.0 0.573 6.976 91.0 2.1675 1.0 273.0 21.0 396.90 5.64 23.9
504 0.10959 0.0 11.93 0.0 0.573 6.794 89.3 2.3889 1.0 273.0 21.0 393.45 6.48 22.0
505 0.04741 0.0 11.93 0.0 0.573 6.030 80.8 2.5050 1.0 273.0 21.0 396.90 7.88 11.9

506 rows × 14 columns

In [40]:
from sklearn.linear_model import Lasso
las = Lasso( 0.5)
In [41]:
X_train, X_test, y_train, y_test = train_test_split( boston.data, boston.target, test_size=0.15)
las.fit( X_train, y_train )
Out[41]:
Lasso(alpha=0.5, copy_X=True, fit_intercept=True, max_iter=1000,
   normalize=False, positive=False, precompute=False, random_state=None,
   selection='cyclic', tol=0.0001, warm_start=False)
In [42]:
las.score( X_train, y_train )
Out[42]:
0.70096592417740755
In [43]:
las.score( X_test, y_test )
Out[43]:
0.75923310974139779
In [44]:
from sklearn.tree import DecisionTreeRegressor
regressor = DecisionTreeRegressor(  ) #max_depth=5, min_impurity_decrease=0.01, random_state=0
In [45]:
regressor.fit( boston.data, boston.target )
Out[45]:
DecisionTreeRegressor(criterion='mse', max_depth=None, max_features=None,
           max_leaf_nodes=None, min_impurity_decrease=0.0,
           min_impurity_split=None, min_samples_leaf=1,
           min_samples_split=2, min_weight_fraction_leaf=0.0,
           presort=False, random_state=None, splitter='best')
In [46]:
X_train, X_test, y_train, y_test = train_test_split( boston.data, boston.target, test_size=0.15)
In [47]:
regressor.fit( X_train, y_train )
Out[47]:
DecisionTreeRegressor(criterion='mse', max_depth=None, max_features=None,
           max_leaf_nodes=None, min_impurity_decrease=0.0,
           min_impurity_split=None, min_samples_leaf=1,
           min_samples_split=2, min_weight_fraction_leaf=0.0,
           presort=False, random_state=None, splitter='best')
In [48]:
regressor.score( X_train, y_train )
Out[48]:
1.0
In [49]:
regressor.score( X_test, y_test )
Out[49]:
0.78659716751605868
In [50]:
pred = regressor.predict( X_test )
pred
Out[50]:
array([ 10.2,  20.9,  14.1,  16.1,  15. ,  27.5,  18.2,  44.8,  18.5,
        11.9,  15. ,  15.6,  10.2,  24.4,  15.2,  23.9,  13.8,  12.1,
        19.4,  19.9,  28.7,  20.8,  19.2,  13.8,  23. ,  24.1,  31.5,
         6.3,  18.7,  19.5,  48.5,  24.8,  36.1,  22.2,  10.2,  21.2,
        20.4,   6.3,  16.1,  24.6,  35.2,  23.2,  18.2,  50. ,  27.9,
        24.8,   7.2,  13.8,  43.1,  19.3,  20.8,  19.6,  26.4,  23.4,
         6.3,  10.2,  35.2,  17.5,  21.8,   8.8,   8.7,  23.7,  18.7,
        25. ,  23.9,  26.5,  23.2,  20.1,  19.6,  22.5,  20. ,  31.6,
         8.5,  16.1,  28.6,  19.6])
In [51]:
y_test
Out[51]:
array([  7. ,  22. ,  16.7,  23.1,  17.8,  14.1,  14.5,  38.7,  21.7,
        13.9,  10.4,  13.1,  17.9,  25. ,  16.1,  28.4,  13.4,   8.3,
        17.8,  18.2,  29.8,  21.2,  19.3,  15.6,  23.3,  31.6,  33.8,
        10.5,  18.9,  21.2,  50. ,  22. ,  32.2,  20.7,  13.1,  24.5,
        21.7,   7.2,  27.5,  24.8,  21.9,  20.2,  20.9,  50. ,  22.8,
        28.7,  12.3,  17.8,  48.8,  21.8,  20.8,  19.3,  18.9,  22.8,
         8.8,  10.9,  42.8,  19.5,  21.2,  11. ,   8.3,  29. ,  11.9,
        28.7,  23.3,  22.8,  22.6,  23. ,  19.8,  22.6,  17.4,  31.7,
        11.5,  21.9,  32. ,  23.4])
In [52]:
plt.hist( pred/y_test )
Out[52]:
(array([  5.,  13.,  23.,  20.,   8.,   1.,   3.,   2.,   0.,   1.]),
 array([ 0.5698324 ,  0.70788462,  0.84593684,  0.98398906,  1.12204129,
         1.26009351,  1.39814573,  1.53619795,  1.67425017,  1.81230239,
         1.95035461]),
 <a list of 10 Patch objects>)
In [53]:
np.std(pred - y_test)
Out[53]:
4.1640244623430931
In [54]:
from sklearn.model_selection import cross_val_score
cross_val_score(regressor, boston.data, boston.target, cv=10)
Out[54]:
array([ 0.51877491,  0.57361183, -1.38257463,  0.41338543,  0.76374123,
        0.40432617, -0.16870971,  0.35307559, -3.93934968,  0.09842071])
In [55]:
from sklearn.tree import DecisionTreeRegressor
regressor = DecisionTreeRegressor( max_depth=5, min_impurity_decrease=0.01 ) #random_state=0
In [56]:
regressor.fit( boston.data, boston.target )
Out[56]:
DecisionTreeRegressor(criterion='mse', max_depth=5, max_features=None,
           max_leaf_nodes=None, min_impurity_decrease=0.01,
           min_impurity_split=None, min_samples_leaf=1,
           min_samples_split=2, min_weight_fraction_leaf=0.0,
           presort=False, random_state=None, splitter='best')
In [57]:
X_train, X_test, y_train, y_test = train_test_split( boston.data, boston.target, test_size=0.15)
In [58]:
regressor.fit( X_train, np.log(y_train) )
Out[58]:
DecisionTreeRegressor(criterion='mse', max_depth=5, max_features=None,
           max_leaf_nodes=None, min_impurity_decrease=0.01,
           min_impurity_split=None, min_samples_leaf=1,
           min_samples_split=2, min_weight_fraction_leaf=0.0,
           presort=False, random_state=None, splitter='best')
In [59]:
regressor.score( X_train, np.log(y_train) )
Out[59]:
0.68568634697609676
In [60]:
regressor.score( X_test, np.log(y_test) )
Out[60]:
0.72165420750037956
In [61]:
pred = regressor.predict( X_test )
pred
Out[61]:
array([ 2.81782846,  3.1005066 ,  3.1005066 ,  3.1005066 ,  3.1005066 ,
        3.1005066 ,  3.1005066 ,  2.81782846,  3.1005066 ,  2.81782846,
        2.42970674,  3.52698811,  2.42970674,  3.1005066 ,  3.52698811,
        3.1005066 ,  3.52698811,  3.1005066 ,  2.81782846,  3.1005066 ,
        3.52698811,  3.1005066 ,  3.52698811,  3.1005066 ,  3.52698811,
        3.1005066 ,  3.52698811,  3.52698811,  3.1005066 ,  2.81782846,
        3.52698811,  3.1005066 ,  3.52698811,  2.81782846,  3.52698811,
        2.81782846,  3.1005066 ,  2.42970674,  2.42970674,  3.1005066 ,
        2.42970674,  3.1005066 ,  3.1005066 ,  3.1005066 ,  3.1005066 ,
        3.1005066 ,  3.1005066 ,  3.1005066 ,  3.52698811,  2.42970674,
        2.42970674,  3.52698811,  2.81782846,  2.42970674,  3.1005066 ,
        3.52698811,  3.1005066 ,  2.81782846,  2.81782846,  2.42970674,
        3.1005066 ,  2.81782846,  2.42970674,  2.81782846,  2.42970674,
        3.52698811,  3.1005066 ,  3.52698811,  3.1005066 ,  3.1005066 ,
        2.42970674,  2.42970674,  2.42970674,  2.81782846,  3.1005066 ,
        3.1005066 ])
In [62]:
np.log(y_test)
Out[62]:
array([ 2.72129543,  2.8678989 ,  3.00071982,  2.67414865,  2.9601051 ,
        2.9601051 ,  2.4765384 ,  2.74727091,  3.35689712,  3.07731226,
        2.62466859,  3.77963382,  2.60268969,  3.10009229,  3.63495111,
        3.19458313,  3.5085559 ,  3.02529108,  2.82137889,  3.06339092,
        3.54385368,  3.91202301,  3.763523  ,  3.04927304,  3.55248683,
        3.08190997,  3.16124671,  3.35689712,  3.38099467,  2.52572864,
        3.91202301,  3.21887582,  3.58351894,  2.96527307,  3.78418963,
        2.70136121,  3.20274644,  2.53369681,  2.94968834,  3.12676054,
        2.2617631 ,  3.19047635,  2.97552957,  2.97041447,  3.09104245,
        2.99573227,  2.99573227,  3.39450839,  3.77276094,  1.97408103,
        2.34180581,  3.49953328,  2.87919846,  2.58776404,  3.09557761,
        3.44041809,  3.0056826 ,  2.97041447,  2.99573227,  2.32238772,
        3.37073817,  2.96527307,  2.27212589,  2.62466859,  2.67414865,
        3.09104245,  3.22286785,  3.74478709,  3.07731226,  3.06339092,
        2.70136121,  2.11625551,  2.32238772,  2.99573227,  3.05400118,
        3.12676054])
In [63]:
plt.hist(pred - np.log(y_test) )
Out[63]:
(array([  1.,   0.,   2.,  10.,  16.,  18.,  21.,   3.,   4.,   1.]),
 array([-0.81151641, -0.66796795, -0.52441949, -0.38087103, -0.23732256,
        -0.0937741 ,  0.04977436,  0.19332282,  0.33687128,  0.48041974,
         0.6239682 ]),
 <a list of 10 Patch objects>)
In [64]:
plt.hist(np.exp(pred - np.log(y_test)))
Out[64]:
(array([  1.,   2.,  20.,  17.,  24.,   5.,   2.,   4.,   0.,   1.]),
 array([ 0.44418399,  0.58639752,  0.72861105,  0.87082458,  1.01303811,
         1.15525164,  1.29746517,  1.4396787 ,  1.58189223,  1.72410576,
         1.86631929]),
 <a list of 10 Patch objects>)
In [65]:
#from sklearn.datasets import fetch_california_housing
#housing = fetch_california_housing()
In [66]:
from sklearn.neighbors import KNeighborsClassifier
In [67]:
neigh = KNeighborsClassifier(n_neighbors=3)
In [68]:
X_train, X_test, y_train, y_test = train_test_split( iris.data, iris.target, test_size=0.15)
In [69]:
neigh.fit( X_train, y_train )
Out[69]:
KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=3, p=2,
           weights='uniform')
In [70]:
neigh.score( X_train, y_train )
Out[70]:
0.96062992125984248
In [71]:
neigh.score( X_test, y_test )
Out[71]:
0.95652173913043481
In [72]:
from sklearn.model_selection import cross_val_score
cross_val_score( neigh, iris.data, iris.target, cv=10)
Out[72]:
array([ 1.        ,  0.93333333,  1.        ,  0.93333333,  0.86666667,
        1.        ,  0.93333333,  1.        ,  1.        ,  1.        ])
In [73]:
for n in range(1,5):
    neigh = KNeighborsClassifier(n_neighbors=n)
    print(np.mean(cross_val_score( neigh, iris.data, iris.target, cv=10)))
0.96
0.953333333333
0.966666666667
0.966666666667
In [74]:
from sklearn.neighbors import RadiusNeighborsClassifier
rad = RadiusNeighborsClassifier( radius = 0.75 )
cross_val_score( rad, iris.data, iris.target, cv=10)
Out[74]:
array([ 1.        ,  0.93333333,  1.        ,  1.        ,  1.        ,
        0.86666667,  0.93333333,  0.93333333,  1.        ,  1.        ])
In [75]:
for r in np.arange(0.75, 5, 0.15):
    rad = RadiusNeighborsClassifier( radius = r )
    print(r, np.mean(cross_val_score( rad, iris.data, iris.target, cv=10)))
0.75 0.966666666667
0.9 0.953333333333
1.05 0.953333333333
1.2 0.94
1.35 0.92
1.5 0.886666666667
1.65 0.893333333333
1.8 0.886666666667
1.95 0.893333333333
2.1 0.9
2.25 0.88
2.4 0.866666666667
2.55 0.88
2.7 0.893333333333
2.85 0.873333333333
3.0 0.82
3.15 0.786666666667
3.3 0.746666666667
3.45 0.7
3.6 0.633333333333
3.75 0.58
3.9 0.56
4.05 0.52
4.2 0.48
4.35 0.44
4.5 0.413333333333
4.65 0.38
4.8 0.36
4.95 0.333333333333
In [76]:
X = iris.data[:, [0, 2]]
y = iris.target

x_min, x_max = X[:, 0].min() - 1, X[:, 0].max() + 1
y_min, y_max = X[:, 1].min() - 1, X[:, 1].max() + 1
xx, yy = np.meshgrid(np.arange(x_min, x_max, 0.1),
                     np.arange(y_min, y_max, 0.1))

f, axarr = plt.subplots(2, 2, sharex='col', sharey='row', figsize=(10, 8))

for idx, clf, tt in zip(product([0, 1], [0, 1]),
                        [KNeighborsClassifier(n_neighbors=1), KNeighborsClassifier(n_neighbors=3),
                         RadiusNeighborsClassifier( radius = 4. ), DecisionTreeClassifier(max_depth=4, min_impurity_decrease=0.1)],
                        ['KNeighborsClassifier, 1', 'KNeighborsClassifier, 3',
                         'RadiusNeighborsClassifier( radius = 4. ) 4', 'Decision Tree']):

    clf.fit( X, y )
    Z = clf.predict(np.c_[xx.ravel(), yy.ravel()])
    Z = Z.reshape(xx.shape)

    axarr[idx[0], idx[1]].contourf(xx, yy, Z, alpha=0.4)
    axarr[idx[0], idx[1]].scatter(X[:, 0], X[:, 1], c=y,
                                  s=20, edgecolor='k')
    axarr[idx[0], idx[1]].set_title(tt)

plt.show()
In [ ]: