티스토리 뷰

0장_아이리스

Numpy를 활용한 Iris 데이터 분석

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
In [2]:
s = open('iris.csv').readline()
#header = [i.strip('"') for i in s.strip().split(',')][:-1]
header = s.strip().split(',')[:-1]
header
Out[2]:
['SepalLength', 'SepalWidth', 'PetalLength', 'PetalWidth']
In [3]:
labels = ['Iris-setosa', 'Iris-versicolor', 'Iris-virginica']
iris = np.loadtxt('iris.csv', delimiter=',', skiprows=1, converters={4: lambda s: labels.index(s.decode())})
In [4]:
display(iris.shape, iris[:5])
(150, 5)
array([[5.1, 3.5, 1.4, 0.2, 0. ],
       [4.9, 3. , 1.4, 0.2, 0. ],
       [4.7, 3.2, 1.3, 0.2, 0. ],
       [4.6, 3.1, 1.5, 0.2, 0. ],
       [5. , 3.6, 1.4, 0.2, 0. ]])
In [5]:
X = iris[:,:4]
y = iris[:,4]
In [6]:
display(X.shape, y.shape)
display(X[:5], y)
(150, 4)
(150,)
array([[5.1, 3.5, 1.4, 0.2],
       [4.9, 3. , 1.4, 0.2],
       [4.7, 3.2, 1.3, 0.2],
       [4.6, 3.1, 1.5, 0.2],
       [5. , 3.6, 1.4, 0.2]])
array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 2., 2.,
       2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2.,
       2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2.,
       2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2.])
In [7]:
corr = np.corrcoef(X.T)
corr
Out[7]:
array([[ 1.        , -0.10936925,  0.87175416,  0.81795363],
       [-0.10936925,  1.        , -0.4205161 , -0.35654409],
       [ 0.87175416, -0.4205161 ,  1.        ,  0.9627571 ],
       [ 0.81795363, -0.35654409,  0.9627571 ,  1.        ]])
In [8]:
plt.imshow(corr, interpolation='none', vmin=-1, vmax=1, cmap='spring')
plt.xticks(range(4), header, rotation=90)
plt.yticks(range(4), header)
plt.colorbar()
Out[8]:
<matplotlib.colorbar.Colorbar at 0x9facb662e8>
In [9]:
plt.boxplot(X)
plt.xticks(range(1,5), header, rotation=90)
Out[9]:
([<matplotlib.axis.XTick at 0x9faca10be0>,
  <matplotlib.axis.XTick at 0x9faca10518>,
  <matplotlib.axis.XTick at 0x9faca10278>,
  <matplotlib.axis.XTick at 0x9facab5a58>],
 <a list of 4 Text xticklabel objects>)
In [10]:
plt.figure(figsize=[12,8])

for col in range(4):
    plt.subplot(2,2,col+1)
#     plt.scatter(X[:,col], y, c=y, s=30, alpha=0.2)
    plt.scatter(X[:,col], y + np.random.normal(0,0.03,size=len(y)), c=y, s=30, alpha=0.3)
    plt.yticks([0,1,2], ['Setosa', 'Versicolor', 'Virginica'], rotation=90)
    plt.title(header[col], fontsize=15)
In [11]:
iris_df = pd.DataFrame(X, columns=header)
iris_df
Out[11]:
SepalLength SepalWidth PetalLength PetalWidth
0 5.1 3.5 1.4 0.2
1 4.9 3.0 1.4 0.2
2 4.7 3.2 1.3 0.2
3 4.6 3.1 1.5 0.2
4 5.0 3.6 1.4 0.2
5 5.4 3.9 1.7 0.4
6 4.6 3.4 1.4 0.3
7 5.0 3.4 1.5 0.2
8 4.4 2.9 1.4 0.2
9 4.9 3.1 1.5 0.1
10 5.4 3.7 1.5 0.2
11 4.8 3.4 1.6 0.2
12 4.8 3.0 1.4 0.1
13 4.3 3.0 1.1 0.1
14 5.8 4.0 1.2 0.2
15 5.7 4.4 1.5 0.4
16 5.4 3.9 1.3 0.4
17 5.1 3.5 1.4 0.3
18 5.7 3.8 1.7 0.3
19 5.1 3.8 1.5 0.3
20 5.4 3.4 1.7 0.2
21 5.1 3.7 1.5 0.4
22 4.6 3.6 1.0 0.2
23 5.1 3.3 1.7 0.5
24 4.8 3.4 1.9 0.2
25 5.0 3.0 1.6 0.2
26 5.0 3.4 1.6 0.4
27 5.2 3.5 1.5 0.2
28 5.2 3.4 1.4 0.2
29 4.7 3.2 1.6 0.2
... ... ... ... ...
120 6.9 3.2 5.7 2.3
121 5.6 2.8 4.9 2.0
122 7.7 2.8 6.7 2.0
123 6.3 2.7 4.9 1.8
124 6.7 3.3 5.7 2.1
125 7.2 3.2 6.0 1.8
126 6.2 2.8 4.8 1.8
127 6.1 3.0 4.9 1.8
128 6.4 2.8 5.6 2.1
129 7.2 3.0 5.8 1.6
130 7.4 2.8 6.1 1.9
131 7.9 3.8 6.4 2.0
132 6.4 2.8 5.6 2.2
133 6.3 2.8 5.1 1.5
134 6.1 2.6 5.6 1.4
135 7.7 3.0 6.1 2.3
136 6.3 3.4 5.6 2.4
137 6.4 3.1 5.5 1.8
138 6.0 3.0 4.8 1.8
139 6.9 3.1 5.4 2.1
140 6.7 3.1 5.6 2.4
141 6.9 3.1 5.1 2.3
142 5.8 2.7 5.1 1.9
143 6.8 3.2 5.9 2.3
144 6.7 3.3 5.7 2.5
145 6.7 3.0 5.2 2.3
146 6.3 2.5 5.0 1.9
147 6.5 3.0 5.2 2.0
148 6.2 3.4 5.4 2.3
149 5.9 3.0 5.1 1.8

150 rows × 4 columns

In [12]:
pd.plotting.scatter_matrix(iris_df, c=y, s=60, alpha=0.8, figsize=[12,12])
Out[12]:
array([[<matplotlib.axes._subplots.AxesSubplot object at 0x0000009FACCE4C50>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x0000009FACCC3208>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x0000009FACBA3358>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x0000009FACCB7828>],
       [<matplotlib.axes._subplots.AxesSubplot object at 0x0000009FACBB0EB8>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x0000009FACBB0EF0>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x0000009FAD834C18>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x0000009FAD86A2E8>],
       [<matplotlib.axes._subplots.AxesSubplot object at 0x0000009FACD83978>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x0000009FACDB9048>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x0000009FACDDF6D8>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x0000009FACE69D68>],
       [<matplotlib.axes._subplots.AxesSubplot object at 0x0000009FAD2CB438>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x0000009FAD2F1AC8>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x0000009FAD323198>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x0000009FAD34D828>]],
      dtype=object)
In [13]:
iris_df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150 entries, 0 to 149
Data columns (total 4 columns):
SepalLength    150 non-null float64
SepalWidth     150 non-null float64
PetalLength    150 non-null float64
PetalWidth     150 non-null float64
dtypes: float64(4)
memory usage: 4.8 KB
In [14]:
iris_df.describe()
Out[14]:
SepalLength SepalWidth PetalLength PetalWidth
count 150.000000 150.000000 150.000000 150.000000
mean 5.843333 3.054000 3.758667 1.198667
std 0.828066 0.433594 1.764420 0.763161
min 4.300000 2.000000 1.000000 0.100000
25% 5.100000 2.800000 1.600000 0.300000
50% 5.800000 3.000000 4.350000 1.300000
75% 6.400000 3.300000 5.100000 1.800000
max 7.900000 4.400000 6.900000 2.500000
In [15]:
iris_df.plot(kind='hist', alpha=0.3)
Out[15]:
<matplotlib.axes._subplots.AxesSubplot at 0x9fae94e518>
In [16]:
ct, bins = np.histogram(X[:,0], 20)
plt.hist(X[:,0][y==0], bins=bins, alpha=0.3)
plt.hist(X[:,0][y==1], bins=bins, alpha=0.3)
plt.hist(X[:,0][y==2], bins=bins, alpha=0.3)
plt.legend(['Setosa', 'Versicolor', 'Virginica'])
Out[16]:
<matplotlib.legend.Legend at 0x9faea4bef0>
In [17]:
titles = ['Setosa', 'Versicolor', 'Virginica']
plt.figure(figsize=[12,8])

for col in range(4):
    plt.subplot(2,2,col+1)
    plt.title(header[col], fontsize=15)
    ct, bins = np.histogram(X[:,col], 20)
    plt.hist(X[:,col][y==0], bins=bins, alpha=0.3)
    plt.hist(X[:,col][y==1], bins=bins, alpha=0.3)
    plt.hist(X[:,col][y==2], bins=bins, alpha=0.3)
    plt.ylim(0,40)
    if(col==0): plt.legend(titles)
In [18]:
N=30

plt.plot(X[:N].T, 'r-', alpha=0.3)
plt.plot(X[50:50+N].T, 'b-', alpha=0.3)
plt.plot(X[100:100+N].T, 'g-', alpha=0.3)
plt.xticks(range(4), header)
Out[18]:
([<matplotlib.axis.XTick at 0x9faeee5b00>,
  <matplotlib.axis.XTick at 0x9faef88828>,
  <matplotlib.axis.XTick at 0x9faef4a7f0>,
  <matplotlib.axis.XTick at 0x9faea98ba8>],
 <a list of 4 Text xticklabel objects>)
In [19]:
Xs = [X[y==0], X[y==1], X[y==2]]
In [20]:
plt.figure(figsize=[8,12])

for i in range(3):
    plt.subplot(3,1,i+1)
    plt.plot(X[y==i])
    plt.title(titles[i], fontsize=15)
    plt.ylim(0,10)
    if i==2: plt.xlabel('samples')
    if i==0: plt.legend(header)
In [21]:
col1 = 0
col2 = 1

plt.scatter(X[:,col1], X[:,col2], c=y, s=60)
plt.colorbar(shrink=0.5)
plt.xlabel(header[col1])
plt.ylabel(header[col2])
Out[21]:
Text(0,0.5,'SepalWidth')

'beginner > 파이썬 머신러닝 기초' 카테고리의 다른 글

머신러닝 기초_비용함수  (0) 2019.02.22
머신러닝 기초 _ 거리  (0) 2019.02.22
머신러닝과 파이썬  (0) 2019.02.21
Scikit-learn 기초  (0) 2019.02.20
머신러닝 기초  (0) 2019.02.20
공지사항
최근에 올라온 글
최근에 달린 댓글
Total
Today
Yesterday
링크
«   2024/05   »
1 2 3 4
5 6 7 8 9 10 11
12 13 14 15 16 17 18
19 20 21 22 23 24 25
26 27 28 29 30 31
글 보관함