티스토리 뷰
유방암 데이터 활용¶
- scikit-learn 에는 유방암 데이터가 기본적으로 들어있다. 머신러닝 학습에 많이 사용되는 데이터 이므로 익숙해지자.
- 총 30개의 속성과 malignant(악성), benign(양성) 의 두가지 타겟값을 가지고 있다.
In [1]:
%pylab inline
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
In [2]:
from sklearn.datasets import load_breast_cancer
cancer = load_breast_cancer()
In [3]:
type(cancer)
Out[3]:
In [4]:
dir(cancer)
Out[4]:
In [5]:
cancer.data.shape
Out[5]:
In [6]:
cancer.feature_names
Out[6]:
In [7]:
cancer.target_names
Out[7]:
In [8]:
cancer.target
Out[8]:
In [9]:
np.bincount(cancer.target) # cancer.target[cancer.target==0].shape
Out[9]:
In [10]:
print(cancer.DESCR)
In [11]:
for i,name in enumerate(cancer.feature_names):
print('%02d : %s' %(i,name)) # 0을 안붙이면 앞에 0을 안붙이고, 02를 안붙이면 숫자를 앞으로 땡겨쓴다.
In [12]:
cancer.target_names # malignant(악성), benign(양성)
Out[12]:
In [13]:
print('data =>',cancer.data.shape)
print('target =>',cancer.target.shape)
malignant = cancer.data[cancer.target==0]
benign = cancer.data[cancer.target==1]
print('malignant(악성) =>',malignant.shape)
print('benign(양성) =>',benign.shape)
In [14]:
_, bins=np.histogram(cancer.data[:,0], bins=20)
np.histogram(cancer.data[:,0], bins=20)
Out[14]:
In [15]:
plt.hist(malignant[:,0],bins=bins, alpha=0.3)
plt.hist(benign[:,0], bins=bins ,alpha=0.3)
plt.title(cancer.feature_names[0])
Out[15]:
In [16]:
plt.figure(figsize=[20,15])
for col in range(30):
plt.subplot(8,4,col+1)
_, bins=np.histogram(cancer.data[:,col], bins=20)
plt.hist(malignant[:,col],bins=bins, alpha=0.3)
plt.hist(benign[:,col], bins=bins ,alpha=0.3)
plt.title(cancer.feature_names[col])
if col==0: plt.legend(cancer.target_names)
plt.xticks([])
- 10번 반복해서 Logistic Regression 과 선형 SVM 을 적용해 보자. train_test_split() 함수에서 랜덤하게 데이터를 나누기 때문에 매번 점수가 달라진다.
- 선형 SVM 의 결과가 좋지 못하다. 이것은 데이터 정규화를 하지 않았기 때문인데 뒤에서 다루겠다.
In [17]:
from sklearn.linear_model import LogisticRegression
scores = []
for i in range(10):
X_train,X_test,y_train,y_test = train_test_split(cancer.data,cancer.target)
model = LogisticRegression()
model.fit(X_train,y_train)
score = model.score(X_test,y_test)
scores.append(score)
print('scores =', scores)
In [18]:
from sklearn.svm import LinearSVC
scores = []
for i in range(10):
X_train,X_test,y_train,y_test = train_test_split(cancer.data,cancer.target)
model = LinearSVC()
model.fit(X_train,y_train)
score = model.score(X_test,y_test)
scores.append(score)
print('scores =', scores)
In [19]:
fig=plt.figure(figsize=[14,14])
fig.suptitle('Breast Cancer - feature analysis', fontsize=20)
for col in range(cancer.feature_names.shape[0]): # 30 features
plt.subplot(8,4,col+1)
_,bins=np.histogram(cancer.data[:,col],bins=50)
plt.hist(malignant[:,col], bins=bins, alpha=0.5, label='malignant', color='red')
plt.hist(benign[:,col], bins=bins, alpha=0.5, label='benign', color='green')
plt.title(cancer.feature_names[col]+('(%d)' % col))
plt.xticks([])
plt.yticks([])
if col==0: plt.legend()
In [20]:
fig=plt.figure(figsize=[14,14])
fig.suptitle('Breast Cancer - feature analysis', fontsize=20)
for col in range(cancer.feature_names.shape[0]): # 30 features
plt.subplot(8,4,col+1)
# f_,bins=np.histogram(cancer.data[:,col],bins=50)
# plt.hist(malignant[:,col], bins=bins, alpha=0.5, label='malignant', color='red')
# plt.hist(benign[:,col], bins=bins, alpha=0.5, label='benign', color='green')
plt.scatter(cancer.data[:,col], cancer.target, c=cancer.target, alpha=0.5)
plt.title(cancer.feature_names[col]+('(%d)' % col))
plt.xticks([])
plt.yticks([])
# if col==0: plt.legend()
In [21]:
col1= 15
col2= 28
plt.scatter(cancer.data[:,0], cancer.data[:,1], c=cancer.target, alpha=0.3)
Out[21]:
In [22]:
fig,axes = plt.subplots(5,6,figsize=[12,20])
fig.suptitle('mean radius vs others', fontsize=20)
for i in range(30):
ax=axes.ravel()[i]
ax.scatter(cancer.data[:,0],cancer.data[:,i], c=cancer.target, cmap='winter', alpha=0.1)
ax.set_title(cancer.feature_names[i]+('\n(%d)' % i))
ax.set_axis_off()
In [23]:
#모든 속성에 대해서 한번에 그래프를 그릴수 없으므로 상관관계를 수치를 통해 데이터를 파악한다.
mat=np.corrcoef(cancer.data.T) # 열로 읽으므로 Transpose 시켜줘야 한다.
mat
#상관계수: 모든 점에 대해서 (c1-c1')(c2-c2')을 나눠준 값의 합을 표준편차1,표준편차2,n으로 나눠준다.
Out[23]:
In [24]:
from sklearn.datasets import load_iris
iris = load_iris()
mat_iris=np.corrcoef(iris.data.T)
plt.imshow(mat_iris, vmin=-1, vmax=1)
colorbar()
Out[24]:
In [25]:
mat.shape, mat[4,24]
Out[25]:
In [26]:
plt.scatter(cancer.data[:,4], cancer.data[:,24], alpha=0.1)
Out[26]:
In [27]:
# 속성간의 관계를 한 눈에 파악
fig=plt.figure(figsize=[14,14])
plt.title('Breast Cancer - Correlation Coefficient',fontsize=20)
plt.imshow(mat, interpolation='none', vmin=-1, vmax=1)
plt.colorbar(shrink=0.7)
plt.xticks(range(30),cancer.feature_names,rotation=90,ha='center')
plt.yticks(range(30))
print('')
In [28]:
fig=plt.figure(figsize=[10,8])
plt.title('Cancer - boxplot for features',fontsize=15)
plt.boxplot(cancer.data)
plt.xticks(np.arange(30)+1,cancer.feature_names,rotation=90)
#plt.ylim(0,1) # y축 길이를 바꿔보자
plt.xlabel('features')
plt.ylabel('scale')
print('')
단위가 서로 다르다.
cancer.data를 평균은 0 표준편차를 1로 변환하자
In [29]:
# 속성별 평균
m = cancer.data.mean(axis=0)
# 속성별 표준편차
s = cancer.data.std(axis=0)
# 정규확인 데이터
data2 = (cancer.data - m)/s
In [30]:
fig=plt.figure(figsize=[20,15])
plt.boxplot(data2)
pass
In [31]:
m1 = cancer.data.max(axis=0)
m2 = cancer.data.min(axis=0)
data3 = (cancer.data-m2)/(m1-m2)
fig=plt.figure(figsize=[20,15])
plt.boxplot(data3)
pass
크기를 맞췄다.
kNN, LinearSVM, Logistic Regression을 적용¶
- train_test_split() 적용 후 score 확인
- 원본 데이터와 정규화된 데이터 비교
In [32]:
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
X_train,X_test,y_train,y_test = train_test_split(cancer.data,cancer.target)
model = KNeighborsClassifier(n_neighbors=5)
model.fit(X_train,y_train)
score = model.score(X_test,y_test)
score
Out[32]:
In [49]:
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
X2_train,X2_test,y_train,y_test = train_test_split(data2,cancer.target)
model = KNeighborsClassifier(n_neighbors=5)
model.fit(X2_train,y_train)
score = model.score(X2_test,y_test)
score
Out[49]:
In [34]:
from sklearn.svm import LinearSVC
model = LinearSVC(C=1)
model.fit(X_train, y_train)
score = model.score(X_test, y_test)
print(score)
In [35]:
from sklearn.svm import LinearSVC
model = LinearSVC(C=1)
model.fit(X2_train, y_train)
score = model.score(X2_test, y_test)
print(score)
In [36]:
from sklearn.linear_model import LogisticRegression
model = LogisticRegression()
model.fit(X_train,y_train)
score = model. score(X_test,y_test)
score
Out[36]:
In [50]:
from sklearn.linear_model import LogisticRegression
model = LogisticRegression()
model.fit(X2_train,y_train)
score = model.score(X2_test,y_test)
score
Out[50]:
In [38]:
def sigmoid(x):
return 1/(1+np.exp(-10*x))
In [39]:
X_train, X_test, y_train, y_test = train_test_split(cancer.data, cancer.target)
model = LogisticRegression()
model.fit(X_train, y_train)
score_train = model.score(X_train, y_train)
score_test = model.score(X_test, y_test)
print(score_train, score_test)
In [47]:
from sklearn.svm import SVC
model = SVC(C=1)
model.fit(X_train, y_train)
score = model.score(X_test, y_test)
print(score)
In [51]:
from sklearn.svm import SVC
model = SVC(C=1)
model.fit(X2_train, y_train)
score = model.score(X2_test, y_test)
print(score)
In [44]:
X_mean = X_train.mean(axis=0)
X_std = X_train.std(axis=0)
X_train_norm = (X_train - X_mean)/X_std
X_test_norm = (X_test - X_mean)/X_std
plt.boxplot(X_train_norm)
print('')
In [45]:
from sklearn.svm import SVC
model = SVC(C=1)
model.fit(X_train_norm, y_train)
score = model.score(X_test_norm, y_test)
print(score)
'beginner > 파이썬 머신러닝 기초' 카테고리의 다른 글
유방암 데이터 분석 by SVM (0) | 2019.03.12 |
---|---|
지도학습 - kernel SVM (0) | 2019.03.07 |
지도학습 - 로지스틱회귀 (0) | 2019.03.05 |
지도학습 - LinearSVM_2 (0) | 2019.03.05 |
지도학습 - LinearSVM_1 (0) | 2019.03.05 |