로지스틱 회귀 (Logistic Regression)¶

로지스틱 회귀는 LinearSVM 과 같이 각 클래스를 직선 또는 평면 으로 가른다.
시그모이드 함수는 계단함수를 표현하기 위함. 스위치 개념. 확률을 알려주는 함수
회귀함수를 구하여 시그모이드 함수에 대입> 분류하는데 많이 이용되지만 회귀라고 불리우는 이유

import numpy as np
import matplotlib.pyplot as plt

from sklearn.datasets import make_blobs

X, y = make_blobs(400, 2, [[0,0],[5,5]], [2,3])

plt.scatter(X[:,0], X[:,1], c=y, s=60, alpha=0.3)
plt.colorbar()
plt.title('make_blobs() - 400 samples')

Text(0.5,1,'make_blobs() - 400 samples')

%%time

from sklearn.linear_model import LogisticRegression

model = LogisticRegression()
model.fit(X, y)
score = model.score(X, y)
score

Wall time: 3 ms

display(model.coef_, model.intercept_)

array([[0.6691802 , 0.63815211]])

array([-2.77155393])

import mglearn

plt.figure(figsize=[8,6])
mglearn.plots.plot_2d_classification(model, X, cm='Reds', alpha=0.3)
mglearn.discrete_scatter(X[:,0], X[:,1], y)

[<matplotlib.lines.Line2D at 0x1844a5e1240>,
 <matplotlib.lines.Line2D at 0x1844a5e1358>]

LinearSVM 적용¶

%%time

from sklearn.svm import LinearSVC

model = LinearSVC(C=1)
model.fit(X, y)

score = model.score(X, y)
print(score)

0.9025
Wall time: 17 ms

display(model.coef_, model.intercept_)

array([[0.22655459, 0.21809727]])

array([-0.97457533])

위에 구한 기울기와 달라보이지만 ax+by+c=0에서 a와 b의 비율이 중요한 것이므로 기울기는 유사하다고 볼 수 있다.

import mglearn

plt.figure(figsize=[8,6])
mglearn.plots.plot_2d_classification(model, X, eps=0.5, cm='spring')
mglearn.discrete_scatter(X[:,0], X[:,1],y)

[<matplotlib.lines.Line2D at 0x184496c3160>,
 <matplotlib.lines.Line2D at 0x184496c3278>]

def sigmoid(x):
    return 1/(1+np.exp(-x))

from mpl_toolkits.mplot3d import Axes3D

fig = plt.figure(figsize=[10,8])
ax = Axes3D(fig)

a = np.arange(-4,12,0.2)
b = np.arange(-4,12,0.2)
xx, yy = np.meshgrid(a,b)
ax.plot_surface(xx, yy, model.coef_[0,0]*xx + model.coef_[0,1]*yy + model.intercept_[0],
                shade=True, alpha=0.1, color='b')
ax.plot_wireframe(xx, yy, model.coef_[0,0]*xx + model.coef_[0,1]*yy + model.intercept_[0],
                  rstride=2, cstride=2, color='0.5')

ax.scatter(X[:,0], X[:,1], y, c=y, s=60)

ax.set_xlabel('X')
ax.set_ylabel('Y')
ax.set_zlabel('target')

ax.view_init(60, 70)

from mpl_toolkits.mplot3d import Axes3D

fig = plt.figure()
ax = Axes3D(fig)

a = np.arange(-4, 12, 0.2)
b = np.arange(-4, 12, 0.2)
xx, yy = np.meshgrid(a,b)

ax.plot_surface(xx, yy, xx + yy, alpha=0.1)

ax.scatter(X[:,0], X[:,1], y, c=y, s=60)

<mpl_toolkits.mplot3d.art3d.Path3DCollection at 0x1fb624747b8>

from mpl_toolkits.mplot3d import Axes3D

fig = plt.figure(figsize=[10,8])
ax = Axes3D(fig)

a = np.arange(-4,12,0.2)
b = np.arange(-4,12,0.2)
xx, yy = np.meshgrid(a,b)
ax.plot_surface(xx, yy, sigmoid(model.coef_[0,0]*xx + model.coef_[0,1]*yy + model.intercept_[0]),
                shade=True, alpha=0.3, color='b')
ax.plot_wireframe(xx, yy, sigmoid(model.coef_[0,0]*xx + model.coef_[0,1]*yy + model.intercept_[0]),
                  rstride=2, cstride=2, color='0.5')

ax.scatter(X[:,0], X[:,1], y, c=y, s=60)

ax.set_xlabel('X')
ax.set_ylabel('Y')
ax.set_zlabel('target')

ax.view_init(20, -80)

pridict_proba를 사용하면 각 속성에 속할 확률을 알려준다.

display(model.predict_proba(X)[:10], y[:10])

array([[9.98664212e-01, 1.33578826e-03],
       [2.96447658e-03, 9.97035523e-01],
       [9.87513032e-03, 9.90124870e-01],
       [9.73755412e-01, 2.62445881e-02],
       [3.40304775e-05, 9.99965970e-01],
       [1.43541711e-06, 9.99998565e-01],
       [1.51971018e-04, 9.99848029e-01],
       [3.31922924e-04, 9.99668077e-01],
       [9.92520641e-01, 7.47935859e-03],
       [1.24521532e-04, 9.99875478e-01]])

array([0, 1, 1, 0, 1, 1, 1, 1, 0, 1])

model.decision_function(X)[:10]

array([-6.61689702,  5.81808591,  4.60781156, -3.61370036, 10.28822001,
       13.45405364,  8.79166875,  8.0102758 , -4.88810077,  8.99090738])

클래스가 3개인 경우¶

from sklearn.datasets import make_blobs

X, y = make_blobs(300, 2, [[0,0],[-10,10],[10,10]], [1,3,5])
# 300: 각 샘플개수, 2는 속성(축)개수, [0,0]과 [-10,10], [10, 10]를 중심으로 한다. 
# 각각 표준편차가 1, 2 이다.


plt.scatter(X[:,0], X[:,1], c=y, alpha=0.5)
plt.colorbar()

<matplotlib.colorbar.Colorbar at 0x18449b07f28>

from sklearn.linear_model import LogisticRegression

model = LogisticRegression()
model.fit(X, y)
score = model.score(X, y)
score

0.9933333333333333

display(model.coef_, model.intercept_)

# 밑에 그림에서 Y자 모양 세 직선을 말하는게 아니라 점선 3개의 기울기와 y절편을 의미
# w0 + w1x1 + w2x2 = 0

array([[-0.24512711, -1.5996453 ],
       [-0.78677575,  0.13901978],
       [ 0.85448731,  0.49440711]])

array([ 3.16102611, -3.48958455, -3.54175398])

import mglearn

plt.figure(figsize=[10,8])
mglearn.plots.plot_2d_classification(model, X, cm='Reds', alpha=0.3)
mglearn.discrete_scatter(X[:,0], X[:,1], y)

[<matplotlib.lines.Line2D at 0x18449b7ac50>,
 <matplotlib.lines.Line2D at 0x18449b7ad68>,
 <matplotlib.lines.Line2D at 0x18449b832b0>]

plt.figure(figsize=[10,8])
mglearn.plots.plot_2d_classification(model, X, cm='spring', alpha=0.3)
mglearn.discrete_scatter(X[:,0], X[:,1], y)

w=model.coef_
b=model.intercept_

rng = np.array([X[:,0].min(), X[:,0].max()])
for i in range(3):
    plt.plot(rng, -(w[i,0]*rng + b[i])/w[i,1], ':', lw=4)
    
    
# 점선 3개는 실제로 시그모이드 평면 3개이다.

display(model.predict_proba(X)[:10], y[:10])

# 3개가 나오는 이유는 클래스 각각에 대해서 포함 될 수 있는 확률을 의미한다.
# 이 숫자를 보고 그림이 그려진 이유를 파악할 수 있다.

array([[9.08637445e-01, 8.79690768e-03, 8.25656473e-02],
       [9.43407738e-07, 1.47955517e-06, 9.99997577e-01],
       [4.43659230e-13, 1.41196151e-04, 9.99858804e-01],
       [1.23996289e-09, 7.38273103e-05, 9.99926171e-01],
       [6.89961417e-01, 1.09487949e-02, 2.99089788e-01],
       [9.13915063e-01, 2.56159636e-02, 6.04689737e-02],
       [9.97607700e-06, 9.84529047e-01, 1.54609766e-02],
       [1.24226334e-02, 9.82392858e-01, 5.18450821e-03],
       [6.16672824e-06, 1.28825548e-06, 9.99992545e-01],
       [2.14608768e-01, 3.85901847e-06, 7.85387373e-01]])

array([0, 2, 2, 2, 0, 0, 1, 1, 2, 2])

9.08637445e-01+ 8.79690768e-03+ 8.25656473e-02

0.99999999998

model.decision_function(X)[:10]

array([[  3.849533  ,  -4.64908729,  -2.32623558],
       [-13.87382384, -13.42382511,   9.72216291],
       [-28.44359127,  -8.86509091,  11.26204184],
       [-20.50822   ,  -9.51374362,   9.1200293 ],
       [  0.27512045,  -4.69936507,  -1.11805343],
       [  2.77143898,  -3.60848176,  -2.7120079 ],
       [-11.50643948,   4.99918745,  -4.14484329],
       [ -4.39417824,   3.30673565,  -5.27519282],
       [-11.99642906, -13.56231341,   9.20475404],
       [ -0.99125071, -12.23302721,   4.65037743]])

Iris 데이터 - 속성 2개로 제한¶

from sklearn.datasets import load_iris

iris = load_iris()

col1 = 0
col2 = 1

X = iris.data[:, [col1,col2]]
y = iris.target

X.shape, y.shape

((150, 2), (150,))

plt.scatter(X[:,0], X[:,1], c=y, s=60)
plt.colorbar()

<matplotlib.colorbar.Colorbar at 0x1ceeaa07208>

from sklearn.linear_model import LogisticRegression

model = LogisticRegression()
model.fit(X, y)
score = model.score(X, y)
score

0.7666666666666667

display(model.coef_, model.intercept_)

array([[-2.49579289,  4.01011301],
       [ 0.49709451, -1.63380222],
       [ 1.15921404, -1.77736568]])

array([ 0.81713932,  1.22543562, -2.22516119])

import mglearn

plt.figure(figsize=[10,8])
mglearn.plots.plot_2d_classification(model, X, cm='Reds', alpha=0.3)
mglearn.discrete_scatter(X[:,0], X[:,1], y)

[<matplotlib.lines.Line2D at 0x1ceed21e470>,
 <matplotlib.lines.Line2D at 0x1ceed21e588>,
 <matplotlib.lines.Line2D at 0x1ceed21ea90>]

plt.figure(figsize=[10,8])
mglearn.plots.plot_2d_classification(model, X, cm='Reds', alpha=0.3)
mglearn.discrete_scatter(X[:,0], X[:,1], y)

rng = np.array([X[:,0].min(), X[:,0].max()])
for i in range(3):
    plt.plot(rng, -(model.coef_[i,0]*rng + model.intercept_[i])/model.coef_[i,1], ':', lw=4)

Iris 데이터 - 모든 속성 사용¶

from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split

iris = load_iris()

X_train, X_test, y_train, y_test = train_test_split(iris.data, iris.target)

X_train.shape, X_test.shape

((112, 4), (38, 4))

from sklearn.linear_model import LogisticRegression

model = LogisticRegression()
model.fit(X_train, y_train)

score_train = model.score(X_train, y_train)
score_test = model.score(X_test, y_test)

print(score_train, score_test)

0.9553571428571429 0.9473684210526315

display(model.coef_, model.intercept_)

array([[ 0.41420916,  1.31408619, -2.13728907, -0.95475576],
       [ 0.28368395, -1.42662002,  0.69332266, -1.51121812],
       [-1.54075724, -1.27706019,  2.13037186,  2.46068165]])

array([ 0.26110458,  1.12373588, -1.23692182])

옵션(C) 변경 및 과적합 판단¶

Cs = [0.001, 0.01, 0.1, 1, 10, 100, 1000]

s1 = []
s2 = []

for c in Cs:
    model = LogisticRegression(C=c)
    model.fit(X_train, y_train)

    score_train = model.score(X_train, y_train)
    score_test = model.score(X_test, y_test)

    s1.append(score_train)
    s2.append(score_test)
    
plt.plot(s1,'bo:')
plt.plot(s2,'rs-')
plt.legend(['train','test'])
plt.xticks(range(len(Cs)),Cs)
plt.ylim(0,1)
plt.xlabel('C')
plt.ylabel('score')

Text(0,0.5,'score')

시그모이드(sigmoid) 함수 적용¶

plt.figure(figsize=[12,8])

for col in range(4):
    plt.subplot(2,2,col+1)
    plt.scatter(iris.data[:,col], iris.target + np.random.normal(0,0.03,size=len(y)), c=iris.target, s=30, alpha=0.3)
    plt.yticks([0,1,2], ['Setosa', 'Versicolor', 'Virginica'], rotation=90)
    plt.title(iris.feature_names[col], fontsize=15)

X = iris.data[:,[2]]
y = iris.target.copy(); y[y==2]=1

plt.scatter(X[:,0], y, c=y, s=30)
plt.colorbar()

<matplotlib.colorbar.Colorbar at 0x1ceeb8ffa58>

def sigmoid(t):
    return 1/(1+np.exp(-t))

rng = np.arange(-5,5,0.1)
plt.plot(rng, sigmoid(rng))

plt.hlines([0,0.5,1],-5,5,linestyles='dotted')
plt.vlines([0],0,1,linestyles='dotted')
plt.title('Sigmoid')

Text(0.5,1,'Sigmoid')

plt.scatter(X[:,0], y, c=y, s=30)
plt.colorbar()

rng = np.arange(1,7,0.1)
plt.plot(rng, sigmoid(2*(rng-2.5)), 'r--')

[<matplotlib.lines.Line2D at 0x1ceed7b7518>]

model = LogisticRegression()
model.fit(X, y)

display(model.score(X, y), model.coef_, model.intercept_)

1.0

array([[1.72960591]])

array([-4.28674842])

plt.scatter(X[:,0], y, c=y, s=30)
plt.colorbar()

rng = np.arange(1,7,0.1)
plt.plot(rng, sigmoid(model.coef_[0,0]*rng+model.intercept_[0]), 'r--')
plt.vlines([-model.intercept_[0]/model.coef_[0,0]],0,1,linestyles='dotted')
plt.text(3, 0.5, 'boundary = %.3f' % (-model.intercept_[0]/model.coef_[0,0]))

Text(3,0.5,'boundary = 2.478')

지도학습 - kernel SVM (0)	2019.03.07
유방암 데이터 분석 (3)	2019.03.05
지도학습 - LinearSVM_2 (0)	2019.03.05
지도학습 - LinearSVM_1 (0)	2019.03.05
1월 지하철 승하차 인원 분석 (0)	2019.02.28

조환희의 학습 블로그

티스토리 뷰

지도학습 - 로지스틱회귀

로지스틱 회귀 (Logistic Regression)¶

LinearSVM 적용¶

클래스가 3개인 경우¶

Iris 데이터 - 속성 2개로 제한¶

Iris 데이터 - 모든 속성 사용¶

옵션(C) 변경 및 과적합 판단¶

시그모이드(sigmoid) 함수 적용¶

관련 기술¶

'beginner > 파이썬 머신러닝 기초' 카테고리의 다른 글

티스토리툴바

« 2025/07 »
일	월	화	수	목	금	토
		1	2	3	4	5
6	7	8	9	10	11	12
13	14	15	16	17	18	19
20	21	22	23	24	25	26
27	28	29	30	31