티스토리 뷰

2019.02,18

구간나누기

  • np.arange() # a부터 b까지 c단위로 나눈다.
  • np.linspace() # 구간을 나눈다. ex 히스토그램
  • np.bincount() # 구간을 나눠서 개수를 셈 ex) 140~150은 1명 150~160은 3명 160~170은 5명 170~180은 7명 180~190은 4명
  • np.digitize() # 구간을 나눠서 mapping 시킴 ex) 143, 142는 0, 152, 155, 156은 1, ....
  • np.histogram() => plt.hist()
In [12]:
import numpy as np
import matplotlib.pyplot as plt
In [15]:
np.arange(-1,1,0.1) # 1 대신 1과 1.1 사이의 어떤 값을 집어넣으면 1도 포함시킬수 있다.(20개에서 21개가 된다.)
Out[15]:
array([-1.00000000e+00, -9.00000000e-01, -8.00000000e-01, -7.00000000e-01,
       -6.00000000e-01, -5.00000000e-01, -4.00000000e-01, -3.00000000e-01,
       -2.00000000e-01, -1.00000000e-01, -2.22044605e-16,  1.00000000e-01,
        2.00000000e-01,  3.00000000e-01,  4.00000000e-01,  5.00000000e-01,
        6.00000000e-01,  7.00000000e-01,  8.00000000e-01,  9.00000000e-01])
In [17]:
np.linspace(-1,1,21) # arange와는 다르게 개수를 준다. (끝점이 포함되므로 1도 들어감)
Out[17]:
array([-1. , -0.9, -0.8, -0.7, -0.6, -0.5, -0.4, -0.3, -0.2, -0.1,  0. ,
        0.1,  0.2,  0.3,  0.4,  0.5,  0.6,  0.7,  0.8,  0.9,  1. ])
In [20]:
x = np.linspace(-1,1,101)
y = x**3

plt.plot(x,y)
plt.xlabel('x')
plt.ylabel('y')
Out[20]:
Text(0,0.5,'y')

bincount()

  • 정수가 나타난 횟수를 센다
In [22]:
a = np.random.randint(10,size=100)
a
Out[22]:
array([6, 2, 4, 4, 7, 2, 4, 6, 7, 8, 7, 7, 9, 6, 0, 2, 8, 4, 6, 0, 8, 3,
       1, 6, 7, 2, 7, 9, 9, 4, 7, 4, 4, 3, 6, 3, 0, 0, 4, 3, 7, 8, 9, 3,
       0, 0, 8, 7, 0, 9, 5, 5, 1, 4, 8, 4, 1, 0, 2, 5, 4, 0, 6, 6, 4, 9,
       9, 9, 3, 5, 8, 0, 5, 3, 5, 2, 2, 4, 2, 7, 2, 9, 5, 3, 7, 6, 4, 7,
       6, 3, 0, 8, 1, 4, 0, 8, 9, 0, 1, 7])
In [28]:
np.bincount(a)      # 0 13개, 1 5개, 2 9개, 3 9개, 4 15개, ..... 
Out[28]:
array([13,  5,  9,  9, 15,  7, 10, 13,  9, 10], dtype=int64)
In [29]:
plt.plot(np.bincount(a))
#plt.ylm(0,16) y의 범위를 0부터 16으로 한다.
Out[29]:
[<matplotlib.lines.Line2D at 0x74342d4ac8>]
In [113]:
a = np.random.randint(5,10,size=100)
a
Out[113]:
array([9, 6, 8, 5, 7, 8, 5, 8, 6, 8, 8, 9, 5, 7, 7, 7, 6, 8, 8, 7, 5, 5,
       7, 8, 8, 5, 9, 7, 8, 5, 9, 6, 9, 6, 7, 6, 9, 6, 5, 5, 8, 7, 9, 6,
       7, 8, 5, 7, 6, 9, 7, 7, 9, 7, 7, 6, 8, 7, 6, 8, 7, 8, 7, 5, 5, 5,
       5, 6, 5, 7, 5, 8, 7, 7, 7, 8, 9, 8, 6, 7, 5, 6, 9, 7, 6, 5, 9, 8,
       5, 7, 6, 9, 7, 5, 9, 7, 7, 5, 7, 7])
In [27]:
np.bincount([7,5,5,6,7])  # 5 2개, 6 1개, 7 2개, 항상 0인 값부터 갯수를 출력한다.
Out[27]:
array([0, 0, 0, 0, 0, 2, 1, 2], dtype=int64)

histogram()

  • 설정한 구간에서의 항목 갯수를 출력한다.

help(np.histogram)

histogram(a, bins=10, range=None, normed=False, weights=None, density=None)

Compute the histogram of a set of data.

Parameters
----------
a : array_like
    Input data. The histogram is computed over the flattened array.
bins : int or sequence of scalars or str, optional
    If `bins` is an int, it defines the number of equal-width
    bins in the given range (10, by default). If `bins` is a
    sequence, it defines the bin edges, including the rightmost
    edge, allowing for non-uniform bin widths.
In [34]:
a = np.random.rand(100)  # 0과 1 사이의 실수
a
Out[34]:
array([0.58351652, 0.26852529, 0.1546097 , 0.57100784, 0.76492878,
       0.67196103, 0.93500219, 0.97272469, 0.56549907, 0.84773544,
       0.25896203, 0.56070327, 0.772981  , 0.2355588 , 0.80398823,
       0.84951858, 0.66164223, 0.22187544, 0.32135696, 0.97469582,
       0.88887494, 0.09408865, 0.76412771, 0.6034494 , 0.47768411,
       0.49454183, 0.18538133, 0.61491967, 0.58838206, 0.33220437,
       0.15008645, 0.92957487, 0.45807553, 0.12792326, 0.29480085,
       0.89107701, 0.84806509, 0.16537209, 0.14680848, 0.82544415,
       0.98374588, 0.41990583, 0.70541355, 0.58168093, 0.85064365,
       0.70455574, 0.41239867, 0.25408897, 0.45401597, 0.6035087 ,
       0.55282197, 0.68832801, 0.95752577, 0.18447558, 0.53299123,
       0.95095092, 0.17615794, 0.99522962, 0.49123557, 0.0150139 ,
       0.88940165, 0.50621334, 0.537828  , 0.1119445 , 0.69261009,
       0.54739696, 0.73451652, 0.80526732, 0.93112206, 0.77828737,
       0.28017595, 0.17198014, 0.32803294, 0.59985658, 0.70632095,
       0.02027084, 0.7220679 , 0.56495156, 0.63629864, 0.14305452,
       0.51856299, 0.34137121, 0.15955661, 0.49352367, 0.34254149,
       0.83800904, 0.93326777, 0.57832154, 0.35626851, 0.03114831,
       0.38006955, 0.79657931, 0.38770488, 0.88126649, 0.95314178,
       0.58935647, 0.57294819, 0.49794401, 0.41308279, 0.69441778])
In [35]:
a.min(), a.max()
Out[35]:
(0.015013898184105035, 0.9952296208937229)
In [36]:
plt.hist(a)  # 구간(디폴트)값을 10개 줬다.
Out[36]:
(array([ 5., 11.,  7.,  8., 10., 17.,  9., 10., 12., 11.]),
 array([0.0150139 , 0.11303547, 0.21105704, 0.30907861, 0.40710019,
        0.50512176, 0.60314333, 0.7011649 , 0.79918648, 0.89720805,
        0.99522962]),
 <a list of 10 Patch objects>)
In [37]:
np.histogram(a)
Out[37]:
(array([ 5, 11,  7,  8, 10, 17,  9, 10, 12, 11], dtype=int64),
 array([0.0150139 , 0.11303547, 0.21105704, 0.30907861, 0.40710019,
        0.50512176, 0.60314333, 0.7011649 , 0.79918648, 0.89720805,
        0.99522962]))
In [45]:
n, b = np.histogram(a, bins=np.arange(0,1,0.1))  # 구간을 9개로 나눴다
n, b
Out[45]:
(array([ 4, 12,  7,  8, 10, 17,  9, 10, 12], dtype=int64),
 array([0. , 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]))
In [46]:
plt.hist(a, bins=b)
Out[46]:
(array([ 4., 12.,  7.,  8., 10., 17.,  9., 10., 12.]),
 array([0. , 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]),
 <a list of 9 Patch objects>)

digitize()

  • 설정한 구간값을 반환한다.

help(np.digitize)

digitize(x, bins, right=False)

Return the indices of the bins to which each value in input array belongs.

Each index ``i`` returned is such that ``bins[i-1] <= x < bins[i]`` if
`bins` is monotonically increasing, or ``bins[i-1] > x >= bins[i]`` if
`bins` is monotonically decreasing. If values in `x` are beyond the
bounds of `bins`, 0 or ``len(bins)`` is returned as appropriate. If right
is True, then the right bin is closed so that the index ``i`` is such
that ``bins[i-1] < x <= bins[i]`` or ``bins[i-1] >= x > bins[i]`` if `bins`
is monotonically increasing or decreasing, respectively.

Parameters
----------
x : array_like
    Input array to be binned. Prior to NumPy 1.10.0, this array had to
    be 1-dimensional, but can now have any shape.
bins : array_like
    Array of bins. It has to be 1-dimensional and monotonic.
right : bool, optional
    Indicating whether the intervals include the right or the left bin
    edge. Default behavior is (right==False) indicating that the interval
    does not include the right edge. The left bin end is open in this
    case, i.e., bins[i-1] <= x < bins[i] is the default behavior for
    monotonically increasing bins.
In [53]:
scores=[93, 78, 77, 89, 100, 67, 51, 82, 99, 71] # 93은 4구간, 78은 2구간, 77은 2구간, ... (60이하의 구간, 90이상의 구간이 더 있다.)

d = np.digitize(scores, bins= [60,70,80,90])
d
Out[53]:
array([4, 2, 2, 3, 4, 1, 0, 3, 4, 2], dtype=int64)
In [54]:
grade = ['가', '양', '미', '우', '수']
[grade[i] for i in d]
Out[54]:
['수', '미', '미', '우', '수', '양', '가', '우', '수', '미']

연습문제

iris 데이터에서 첫번째 칼럼의 값을 구간값으로 바꾸시오. (어떻게 구간을 나누는게 좋을지 고민해보자.)

In [55]:
f = openf = open('iris.csv')

line = f.readline()
features = line.strip().split(',')[:4]

labels = ['Iris-setosa', 'Iris-versicolor', 'Iris-virginica']

data = []
for line in f:
    l = line.strip().split(',')
    
    l[:4] = [float(i) for i in l[:4]]

    l[4] = labels.index(l[4])
    
    data.append(l)

f.close()

iris = np.array(data)
In [111]:
list(iris[:,0])

r = np.digitize(list(iris[:,0]), bins = [4,5,6,7])
r
Out[111]:
array([2, 1, 1, 1, 2, 2, 1, 2, 1, 1, 2, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2,
       1, 2, 1, 2, 2, 2, 2, 1, 1, 2, 2, 2, 1, 2, 2, 1, 1, 2, 2, 1, 1, 2,
       2, 1, 2, 1, 2, 2, 4, 3, 3, 2, 3, 2, 3, 1, 3, 2, 2, 2, 3, 3, 2, 3,
       2, 2, 3, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 2, 2, 3, 2, 3, 3, 3,
       2, 2, 2, 3, 2, 2, 2, 2, 2, 3, 2, 2, 3, 2, 4, 3, 3, 4, 1, 4, 3, 4,
       3, 3, 3, 2, 2, 3, 3, 4, 4, 3, 3, 2, 4, 3, 3, 4, 3, 3, 3, 4, 4, 4,
       3, 3, 3, 4, 3, 3, 3, 3, 3, 3, 2, 3, 3, 3, 3, 3, 3, 2], dtype=int64)
In [61]:
len(d)
Out[61]:
150
In [71]:
c=np.sort(list(iris[:,0]))
In [73]:
c[49], c[99]
Out[73]:
(5.4, 6.3)
In [105]:
list(iris[:,0])

d = np.digitize(list(iris[:,0]), bins = [5.4, 6.3])
d
Out[105]:
array([0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1, 0, 1, 0, 1, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 2, 2, 2, 1, 2, 1, 2, 0, 2, 0, 0, 1, 1, 1, 1, 2,
       1, 1, 1, 1, 1, 1, 2, 1, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2,
       1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 2, 1, 2, 2, 2, 2, 0, 2, 2, 2,
       2, 2, 2, 1, 1, 2, 2, 2, 2, 1, 2, 1, 2, 2, 2, 2, 1, 1, 2, 2, 2, 2,
       2, 2, 1, 2, 2, 2, 1, 2, 2, 2, 1, 2, 2, 2, 2, 2, 1, 1], dtype=int64)
In [106]:
sum(d==1)
Out[106]:
53
In [107]:
sum(d==2)
Out[107]:
51
In [108]:
sum(d==0)
Out[108]:
46
In [109]:
plt.plot(list(iris[:,0]))
Out[109]:
[<matplotlib.lines.Line2D at 0x7434626940>]
In [110]:
plt.plot(d)
Out[110]:
[<matplotlib.lines.Line2D at 0x74346c0208>]
In [112]:
plt.plot(r)
Out[112]:
[<matplotlib.lines.Line2D at 0x743470cda0>]
  • 풀이
In [115]:
sepal_length = iris[:,0]
sepal_length
Out[115]:
array([5.1, 4.9, 4.7, 4.6, 5. , 5.4, 4.6, 5. , 4.4, 4.9, 5.4, 4.8, 4.8,
       4.3, 5.8, 5.7, 5.4, 5.1, 5.7, 5.1, 5.4, 5.1, 4.6, 5.1, 4.8, 5. ,
       5. , 5.2, 5.2, 4.7, 4.8, 5.4, 5.2, 5.5, 4.9, 5. , 5.5, 4.9, 4.4,
       5.1, 5. , 4.5, 4.4, 5. , 5.1, 4.8, 5.1, 4.6, 5.3, 5. , 7. , 6.4,
       6.9, 5.5, 6.5, 5.7, 6.3, 4.9, 6.6, 5.2, 5. , 5.9, 6. , 6.1, 5.6,
       6.7, 5.6, 5.8, 6.2, 5.6, 5.9, 6.1, 6.3, 6.1, 6.4, 6.6, 6.8, 6.7,
       6. , 5.7, 5.5, 5.5, 5.8, 6. , 5.4, 6. , 6.7, 6.3, 5.6, 5.5, 5.5,
       6.1, 5.8, 5. , 5.6, 5.7, 5.7, 6.2, 5.1, 5.7, 6.3, 5.8, 7.1, 6.3,
       6.5, 7.6, 4.9, 7.3, 6.7, 7.2, 6.5, 6.4, 6.8, 5.7, 5.8, 6.4, 6.5,
       7.7, 7.7, 6. , 6.9, 5.6, 7.7, 6.3, 6.7, 7.2, 6.2, 6.1, 6.4, 7.2,
       7.4, 7.9, 6.4, 6.3, 6.1, 7.7, 6.3, 6.4, 6. , 6.9, 6.7, 6.9, 5.8,
       6.8, 6.7, 6.7, 6.3, 6.5, 6.2, 5.9])
In [116]:
plt.plot(sepal_length)
plt.grid()
In [117]:
plt.hist(sepal_length, bins=100)
pass
In [118]:
t1 = sepal_length[:50]
t2 = sepal_length[50:100]
t3 = sepal_length[100:]

t1.max(), t1.min(), t1.mean()
Out[118]:
(5.8, 4.3, 5.006)
In [119]:
t2.max(), t2.min(), t2.mean()
Out[119]:
(7.0, 4.9, 5.936)
In [120]:
t3.max(), t3.min(), t3.mean()
Out[120]:
(7.9, 4.9, 6.587999999999998)
In [121]:
b = [t1.mean(), (t2.mean()+t1.mean())/2, t2.mean(), (t3.mean()+t2.mean())/2, t3.mean()]
In [122]:
s2 = np.digitize(sepal_length, bins=b)
s2
Out[122]:
array([1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 2, 2, 1, 1, 2, 1, 1, 1,
       0, 1, 0, 0, 0, 1, 1, 0, 0, 1, 1, 2, 0, 0, 2, 0, 0, 1, 0, 0, 0, 0,
       1, 0, 1, 0, 1, 0, 5, 4, 5, 2, 4, 2, 4, 0, 5, 1, 0, 2, 3, 3, 2, 5,
       2, 2, 3, 2, 2, 3, 4, 3, 4, 5, 5, 5, 3, 2, 2, 2, 2, 3, 1, 3, 5, 4,
       2, 2, 2, 3, 2, 0, 2, 2, 2, 3, 1, 2, 4, 2, 5, 4, 4, 5, 0, 5, 5, 5,
       4, 4, 5, 2, 2, 4, 4, 5, 5, 3, 5, 2, 5, 4, 5, 5, 3, 3, 4, 5, 5, 5,
       4, 4, 3, 5, 4, 4, 3, 5, 5, 5, 2, 5, 5, 5, 4, 4, 3, 2], dtype=int64)
In [123]:
plt.plot(s2, 'ro')
Out[123]:
[<matplotlib.lines.Line2D at 0x74358e3080>]


'beginner > 파이썬 기초' 카테고리의 다른 글

Matplotlib  (0) 2019.02.19
NumPy_기타  (0) 2019.02.19
NumPy_정렬  (0) 2019.02.15
NumPy_파일입출력  (0) 2019.02.14
NumPy_함수  (0) 2019.02.13
공지사항
최근에 올라온 글
최근에 달린 댓글
Total
Today
Yesterday
링크
«   2024/05   »
1 2 3 4
5 6 7 8 9 10 11
12 13 14 15 16 17 18
19 20 21 22 23 24 25
26 27 28 29 30 31
글 보관함