Loading data from file¶

넘파이를 이용하여 구해보자.

import numpy as np

from PIL import Image
Image.open('data-01-test-score.png')

xy = np.loadtxt('data-01-test-score.csv', delimiter=',', dtype=np.float32) # 모두 같은 타입이어야만 가능
x_data = xy[:, 0:-1]
y_data = xy[:, [-1]]

##### 슬라이싱 #####
nums = range(5)     # range is a built-in function that creates a list of integers
print nums          # Prints "[0, 1, 2, 3, 4]"
print nums[2:4]     # Get a slice from index 2 to 4 (exclusive); prints "[2, 3]"
print nums[2:]      # Get a slice from index 2 to the end; prints "[2, 3, 4]"
print nums[:2]      # Get a slice from the start to index 2 (exclusive); prints "[0, 1]"
print nums[:]       # Get a slice of the whole list; prints ["0, 1, 2, 3, 4]"
print nums[:-1]     # slice indices can be negative; prints ["0, 1, 2, 3]"
nums[2:4] = [8,9]   # Assign a new sublist to a slice<br>
print nums          # Prints "[0, 1, 8, 9, 4]"

Indexing, Slicing, Iterating¶

Array can be indexed, sliced, iterated much like lists and other sequence types in Python
As with Python lists, slicing in Numpy can be accomplished with the colon (:) syntax
Colon instances (:) can be replaced with dots (...)

b = np.array([[1, 2, 3, 4], [5, 6, 7, 8], [9, 10, 11, 12]])
# array([[1, 2, 3, 4],
#        [5, 6, 7, 8],
#        [9, 10, 11, 12]])
b[:, 1]
# array([2, 6, 10])

b[-1]
# array([9, 10, 11, 12])

b[-1, :]
# array([9, 10, 11, 12])

b[-1, ...]
# array([9, 10, 11, 12])

b[0:2, :]
# array([[1, 2, 3, 4],
#        [5, 6, 7, 8]])

# Make sure the shape and data are OK
print(x_data.shape, x_data, len(x_data))
print(y_data.shape, y_data)

(6, 3) [[ 73.  80.  75.]
 [ 93.  88.  93.]
 [ 89.  91.  90.]
 [ 96.  98. 100.]
 [ 73.  66.  70.]
 [ 53.  46.  55.]] 6
(6, 1) [[152.]
 [185.]
 [180.]
 [196.]
 [142.]
 [101.]]

import tensorflow as tf

# placeholders for a tensor that will be always fed.
X = tf.placeholder(tf.float32, shape=[None, 3])
Y = tf.placeholder(tf.float32, shape=[None, 1])

W = tf.Variable(tf.random_normal([3, 1]), name='weight')
b = tf.Variable(tf.random_normal([1]), name='bias')
                
# Hypothesis
hypothesis = tf.matmul(X, W) + b
                
# Simplified cost/loss function
cost = tf.reduce_mean(tf.square(hypothesis - Y))
                
# Minimize
optimizer = tf.train.GradientDescentOptimizer(learning_rate=1e-5)
train = optimizer.minimize(cost)
                
# Launch the graph in a session
sess = tf.Session()
# Initializes global variables in the graph.
sess.run(tf.global_variables_initializer())
# Set up feed_dict variables inside the loop.
for step in range(2001):
    cost_val, hy_val, _ = sess.run(
        [cost, hypothesis, train],
        feed_dict={X: x_data, Y: y_data})
    if step % 10 == 0:
        print(step, "Cost: ", cost_val,
                     "\nPrediction:\n", hy_val)
        
# Ask my score
print("Your score will be ", sess.run(hypothesis,
            feed_dict={X: [[100, 70, 101]]}))
print("Other scores will be", sess.run(hypothesis,
            feed_dict={X: [[60, 70, 110], [90, 100, 80]]}))

# 결과 값 일부만 출력
0 Cost:  2838.0637 
Prediction:
 [[198.27751]
 [245.33527]
 [238.06161]
 [260.92917]
 [187.29489]
 [141.09808]]
10 Cost:  9.312655 
Prediction:
 [[148.70442 ]
 [185.71544 ]
 [179.3366  ]
 [196.97311 ]
 [141.81572 ]
 [107.563705]]
20 Cost:  9.143283 
Prediction:
 [[148.38025 ]
 [185.30626 ]
 [178.94374 ]
 [196.54228 ]
 [141.50127 ]
 [107.324936]]
30 Cost:  9.103845 
Prediction:
 [[148.38957 ]
 [185.29811 ]
 [178.94594 ]
 [196.54164 ]
 [141.49272 ]
 [107.311676]]
40 Cost:  9.064709 
Prediction:
 [[148.40112]
 [185.29271]
 [178.95079]
 [196.54393]
 [141.48627]
 [107.3    ]]
50 Cost:  9.025783 
Prediction:
 [[148.41266]
 [185.28734]
 [178.95567]
 [196.54623]
 [141.47986]
 [107.28837]]

..........................................

1960 Cost:  4.66599 
Prediction:
 [[150.04318 ]
 [184.53668 ]
 [179.65231 ]
 [196.83426 ]
 [140.62003 ]
 [105.607704]]
1970 Cost:  4.6537633 
Prediction:
 [[150.0493  ]
 [184.53392 ]
 [179.65495 ]
 [196.83513 ]
 [140.61705 ]
 [105.601204]]
1980 Cost:  4.6415877 
Prediction:
 [[150.0554 ]
 [184.53116]
 [179.65762]
 [196.836  ]
 [140.6141 ]
 [105.59472]]
1990 Cost:  4.629454 
Prediction:
 [[150.06151]
 [184.52844]
 [179.66028]
 [196.83688]
 [140.61119]
 [105.58825]]
2000 Cost:  4.6174397 
Prediction:
 [[150.06758]
 [184.52571]
 [179.66292]
 [196.83775]
 [140.60826]
 [105.58182]]
Your score will be  [[191.97649]]
Other scores will be [[174.5262 ]
 [172.73979]]

Image.open('Queue.png')

파일이 굉장히 커서 메모리에 한번에 올리기 힘든 경우. 넘파이를 사용하면 메모리가 부족하다는 말이 나온다. 이럴때를 대비해서 텐서플로우에서는 Queue Runners라는 시스템을 만들어 두었다. 여러개의 파일에서 읽어올때, 여러개의 파일을 큐에 쌓게되고 이것을 reader로 연결해서 이걸 읽은 다음 디코드 하고 다시 큐에 쌓게 된다. 이렇게 일정량 쌓인 배치만큼 읽어와서 학습을 시키고, 여기서 조금씩 꺼내 쓰면 된다. 메모리 관리에 유용

Step.1¶

filename_queue = tf.train.string_input_producer(
    ['data-10-test-score.csv', 'data-02-test-score.csv', ...],
    Shuffle=False, name='filename_queue')

우리들이 가지고 있는 파일들의 리스트를 만들어 준다.

Step.2¶

reader = tf.TextLineReader()
key, value = reader.read(filename_queue)

파일을 읽어올 리더를 정의해 준다. textline뿐만 아니라 바이너리 등을 읽어오는 경우도 있다.
key, value를 나누어서 읽겠다. (일반적으로 텍스트 파일을 읽을때 이렇게 읽는다.)

Step.3¶

record_defaults = [[0.],[0.],[0.],[0.]]
xy = tf.decode_csv(value, record_defaults=record_defaults)

읽어온 value값을 어떻게 파싱할 것인가를 decode_csv로 가져오게 된다.
읽어올때 각각의 필드에 해당하는 값이 어떤 타입인지 결정해줄 수 있다. record_defaults를 이용

tf.train.batch¶

#collect batches of csv in
train_x_batch, train_y_batch = \
    tf.train.batch([xy[0:-1], xy[-1:]], batch_size=10)
    # batch는 펌프같은 역할을 하여 데이터에서 ?축을 읽어오게 한다.
    # xy[0:-1]은 train_x_batch로 xy[-1:]은 train_y_batch로 불러온다. 한번에 10개씩
    
sess = tf.Session()
...

# Start populating the filename queue.
coord = tf.train.Coordinator()
threads = tf.train.start_queue_runners(sess=sess, coord=coord)

for step in range(2001):
    x_batch, y_batch = sess.run([train_x_batch, train_y_batch])
    ...
    
coord.request_stop()
coord.join(threads)

import tensorflow as tf

filename_queue = tf.train.string_input_producer(
    ['data-01-test-score.csv'], shuffle=False, name='filename_queue')

reader = tf.TextLineReader()
key, value = reader.read(filename_queue)

# Default values, in case of empty columns. Also specifies the type of the decoded result.
record_defaults = [[0.],[0.],[0.],[0.]]
xy = tf.decode_csv(value, record_defaults = record_defaults)

# collect batches of csv in
train_x_batch, train_y_batch = \
    tf.train.batch([xy[0:-1], xy[-1:]], batch_size=10)

# placeholders for a tensor that will be always fed.
X = tf.placeholder(tf.float32, shape=[None, 3])
Y = tf.placeholder(tf.float32, shape=[None, 1])

W = tf.Variable(tf.random_normal([3, 1]), name='weight')
b = tf.Variable(tf.random_normal([1]), name='bias')

# Hypothesis
hypothesis = tf.matmul(X, W) + b

# Simplified cost/loss function
cost = tf.reduce_mean(tf.square(hypothesis - Y))

# Minimize
optimizer = tf.train.GradientDescentOptimizer(learning_rate=1e-5)
train = optimizer.minimize(cost)

# Launch the graph in a session.
sess = tf.Session()
# Initializes global variables in the graph.
sess.run(tf.global_variables_initializer())

# start populating the filename queue.
coord = tf.train.Coordinator()
threads = tf.train.start_queue_runners(sess=sess, coord=coord)

for step in range(2001):
    x_batch, y_batch = sess.run([train_x_batch, train_y_batch])
    cost_val, hy_val, _ = sess.run(
        [cost, hypothesis, train],
        feed_dict={X: x_batch, Y: y_batch})
    if step % 10 == 0:
        print(step, "Cost: ", cost_val,
                    "\nPrediction:\n", hy_val)
    
coord.request_stop()
coord.join(threads)

# Ask my score
print("Your score will be ",
      sess.run(hypothesis, feed_dict={X: [[100, 70, 101]]}))

print("Other scores will be ",
      sess.run(hypothesis, feed_dict={X: [[60, 70, 110], [90, 100, 80]]}))

# Out of RangeError 오류, csv 파일에 있는 주석을 없앴더니 됨.

0 Cost:  42634.477
Prediction:
[[-36.364662]
[-38.877457]
[-41.10543 ]
[-42.42621 ]
[-30.120293]
[-17.512817]
[-36.364662]
[-38.877457]
[-41.10543 ]
[-42.42621 ]]
10 Cost:  13.605237
Prediction:
[[139.1753  ]
[107.281555]
[148.2732  ]
[183.07416 ]
[177.57092 ]
[195.71326 ]
[139.1753  ]
[107.281555]
[148.2732  ]
[183.07416 ]]
20 Cost:  11.773229
Prediction:
[[178.94379]
[197.2052 ]
[140.23022]
[108.05257]
[149.43839]
[184.45905]
[178.94379]
[197.2052 ]
[140.23022]
[108.05257]]
30 Cost:  7.232764
Prediction:
[[149.3255 ]
[184.30748]
[178.80287]
[197.04868]
[140.11324]
[107.95967]
[149.3255 ]
[184.30748]
[178.80287]
[197.04868]]
40 Cost:  12.135092
Prediction:
[[140.2409 ]
[108.04736]
[149.47902]
[184.47667]
[178.97774]
[197.23611]
[140.2409 ]
[108.04736]
[149.47902]
[184.47667]]
50 Cost:  11.688535
Prediction:
[[178.96616]
[197.2204 ]
[140.22394]
[108.02818]
[149.47536]
[184.4563 ]
[178.96616]
[197.2204 ]
[140.22394]
[108.02818]]

...

1960 Cost:  8.0693035
Prediction:
[[139.60762]
[106.5494 ]
[150.79176]
[183.83481]
[179.53098]
[197.34142]
[139.60762]
[106.5494 ]
[150.79176]
[183.83481]]
1970 Cost:  7.996835
Prediction:
[[179.57143]
[197.38324]
[139.6352 ]
[106.56528]
[150.82909]
[183.87128]
[179.57143]
[197.38324]
[139.6352 ]
[106.56528]]
1980 Cost:  4.633001
Prediction:
[[150.72296]
[183.73543]
[179.44203]
[197.2401 ]
[139.53136]
[106.4843 ]
[150.72296]
[183.73543]
[179.44203]
[197.2401 ]]
1990 Cost:  8.031915
Prediction:
[[139.60168]
[106.53191]
[150.80644]
[183.82791]
[179.53738]
[197.3418 ]
[139.60168]
[106.53191]
[150.80644]
[183.82791]]
2000 Cost:  7.961212
Prediction:
[[179.57838]
[197.3842 ]
[139.62973]
[106.54816]
[150.8442 ]
[183.86499]
[179.57838]
[197.3842 ]
[139.62973]
[106.54816]]
Your score will be  [[190.59384]]
Other scores will be  [[188.88681]
[169.67206]]

shuffle_batch¶

# min_after_dequeue defines how big a buffer we will randomly smape
#    from -- bigger means better shuffling but slower start up and more
#    memory used.
# capacity must be larger than min_after_dequeue and the amount larger
#    determines the maximum we will prefetch. Recommendation:
#    min_after_dequeue + (num_threads + a small safety margin) * batch_size
min_after_dequeue = 10000
capacity = min_after_dequeue + 3 * batch_size
example_batch, label_batch = tf.train.shuffle_batch(
    [example, label], batch_size=batch_size, capacity=capacity,
    min_after_dequeue=min_after_dequeue)

다양한 코드를 볼 수 있음: https://www.tensorflow.org/guide/datasets

출처 : https://www.inflearn.com/course/%EA%B8%B0%EB%B3%B8%EC%A0%81%EC%9D%B8-%EB%A8%B8%EC%8B%A0%EB%9F%AC%EB%8B%9D-%EB%94%A5%EB%9F%AC%EB%8B%9D-%EA%B0%95%EC%A2%8C/lecture/3385

Logistic Classification 이론 (0)	2019.05.05
Logistic Regression (0)	2019.05.04
Multi-variable linear regression (0)	2019.05.03
Linear Regression의 cost 최소화의 TensorFlow 구현 (0)	2019.05.03
Tensorflow로 간단한 linear regression 구현 (0)	2019.05.02

조환희의 학습 블로그

티스토리 뷰