# 如何在TensorFlow中高效使用数据集

2018-03-13 15:59:52 发布
 您的评价: 0
0收藏
 文件夹 请选择... ------------- 新增文件夹... 新增文件夹 标签 (多个标签用逗号分隔)

• 载入数据：为数据创建一个数据集实例。

• 创建一个迭代器：通过使用创建的数据集构建一个迭代器来对数据集进行迭代。

• 使用数据：通过使用创建的迭代器，我们可以找到可传输给模型的数据集元素。

``````# create a random vector of shape (100,2)
x = np.random.sample((100,2))
# make a dataset from a numpy array
dataset = tf.data.Dataset.from_tensor_slices(x)``````

``````features, labels = (np.random.sample((100,2)), np.random.sample((100,1)))
dataset = tf.data.Dataset.from_tensor_slices((features,labels))``````

``````# using a tensor
dataset = tf.data.Dataset.from_tensor_slices(tf.random_uniform([100, 2]))``````

``````x = tf.placeholder(tf.float32, shape=[None,2])
dataset = tf.data.Dataset.from_tensor_slices(x)``````

``````sequence = np.array([[1],[2,3],[3,4]])
def generator():
for el in sequence:
yield el
dataset = tf.data.Dataset().from_generator(generator,
output_types=tf.float32,
output_shapes=[tf.float32])``````

One Shot 迭代器

``````x = np.random.sample((100,2))
# make a dataset from a numpy array
dataset = tf.data.Dataset.from_tensor_slices(x)
# create the iterator
iter = dataset.make_one_shot_iterator()``````

``````...
# create the iterator
iter = dataset.make_one_shot_iterator()
el = iter.get_next()``````

``````with tf.Session() as sess:
print(sess.run(el)) # output: [ 0.42116176  0.40666069]``````

``````# using a placeholder
x = tf.placeholder(tf.float32, shape=[None,2])
dataset = tf.data.Dataset.from_tensor_slices(x)
data = np.random.sample((100,2))
iter = dataset.make_initializable_iterator() # create the iterator
el = iter.get_next()
with tf.Session() as sess:
# feed the placeholder with data
sess.run(iter.initializer, feed_dict={ x: data })
print(sess.run(el)) # output [ 0.52374458  0.71968478]``````

``````train_data = (np.random.sample((100,2)), np.random.sample((100,1)))
test_data = (np.array([[1,2]]), np.array([[0]]))``````

``````# initializable iterator to switch between dataset
EPOCHS = 10
x, y = tf.placeholder(tf.float32, shape=[None,2]), tf.placeholder(tf.float32, shape=[None,1])
dataset = tf.data.Dataset.from_tensor_slices((x, y))
train_data = (np.random.sample((100,2)), np.random.sample((100,1)))
test_data = (np.array([[1,2]]), np.array([[0]]))
iter = dataset.make_initializable_iterator()
features, labels = iter.get_next()
with tf.Session() as sess:
#     initialise iterator with train data
sess.run(iter.initializer, feed_dict={ x: train_data[0], y: train_data[1]})
for _ in range(EPOCHS):
sess.run([features, labels])
#     switch to test data
sess.run(iter.initializer, feed_dict={ x: test_data[0], y: test_data[1]})
print(sess.run([features, labels]))``````

``````# making fake data using numpy
train_data = (np.random.sample((100,2)), np.random.sample((100,1)))
test_data = (np.random.sample((10,2)), np.random.sample((10,1)))``````

``````# create two datasets, one for training and one for test
train_dataset = tf.data.Dataset.from_tensor_slices(train_data)
test_dataset = tf.data.Dataset.from_tensor_slices(test_data)``````

``````# create a iterator of the correct shape and type
iter = tf.data.Iterator.from_structure(train_dataset.output_types,
train_dataset.output_shapes)``````

``````# create the initialisation operations
train_init_op = iter.make_initializer(train_dataset)
test_init_op = iter.make_initializer(test_dataset)``````

``features, labels = iter.get_next()``

``````# Reinitializable iterator to switch between Datasets
EPOCHS = 10
# making fake data using numpy
train_data = (np.random.sample((100,2)), np.random.sample((100,1)))
test_data = (np.random.sample((10,2)), np.random.sample((10,1)))
# create two datasets, one for training and one for test
train_dataset = tf.data.Dataset.from_tensor_slices(train_data)
test_dataset = tf.data.Dataset.from_tensor_slices(test_data)
# create a iterator of the correct shape and type
iter = tf.data.Iterator.from_structure(train_dataset.output_types,
train_dataset.output_shapes)
features, labels = iter.get_next()
# create the initialisation operations
train_init_op = iter.make_initializer(train_dataset)
test_init_op = iter.make_initializer(test_dataset)
with tf.Session() as sess:
sess.run(train_init_op) # switch to train dataset
for _ in range(EPOCHS):
sess.run([features, labels])
sess.run(test_init_op) # switch to val dataset
print(sess.run([features, labels]))``````

``````...
next_el = iter.get_next()
...
print(sess.run(next_el)) # will output the current element``````

``````# using two numpy arrays
features, labels = (np.array([np.random.sample((100,2))]),
np.array([np.random.sample((100,1))]))
dataset = tf.data.Dataset.from_tensor_slices((features,labels)).repeat().batch(BATCH_SIZE)``````

``````iter = dataset.make_one_shot_iterator()
x, y = iter.get_next()``````

``````# make a simple model
net = tf.layers.dense(x, 8) # pass the first value from iter.get_next() as input
net = tf.layers.dense(net, 8)
prediction = tf.layers.dense(net, 1)
loss = tf.losses.mean_squared_error(prediction, y) # pass the second value from iter.get_net() as label

``````# make a simple model
EPOCHS = 10
BATCH_SIZE = 16
# using two numpy arrays
features, labels = (np.array([np.random.sample((100,2))]),
np.array([np.random.sample((100,1))]))
dataset = tf.data.Dataset.from_tensor_slices((features,labels)).repeat().batch(BATCH_SIZE)
iter = dataset.make_one_shot_iterator()
x, y = iter.get_next()
# make a simple model
net = tf.layers.dense(x, 8, activation=tf.tanh) # pass the first value from iter.get_next() as input
net = tf.layers.dense(net, 8, activation=tf.tanh)
prediction = tf.layers.dense(net, 1, activation=tf.tanh)
loss = tf.losses.mean_squared_error(prediction, y) # pass the second value from iter.get_net() as label
with tf.Session() as sess:
sess.run(tf.global_variables_initializer())
for i in range(EPOCHS):
_, loss_value = sess.run([train_op, loss])
print("Iter: {}, Loss: {:.4f}".format(i, loss_value))``````

``````Iter: 0, Loss: 0.1328
Iter: 1, Loss: 0.1312
Iter: 2, Loss: 0.1296
Iter: 3, Loss: 0.1281
Iter: 4, Loss: 0.1267
Iter: 5, Loss: 0.1254
Iter: 6, Loss: 0.1242
Iter: 7, Loss: 0.1231
Iter: 8, Loss: 0.1220
Iter: 9, Loss: 0.1210``````

``````# BATCHING
BATCH_SIZE = 4
x = np.random.sample((100,2))
# make a dataset from a numpy array
dataset = tf.data.Dataset.from_tensor_slices(x).batch(BATCH_SIZE)
iter = dataset.make_one_shot_iterator()
el = iter.get_next()
with tf.Session() as sess:
print(sess.run(el))``````

``````[[ 0.65686128  0.99373963]
[ 0.69690451  0.32446826]
[ 0.57148422  0.68688242]
[ 0.20335116  0.82473219]]``````

repeat

shuffle

``````# BATCHING
BATCH_SIZE = 4
x = np.array([[1],[2],[3],[4]])
# make a dataset from a numpy array
dataset = tf.data.Dataset.from_tensor_slices(x)
dataset = dataset.shuffle(buffer_size=100)
dataset = dataset.batch(BATCH_SIZE)
iter = dataset.make_one_shot_iterator()
el = iter.get_next()
with tf.Session() as sess:
print(sess.run(el))``````

``````[[4]
[2]
[3]
[1]]``````

``````[[3]
[1]
[2]
[4]]``````

MAP

``````# MAP
x = np.array([[1],[2],[3],[4]])
# make a dataset from a numpy array
dataset = tf.data.Dataset.from_tensor_slices(x)
dataset = dataset.map(lambda x: x*2)
iter = dataset.make_one_shot_iterator()
el = iter.get_next()
with tf.Session() as sess:
#     this will run forever
for _ in range(len(x)):
print(sess.run(el))``````

``````[2]
[4]
[6]
[8]``````

• TensorFlow 数据集教程：https://www.tensorflow.org/programmers_guide/datasets

• 数据集文档：https://www.tensorflow.org/api_docs/python/tf/data/Dataset

## 扩展阅读

TensorFow的基本使用
TensorFlow实战之Scikit Flow系列指导：Part 1

TensorFlow实战之K-Means聚类算法实践
【机器学习】Tensorflow基本使用

## 为您推荐

python开发总结
awesome-machine-learning - 非常棒的机器学习框架，库和软件集合

## 更多

TensorFlow
NumPy
Python开发

### 阅读目录

 相关文档 　—　更多 相关经验 　—　更多 相关讨论 　—　更多