[ML] TensorFlow Datasets Summary

2 minute read

TensorFlow Datasets(TFDS)

tensorflow-v2.6.0 tfds-v4.4.0



Load Datasets via tf.keras.datasets

import pprint
import tensorflow as tf


physical_devices = tf.config.list_physical_devices('GPU')
for physical_device in physical_devices:
    tf.config.experimental.set_memory_growth(device=physical_device, enable=True)
    
    
pprint.pprint(dir(tf.keras.datasets))
['__builtins__',
 '__cached__',
 '__doc__',
 '__file__',
 '__loader__',
 '__name__',
 '__package__',
 '__path__',
 '__spec__',
 '_sys',
 'boston_housing',
 'cifar10',
 'cifar100',
 'fashion_mnist',
 'imdb',
 'mnist',
 'reuters']


from tensorflow.keras.datasets import mnist
from tensorflow.keras.datasets import fashion_mnist


(train_imgs, train_labels), (test_imgs, test_labels) = mnist.load_data()
print('* ====== MNIST ======')
print(f'train_imgs.shape: {train_imgs.shape}')
print(f'train_labels.shape: {train_labels.shape}')
print()
print(f'test_imgs.shape: {test_imgs.shape}')
print(f'test_labels.shape: {test_labels.shape}')

(train_imgs, train_labels), (test_imgs, test_labels) = fashion_mnist.load_data()
print('\n\n* ====== Fashion MNIST ======')
print(f'train_imgs.shape: {train_imgs.shape}')
print(f'train_labels.shape: {train_labels.shape}')
print()
print(f'test_imgs.shape: {test_imgs.shape}')
print(f'test_labels.shape: {test_labels.shape}')
* ====== MNIST ======
train_imgs.shape: (60000, 28, 28)
train_labels.shape: (60000,)

test_imgs.shape: (10000, 28, 28)
test_labels.shape: (10000,)


* ====== Fashion MNIST ======
train_imgs.shape: (60000, 28, 28)
train_labels.shape: (60000,)

test_imgs.shape: (10000, 28, 28)
test_labels.shape: (10000,)



Install TFDS

  • The stable version, released every few months.
pip install tensorflow-datasets
  • Released every day, contains the last versions of the datasets.
pip install tfds-nightly



Find available datasets

import tensorflow_datasets as tfds


tfds.list_builders()

tf.keras.datasets보다 더 많은 종류의 데이터셋을 확인할 수 있다.



Dataset Build

1. tfds.load

import tensorflow_datasets as tfds


dataset, ds_info = tfds.load(
    name='mnist',
    shuffle_files=False, # True일 경우 train, validation, test 각각 전부 shuffle (Default: False)
    with_info=True,      # tfds.core.DatasetInfo 객체 반환 (Default: False)
)

n_train = ds_info.splits['train'].num_examples
n_test = ds_info.splits['test'].num_examples

train_ds = dataset['train'].shuffle(n_train).batch(8).take(1)
test_ds = dataset['test']

for data_item in train_ds:
    imgs, labels = data_item['image'], data_item['label']
    print('imgs.shape: ', imgs.shape)
    print('labels.shape: ', labels.shape)
imgs.shape:  (8, 28, 28, 1)
labels.shape:  (8,)


  • as_supervised=True 활용
import tensorflow_datasets as tfds


dataset, ds_info = tfds.load(
    name='mnist',
    shuffle_files=False, # True일 경우 train, validation, test 각각 전부 Shuffle (Default: False)
    as_supervised=True,  # Data Item이 dict가 아닌 (imgs, labels) 튜플 형태로 반환
    with_info=True,      # tfds.core.DatasetInfo 객체 반환 (Default: False)
)

n_train = ds_info.splits['train'].num_examples
n_test = ds_info.splits['test'].num_examples

train_ds = dataset['train'].shuffle(n_train).batch(8).take(1)
test_ds = dataset['test']

for imgs, labels in train_ds:
    print('imgs.shape: ', imgs.shape)
    print('labels.shape: ', labels.shape)
imgs.shape:  (8, 28, 28, 1)
labels.shape:  (8,)


  • split 활용
import tensorflow_datasets as tfds


(train_ds, test_ds), ds_info = tfds.load(
    name='mnist',
    split=['train', 'test'],
    shuffle_files=False, # True일 경우 train, validation, test 각각 전부 Shuffle (Default: False)
    as_supervised=True,  # Data Item이 dict가 아닌 (imgs, labels) 튜플 형태로 반환
    with_info=True,      # tfds.core.DatasetInfo 객체 반환 (Default: False)
)

n_train = ds_info.splits['train'].num_examples
train_ds = train_ds.shuffle(n_train).batch(8).take(1)

for imgs, labels in train_ds:
    print('imgs.shape: ', imgs.shape)
    print('labels.shape: ', labels.shape)
imgs.shape:  (8, 28, 28, 1)
labels.shape:  (8,)


  • numpy 객체로 반환(tfds.as_numpy(ds))
import tensorflow_datasets as tfds


(train_ds, test_ds), ds_info = tfds.load(
    name='mnist',
    split=['train', 'test'],
    shuffle_files=False, # True일 경우 train, validation, test 각각 전부 Shuffle (Default: False)
    as_supervised=True,  # Data Item이 dict가 아닌 (imgs, labels) 튜플 형태로 반환
    with_info=True,      # tfds.core.DatasetInfo 객체 반환 (Default: False)
)

n_train = ds_info.splits['train'].num_examples
train_ds = train_ds.shuffle(n_train).batch(8).take(1)

for imgs, labels in tfds.as_numpy(train_ds):
    print('imgs type: ', type(imgs))
    print('labels type: ', type(labels))
imgs type:  <class 'numpy.ndarray'>
labels type:  <class 'numpy.ndarray'>



2. tfds.builder

tfds.load is a thin wrapper around tfds.core.DatasetBuilder.
You can get the same output using the tfds.core.DatasetBuilder API

import tensorflow_datasets as tfds


builder = tfds.builder('fashion_mnist')

# 1. Create the tfrecord files (no-op if already exists)
builder.download_and_prepare()

# 2. Load the `tf.data.Dataset`
train_ds, test_ds = builder.as_dataset(split=['train', 'test'], shuffle_files=True)

tfds.loadwith_info와 같은 param이 없는 차이점이 있다.


3. CLI

tfds build ${DatasetName}



Visualization

tfds.as_dataframe

import tensorflow_datasets as tfds


dataset, ds_info = tfds.load('mnist', split='train', with_info=True)
tfds.as_dataframe(dataset.take(6), ds_info)


tfds.show_examples

import tensorflow_datasets as tfds


dataset, ds_info = tfds.load('mnist', split='train', with_info=True)
fig = tfds.show_examples(dataset, ds_info)



Reference

Leave a comment