Source code for carpedm.data.providers

#
# Copyright (c) 2018 Neal Digre.
#
# This software may be modified and distributed under the terms
# of the MIT license. See the LICENSE file for details.


"""Data providers for Task input function.

This module provides a generic interface for providing data useable
by machine learning algorithms.

A provider may either (1) receive data from the method that initialized
it, or (2) receive a directory path where the data to load is stored.

Todo:
    * Generator
        * numpy
        * pandas DataFrame

"""
import os
import functools
import abc

import tensorflow as tf

from carpedm.data import ops, preproc


[docs]class DataProvider(object):
    """Data provider abstract class."""

    __metaclass__ = abc.ABCMeta

    def __init__(self, target_id):
        """Initializer.

        Args:
            target_id (str): Feature to use as the target value.

        """
        self._target_id = target_id

[docs]    @abc.abstractmethod
    def make_batch(self, batch_size):
        """Generator method that returns a new batch with each call.

        Args:
            batch_size (int): Number of examples per batch.

        Returns:
            dict: Batch features.
            array_like: Batch targets.

        """

    @property
    @abc.abstractmethod
    def format(self):
        return 'channels_last'


[docs]class TFDataSet(DataProvider):
    """TensorFlow DataSet provider from TFRecords stored on disk."""

    def __init__(self,
                 target_id,
                 data_dir,
                 subset,
                 num_examples,
                 pad_shape,
                 sparse_labels):
        """Initializer.

        Extends DataProvider.

        Args:
            data_dir (str): Directory containing (sharded) tfrecord
                files.
            subset (str): One of {'train', 'dev', 'test'}.
            num_examples (int): Number of examples in subset.
            pad_shape (tuple): Shape (height, width) of padded images.
        """
        self.data_dir = data_dir
        self.subset = subset
        self.num_examples = num_examples
        self.pad_shape = pad_shape
        self.sparse_labels = sparse_labels
        super(TFDataSet, self).__init__(target_id)
        channels = tf.Dimension(1)  # Converting to grayscale in _preproc
        self._padding = {
            "image/data": tf.TensorShape([tf.Dimension(self.pad_shape[0]),
                                          tf.Dimension(self.pad_shape[1]),
                                          channels]),
            "image/height": tf.TensorShape([]),
            "image/width": tf.TensorShape([]),
            "image/char/count": tf.TensorShape([]),
            "image/line/count": tf.TensorShape([]),
            "image/mask/char": tf.TensorShape([
                tf.Dimension(self.pad_shape[0]),
                tf.Dimension(self.pad_shape[1]), 1
            ]),
            "image/mask/line": tf.TensorShape([
                tf.Dimension(self.pad_shape[0]),
                tf.Dimension(self.pad_shape[1]), 1
            ]),
            "image/seq/char/id": tf.TensorShape([None]),
            "image/seq/char/id_sparse": tf.TensorShape([None]),
            "image/seq/char/bbox": tf.TensorShape([None, 4]),
            "image/seq/line/bbox": tf.TensorShape([None, 4]),
        }

    @property
    def format(self):
        return 'channels_last'

[docs]    def make_batch(self, batch_size, single_char=False):
        filenames = self._get_filenames()
        dataset = tf.data.TFRecordDataset(filenames).repeat()
        dataset = dataset.map(
            functools.partial(self._parser,
                              distort=(single_char and
                                       self.subset == 'train')),
            num_parallel_calls=batch_size)

        if self.subset == 'train':
            min_q_exs = 0.4 * self.num_examples
            dataset = dataset.shuffle(
                buffer_size=int(min_q_exs + 3 * batch_size)
            )
        padded_shapes = tuple([self._padding[k] for k in self.feat_keys])
        dataset = dataset.padded_batch(batch_size, padded_shapes=padded_shapes)

        iterator = dataset.make_one_shot_iterator()
        batch = iterator.get_next()

        features = dict(zip(self.feat_keys, batch))
        for key, value in features.items():
            if 'sparse' in key:
                features[key] = tf.deserialize_many_sparse(value,
                                                           dtype=tf.int32)
        labels = features.pop(self._target_id, None)

        return features, labels

    def _get_filenames(self):
        if self.subset in ['train', 'dev', 'test']:
            files = os.listdir(self.data_dir)
            relevant = [os.path.join(self.data_dir, f)
                        for f in files if self.subset in f]
            return relevant
        else:
            raise ValueError('Invalid data subset "%s"' % self.subset)

    def _parser(self, serialized, distort=False):
        tensor_dict = ops.parse_sequence_example(serialized)
        tensor_dict['image/data'] = self._preproc(tensor_dict['image/data'])
        (tensor_dict['image/data'],
         tensor_dict['image/seq/char/bbox'],
         tensor_dict['image/seq/line/bbox']) = preproc.pad_borders_or_shrink(
            tensor_dict['image/data'], tensor_dict['image/seq/char/bbox'],
            tensor_dict['image/seq/line/bbox'], self.pad_shape)

        if self.sparse_labels:
            tensor_dict['image/seq/char/id_sparse'] = tf.serialize_sparse(
                ops.sparsify_label(tensor_dict['image/seq/char/id'],
                                   tensor_dict['image/char/count'])
            )

        # if distort: image = distort_image(image)
        self.feat_keys, features = zip(*tensor_dict.items())
        return features

    def _preproc(self, image):
        image = preproc.normalize(preproc.convert_to_grayscale(image))
        return image