Source code for carpedm.data.io

#
# Copyright (C) 2018 Neal Digre.
#
# This software may be modified and distributed under the terms
# of the MIT license. See the LICENSE file for details.
#
# The DataWriter helper class of this module is based on the TensorFlow
# "im2txt" models input pipeline, so here is their license:
#
# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

"""Input and output.

This module provides functionality for reading and writing data.

Todo:
    * Tests
        * DataWriter
        * CSVParser

"""
import csv
import functools
import os
import random
import sys
from datetime import datetime
from threading import Thread

import numpy as np

from carpedm.data import ops
from carpedm.data.util import Character, ImageMeta
from carpedm.data.util import image_path


[docs]class DataWriter(object):
    """Utility for writing data to disk in various formats.

    Attributes:
        available_formats (list): The available formats.

    References:
        Heavy modification of ``_process_dataset`` in the
        `input pipeline`_ for the TensorFlow `im2txt` models.

    ..  _input pipeline: https://github.com/tensorflow/models/blob/
        master/research/im2txt/im2txt/data/build_mscoco_data.py

    """

    available_formats = ['tfrecords', 'jpeg', 'jpg', 'png']

    def __init__(self, format_out, images, image_shape, vocab,
                 chunk, character, line, label, bbox, subdirs):
        """

        Args:
            format_out (str):
            images (list of ImageMeta):
            image_shape (tuple or None):
            vocab (Vocabulary):
            chunk (bool):
            character (bool):
            line (bool):
            label (bool):
            bbox (str or None): If not None, include bbox in features
                as unit (e.g. 'pixel', 'ratio' [of image]))
            subdirs (bool): Generate a subdirectory for each class.

        """

        self._writer_types = {
            'tfrecords': self._write_tfrecords,
            'jpeg': functools.partial(self._write_raw_images,
                                      image_format='.jpeg'),
            'jpg': functools.partial(self._write_raw_images,
                                     image_format='.jpg'),
            'png': functools.partial(self._write_raw_images,
                                     image_format='.png')
        }

        assert format_out in self.available_formats
        self._write = self._writer_types[format_out]
        self._images = images
        self._shape = image_shape
        self._vocab = vocab
        self._chunk = chunk
        self._char = character
        self._line = line
        self._label = label
        self._bbox = bbox
        self._subdirs = subdirs

[docs]    def write(self, fname_prefix, num_threads, num_shards):
        """Write data to disk.

        Args:
            fname_prefix (str): Path base for data files.
            num_threads (int): Number of threads to run in parallel.
            num_shards (int): Total number of shards to write, if any.

        Returns:
            int: Total number of examples written.

        """
        spacing = np.linspace(0, len(self._images),
                              num_threads + 1).astype(np.int)
        ranges = []
        threads = []
        num_examples = [0] * num_threads
        for i in range(len(spacing) - 1):
            ranges.append([spacing[i], spacing[i + 1]])
        print("Launching %d threads for spacings: %s" % (
            num_threads, ranges
        ))
        for thread_ix in range(len(ranges)):
            args = (thread_ix, num_shards, fname_prefix, ranges, num_examples)
            t = Thread(target=self._write, args=args)
            t.start()
            threads.append(t)

        # Try joining threads with TensorFlow
        try:
            import tensorflow as tf
        except ImportError:
            for thread in threads:
                thread.join()
        else:
            coord = tf.train.Coordinator()
            coord.join(threads)

        print("%s: Finished writing all %d image-target pairs." %
              (datetime.now(), len(self._images)))
        return sum(num_examples)

    def _write_tfrecords(self, index, num_shards,
                         fname_prefix, ranges, num_examples):
        """Write TFRecords.

        Args:
            index (int): Thread identifier in [0, len(ranges)].
            ranges (list): Pairs of integers specifying the ranges of
                the dataset to process in parallel
            num_examples (list): Structure for storing number of examples
                written.

        Returns:
            int: Number of examples written to shard.

        """
        try:
            import tensorflow as tf
        except ImportError:
            import warnings
            warnings.warn("The tfrecords output format is not available. "
                          "Please install tensorflow if you wish to use it.")
            return

        num_threads = len(ranges)
        assert not num_shards % num_threads
        num_shards_per_batch = int(num_shards / num_threads)
        shard_ranges = np.linspace(ranges[index][0],
                                   ranges[index][1],
                                   num_shards_per_batch + 1).astype(int)
        written_count = 0
        for s in range(num_shards_per_batch):
            shard = index * num_shards_per_batch + s
            output_file = "%s-%.5d-of-%.5d" % (fname_prefix, shard+1,
                                               num_shards)
            images_in_shard = np.arange(shard_ranges[s],
                                        shard_ranges[s + 1],
                                        dtype=int)
            images_in_shard = [self._images[i] for i in images_in_shard]

            shard_counter = 0
            writer = tf.python_io.TFRecordWriter(output_file)
            for img_meta in images_in_shard:
                feature_dicts = img_meta.generate_features(
                    image_shape=self._shape, chunk=self._chunk,
                    character=self._char, vocab=self._vocab,
                    line=self._line, label=self._label, bbox=self._bbox)
                for fd in feature_dicts:
                    example = ops.to_sequence_example(fd)
                    if example is not None:
                        writer.write(example.SerializeToString())
                        written_count += 1
                        shard_counter += 1

                if not shard_counter % 1000:
                    print("%s [thread %d]: Processed %d items in thread batch."
                          % (datetime.now(), index, shard_counter))
                    sys.stdout.flush()

            writer.close()
            print("%s [thread %d]: Wrote %d image-target pairs to %s" %
                  (datetime.now(), index, shard_counter, output_file))
            sys.stdout.flush()

        num_examples[index] = written_count

    def _write_raw_images(self, index, num_shards, fname_prefix, ranges,
                          num_examples, image_format):
        """Write structured directories of images."""
        from PIL import Image

        os.makedirs(fname_prefix, exist_ok=True)

        csvfile = open(
            os.path.join(fname_prefix, 'targets_{}.csv'.format(index)),
            'w+')
        fieldnames = [
            'file_id',
            'image_id',
            'unicodes',
            'bounding_boxes (x:y:h:w)'
        ]
        writer = csv.DictWriter(csvfile, fieldnames=fieldnames, delimiter=',')
        writer.writeheader()

        def write_features(meta, feats, count):
            row_dict = dict()
            row_dict['image_id'] = meta.filepath.split('/')[-1].strip('.jpg')

            im = Image.fromarray(feats['image/data'])
            im_id = '{}_{}'.format(index, count)
            row_dict['file_id'] = im_id

            if self._char and self._label:
                label = "_".join([self._vocab.id_to_char(c)
                                  for c in feats['image/seq/char/id']])
                if self._subdirs:
                    dir_img = os.path.join(fname_prefix, label)
                    os.makedirs(dir_img, exist_ok=True)
                    fname = os.path.join(dir_img, im_id + image_format)
                else:
                    fname = os.path.join(fname_prefix, im_id + image_format)
                row_dict['unicodes'] = label
            else:
                fname = os.path.join(fname_prefix, im_id + image_format)

            if self._char and self._bbox:
                xmin = feats['image/seq/char/bbox/xmin']
                ymin = feats['image/seq/char/bbox/ymin']
                xmax = feats['image/seq/char/bbox/xmax']
                ymax = feats['image/seq/char/bbox/ymax']

                row_dict['bounding_boxes (x:y:h:w)'] = "_".join(
                    ["{}:{}:{}:{}".format(xmin[i],
                                          ymin[i],
                                          xmax[i] - xmin[i],
                                          ymax[i] - ymin[i])
                     for i in range(feats['image/char/count'])]
                )

            im.save(fname)
            writer.writerow(row_dict)

        written_count = 0
        first, last = ranges[index]
        images = [self._images[i] for i in range(first, last)]

        for img in images:
            feature_dicts = img.generate_features(
                image_shape=self._shape, chunk=self._chunk,
                character=self._char, vocab=self._vocab,
                line=self._line, label=self._label, bbox=self._bbox)
            for fd in feature_dicts:
                write_features(meta=img, feats=fd, count=written_count)
                written_count += 1

            if not written_count % 1000:
                print("%s [thread %d]: Processed %d items in thread batch."
                      % (datetime.now(), index, written_count))
                sys.stdout.flush()

        num_examples[index] = written_count

        csvfile.close()


[docs]class CSVParser(object):
    """Utility class for parsing coordinate CSV files."""

    x = 'X'
    y = 'Y'
    w = 'Width'
    h = 'Height'
    image = 'Image'
    label = 'Unicode'
    bid = 'Block ID'
    cid = 'Char ID'

    def __init__(self, csv_file, data_dir, bib_id, ):
        """Initializer.

        Args:
            csv_file (file): Opened CSV file to parse.
            data_dir (str): Path to directory containing raw data.
            bib_id (str): Bibliography ID.
        """
        self._reader = csv.DictReader(csv_file, restkey='comments')
        self._data_dir = data_dir
        self._bib_id = bib_id

[docs]    def character(self, row):
        """Convert CSV row to a Character object.

        Returns:
            Character: The next character
        """
        row = {k.strip('\ufeff'): v for k, v in row.items()}

        return Character(
            x=row[self.x], y=row[self.y], w=row[self.w], h=row[self.h],
            label=row[self.label], block_id=row[self.bid],
            char_id=row[self.cid], image_id=image_path(self._data_dir,
                                                       self._bib_id,
                                                       row[self.image])
        )

[docs]    def characters(self):
        """Generates rest of characters in CSV.

        Yields:
            :obj:`carpedm.data.util.Character`: The next character.
        """
        for row in self._reader:
            yield self.character(row)

[docs]    def parse_pages(self):
        """Genereate metadata for full page images.

        Includes every character on page. Characters not in character
        set or vocabulary will be labeled as unknown when converted to
        integer IDs.

        Returns:
            :obj:`list` of :obj:`carpedm.data.util.ImageMeta`:
                Page image meta data.
        """
        pages = []
        char = self.character(next(self._reader))
        image_id = char.image_id
        image = ImageMeta(
            filepath=image_id, full_image=True, first_char=char)
        for char in self.characters():
            if char.image_id == image_id:
                image.add_char(char)
            else:
                pages.append(image)
                image_id = char.image_id
                image = ImageMeta(
                    filepath=image_id, full_image=True, first_char=char)
        # Add last image.
        pages.append(image)
        return pages

[docs]    def parse_lines(self):
        """Generate metadata for vertical lines of characters.

        Characters not in character set or vocabulary will be labeled as
        unknown when converted to integer IDs.

        Returns:
            :obj:`list` of :obj:`carpedm.data.util.ImageMeta`:
                Line image meta data.
        """
        lines = []

        c = self.character(next(self._reader))
        image = ImageMeta(filepath=c.image_id, first_char=c)

        for c in self.characters():
            if image.valid_char(c, same_line=True):
                image.add_char(c)
            else:
                lines.append(image)
                image = ImageMeta(filepath=c.image_id, first_char=c)
        lines.append(image)
        return lines

[docs]    def parse_sequences(self, charset, len_min, len_max):
        """Generate metadata for images of character sequences.

        Only includes sequences of chars in the desired character set.
        If ``len_min == len_max``, sequence length is deterministic, else
        each sequence is of random length from [len_min, len_max].

        Args:
            charset (CharacterSet): The character set.
            len_min (int): Minimum sequence length.
            len_max (int): Maximum sequence length.

        Returns:
            :obj:`list` of :obj:`carpedm.data.util.ImageMeta`:
                Sequence image meta data.

        """
        sequences = []
        length = random.randint(len_min, len_max)
        image = None
        for c in self.characters():
            if image is None:
                if charset.in_charset(c.label):
                    image = ImageMeta(filepath=c.image_id, first_char=c)
            elif (image.valid_char(c, same_line=True)
                  and charset.in_charset(c.label)
                  and image.num_chars < length):
                image.add_char(c)
            else:
                if len_min <= image.num_chars <= len_max:
                    sequences.append(image)
                    length = random.randint(len_min, len_max)
                if charset.in_charset(c.label):
                    image = ImageMeta(filepath=c.image_id, first_char=c)
                else:
                    image = None
        if image is not None and len_min <= image.num_chars <= len_max:
            sequences.append(image)
        return sequences

[docs]    def parse_characters(self, charset):
        """Generate metadata for single character images.

        Args:
            charset (CharacterSet): Character set.

        A more efficient implementation of ``parse_sequences`` when
        ``image_scope='seq'`` and ``seq_len=1``.

        Only characters in the character set are included.

        Returns:
            :obj:`list` of :obj:`carpedm.data.util.ImageMeta`:
                Single character image meta data.
        """
        return [ImageMeta(filepath=c.image_id, first_char=c)
                for c in self.characters() if charset.in_charset(c.label)]