Source code for carpedm.data.util

#
# Copyright (C) 2018 Neal Digre.
#
# This software may be modified and distributed under the terms
# of the MIT license. See the LICENSE file for details.
#
#
# Portions of this module are based on or taken from the TensorFlow
# models "im2text" data pipeline, so here is their license.
#
# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.


"""Data utilities.

This module provides utility methods/classes used by other data modules.

Todo:
    * Tests
        * ``generate_features``
    * Refactor ``generate_features``
    * Fix ``class_mask`` for overlapping characters.
"""
import itertools
import os
from re import match

import numpy as np
import tensorflow as tf

from carpedm.data import ops


[docs]def image_path(data_dir, bib_id, image_id): """Generate path to a specified image. Args: data_dir (str): Path to top-level data directory. bib_id (str): Bibliography ID. image_id (str): Image ID. Returns: String """ return os.path.join(data_dir, bib_id, 'images', image_id + '.jpg')
[docs]class BBox(object): """Bounding box helper class.""" def __init__(self, xmin, xmax, ymin, ymax): """Initializer. Args: xmin: xmax: ymin: ymax: """ self.xmin = xmin self.xmax = xmax self.ymin = ymin self.ymax = ymax self.box = (self.xmin, self.xmax, self.ymin, self.ymax) def __getitem__(self, item): return self.box[item] def __len__(self): return len(self.box)
[docs]class Character(object): """Helper class for storing a single character.""" def __init__(self, label, image_id, x, y, block_id, char_id, w, h): """Initializer. Argument order matches csv format. Args: label (str): Unicode-like label for the character. image_id (str): Identifier (e.g. filepath) for image from which the character comes. x (str or int): X-coordinate (column) of character's top- left corner, relative to left (col[0]) of parent image. y (str or int): Y-coordinate (row) of character's top-left corner, relative to top (row[0]) of parent image. block_id (str): ID for the character's block. char_id (str): Unique ID for character (token) in an image. w (str or int): Width (in pixels). h (str or int): Height (in pixels). """ assert match(r'^U\+[0-9A-Fa-f]{4,5}$', label), ( "Invalid label %s" % label ) assert match(r'^B[0-9]{4}$', block_id), ( "Invalid block ID %s" % block_id ) assert match(r'^C[0-9]{4}$', char_id), ( "Invalid character ID %s" % char_id ) self.label = label self.image_id = image_id self.block_id = block_id self.id = char_id self.x = int(x) self.y = int(y) self.w = int(w) self.h = int(h)
[docs]class ImageTFOps(object): """Helper class for decoding and resizing images.""" _sess = tf.Session() _encoded_jpeg = tf.placeholder(dtype=tf.string) _decode_jpeg = tf.image.decode_jpeg(_encoded_jpeg, channels=3) _image_orig = tf.placeholder(dtype=tf.uint8, shape=(None, None, 3)) _shape = tf.placeholder(dtype=tf.int32, shape=(2,)) _image_resize = tf.cast(tf.image.resize_images(_image_orig, size=_shape), tf.uint8) def decode_jpeg(self, encoded_jpeg): image = self._sess.run(self._decode_jpeg, feed_dict={self._encoded_jpeg: encoded_jpeg}) assert len(image.shape) == 3 assert image.shape[2] == 3 return image def resize(self, image, shape): image = self._sess.run( self._image_resize, feed_dict={ self._image_orig: image, self._shape: shape } ) return image
[docs]class ImageMeta(object): """Class for storing and manipulating image metadata.""" _image_helper = ImageTFOps() def __init__(self, filepath, full_image=False, first_char=None): """Initializer Args: filepath (str): Path to parent image. full_image (bool): Use full parent image. first_char (Character or None): First character. """ self.filepath = filepath self._full = full_image self._labels = [] self._blocks = [] self._ids = [] self._x_raw = [] self._y_raw = [] self._w_raw = [] self._h_raw = [] self._w_full = None self._h_full = None self._out_shape = (None, None) if first_char: self._x_raw.append(first_char.x) self._y_raw.append(first_char.y) self._w_raw.append(first_char.w) self._h_raw.append(first_char.h) self._labels.append(first_char.label) self._blocks.append(first_char.block_id) self._ids.append(first_char.id) @property def full_w(self): """Width (in pixels) of full raw parent image. Returns: int: The return value. """ if self._w_full is None: im = self._load_image() self._w_full = im.shape[1] self._h_full = im.shape[0] return self._w_full @property def full_h(self): """Height (in pixels) of full raw parent image. Returns: int: The return value. """ if self._h_full is None: im = self._load_image() self._h_full = im.shape[0] self._w_full = im.shape[1] return self._h_full @property def xmin(self): """Image's minimum x-coordinate (column) in raw parent image. Returns: int: The return value. """ if self._full or len(self._x_raw) == 0: return 0 else: return min(self._x_raw) @property def xmax(self): """Image's maximum x-coordinate (column) in raw parent image. Returns: int: The return value. """ if self._full or len(self._char_xmax) == 0: return self.full_w else: return max(self._char_xmax) @property def ymin(self): """Image's minimum y-coordinate (row) in raw parent image. Returns: int: The return value. """ if self._full or len(self._y_raw) == 0: return 0 else: return min(self._y_raw) @property def ymax(self): """Image's maximum y-coordinate (row) in raw parent image. Returns: int: The return value. """ if self._full or len(self._char_ymax) == 0: return self.full_h else: return max(self._char_ymax) @property def width(self): """Width (in pixels) in full parent image original scale. Returns: int: The return value. """ return self.xmax - self.xmin @property def height(self): """Height (in pixels) in full parent image original scale. Returns: int: The return value. """ return self.ymax - self.ymin @property def num_chars(self): """Number of characters in the image. Returns: int: The return value. """ return len(self._labels) @property def char_labels(self): """Character labels Returns: :obj:`list` of :obj:`str`: The return value. """ return self._labels @property def char_bboxes(self): """Bounding boxes for characters. Returned bounding boxes are relative to (:meth:`xmin`, :meth:`ymin`). Returns: :obj:`list` of :obj:`carpedm.data.util.BBox`: The return values. """ scale_h, scale_w = self.new_shape(self._out_shape, ratio=True) adjusted_bboxes = [ BBox(xmin=(self._x_raw[i] - self.xmin) * scale_w, xmax=(self._char_xmax[i] - self.xmin) * scale_w, ymin=(self._y_raw[i] - self.ymin) * scale_h, ymax=(self._char_ymax[i] - self.ymin) * scale_h) for i in range(self.num_chars) ] return adjusted_bboxes @property def line_bboxes(self): """Bounding boxes for lines in the image, Note: Currently only meaningful when using full page image. Returns: :obj:`list` of :obj:`BBox`: The return values. """ result = [] if self._full: bboxes = self.char_bboxes b = bboxes[0] xmin, xmax, ymin, ymax = [b.xmin], [b.xmax], [b.ymin], [b.ymax] for b in bboxes[1:]: if not ops.in_line(xmin_line=xmin, xmax_line=xmax, ymin_line=min(ymin), xmin_new=b.xmin, xmax_new=b.xmax, ymax_new=b.ymax): result.append(BBox(min(xmin), max(xmax), min(ymin), max(ymax))) xmin, xmax = [b.xmin], [b.xmax] ymin, ymax = [b.ymin], [b.ymax] else: xmin.append(b.xmin) xmax.append(b.xmax) ymin.append(b.ymin) ymax.append(b.ymax) # Add last line result.append(BBox(min(xmin), max(xmax), min(ymin), max(ymax))) return result @property def char_mask(self): """Generate pseudo-pixel-level character mask. Pixels within character bounding boxes are assigned to positive class (1), others assigned negative class (0). Returns: :obj:`numpy.ndarray`: Character mask of shape (height, width, 1) """ mask = np.zeros(self._out_shape[:2]) for b in self.char_bboxes: mask[b.ymin:b.ymax, b.xmin:b.xmax] = 1 mask = np.expand_dims(mask, 2) return mask.astype(dtype=np.float32) @property def line_mask(self): """Generate pseudo-pixel-level line mask. Pixels within line bounding boxes are assigned to positive class (1), others assigned negative class (0). Returns: :obj:`numpy.ndarray`: Line mask of shape (height, width, 1) """ mask = np.zeros(self._out_shape[:2]) for b in self.line_bboxes: mask[b.ymin:b.ymax, b.xmin:b.xmax] = 1 mask = np.expand_dims(mask, 2) return mask.astype(dtype=np.float32)
[docs] def class_mask(self, vocab): """Generate a character class image mask. Note: Where characters overlap, the last character added is arbitrarily the one that will be represented in the mask. This should be fixed in a future version. Args: vocab (Vocabulary): The vocabulary for converting to ID. Returns: :obj:`numpy.ndarray`: Class mask of shape (height, width, 1) """ mask = np.zeros(self._out_shape[:2]) for label, b in zip(self.char_labels, self.char_bboxes): mask[b.ymin:b.ymax, b.xmin:b.xmax] = vocab.char_to_id(label) mask = np.expand_dims(mask, 2) return mask.astype(dtype=np.float32)
[docs] def generate_features(self, image_shape, vocab, chunk, character, line, label, bbox): """ Args: image_shape (tuple or None): Shape (height, width) to which images are resized, or the size of each chunk if chunks == True. vocab (Vocabulary or None): Vocabulary for converting characters to IDs. Required ``if character and label``. chunk (bool): Instead of using the original image, return a list of image chunks and corresponding features extracted from the original image on a regular grid. The original image is padded to divide evenly by chunk shape. character (bool): Include character info (ID, bbox). line (bool): Include line info (bbox) in features. label (bool): Include label IDs in features. bbox (str or None): If not None, include bbox in features as unit (e.g. 'pixel', 'ratio' [of image])) Returns: :obj:`list` of :obj:`dict`: Feature dictionaries. """ if character and label: assert vocab, "Must provide vocab." features = {} shape = self.new_shape(image_shape) full_shape = None if chunk else shape im = self.load_image(full_shape) h, w, c = im.shape features['image/data'] = im features['image/height'] = h features['image/width'] = w features['image/channels'] = c if character: features['image/char/count'] = self.num_chars if label: features['image/seq/char/id'] = [vocab.char_to_id(c) for c in self.char_labels] if bbox: bboxes = self.char_bboxes xmin, ymin, xmax, ymax = ops.seq_norm_bbox_values(bboxes, h, w) features['image/seq/char/bbox/xmin'] = xmin features['image/seq/char/bbox/ymin'] = ymin features['image/seq/char/bbox/xmax'] = xmax features['image/seq/char/bbox/ymax'] = ymax if line and self._full: if bbox: bboxes = self.line_bboxes features['image/line/count'] = len(bboxes) xmin, ymin, xmax, ymax = ops.seq_norm_bbox_values(bboxes, h, w) features['image/seq/line/bbox/xmin'] = xmin features['image/seq/line/bbox/ymin'] = ymin features['image/seq/line/bbox/xmax'] = xmax features['image/seq/line/bbox/ymax'] = ymax if chunk: result = [] img = features['image/data'] h_diff = features['image/height'] % shape[0] w_diff = features['image/width'] % shape[1] padding = ((0, shape[0] - h_diff), (0, shape[1] - w_diff), (0, 0)) img = np.pad(img, pad_width=padding, mode='mean') ys = np.arange(0, img.shape[0], shape[0]) xs = np.arange(0, img.shape[1], shape[1]) # top left of each block coordinates = list(itertools.product(ys, xs)) for coord in coordinates: y1, y2 = coord[0], coord[0] + shape[0] x1, x2 = coord[1], coord[1] + shape[1] region = dict() region['image/data'] = img[y1:y2, x1:x2, :] region['image/height'] = shape[0] region['image/width'] = shape[1] region['image/channels'] = img.shape[2] if character: char_ixs = ops.ixs_in_region( features['image/seq/char/bbox'], y1, y2, x1, x2) characters = list(map(lambda i: self.char_labels[i], char_ixs)) if label: region['image/seq/char/id'] = [vocab.char_to_id(c) for c in characters] region['image/char/count'] = len(char_ixs) if bbox: bboxes = list(map(lambda i: features['image/seq/char/bbox'][i], char_ixs)) xmin, ymin, xmax, ymax = ops.seq_norm_bbox_values( bboxes, height=shape[0], width=shape[1]) region['image/seq/char/bbox/xmin'] = xmin region['image/seq/char/bbox/ymin'] = ymin region['image/seq/char/bbox/xmax'] = xmax region['image/seq/char/bbox/ymax'] = ymax if line and self._full: line_ixs = ops.ixs_in_region(features['image/seq/line/bbox'], y1, y2, x1, x2) if bbox: bboxes = list(map( lambda i: features['image/seq/line/bbox'][i], line_ixs)) xmin, ymin, xmax, ymax = ops.seq_norm_bbox_values( bboxes, height=shape[0], width=shape[1]) region['image/seq/char/bbox/xmin'] = xmin region['image/seq/char/bbox/ymin'] = ymin region['image/seq/char/bbox/xmax'] = xmax region['image/seq/char/bbox/ymax'] = ymax result.append(region) else: result = [features] return result
[docs] def load_image(self, shape): """Load image and resize to shape. If ``shape`` is None or (None, None), original size is maintained. Args: shape (tuple or None): Output dimensions (height, width). Returns: :obj:`numpy.ndarray`: Resized image. """ if shape: assert len(shape) == 2 new_shape = self.new_shape(shape) image = self._load_image() image = image[self.ymin:self.ymax, self.xmin:self.xmax, :] image = self._image_helper.resize(image, new_shape) image = np.array(image, dtype=np.uint8) self._out_shape = image.shape return image
[docs] def valid_char(self, char, same_line=False): """Check if char is a valid character to include in image. Args: char (Character): The character to validate. same_line (bool): Consider whether char is in the same line as those already in the image example. Returns: bool: True for valid, False otherwise. """ valid = True if same_line and len(self._x_raw) > 0: if not ops.in_line(xmin_line=self._x_raw, xmax_line=self._char_xmax, ymin_line=min(self._y_raw), xmin_new=char.x, xmax_new=char.x + char.w, ymax_new=char.y + char.h): valid = False if char.image_id not in self.filepath: # not in same image valid = False return valid
[docs] def add_char(self, char): """Add a character to the image. Args: char (Character): The character to add. """ if self.valid_char(char): self._x_raw.append(char.x) self._y_raw.append(char.y) self._w_raw.append(char.w) self._h_raw.append(char.h) self._labels.append(char.label) self._blocks.append(char.block_id) self._ids.append(char.id) else: raise ValueError( "Invalid image id '{}'.".format(char.image_id), "Must be within {}.".format(self.filepath) )
[docs] def combine_with(self, images): """ Args: images (list of ImageMeta): """ raise NotImplementedError
def _load_image(self): """Loads the raw parent image.""" with tf.gfile.FastGFile(self.filepath, 'rb') as f: encoded_image = f.read() try: image = self._image_helper.decode_jpeg(encoded_image) except (tf.errors.InvalidArgumentError, AssertionError): print("Skipping file with invalid JPEG data: %s" % image_path) return return image @property def _char_xmax(self): """Maximum x-coordinate (column) of each character.""" return [self._x_raw[i] + self._w_raw[i] for i in range(self.num_chars)] @property def _char_ymax(self): """Maximum y-coordinate (row) of each character.""" return [self._y_raw[i] + self._h_raw[i] for i in range(self.num_chars)]
[docs] def new_shape(self, shape, ratio=False): """Resolves (and computes) input shape to a consistent type. Args: shape (tuple or None): New shape of image (height, width), with potentially inconsistent types. ratio (bool): Return new size as ratio of original size. Returns: int or float: Absolute or relative height int or float: Absolute or relative width """ height = self.height width = self.width if shape and any(shape): assert all([not a or isinstance(a, (int, float)) for a in shape]), "Invalid shape {}".format(shape) if isinstance(shape[0], int): height = shape[0] elif isinstance(shape[0], float): height = height * shape[0] if isinstance(shape[1], int): width = shape[1] elif isinstance(shape[1], float): width = width * shape[1] # Compute to maintain aspect ratio if not shape[0]: height = height * (width / self.width) if not shape[1]: width = width * (height / self.height) if ratio: height = height / self.height width = width / self.width else: height = int(height) width = int(width) return height, width