Source code for carpedm.data.util
#
# Copyright (C) 2018 Neal Digre.
#
# This software may be modified and distributed under the terms
# of the MIT license. See the LICENSE file for details.
#
#
# Portions of this module are based on or taken from the TensorFlow
# models "im2text" data pipeline, so here is their license.
#
# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Data utilities.
This module provides utility methods/classes used by other data modules.
Todo:
* Tests
* ``generate_features``
* Refactor ``generate_features``
* Fix ``class_mask`` for overlapping characters.
"""
import itertools
import os
from re import match
import numpy as np
import tensorflow as tf
from carpedm.data import ops
[docs]def image_path(data_dir, bib_id, image_id):
"""Generate path to a specified image.
Args:
data_dir (str): Path to top-level data directory.
bib_id (str): Bibliography ID.
image_id (str): Image ID.
Returns: String
"""
return os.path.join(data_dir, bib_id, 'images', image_id + '.jpg')
[docs]class BBox(object):
"""Bounding box helper class."""
def __init__(self, xmin, xmax, ymin, ymax):
"""Initializer.
Args:
xmin:
xmax:
ymin:
ymax:
"""
self.xmin = xmin
self.xmax = xmax
self.ymin = ymin
self.ymax = ymax
self.box = (self.xmin, self.xmax, self.ymin, self.ymax)
def __getitem__(self, item):
return self.box[item]
def __len__(self):
return len(self.box)
[docs]class Character(object):
"""Helper class for storing a single character."""
def __init__(self, label, image_id, x, y, block_id, char_id, w, h):
"""Initializer.
Argument order matches csv format.
Args:
label (str): Unicode-like label for the character.
image_id (str): Identifier (e.g. filepath) for
image from which the character comes.
x (str or int): X-coordinate (column) of character's top-
left corner, relative to left (col[0]) of parent image.
y (str or int): Y-coordinate (row) of character's top-left
corner, relative to top (row[0]) of parent image.
block_id (str): ID for the character's block.
char_id (str): Unique ID for character (token) in an image.
w (str or int): Width (in pixels).
h (str or int): Height (in pixels).
"""
assert match(r'^U\+[0-9A-Fa-f]{4,5}$', label), (
"Invalid label %s" % label
)
assert match(r'^B[0-9]{4}$', block_id), (
"Invalid block ID %s" % block_id
)
assert match(r'^C[0-9]{4}$', char_id), (
"Invalid character ID %s" % char_id
)
self.label = label
self.image_id = image_id
self.block_id = block_id
self.id = char_id
self.x = int(x)
self.y = int(y)
self.w = int(w)
self.h = int(h)
[docs]class ImageTFOps(object):
"""Helper class for decoding and resizing images."""
_sess = tf.Session()
_encoded_jpeg = tf.placeholder(dtype=tf.string)
_decode_jpeg = tf.image.decode_jpeg(_encoded_jpeg, channels=3)
_image_orig = tf.placeholder(dtype=tf.uint8, shape=(None, None, 3))
_shape = tf.placeholder(dtype=tf.int32, shape=(2,))
_image_resize = tf.cast(tf.image.resize_images(_image_orig, size=_shape),
tf.uint8)
def decode_jpeg(self, encoded_jpeg):
image = self._sess.run(self._decode_jpeg,
feed_dict={self._encoded_jpeg: encoded_jpeg})
assert len(image.shape) == 3
assert image.shape[2] == 3
return image
def resize(self, image, shape):
image = self._sess.run(
self._image_resize,
feed_dict={
self._image_orig: image,
self._shape: shape
}
)
return image
[docs]class ImageMeta(object):
"""Class for storing and manipulating image metadata."""
_image_helper = ImageTFOps()
def __init__(self, filepath, full_image=False, first_char=None):
"""Initializer
Args:
filepath (str): Path to parent image.
full_image (bool): Use full parent image.
first_char (Character or None): First character.
"""
self.filepath = filepath
self._full = full_image
self._labels = []
self._blocks = []
self._ids = []
self._x_raw = []
self._y_raw = []
self._w_raw = []
self._h_raw = []
self._w_full = None
self._h_full = None
self._out_shape = (None, None)
if first_char:
self._x_raw.append(first_char.x)
self._y_raw.append(first_char.y)
self._w_raw.append(first_char.w)
self._h_raw.append(first_char.h)
self._labels.append(first_char.label)
self._blocks.append(first_char.block_id)
self._ids.append(first_char.id)
@property
def full_w(self):
"""Width (in pixels) of full raw parent image.
Returns:
int: The return value.
"""
if self._w_full is None:
im = self._load_image()
self._w_full = im.shape[1]
self._h_full = im.shape[0]
return self._w_full
@property
def full_h(self):
"""Height (in pixels) of full raw parent image.
Returns:
int: The return value.
"""
if self._h_full is None:
im = self._load_image()
self._h_full = im.shape[0]
self._w_full = im.shape[1]
return self._h_full
@property
def xmin(self):
"""Image's minimum x-coordinate (column) in raw parent image.
Returns:
int: The return value.
"""
if self._full or len(self._x_raw) == 0:
return 0
else:
return min(self._x_raw)
@property
def xmax(self):
"""Image's maximum x-coordinate (column) in raw parent image.
Returns:
int: The return value.
"""
if self._full or len(self._char_xmax) == 0:
return self.full_w
else:
return max(self._char_xmax)
@property
def ymin(self):
"""Image's minimum y-coordinate (row) in raw parent image.
Returns:
int: The return value.
"""
if self._full or len(self._y_raw) == 0:
return 0
else:
return min(self._y_raw)
@property
def ymax(self):
"""Image's maximum y-coordinate (row) in raw parent image.
Returns:
int: The return value.
"""
if self._full or len(self._char_ymax) == 0:
return self.full_h
else:
return max(self._char_ymax)
@property
def width(self):
"""Width (in pixels) in full parent image original scale.
Returns:
int: The return value.
"""
return self.xmax - self.xmin
@property
def height(self):
"""Height (in pixels) in full parent image original scale.
Returns:
int: The return value.
"""
return self.ymax - self.ymin
@property
def num_chars(self):
"""Number of characters in the image.
Returns:
int: The return value.
"""
return len(self._labels)
@property
def char_labels(self):
"""Character labels
Returns:
:obj:`list` of :obj:`str`: The return value.
"""
return self._labels
@property
def char_bboxes(self):
"""Bounding boxes for characters.
Returned bounding boxes are relative to
(:meth:`xmin`, :meth:`ymin`).
Returns:
:obj:`list` of :obj:`carpedm.data.util.BBox`:
The return values.
"""
scale_h, scale_w = self.new_shape(self._out_shape, ratio=True)
adjusted_bboxes = [
BBox(xmin=(self._x_raw[i] - self.xmin) * scale_w,
xmax=(self._char_xmax[i] - self.xmin) * scale_w,
ymin=(self._y_raw[i] - self.ymin) * scale_h,
ymax=(self._char_ymax[i] - self.ymin) * scale_h)
for i in range(self.num_chars)
]
return adjusted_bboxes
@property
def line_bboxes(self):
"""Bounding boxes for lines in the image,
Note: Currently only meaningful when using full page image.
Returns:
:obj:`list` of :obj:`BBox`: The return values.
"""
result = []
if self._full:
bboxes = self.char_bboxes
b = bboxes[0]
xmin, xmax, ymin, ymax = [b.xmin], [b.xmax], [b.ymin], [b.ymax]
for b in bboxes[1:]:
if not ops.in_line(xmin_line=xmin,
xmax_line=xmax,
ymin_line=min(ymin),
xmin_new=b.xmin,
xmax_new=b.xmax,
ymax_new=b.ymax):
result.append(BBox(min(xmin), max(xmax),
min(ymin), max(ymax)))
xmin, xmax = [b.xmin], [b.xmax]
ymin, ymax = [b.ymin], [b.ymax]
else:
xmin.append(b.xmin)
xmax.append(b.xmax)
ymin.append(b.ymin)
ymax.append(b.ymax)
# Add last line
result.append(BBox(min(xmin), max(xmax), min(ymin), max(ymax)))
return result
@property
def char_mask(self):
"""Generate pseudo-pixel-level character mask.
Pixels within character bounding boxes are assigned to positive
class (1), others assigned negative class (0).
Returns:
:obj:`numpy.ndarray`: Character mask of shape (height, width, 1)
"""
mask = np.zeros(self._out_shape[:2])
for b in self.char_bboxes:
mask[b.ymin:b.ymax, b.xmin:b.xmax] = 1
mask = np.expand_dims(mask, 2)
return mask.astype(dtype=np.float32)
@property
def line_mask(self):
"""Generate pseudo-pixel-level line mask.
Pixels within line bounding boxes are assigned to positive
class (1), others assigned negative class (0).
Returns:
:obj:`numpy.ndarray`: Line mask of shape (height, width, 1)
"""
mask = np.zeros(self._out_shape[:2])
for b in self.line_bboxes:
mask[b.ymin:b.ymax, b.xmin:b.xmax] = 1
mask = np.expand_dims(mask, 2)
return mask.astype(dtype=np.float32)
[docs] def class_mask(self, vocab):
"""Generate a character class image mask.
Note:
Where characters overlap, the last character added is
arbitrarily the one that will be represented in the mask.
This should be fixed in a future version.
Args:
vocab (Vocabulary): The vocabulary for converting to ID.
Returns:
:obj:`numpy.ndarray`: Class mask of shape (height, width, 1)
"""
mask = np.zeros(self._out_shape[:2])
for label, b in zip(self.char_labels, self.char_bboxes):
mask[b.ymin:b.ymax, b.xmin:b.xmax] = vocab.char_to_id(label)
mask = np.expand_dims(mask, 2)
return mask.astype(dtype=np.float32)
[docs] def generate_features(self,
image_shape,
vocab,
chunk,
character,
line,
label,
bbox):
"""
Args:
image_shape (tuple or None): Shape (height, width) to which
images are resized, or the size of each chunk if
chunks == True.
vocab (Vocabulary or None): Vocabulary for converting
characters to IDs. Required ``if character and label``.
chunk (bool): Instead of using the original image, return
a list of image chunks and corresponding features
extracted from the original image on a regular grid.
The original image is padded to divide evenly by chunk
shape.
character (bool): Include character info (ID, bbox).
line (bool): Include line info (bbox) in features.
label (bool): Include label IDs in features.
bbox (str or None): If not None, include bbox in features
as unit (e.g. 'pixel', 'ratio' [of image]))
Returns:
:obj:`list` of :obj:`dict`: Feature dictionaries.
"""
if character and label:
assert vocab, "Must provide vocab."
features = {}
shape = self.new_shape(image_shape)
full_shape = None if chunk else shape
im = self.load_image(full_shape)
h, w, c = im.shape
features['image/data'] = im
features['image/height'] = h
features['image/width'] = w
features['image/channels'] = c
if character:
features['image/char/count'] = self.num_chars
if label:
features['image/seq/char/id'] = [vocab.char_to_id(c)
for c in self.char_labels]
if bbox:
bboxes = self.char_bboxes
xmin, ymin, xmax, ymax = ops.seq_norm_bbox_values(bboxes, h, w)
features['image/seq/char/bbox/xmin'] = xmin
features['image/seq/char/bbox/ymin'] = ymin
features['image/seq/char/bbox/xmax'] = xmax
features['image/seq/char/bbox/ymax'] = ymax
if line and self._full:
if bbox:
bboxes = self.line_bboxes
features['image/line/count'] = len(bboxes)
xmin, ymin, xmax, ymax = ops.seq_norm_bbox_values(bboxes, h, w)
features['image/seq/line/bbox/xmin'] = xmin
features['image/seq/line/bbox/ymin'] = ymin
features['image/seq/line/bbox/xmax'] = xmax
features['image/seq/line/bbox/ymax'] = ymax
if chunk:
result = []
img = features['image/data']
h_diff = features['image/height'] % shape[0]
w_diff = features['image/width'] % shape[1]
padding = ((0, shape[0] - h_diff), (0, shape[1] - w_diff), (0, 0))
img = np.pad(img, pad_width=padding, mode='mean')
ys = np.arange(0, img.shape[0], shape[0])
xs = np.arange(0, img.shape[1], shape[1])
# top left of each block
coordinates = list(itertools.product(ys, xs))
for coord in coordinates:
y1, y2 = coord[0], coord[0] + shape[0]
x1, x2 = coord[1], coord[1] + shape[1]
region = dict()
region['image/data'] = img[y1:y2, x1:x2, :]
region['image/height'] = shape[0]
region['image/width'] = shape[1]
region['image/channels'] = img.shape[2]
if character:
char_ixs = ops.ixs_in_region(
features['image/seq/char/bbox'], y1, y2, x1, x2)
characters = list(map(lambda i: self.char_labels[i],
char_ixs))
if label:
region['image/seq/char/id'] = [vocab.char_to_id(c)
for c in characters]
region['image/char/count'] = len(char_ixs)
if bbox:
bboxes = list(map(lambda i:
features['image/seq/char/bbox'][i],
char_ixs))
xmin, ymin, xmax, ymax = ops.seq_norm_bbox_values(
bboxes, height=shape[0], width=shape[1])
region['image/seq/char/bbox/xmin'] = xmin
region['image/seq/char/bbox/ymin'] = ymin
region['image/seq/char/bbox/xmax'] = xmax
region['image/seq/char/bbox/ymax'] = ymax
if line and self._full:
line_ixs = ops.ixs_in_region(features['image/seq/line/bbox'],
y1, y2, x1, x2)
if bbox:
bboxes = list(map(
lambda i: features['image/seq/line/bbox'][i],
line_ixs))
xmin, ymin, xmax, ymax = ops.seq_norm_bbox_values(
bboxes, height=shape[0], width=shape[1])
region['image/seq/char/bbox/xmin'] = xmin
region['image/seq/char/bbox/ymin'] = ymin
region['image/seq/char/bbox/xmax'] = xmax
region['image/seq/char/bbox/ymax'] = ymax
result.append(region)
else:
result = [features]
return result
[docs] def load_image(self, shape):
"""Load image and resize to shape.
If ``shape`` is None or (None, None), original size is
maintained.
Args:
shape (tuple or None): Output dimensions (height, width).
Returns:
:obj:`numpy.ndarray`: Resized image.
"""
if shape:
assert len(shape) == 2
new_shape = self.new_shape(shape)
image = self._load_image()
image = image[self.ymin:self.ymax, self.xmin:self.xmax, :]
image = self._image_helper.resize(image, new_shape)
image = np.array(image, dtype=np.uint8)
self._out_shape = image.shape
return image
[docs] def valid_char(self, char, same_line=False):
"""Check if char is a valid character to include in image.
Args:
char (Character): The character to validate.
same_line (bool): Consider whether char is in the same line
as those already in the image example.
Returns:
bool: True for valid, False otherwise.
"""
valid = True
if same_line and len(self._x_raw) > 0:
if not ops.in_line(xmin_line=self._x_raw,
xmax_line=self._char_xmax,
ymin_line=min(self._y_raw),
xmin_new=char.x,
xmax_new=char.x + char.w,
ymax_new=char.y + char.h):
valid = False
if char.image_id not in self.filepath:
# not in same image
valid = False
return valid
[docs] def add_char(self, char):
"""Add a character to the image.
Args:
char (Character): The character to add.
"""
if self.valid_char(char):
self._x_raw.append(char.x)
self._y_raw.append(char.y)
self._w_raw.append(char.w)
self._h_raw.append(char.h)
self._labels.append(char.label)
self._blocks.append(char.block_id)
self._ids.append(char.id)
else:
raise ValueError(
"Invalid image id '{}'.".format(char.image_id),
"Must be within {}.".format(self.filepath)
)
[docs] def combine_with(self, images):
"""
Args:
images (list of ImageMeta):
"""
raise NotImplementedError
def _load_image(self):
"""Loads the raw parent image."""
with tf.gfile.FastGFile(self.filepath, 'rb') as f:
encoded_image = f.read()
try:
image = self._image_helper.decode_jpeg(encoded_image)
except (tf.errors.InvalidArgumentError, AssertionError):
print("Skipping file with invalid JPEG data: %s" % image_path)
return
return image
@property
def _char_xmax(self):
"""Maximum x-coordinate (column) of each character."""
return [self._x_raw[i] + self._w_raw[i] for i in range(self.num_chars)]
@property
def _char_ymax(self):
"""Maximum y-coordinate (row) of each character."""
return [self._y_raw[i] + self._h_raw[i] for i in range(self.num_chars)]
[docs] def new_shape(self, shape, ratio=False):
"""Resolves (and computes) input shape to a consistent type.
Args:
shape (tuple or None): New shape of image (height, width),
with potentially inconsistent types.
ratio (bool): Return new size as ratio of original size.
Returns:
int or float: Absolute or relative height
int or float: Absolute or relative width
"""
height = self.height
width = self.width
if shape and any(shape):
assert all([not a or isinstance(a, (int, float))
for a in shape]), "Invalid shape {}".format(shape)
if isinstance(shape[0], int):
height = shape[0]
elif isinstance(shape[0], float):
height = height * shape[0]
if isinstance(shape[1], int):
width = shape[1]
elif isinstance(shape[1], float):
width = width * shape[1]
# Compute to maintain aspect ratio
if not shape[0]:
height = height * (width / self.width)
if not shape[1]:
width = width * (height / self.height)
if ratio:
height = height / self.height
width = width / self.width
else:
height = int(height)
width = int(width)
return height, width