Examples

Single Character Task

Below is an example Task definition for a single character recognition task and the corresponding import in __init__.py for accessing the task through the registry.

For more details on Task definition and default properties, please refer to the Tasks documentation.

ocr.py

#
# Copyright (C) 2018 Neal Digre.
#
# This software may be modified and distributed under the terms
# of the MIT license. See the LICENSE file for details.


"""Optical character recognition tasks.

TODO:
    * Modularize common loss functions, select by id
    * Modularize common regularization options, select by id
"""
import abc

import tensorflow as tf

from carpedm.data.lang import JapaneseUnicodes
from carpedm.tasks.generic import Task
from carpedm.util import registry
from carpedm.util.eval import confusion_matrix_metric


class OCRTask(Task):
    """Abstract class for OCR Tasks."""

    def __init__(self, **kwargs):
        super(OCRTask, self).__init__(**kwargs)

    @property
    def target(self):
        return 'image/seq/char/id'

    @property
    def blocks(self):
        return False

    @property
    def character(self):
        return True

    @property
    def line(self):
        return False

    @property
    def label(self):
        return True

    @property
    def bbox(self):
        return False

    @property
    @abc.abstractmethod
    def sparse_labels(self):
        return False

    def regularization(self, hparams):
        raise NotImplementedError

    def results(self, loss, tower_features, tower_preds, tower_targets,
                is_training):
        raise NotImplementedError

    def loss_fn(self, features, model_output, targets, is_training):
        raise NotImplementedError


@registry.register_task
class OCRSingleKana(OCRTask):
    """Single character recognition tasks."""

    @property
    def image_scope(self):
        return 'char'

    @property
    def character_set(self):
        return JapaneseUnicodes('kana')

    def results(self, loss, tower_features, tower_preds, tower_targets,
                is_training):
        tensors_to_log = {'loss': loss}

        tf.summary.image("sample_input", tower_features[0]['image/data'])

        all_logits = tf.concat([p for p in tower_preds], axis=0)
        predictions = {
            'classes': tf.argmax(all_logits, axis=1),
            'probabilities': tf.nn.softmax(all_logits)
        }

        stacked_labels = tf.squeeze(tf.concat(tower_targets, axis=0))

        accuracy = tf.metrics.accuracy(stacked_labels, predictions['classes'])
        metrics = {
            'accuracy': accuracy,
            'confusion': confusion_matrix_metric(
                stacked_labels, predictions['classes'], self.num_classes)
        }

        return tensors_to_log, predictions, metrics

    def loss_fn(self, features, model_output, targets, is_training):
        with tf.name_scope('batch_xentropy'):
            loss = tf.losses.sparse_softmax_cross_entropy(
                logits=model_output, labels=targets)
        return loss

    def regularization(self, hparams):
        model_params = tf.trainable_variables()
        weight_loss = tf.multiply(
            hparams.weight_decay,
            tf.add_n([tf.nn.l2_loss(v) for v in model_params]),
            name='weight_loss')
        return weight_loss

    @property
    def sparse_labels(self):
        return False


@registry.register_task
class OCRSeqKana3(OCRTask):

    def __init__(self, beam_width=100, **kwargs):
        self._beam_width = beam_width
        super(OCRSeqKana3, self).__init__(**kwargs)

    @property
    def character_set(self):
        return JapaneseUnicodes('kana')

    @property
    def image_scope(self):
        return 'seq'

    @property
    def sequence_length(self):
        return 3

    @property
    def sparse_labels(self):
        return True

    @property
    def target(self):
        return 'image/seq/char/id_sparse'

    def loss_fn(self, features, model_output, targets, is_training):
        return tf.nn.ctc_loss(labels=targets,
                              inputs=model_output['logits'],
                              sequence_length=model_output['seq_len'],
                              time_major=False)

    def results(self, loss, tower_features, tower_preds, tower_targets,
                is_training):

        tf.summary.image("sample_input", tower_features[0]['image/data'])

        all_logits = tf.concat([p['logits'] for p in tower_preds], axis=0)
        seq_lens = tf.concat([p['seq_len'] for p in tower_preds], axis=0)

        # TODO: fix when seqs are different lengths from multiple GPUs
        all_labels = tf.sparse_concat(0, [p for p in tower_targets])
        decoded, log_prob = tf.nn.ctc_beam_search_decoder(
            inputs=tf.transpose(all_logits, [1, 0, 2]),
            sequence_length=seq_lens,
            beam_width=self._beam_width)
        decoded = decoded[0]  # best path

        edit_distance = tf.edit_distance(decoded, tf.to_int64(all_labels),
                                         normalize=False)

        Z = tf.cast(tf.size(all_labels), tf.float32)
        ler = tf.reduce_sum(edit_distance) / Z
        S = tf.cast(tf.size(edit_distance), tf.float32)
        num_wrong_seqs = tf.cast(tf.count_nonzero(edit_distance), tf.float32)
        ser = num_wrong_seqs / S

        metrics = {
            'ler': tf.metrics.mean(ler),
            'ser': tf.metrics.mean(ser)
        }

        tensors_to_log = {'loss': loss, 'ler': ler, 'ser': ser}

        mapping_string = tf.constant(self._meta.vocab.types())
        table = tf.contrib.lookup.index_to_string_table_from_tensor(
            mapping_string, default_value='NULL')
        decoding = table.lookup(tf.to_int64(tf.sparse_tensor_to_dense(decoded)))
        gt = table.lookup(tf.to_int64(tf.sparse_tensor_to_dense(all_labels)))

        tf.summary.text('decoded', decoding)
        tf.summary.text('gt', gt)

        predictions = {
            'classes': tf.argmax(input=all_logits, axis=1),
            'probabilities': tf.nn.softmax(all_logits),
            'decoded': decoding,
        }

        return tensors_to_log, predictions, metrics

    def regularization(self, hparams):
        model_params = tf.trainable_variables()
        weight_loss = tf.multiply(
            hparams.weight_decay,
            tf.add_n([tf.nn.l2_loss(v) for v in model_params]),
            name='weight_loss')
        return weight_loss

tasks.__init__.py

#
# Copyright (C) 2018 Neal Digre.
#
# This software may be modified and distributed under the terms
# of the MIT license. See the LICENSE file for details.

from carpedm.tasks import generic

# Defined tasks. Imports here force registration.
from carpedm.tasks.ocr import OCRSingleKana

Baseline Model

baseline.py

#
# Copyright (C) 2018 Neal Digre.
#
# This software may be modified and distributed under the terms
# of the MIT license. See the LICENSE file for details.

"""Baseline models."""

import tensorflow as tf

from carpedm.models.generic import TFModel
from carpedm import nn
from carpedm.util import registry


@registry.register_model
class SingleCharBaseline(TFModel):
    """A simple baseline CNN model."""

    def __init__(self, num_classes, *args, **kwargs):
        """Initializer.

        Overrides TFModel.

        Args:
            num_classes: Number of possible character classes.
            *args: Unused arguments.
            **kwargs: Unused arguments.

        """
        self._num_classes = num_classes
        self._cnn = nn.conv.CNN()

    @property
    def name(self):
        return "Baseline_" + self._cnn.name

    def _forward_pass(self, features, data_format, axes_order,
                      is_training, reuse):
        x = features['image/data']
        x = self._cnn.forward_pass(
            x, data_format, axes_order, is_training, False, reuse)
        x = tf.layers.flatten(x)
        tf.logging.info('image after flatten: %s', x.get_shape())

        x = tf.layers.dense(
            inputs=x, units=200, activation=tf.nn.relu, name='dense1')
        nn.util.activation_summary(x)
        x = tf.layers.dense(
            inputs=x, units=200, activation=tf.nn.relu, name='dense2')
        nn.util.activation_summary(x)
        logits = tf.layers.dense(
            inputs=x, units=self._num_classes, name='logits')
        return logits


@registry.register_model
class SequenceBaseline(TFModel):
    """A simple baseline CNN-LSTM model."""

    def __init__(self, num_classes, lstm_layers=2, lstm_units=100,
                 feature_extractor=nn.conv.CNN(), *args, **kwargs):
        """Initializer.

        Overrides TFModel.

        Args:
            num_classes (int): Number of possible character classes.
            lstm_layers (int): Number of LSTM layers.
            lstm_unit (int): Number of units in LSTM cell
            feature_extractor:
            *args: Unused arguments.
            **kwargs: Unused arguments.
        """
        self._num_classes = num_classes + 1  # Add CTC null label.
        self._layers = lstm_layers
        self._units = lstm_units
        self._feature_extractor = feature_extractor

    @property
    def name(self):
        return 'Baseline_seq_' + self._feature_extractor.name

    def _forward_pass(self, features, data_format, axes_order,
                      is_training, reuse):
        x = self._feature_extractor.forward_pass(
            features['image/data'], data_format, axes_order,
            is_training, False, reuse)
        if axes_order == [0, 3, 1, 2]:
            x = tf.transpose(x, [0, 2, 3, 1])
        x = tf.reshape(x, [-1, x.shape[1], x.shape[2] * x.shape[3]])
        x = nn.rnn.bi_lstm(x, n_layers=self._layers, n_units=self._units)
        seq_len = tf.tile(tf.expand_dims(tf.to_int32(tf.shape(x)[1]), 0),
                          [tf.to_int32(tf.shape(x)[0])])
        logits = tf.layers.dense(inputs=x, units=self._num_classes)

        return {'logits': logits, 'seq_len': seq_len}

    def initialize_pretrained(self, pretrained_dir):

        submodel = 'Baseline_' + self._feature_extractor.name

        variable_mapping = dict()

        for i in range(5):
            variable_mapping[submodel + '/conv{}/'.format(i)] \
                = self.name + '/conv{}/'.format(i)

        return variable_mapping

models.__init__.py

#
# Copyright (C) 2018 Neal Digre.
#
# This software may be modified and distributed under the terms
# of the MIT license. See the LICENSE file for details.

from carpedm.models import generic

# Defined models. Imports here force registration.
from carpedm.models.baseline import SingleCharBaseline

Using Tasks and Models

Below is a minimal main.py example for getting started training a model using the Task interface. For an in-depth description, please refer to the guide Training a Model.

#
# Copyright (C) 2018 Neal Digre.
#
# This software may be modified and distributed under the terms
# of the MIT license. See the LICENSE file for details.

"""Minimal main module.

If this file is changed, please also change the ``:lines:`` option in
the following files where this code is referenced with the
``literalinclude`` directive.

    * ../guides/usage.rst

"""
import os
import re

import tensorflow as tf

import carpedm as dm
from carpedm.util import registry


tf.logging.set_verbosity(tf.logging.INFO)

# Task definition
args = {'data_dir': dm.data.sample,
        'task_dir': '/tmp/carpedm_tasks',
        'shape_store': None,
        'shape_in': (64, 64)}
task = registry.task('ocr_single_kana')(**args)

# Training Hyperparameters
num_epochs = 30
training_hparams = {'train_batch_size': 32,
                    'eval_batch_size': 1,
                    'data_format': 'channels_last',
                    'optimizer': 'sgd',
                    'learning_rate': 1e-3,
                    'momentum': 0.96,
                    'weight_decay': 2e-4,
                    'gradient_clipping': None,
                    'lr_decay_steps': None,
                    'init_dir': None,  # for pre-trained models
                    'sync': False}

# Model hyperparameters and definition
model_hparams = {}
model = registry.model('single_char_baseline')(num_classes=task.num_classes, **model_hparams)

# Unique job_id
experiment_id = 'example'
shape = re.sub(r'([,])', '_', re.sub(r'([() ])', '', str(args['shape_in'])))
job_id = os.path.join(experiment_id, shape, model.name)
task.job_id = job_id  # Used to check for first model initialization.
job_dir = os.path.join(task.task_log_dir, job_id)

# TensorFlow Configuration
sess_config = tf.ConfigProto(
    allow_soft_placement=True,
    log_device_placement=False,
    intra_op_parallelism_threads=0,
    gpu_options=tf.GPUOptions(force_gpu_compatible=True))
config = tf.estimator.RunConfig(session_config=sess_config,
                                model_dir=job_dir,
                                save_summary_steps=10)
hparams = tf.contrib.training.HParams(is_chief=config.is_chief,
                                      **training_hparams)

# Input and model functions
train_input_fn = task.input_fn(hparams.train_batch_size,
                               subset='train',
                               num_shards=1,
                               overwrite=False)
eval_input_fn = task.input_fn(hparams.eval_batch_size,
                              subset='dev',
                              num_shards=1,
                              overwrite=False)
model_fn = task.model_fn(model, num_gpus=0, variable_strategy='CPU',
                         num_workers=config.num_worker_replicas or 1)

# Number of training steps
train_examples = dm.data.num_examples_per_epoch(task.task_data_dir, 'train')
eval_examples = dm.data.num_examples_per_epoch(task.task_data_dir, 'dev')

if eval_examples % hparams.eval_batch_size != 0:
    raise ValueError(('validation set size (%d) must be multiple of '
                      'eval_batch_size (%d)') % (eval_examples,
                                                 hparams.eval_batch_size))

eval_steps = eval_examples // hparams.eval_batch_size
train_steps = num_epochs * ((train_examples // hparams.train_batch_size) or 1)

train_spec = tf.estimator.TrainSpec(input_fn=train_input_fn, max_steps=train_steps)
eval_spec = tf.estimator.EvalSpec(input_fn=eval_input_fn, steps=eval_steps)

# Estimator definition and training
estimator = tf.estimator.Estimator(model_fn=model_fn, config=config, params=hparams)
tf.estimator.train_and_evaluate(estimator, train_spec=train_spec, eval_spec=eval_spec)