
Single Character Task

Below is an example Task definition for a single character recognition task and the corresponding import in __init__.py for accessing the task through the registry.

For more details on Task definition and default properties, please refer to the Tasks documentation.


# Copyright (C) 2018 Neal Digre.
# This software may be modified and distributed under the terms
# of the MIT license. See the LICENSE file for details.

"""Optical character recognition tasks.

    * Modularize common loss functions, select by id
    * Modularize common regularization options, select by id
import abc

import tensorflow as tf

from carpedm.data.lang import JapaneseUnicodes
from carpedm.tasks.generic import Task
from carpedm.util import registry
from carpedm.util.eval import confusion_matrix_metric

class OCRTask(Task):
    """Abstract class for OCR Tasks."""

    def __init__(self, **kwargs):
        super(OCRTask, self).__init__(**kwargs)

    def target(self):
        return 'image/seq/char/id'

    def blocks(self):
        return False

    def character(self):
        return True

    def line(self):
        return False

    def label(self):
        return True

    def bbox(self):
        return False

    def sparse_labels(self):
        return False

    def regularization(self, hparams):
        raise NotImplementedError

    def results(self, loss, tower_features, tower_preds, tower_targets,
        raise NotImplementedError

    def loss_fn(self, features, model_output, targets, is_training):
        raise NotImplementedError

class OCRSingleKana(OCRTask):
    """Single character recognition tasks."""

    def image_scope(self):
        return 'char'

    def character_set(self):
        return JapaneseUnicodes('kana')

    def results(self, loss, tower_features, tower_preds, tower_targets,
        tensors_to_log = {'loss': loss}

        tf.summary.image("sample_input", tower_features[0]['image/data'])

        all_logits = tf.concat([p for p in tower_preds], axis=0)
        predictions = {
            'classes': tf.argmax(all_logits, axis=1),
            'probabilities': tf.nn.softmax(all_logits)

        stacked_labels = tf.squeeze(tf.concat(tower_targets, axis=0))

        accuracy = tf.metrics.accuracy(stacked_labels, predictions['classes'])
        metrics = {
            'accuracy': accuracy,
            'confusion': confusion_matrix_metric(
                stacked_labels, predictions['classes'], self.num_classes)

        return tensors_to_log, predictions, metrics

    def loss_fn(self, features, model_output, targets, is_training):
        with tf.name_scope('batch_xentropy'):
            loss = tf.losses.sparse_softmax_cross_entropy(
                logits=model_output, labels=targets)
        return loss

    def regularization(self, hparams):
        model_params = tf.trainable_variables()
        weight_loss = tf.multiply(
            tf.add_n([tf.nn.l2_loss(v) for v in model_params]),
        return weight_loss

    def sparse_labels(self):
        return False

class OCRSeqKana3(OCRTask):

    def __init__(self, beam_width=100, **kwargs):
        self._beam_width = beam_width
        super(OCRSeqKana3, self).__init__(**kwargs)

    def character_set(self):
        return JapaneseUnicodes('kana')

    def image_scope(self):
        return 'seq'

    def sequence_length(self):
        return 3

    def sparse_labels(self):
        return True

    def target(self):
        return 'image/seq/char/id_sparse'

    def loss_fn(self, features, model_output, targets, is_training):
        return tf.nn.ctc_loss(labels=targets,

    def results(self, loss, tower_features, tower_preds, tower_targets,

        tf.summary.image("sample_input", tower_features[0]['image/data'])

        all_logits = tf.concat([p['logits'] for p in tower_preds], axis=0)
        seq_lens = tf.concat([p['seq_len'] for p in tower_preds], axis=0)

        # TODO: fix when seqs are different lengths from multiple GPUs
        all_labels = tf.sparse_concat(0, [p for p in tower_targets])
        decoded, log_prob = tf.nn.ctc_beam_search_decoder(
            inputs=tf.transpose(all_logits, [1, 0, 2]),
        decoded = decoded[0]  # best path

        edit_distance = tf.edit_distance(decoded, tf.to_int64(all_labels),

        Z = tf.cast(tf.size(all_labels), tf.float32)
        ler = tf.reduce_sum(edit_distance) / Z
        S = tf.cast(tf.size(edit_distance), tf.float32)
        num_wrong_seqs = tf.cast(tf.count_nonzero(edit_distance), tf.float32)
        ser = num_wrong_seqs / S

        metrics = {
            'ler': tf.metrics.mean(ler),
            'ser': tf.metrics.mean(ser)

        tensors_to_log = {'loss': loss, 'ler': ler, 'ser': ser}

        mapping_string = tf.constant(self._meta.vocab.types())
        table = tf.contrib.lookup.index_to_string_table_from_tensor(
            mapping_string, default_value='NULL')
        decoding = table.lookup(tf.to_int64(tf.sparse_tensor_to_dense(decoded)))
        gt = table.lookup(tf.to_int64(tf.sparse_tensor_to_dense(all_labels)))

        tf.summary.text('decoded', decoding)
        tf.summary.text('gt', gt)

        predictions = {
            'classes': tf.argmax(input=all_logits, axis=1),
            'probabilities': tf.nn.softmax(all_logits),
            'decoded': decoding,

        return tensors_to_log, predictions, metrics

    def regularization(self, hparams):
        model_params = tf.trainable_variables()
        weight_loss = tf.multiply(
            tf.add_n([tf.nn.l2_loss(v) for v in model_params]),
        return weight_loss


from carpedm.tasks import generic

# Defined tasks. Imports here force registration.
from carpedm.tasks.ocr import OCRSingleKana

Baseline Model


"""Baseline models."""

import tensorflow as tf

from carpedm.models.generic import TFModel
from carpedm import nn
from carpedm.util import registry

class SingleCharBaseline(TFModel):
    """A simple baseline CNN model."""

    def __init__(self, num_classes, *args, **kwargs):

        Overrides TFModel.

            num_classes: Number of possible character classes.
            *args: Unused arguments.
            **kwargs: Unused arguments.

        self._num_classes = num_classes
        self._cnn = nn.conv.CNN()

    def name(self):
        return "Baseline_" + self._cnn.name

    def _forward_pass(self, features, data_format, axes_order,
                      is_training, reuse):
        x = features['image/data']
        x = self._cnn.forward_pass(
            x, data_format, axes_order, is_training, False, reuse)
        x = tf.layers.flatten(x)
        tf.logging.info('image after flatten: %s', x.get_shape())

        x = tf.layers.dense(
            inputs=x, units=200, activation=tf.nn.relu, name='dense1')
        x = tf.layers.dense(
            inputs=x, units=200, activation=tf.nn.relu, name='dense2')
        logits = tf.layers.dense(
            inputs=x, units=self._num_classes, name='logits')
        return logits

class SequenceBaseline(TFModel):
    """A simple baseline CNN-LSTM model."""

    def __init__(self, num_classes, lstm_layers=2, lstm_units=100,
                 feature_extractor=nn.conv.CNN(), *args, **kwargs):

        Overrides TFModel.

            num_classes (int): Number of possible character classes.
            lstm_layers (int): Number of LSTM layers.
            lstm_unit (int): Number of units in LSTM cell
            *args: Unused arguments.
            **kwargs: Unused arguments.
        self._num_classes = num_classes + 1  # Add CTC null label.
        self._layers = lstm_layers
        self._units = lstm_units
        self._feature_extractor = feature_extractor

    def name(self):
        return 'Baseline_seq_' + self._feature_extractor.name

    def _forward_pass(self, features, data_format, axes_order,
                      is_training, reuse):
        x = self._feature_extractor.forward_pass(
            features['image/data'], data_format, axes_order,
            is_training, False, reuse)
        if axes_order == [0, 3, 1, 2]:
            x = tf.transpose(x, [0, 2, 3, 1])
        x = tf.reshape(x, [-1, x.shape[1], x.shape[2] * x.shape[3]])
        x = nn.rnn.bi_lstm(x, n_layers=self._layers, n_units=self._units)
        seq_len = tf.tile(tf.expand_dims(tf.to_int32(tf.shape(x)[1]), 0),
        logits = tf.layers.dense(inputs=x, units=self._num_classes)

        return {'logits': logits, 'seq_len': seq_len}

    def initialize_pretrained(self, pretrained_dir):

        submodel = 'Baseline_' + self._feature_extractor.name

        variable_mapping = dict()

        for i in range(5):
            variable_mapping[submodel + '/conv{}/'.format(i)] \
                = self.name + '/conv{}/'.format(i)

        return variable_mapping


from carpedm.models import generic

# Defined models. Imports here force registration.
from carpedm.models.baseline import SingleCharBaseline

Using Tasks and Models

Below is a minimal main.py example for getting started training a model using the Task interface. For an in-depth description, please refer to the guide Training a Model.

"""Minimal main module.

If this file is changed, please also change the ``:lines:`` option in
the following files where this code is referenced with the
``literalinclude`` directive.

    * ../guides/usage.rst

import os
import re

import tensorflow as tf

import carpedm as dm
from carpedm.util import registry


# Task definition
args = {'data_dir': dm.data.sample,
        'task_dir': '/tmp/carpedm_tasks',
        'shape_store': None,
        'shape_in': (64, 64)}
task = registry.task('ocr_single_kana')(**args)

# Training Hyperparameters
num_epochs = 30
training_hparams = {'train_batch_size': 32,
                    'eval_batch_size': 1,
                    'data_format': 'channels_last',
                    'optimizer': 'sgd',
                    'learning_rate': 1e-3,
                    'momentum': 0.96,
                    'weight_decay': 2e-4,
                    'gradient_clipping': None,
                    'lr_decay_steps': None,
                    'init_dir': None,  # for pre-trained models
                    'sync': False}

# Model hyperparameters and definition
model_hparams = {}
model = registry.model('single_char_baseline')(num_classes=task.num_classes, **model_hparams)

# Unique job_id
experiment_id = 'example'
shape = re.sub(r'([,])', '_', re.sub(r'([() ])', '', str(args['shape_in'])))
job_id = os.path.join(experiment_id, shape, model.name)
task.job_id = job_id  # Used to check for first model initialization.
job_dir = os.path.join(task.task_log_dir, job_id)

# TensorFlow Configuration
sess_config = tf.ConfigProto(
config = tf.estimator.RunConfig(session_config=sess_config,
hparams = tf.contrib.training.HParams(is_chief=config.is_chief,

# Input and model functions
train_input_fn = task.input_fn(hparams.train_batch_size,
eval_input_fn = task.input_fn(hparams.eval_batch_size,
model_fn = task.model_fn(model, num_gpus=0, variable_strategy='CPU',
                         num_workers=config.num_worker_replicas or 1)

# Number of training steps
train_examples = dm.data.num_examples_per_epoch(task.task_data_dir, 'train')
eval_examples = dm.data.num_examples_per_epoch(task.task_data_dir, 'dev')

if eval_examples % hparams.eval_batch_size != 0:
    raise ValueError(('validation set size (%d) must be multiple of '
                      'eval_batch_size (%d)') % (eval_examples,

eval_steps = eval_examples // hparams.eval_batch_size
train_steps = num_epochs * ((train_examples // hparams.train_batch_size) or 1)

train_spec = tf.estimator.TrainSpec(input_fn=train_input_fn, max_steps=train_steps)
eval_spec = tf.estimator.EvalSpec(input_fn=eval_input_fn, steps=eval_steps)

# Estimator definition and training
estimator = tf.estimator.Estimator(model_fn=model_fn, config=config, params=hparams)
tf.estimator.train_and_evaluate(estimator, train_spec=train_spec, eval_spec=eval_spec)