Examples¶
Single Character Task¶
Below is an example Task definition for a single character recognition task
and the corresponding import in __init__.py
for accessing the task through
the registry.
For more details on Task definition and default properties, please refer to the Tasks documentation.
ocr.py¶
#
# Copyright (C) 2018 Neal Digre.
#
# This software may be modified and distributed under the terms
# of the MIT license. See the LICENSE file for details.
"""Optical character recognition tasks.
TODO:
* Modularize common loss functions, select by id
* Modularize common regularization options, select by id
"""
import abc
import tensorflow as tf
from carpedm.data.lang import JapaneseUnicodes
from carpedm.tasks.generic import Task
from carpedm.util import registry
from carpedm.util.eval import confusion_matrix_metric
class OCRTask(Task):
"""Abstract class for OCR Tasks."""
def __init__(self, **kwargs):
super(OCRTask, self).__init__(**kwargs)
@property
def target(self):
return 'image/seq/char/id'
@property
def blocks(self):
return False
@property
def character(self):
return True
@property
def line(self):
return False
@property
def label(self):
return True
@property
def bbox(self):
return False
@property
@abc.abstractmethod
def sparse_labels(self):
return False
def regularization(self, hparams):
raise NotImplementedError
def results(self, loss, tower_features, tower_preds, tower_targets,
is_training):
raise NotImplementedError
def loss_fn(self, features, model_output, targets, is_training):
raise NotImplementedError
@registry.register_task
class OCRSingleKana(OCRTask):
"""Single character recognition tasks."""
@property
def image_scope(self):
return 'char'
@property
def character_set(self):
return JapaneseUnicodes('kana')
def results(self, loss, tower_features, tower_preds, tower_targets,
is_training):
tensors_to_log = {'loss': loss}
tf.summary.image("sample_input", tower_features[0]['image/data'])
all_logits = tf.concat([p for p in tower_preds], axis=0)
predictions = {
'classes': tf.argmax(all_logits, axis=1),
'probabilities': tf.nn.softmax(all_logits)
}
stacked_labels = tf.squeeze(tf.concat(tower_targets, axis=0))
accuracy = tf.metrics.accuracy(stacked_labels, predictions['classes'])
metrics = {
'accuracy': accuracy,
'confusion': confusion_matrix_metric(
stacked_labels, predictions['classes'], self.num_classes)
}
return tensors_to_log, predictions, metrics
def loss_fn(self, features, model_output, targets, is_training):
with tf.name_scope('batch_xentropy'):
loss = tf.losses.sparse_softmax_cross_entropy(
logits=model_output, labels=targets)
return loss
def regularization(self, hparams):
model_params = tf.trainable_variables()
weight_loss = tf.multiply(
hparams.weight_decay,
tf.add_n([tf.nn.l2_loss(v) for v in model_params]),
name='weight_loss')
return weight_loss
@property
def sparse_labels(self):
return False
@registry.register_task
class OCRSeqKana3(OCRTask):
def __init__(self, beam_width=100, **kwargs):
self._beam_width = beam_width
super(OCRSeqKana3, self).__init__(**kwargs)
@property
def character_set(self):
return JapaneseUnicodes('kana')
@property
def image_scope(self):
return 'seq'
@property
def sequence_length(self):
return 3
@property
def sparse_labels(self):
return True
@property
def target(self):
return 'image/seq/char/id_sparse'
def loss_fn(self, features, model_output, targets, is_training):
return tf.nn.ctc_loss(labels=targets,
inputs=model_output['logits'],
sequence_length=model_output['seq_len'],
time_major=False)
def results(self, loss, tower_features, tower_preds, tower_targets,
is_training):
tf.summary.image("sample_input", tower_features[0]['image/data'])
all_logits = tf.concat([p['logits'] for p in tower_preds], axis=0)
seq_lens = tf.concat([p['seq_len'] for p in tower_preds], axis=0)
# TODO: fix when seqs are different lengths from multiple GPUs
all_labels = tf.sparse_concat(0, [p for p in tower_targets])
decoded, log_prob = tf.nn.ctc_beam_search_decoder(
inputs=tf.transpose(all_logits, [1, 0, 2]),
sequence_length=seq_lens,
beam_width=self._beam_width)
decoded = decoded[0] # best path
edit_distance = tf.edit_distance(decoded, tf.to_int64(all_labels),
normalize=False)
Z = tf.cast(tf.size(all_labels), tf.float32)
ler = tf.reduce_sum(edit_distance) / Z
S = tf.cast(tf.size(edit_distance), tf.float32)
num_wrong_seqs = tf.cast(tf.count_nonzero(edit_distance), tf.float32)
ser = num_wrong_seqs / S
metrics = {
'ler': tf.metrics.mean(ler),
'ser': tf.metrics.mean(ser)
}
tensors_to_log = {'loss': loss, 'ler': ler, 'ser': ser}
mapping_string = tf.constant(self._meta.vocab.types())
table = tf.contrib.lookup.index_to_string_table_from_tensor(
mapping_string, default_value='NULL')
decoding = table.lookup(tf.to_int64(tf.sparse_tensor_to_dense(decoded)))
gt = table.lookup(tf.to_int64(tf.sparse_tensor_to_dense(all_labels)))
tf.summary.text('decoded', decoding)
tf.summary.text('gt', gt)
predictions = {
'classes': tf.argmax(input=all_logits, axis=1),
'probabilities': tf.nn.softmax(all_logits),
'decoded': decoding,
}
return tensors_to_log, predictions, metrics
def regularization(self, hparams):
model_params = tf.trainable_variables()
weight_loss = tf.multiply(
hparams.weight_decay,
tf.add_n([tf.nn.l2_loss(v) for v in model_params]),
name='weight_loss')
return weight_loss
tasks.__init__.py¶
#
# Copyright (C) 2018 Neal Digre.
#
# This software may be modified and distributed under the terms
# of the MIT license. See the LICENSE file for details.
from carpedm.tasks import generic
# Defined tasks. Imports here force registration.
from carpedm.tasks.ocr import OCRSingleKana
Baseline Model¶
baseline.py¶
#
# Copyright (C) 2018 Neal Digre.
#
# This software may be modified and distributed under the terms
# of the MIT license. See the LICENSE file for details.
"""Baseline models."""
import tensorflow as tf
from carpedm.models.generic import TFModel
from carpedm import nn
from carpedm.util import registry
@registry.register_model
class SingleCharBaseline(TFModel):
"""A simple baseline CNN model."""
def __init__(self, num_classes, *args, **kwargs):
"""Initializer.
Overrides TFModel.
Args:
num_classes: Number of possible character classes.
*args: Unused arguments.
**kwargs: Unused arguments.
"""
self._num_classes = num_classes
self._cnn = nn.conv.CNN()
@property
def name(self):
return "Baseline_" + self._cnn.name
def _forward_pass(self, features, data_format, axes_order,
is_training, reuse):
x = features['image/data']
x = self._cnn.forward_pass(
x, data_format, axes_order, is_training, False, reuse)
x = tf.layers.flatten(x)
tf.logging.info('image after flatten: %s', x.get_shape())
x = tf.layers.dense(
inputs=x, units=200, activation=tf.nn.relu, name='dense1')
nn.util.activation_summary(x)
x = tf.layers.dense(
inputs=x, units=200, activation=tf.nn.relu, name='dense2')
nn.util.activation_summary(x)
logits = tf.layers.dense(
inputs=x, units=self._num_classes, name='logits')
return logits
@registry.register_model
class SequenceBaseline(TFModel):
"""A simple baseline CNN-LSTM model."""
def __init__(self, num_classes, lstm_layers=2, lstm_units=100,
feature_extractor=nn.conv.CNN(), *args, **kwargs):
"""Initializer.
Overrides TFModel.
Args:
num_classes (int): Number of possible character classes.
lstm_layers (int): Number of LSTM layers.
lstm_unit (int): Number of units in LSTM cell
feature_extractor:
*args: Unused arguments.
**kwargs: Unused arguments.
"""
self._num_classes = num_classes + 1 # Add CTC null label.
self._layers = lstm_layers
self._units = lstm_units
self._feature_extractor = feature_extractor
@property
def name(self):
return 'Baseline_seq_' + self._feature_extractor.name
def _forward_pass(self, features, data_format, axes_order,
is_training, reuse):
x = self._feature_extractor.forward_pass(
features['image/data'], data_format, axes_order,
is_training, False, reuse)
if axes_order == [0, 3, 1, 2]:
x = tf.transpose(x, [0, 2, 3, 1])
x = tf.reshape(x, [-1, x.shape[1], x.shape[2] * x.shape[3]])
x = nn.rnn.bi_lstm(x, n_layers=self._layers, n_units=self._units)
seq_len = tf.tile(tf.expand_dims(tf.to_int32(tf.shape(x)[1]), 0),
[tf.to_int32(tf.shape(x)[0])])
logits = tf.layers.dense(inputs=x, units=self._num_classes)
return {'logits': logits, 'seq_len': seq_len}
def initialize_pretrained(self, pretrained_dir):
submodel = 'Baseline_' + self._feature_extractor.name
variable_mapping = dict()
for i in range(5):
variable_mapping[submodel + '/conv{}/'.format(i)] \
= self.name + '/conv{}/'.format(i)
return variable_mapping
models.__init__.py¶
#
# Copyright (C) 2018 Neal Digre.
#
# This software may be modified and distributed under the terms
# of the MIT license. See the LICENSE file for details.
from carpedm.models import generic
# Defined models. Imports here force registration.
from carpedm.models.baseline import SingleCharBaseline
Using Tasks and Models¶
Below is a minimal main.py
example for getting started training a model using the Task interface.
For an in-depth description, please refer to the guide Training a Model.
#
# Copyright (C) 2018 Neal Digre.
#
# This software may be modified and distributed under the terms
# of the MIT license. See the LICENSE file for details.
"""Minimal main module.
If this file is changed, please also change the ``:lines:`` option in
the following files where this code is referenced with the
``literalinclude`` directive.
* ../guides/usage.rst
"""
import os
import re
import tensorflow as tf
import carpedm as dm
from carpedm.util import registry
tf.logging.set_verbosity(tf.logging.INFO)
# Task definition
args = {'data_dir': dm.data.sample,
'task_dir': '/tmp/carpedm_tasks',
'shape_store': None,
'shape_in': (64, 64)}
task = registry.task('ocr_single_kana')(**args)
# Training Hyperparameters
num_epochs = 30
training_hparams = {'train_batch_size': 32,
'eval_batch_size': 1,
'data_format': 'channels_last',
'optimizer': 'sgd',
'learning_rate': 1e-3,
'momentum': 0.96,
'weight_decay': 2e-4,
'gradient_clipping': None,
'lr_decay_steps': None,
'init_dir': None, # for pre-trained models
'sync': False}
# Model hyperparameters and definition
model_hparams = {}
model = registry.model('single_char_baseline')(num_classes=task.num_classes, **model_hparams)
# Unique job_id
experiment_id = 'example'
shape = re.sub(r'([,])', '_', re.sub(r'([() ])', '', str(args['shape_in'])))
job_id = os.path.join(experiment_id, shape, model.name)
task.job_id = job_id # Used to check for first model initialization.
job_dir = os.path.join(task.task_log_dir, job_id)
# TensorFlow Configuration
sess_config = tf.ConfigProto(
allow_soft_placement=True,
log_device_placement=False,
intra_op_parallelism_threads=0,
gpu_options=tf.GPUOptions(force_gpu_compatible=True))
config = tf.estimator.RunConfig(session_config=sess_config,
model_dir=job_dir,
save_summary_steps=10)
hparams = tf.contrib.training.HParams(is_chief=config.is_chief,
**training_hparams)
# Input and model functions
train_input_fn = task.input_fn(hparams.train_batch_size,
subset='train',
num_shards=1,
overwrite=False)
eval_input_fn = task.input_fn(hparams.eval_batch_size,
subset='dev',
num_shards=1,
overwrite=False)
model_fn = task.model_fn(model, num_gpus=0, variable_strategy='CPU',
num_workers=config.num_worker_replicas or 1)
# Number of training steps
train_examples = dm.data.num_examples_per_epoch(task.task_data_dir, 'train')
eval_examples = dm.data.num_examples_per_epoch(task.task_data_dir, 'dev')
if eval_examples % hparams.eval_batch_size != 0:
raise ValueError(('validation set size (%d) must be multiple of '
'eval_batch_size (%d)') % (eval_examples,
hparams.eval_batch_size))
eval_steps = eval_examples // hparams.eval_batch_size
train_steps = num_epochs * ((train_examples // hparams.train_batch_size) or 1)
train_spec = tf.estimator.TrainSpec(input_fn=train_input_fn, max_steps=train_steps)
eval_spec = tf.estimator.EvalSpec(input_fn=eval_input_fn, steps=eval_steps)
# Estimator definition and training
estimator = tf.estimator.Estimator(model_fn=model_fn, config=config, params=hparams)
tf.estimator.train_and_evaluate(estimator, train_spec=train_spec, eval_spec=eval_spec)