Source code for carpedm.util.train

#
# Copyright (C) 2018 Neal Digre.
#
# This software may be modified and distributed under the terms
# of the MIT license. See the LICENSE file for details.
#
#
# Portions of this module are taken or adapted from the TensorFlow
# CIFAR-10 estimator tutorial, so here is their license.
#
# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.


"""Training utilities.

This modules provides utilities for training machine learning models.
It uses or makes slight modifications to code from the
`TensorFlow CIFAR-10 estimator tutorial`_.

..  _TensorFlow CIFAR-10 estimator tutorial:
    https://github.com/tensorflow/models/tree/master/tutorials/image

"""
import itertools

import tensorflow as tf
from tensorflow.python.platform import tf_logging as logging
from tensorflow.core.framework import node_def_pb2
from tensorflow.python.framework import device as pydev
from tensorflow.python.training import basic_session_run_hooks
from tensorflow.python.training import session_run_hook
from tensorflow.python.training import training_util
from tensorflow.python.training import device_setter


[docs]def config_optimizer(params): """Configure the optimizer used for training. Sets the learning rate schedule and optimization algorithm. Args: params (tf.contrib.training.HParams): Hyperparameters. Returns: tf.train.Optimizer """ # Learning rate schedule if params.lr_decay_steps: learning_rate = tf.train.exponential_decay( params.learning_rate, tf.train.get_global_step(), params.lr_decay_steps, params.lr_decay_rate, staircase=True ) tf.summary.scalar("learning_rate", learning_rate) # elif params.staged_lr: # # tf.train.piecewise_constant(tf.train.get_global_step(), # # boundaries=None, # # staged_lr=params.staged_lr) # raise NotImplementedError else: learning_rate = params.learning_rate # Optimizer if params.optimizer == 'sgd' and params.momentum: optimizer = tf.train.MomentumOptimizer(learning_rate=learning_rate, momentum=params.momentum) elif params.optimizer == 'adam': optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate) elif params.optimizer == 'adagrad': optimizer = tf.train.AdagradOptimizer(learning_rate=learning_rate) elif params.optimizer == 'rmsprop' and params.momentum: optimizer = tf.train.RMSPropOptimizer(learning_rate=learning_rate, momentum=params.momentum) elif params.optimizer == 'rmsprop': optimizer = tf.train.RMSPropOptimizer(learning_rate=learning_rate) else: optimizer = tf.train.GradientDescentOptimizer( learning_rate=learning_rate) return optimizer
# ========= BEGIN Adapted from TensorFlow CIFAR 10 tutorial ========= # class ExamplesPerSecondHook(session_run_hook.SessionRunHook): """Hook to print out examples per second. Total time is tracked and then divided by the total number of steps to get the average step time and then batch_size is used to determine the running average of examples per second. The examples per second for the most recent interval is also logged. """ def __init__( self, batch_size, every_n_steps=10, every_n_secs=None,): """Initializer for ExamplesPerSecondHook. Args: batch_size: Total batch size used to calculate examples/second from global time. every_n_steps: Log stats every n steps. every_n_secs: Log stats every n seconds. """ if (every_n_steps is None) == (every_n_secs is None): raise ValueError('exactly one of every_n_steps' ' and every_n_secs should be provided.') self._timer = basic_session_run_hooks.SecondOrStepTimer( every_steps=every_n_steps, every_secs=every_n_secs) self._step_train_time = 0 self._total_steps = 0 self._batch_size = batch_size def begin(self): self._global_step_tensor = training_util.get_global_step() if self._global_step_tensor is None: raise RuntimeError( 'Global step should be created to use StepCounterHook.') def before_run(self, run_context): # pylint: disable=unused-argument return basic_session_run_hooks.SessionRunArgs(self._global_step_tensor) def after_run(self, run_context, run_values): _ = run_context global_step = run_values.results if self._timer.should_trigger_for_step(global_step): elapsed_time, elapsed_steps = self._timer.update_last_triggered_step( global_step) if elapsed_time is not None: steps_per_sec = elapsed_steps / elapsed_time self._step_train_time += elapsed_time self._total_steps += elapsed_steps average_examples_per_sec = self._batch_size * ( self._total_steps / self._step_train_time) current_examples_per_sec = steps_per_sec * self._batch_size # Average examples/sec followed by current examples/sec logging.info('%s: %g (%g), step = %g', 'Average examples/sec', average_examples_per_sec, current_examples_per_sec, self._total_steps) def local_device_setter(num_devices=1, ps_device_type='cpu', worker_device='/cpu:0', ps_ops=None, ps_strategy=None): if ps_ops is None: ps_ops = ['Variable', 'VariableV2', 'VarHandleOp'] if ps_strategy is None: ps_strategy = device_setter._RoundRobinStrategy(num_devices) if not callable(ps_strategy): raise TypeError("ps_strategy must be callable") def _local_device_chooser(op): current_device = pydev.DeviceSpec.from_string(op.device or "") node_def = op if isinstance(op, node_def_pb2.NodeDef) else op.node_def if node_def.op in ps_ops: ps_device_spec = pydev.DeviceSpec.from_string( '/{}:{}'.format(ps_device_type, ps_strategy(op))) ps_device_spec.merge_from(current_device) return ps_device_spec.to_string() else: worker_device_spec = pydev.DeviceSpec.from_string(worker_device or "") worker_device_spec.merge_from(current_device) return worker_device_spec.to_string() return _local_device_chooser def group_train_op(optimizer, gradvars, update_ops): train_op = [ optimizer.apply_gradients( gradvars, global_step=tf.train.get_global_step()) ] train_op.extend(update_ops) train_op = tf.group(*train_op) return train_op def make_hooks(tensors_to_log, optimizer, num_workers, params): examples_sec_hook = ExamplesPerSecondHook( params.train_batch_size, every_n_steps=100) logging_hook = tf.train.LoggingTensorHook( tensors=tensors_to_log, every_n_iter=100) train_hooks = [logging_hook, examples_sec_hook] if params.sync: optimizer = tf.train.SyncReplicasOptimizer( optimizer, replicas_to_aggregate=num_workers) sync_replicas_hook = optimizer.make_session_run_hook(params.is_chief) train_hooks.append(sync_replicas_hook) return train_hooks def compute_global_grads_loss(tower_gradvars, tower_losses): gradvars = [] with tf.name_scope('gradient_averaging'): all_grads = {} for grad, var in itertools.chain(*tower_gradvars): if grad is not None: all_grads.setdefault(var, []).append(grad) for var, grads in all_grads.items(): # Average gradients on the same device as the variables # to which they apply. with tf.device(var.device): if len(grads) == 1: avg_grad = grads[0] else: avg_grad = tf.multiply(tf.add_n(grads), 1. / len(grads)) gradvars.append((avg_grad, var)) loss = tf.reduce_mean(tower_losses, name='loss') return gradvars, loss