# machine A with 4 GPU
~/models/inception/bazel-bin/inception/imagenet_distributed_train \
--batch_size=32 \
--data_dir=/data1/imagenet1k \
--job_name='worker' \
--task_id=0 \
--ps_hosts='10.10.102.28:2220' \
--worker_hosts='10.10.102.28:2221,10.10.102.28:2222,10.10.102.29:2221,10.10.102.29:2222'
~/models/inception/bazel-bin/inception/imagenet_distributed_train \
--batch_size=32 \
--data_dir=/data1/imagenet1k \
--job_name='worker' \
--task_id=1 \
--ps_hosts='10.10.102.28:2220' \
--worker_hosts='10.10.102.28:2221,10.10.102.28:2222,10.10.102.29:2221,10.10.102.29:2222'
~/models/inception/bazel-bin/inception/imagenet_distributed_train \
--job_name='ps' \
-task_id=0 \
--ps_hosts='10.10.102.28:2220' \
--worker_hosts='10.10.102.28:2221,10.10.102.28:2222,10.10.102.29:2221,10.10.102.29:2222'
# machine B with 4 GPU
~/models/inception/bazel-bin/inception/imagenet_distributed_train \
--batch_size=32 \
--data_dir=/data1/imagenet1k \
--job_name='worker' \
--task_id=2 \
--ps_hosts='10.10.102.28:2220' \
--worker_hosts='10.10.102.28:2221,10.10.102.28:2222,10.10.102.29:2221,10.10.102.29:2222'
~/models/inception/bazel-bin/inception/imagenet_distributed_train \
--batch_size=32 \
--data_dir=/data1/imagenet1k \
--job_name='worker' \
--task_id=3 \
--ps_hosts='10.10.102.28:2220' \
--worker_hosts='10.10.102.28:2221,10.10.102.28:2222,10.10.102.29:2221,10.10.102.29:2222'
INFO:tensorflow:Waiting for model to be ready: Attempting to use uninitialized value mixed_35x35x256a/branch3x3dbl/Conv/weights/ExponentialMovingAverage
[[Node: mixed_35x35x256a/branch3x3dbl/Conv/weights/ExponentialMovingAverage/read = Identity[T=DT_FLOAT, _class=["loc:@mixed_35x35x256a/branch3x3dbl/Conv/weights"], _device="/job:ps/replica:0/task:0/cpu:0"](mixed_35x35x256a/branch3x3dbl/Conv/weights/ExponentialMovingAverage)]]
Caused by op u'mixed_35x35x256a/branch3x3dbl/Conv/weights/ExponentialMovingAverage/read', defined at:
File "/home/models/inception/bazel-bin/inception/imagenet_distributed_train.runfiles/__main__/inception/imagenet_distributed_train.py", line 65, in <module>
tf.app.run()
File "/usr/lib/python2.7/site-packages/tensorflow/python/platform/app.py", line 30, in run
sys.exit(main(sys.argv))
File "/home/models/inception/bazel-bin/inception/imagenet_distributed_train.runfiles/__main__/inception/imagenet_distributed_train.py", line 61, in main
inception_distributed_train.train(server.target, dataset, cluster_spec)
File "/home/models/inception/bazel-bin/inception/imagenet_distributed_train.runfiles/__main__/inception/inception_distributed_train.py", line 220, in train
apply_gradients_op = opt.apply_gradients(grads, global_step=global_step)
File "/usr/lib/python2.7/site-packages/tensorflow/python/training/sync_replicas_optimizer.py", line 427, in apply_gradients
self._variables_to_average)
File "/usr/lib/python2.7/site-packages/tensorflow/python/training/moving_averages.py", line 282, in apply
colocate_with_primary=True)
File "/usr/lib/python2.7/site-packages/tensorflow/python/training/slot_creator.py", line 86, in create_slot
return _create_slot_var(primary, val, scope)
File "/usr/lib/python2.7/site-packages/tensorflow/python/training/slot_creator.py", line 50, in _create_slot_var
slot = variables.Variable(val, name=scope, trainable=False)
File "/usr/lib/python2.7/site-packages/tensorflow/python/ops/variables.py", line 206, in __init__
dtype=dtype)
File "/usr/lib/python2.7/site-packages/tensorflow/python/ops/variables.py", line 275, in _init_from_args
self._snapshot = array_ops.identity(self._variable, name="read")
File "/usr/lib/python2.7/site-packages/tensorflow/python/ops/gen_array_ops.py", line 609, in identity
return _op_def_lib.apply_op("Identity", input=input, name=name)
File "/usr/lib/python2.7/site-packages/tensorflow/python/ops/op_def_library.py", line 655, in apply_op
op_def=op_def)
File "/usr/lib/python2.7/site-packages/tensorflow/python/framework/ops.py", line 2154, in create_op
original_op=self._default_original_op, op_def=op_def)
File "/usr/lib/python2.7/site-packages/tensorflow/python/framework/ops.py", line 1154, in __init__
self._traceback = _extract_stack()
Traceback (most recent call last):
File "/home/models/inception/bazel-bin/inception/imagenet_distributed_train.runfiles/__main__/inception/imagenet_distributed_train.py", line 65, in <module>
tf.app.run()
File "/usr/lib/python2.7/site-packages/tensorflow/python/platform/app.py", line 30, in run
sys.exit(main(sys.argv))
File "/home/models/inception/bazel-bin/inception/imagenet_distributed_train.runfiles/__main__/inception/imagenet_distributed_train.py", line 61, in main
inception_distributed_train.train(server.target, dataset, cluster_spec)
File "/home/models/inception/bazel-bin/inception/imagenet_distributed_train.runfiles/__main__/inception/inception_distributed_train.py", line 260, in train
sess = sv.prepare_or_wait_for_session(target, config=sess_config)
File "/usr/lib/python2.7/site-packages/tensorflow/python/training/supervisor.py", line 674, in prepare_or_wait_for_session
config=config, init_feed_dict=self._init_feed_dict)
File "/usr/lib/python2.7/site-packages/tensorflow/python/training/session_manager.py", line 158, in prepare_session
max_wait_secs=max_wait_secs, config=config)
File "/usr/lib/python2.7/site-packages/tensorflow/python/training/session_manager.py", line 214, in recover_session
saver.restore(sess, ckpt.model_checkpoint_path)
File "/usr/lib/python2.7/site-packages/tensorflow/python/training/saver.py", line 1090, in restore
{self.saver_def.filename_tensor_name: save_path})
File "/usr/lib/python2.7/site-packages/tensorflow/python/client/session.py", line 340, in run
run_metadata_ptr)
File "/usr/lib/python2.7/site-packages/tensorflow/python/client/session.py", line 564, in _run
feed_dict_string, options, run_metadata)
File "/usr/lib/python2.7/site-packages/tensorflow/python/client/session.py", line 637, in _do_run
target_list, options, run_metadata)
File "/usr/lib/python2.7/site-packages/tensorflow/python/client/session.py", line 659, in _do_call
e.code)
tensorflow.python.framework.errors.InvalidArgumentError: Assign requires shapes of both tensors to match. lhs shape= [4] rhs shape= [2]
[[Node: save/Assign_103 = Assign[T=DT_INT64, _class=["loc:@local_steps"], use_locking=true, validate_shape=true, _device="/job:ps/replica:0/task:0/cpu:0"](local_steps, save/restore_slice_103)]]
[[Node: save/restore_all/NoOp_S4 = _Recv[client_terminated=false, recv_device="/job:worker/replica:0/task:0/gpu:0", send_device="/job:ps/replica:0/task:0/cpu:0", send_device_incarnation=1831303354831316628, tensor_name="edge_1174_save/restore_all/NoOp", tensor_type=DT_FLOAT, _device="/job:worker/replica:0/task:0/gpu:0"]()]]
Caused by op u'save/Assign_103', defined at:
File "/home/models/inception/bazel-bin/inception/imagenet_distributed_train.runfiles/__main__/inception/imagenet_distributed_train.py", line 65, in <module>
tf.app.run()
File "/usr/lib/python2.7/site-packages/tensorflow/python/platform/app.py", line 30, in run
sys.exit(main(sys.argv))
File "/home/models/inception/bazel-bin/inception/imagenet_distributed_train.runfiles/__main__/inception/imagenet_distributed_train.py", line 61, in main
inception_distributed_train.train(server.target, dataset, cluster_spec)
File "/home/models/inception/bazel-bin/inception/imagenet_distributed_train.runfiles/__main__/inception/inception_distributed_train.py", line 233, in train
saver = tf.train.Saver()
File "/usr/lib/python2.7/site-packages/tensorflow/python/training/saver.py", line 832, in __init__
restore_sequentially=restore_sequentially)
File "/usr/lib/python2.7/site-packages/tensorflow/python/training/saver.py", line 502, in build
filename_tensor, vars_to_save, restore_sequentially, reshape)
File "/usr/lib/python2.7/site-packages/tensorflow/python/training/saver.py", line 268, in _AddRestoreOps
validate_shape=validate_shape))
File "/usr/lib/python2.7/site-packages/tensorflow/python/ops/gen_state_ops.py", line 40, in assign
use_locking=use_locking, name=name)
File "/usr/lib/python2.7/site-packages/tensorflow/python/ops/op_def_library.py", line 655, in apply_op
op_def=op_def)
File "/usr/lib/python2.7/site-packages/tensorflow/python/framework/ops.py", line 2154, in create_op
original_op=self._default_original_op, op_def=op_def)
File "/usr/lib/python2.7/site-packages/tensorflow/python/framework/ops.py", line 1154, in __init__
self._traceback = _extract_stack()