Skip to content

Commit 0545159

Browse files
authored
[DIST] Set data_sync_drop_remainder as true by default. (#143)
1. set data_sync_drop_remainder to true by default. Signed-off-by: langshi.cls <langshi.cls@alibaba-inc.com>
1 parent d65f685 commit 0545159

File tree

6 files changed

+30
-22
lines changed

6 files changed

+30
-22
lines changed

docs/tutorial/ranking/criteo/train.py

Lines changed: 14 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -112,13 +112,17 @@ def train(self, filenames):
112112
self._args.top_mlp_dims)
113113
loss = self.compute_loss(logits, labels)
114114
step = tf.train.get_or_create_global_step()
115-
train_op = sgd_decay_optimize(
116-
loss,
117-
lr_initial_value=self._args.lr_initial_value,
118-
lr_warmup_steps=self._args.lr_warmup_steps,
119-
lr_decay_start_step=self._args.lr_decay_start_step,
120-
lr_decay_steps=self._args.lr_decay_steps)
121-
return step, loss, train_op
115+
train_auc, train_auc_update_op = hb.metrics.auc(
116+
labels=labels,
117+
predictions=logits, name='train_auc')
118+
with tf.control_dependencies([train_auc_update_op]):
119+
train_op = sgd_decay_optimize(
120+
loss,
121+
lr_initial_value=self._args.lr_initial_value,
122+
lr_warmup_steps=self._args.lr_warmup_steps,
123+
lr_decay_start_step=self._args.lr_decay_start_step,
124+
lr_decay_steps=self._args.lr_decay_steps)
125+
return step, loss, train_op, train_auc
122126

123127
def evaluate(self, filenames):
124128
r'''Evaluate model.
@@ -160,7 +164,7 @@ def main(args):
160164
train_filenames = args.filenames
161165
eval_filenames = args.filenames
162166
model = RankingModel(args)
163-
step, loss, train_op = model.train(train_filenames)
167+
step, loss, train_op, train_auc = model.train(train_filenames)
164168

165169
hooks = []
166170
if args.eval_every_n_iter is not None:
@@ -171,7 +175,7 @@ def main(args):
171175
if args.log_every_n_iter is not None:
172176
hooks.append(
173177
tf.train.LoggingTensorHook(
174-
{'step': step, 'loss': loss},
178+
{'step': step, 'loss': loss, 'train_auc': train_auc},
175179
every_n_iter=args.log_every_n_iter))
176180
if args.train_max_steps is not None:
177181
hooks.append(tf.train.StopAtStepHook(args.train_max_steps))
@@ -236,5 +240,5 @@ def main(args):
236240
disable_imputation=parsed.disable_imputation,
237241
disable_transform=True,
238242
override_embedding_size=parsed.embedding_dim)
239-
with hb.scope():
243+
with hb.scope(data_sync_drop_remainder=False):
240244
main(parsed)

docs/tutorial/ranking/taobao/train.py

Lines changed: 7 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -113,8 +113,11 @@ def train(self, filenames):
113113
loss = self.compute_loss(logits, labels)
114114
step = tf.train.get_or_create_global_step()
115115
opt = tf.train.AdagradOptimizer(learning_rate=self._args.lr)
116-
train_op = opt.minimize(loss, global_step=step)
117-
return step, loss, train_op
116+
train_auc, train_auc_update_op = hb.metrics.auc(
117+
labels=labels, predictions=logits, name='train_auc')
118+
with tf.control_dependencies([train_auc_update_op]):
119+
train_op = opt.minimize(loss, global_step=step)
120+
return step, loss, train_op, train_auc
118121

119122
def evaluate(self, filenames):
120123
r'''Evaluate model.
@@ -148,7 +151,7 @@ def main(args):
148151
train_filenames = args.filenames
149152
eval_filenames = args.filenames
150153
model = RankingModel(args)
151-
step, loss, train_op = model.train(train_filenames)
154+
step, loss, train_op, train_auc = model.train(train_filenames)
152155

153156
hooks = []
154157
if args.eval_every_n_iter is not None:
@@ -159,7 +162,7 @@ def main(args):
159162
if args.log_every_n_iter is not None:
160163
hooks.append(
161164
tf.train.LoggingTensorHook(
162-
{'step': step, 'loss': loss},
165+
{'step': step, 'loss': loss, 'train_auc': train_auc},
163166
every_n_iter=args.log_every_n_iter))
164167
if args.train_max_steps is not None:
165168
hooks.append(tf.train.StopAtStepHook(args.train_max_steps))

hybridbackend/tensorflow/data/__init__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -42,4 +42,4 @@
4242
_ = (
4343
_ctx.get().options
4444
.register('data_batch_count', 1)
45-
.register('data_sync_drop_remainder', False))
45+
.register('data_sync_drop_remainder', True))

hybridbackend/tensorflow/data/tests/sync_replicas_dataset_test.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -64,7 +64,8 @@ def _test_distributed(rank):
6464
batch_size = 10
6565

6666
with tf.Graph().as_default():
67-
with hb.scope(mode=tf.estimator.ModeKeys.TRAIN):
67+
with hb.scope(
68+
data_sync_drop_remainder=False, mode=tf.estimator.ModeKeys.TRAIN):
6869
with tf.device('/cpu:0'):
6970
ds = tf.data.Dataset.range(100 + rank * 50)
7071
ds = ds.batch(batch_size=batch_size)

hybridbackend/tensorflow/estimator/estimator.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -183,9 +183,9 @@ def __init__(self, model_fn, **kwargs):
183183
'''
184184
kwargs['config'] = RunConfig.build(prototype=kwargs.pop('config', None))
185185
model_dir = kwargs.get('model_dir', None)
186-
self._train_drop_remainder = kwargs.pop('train_drop_remainder', None)
187-
self._eval_drop_remainder = kwargs.pop('eval_drop_remainder', None)
188-
self._predict_drop_remainder = kwargs.pop('predict_drop_remainder', None)
186+
self._train_drop_remainder = kwargs.pop('train_drop_remainder', True)
187+
self._eval_drop_remainder = kwargs.pop('eval_drop_remainder', True)
188+
self._predict_drop_remainder = kwargs.pop('predict_drop_remainder', True)
189189

190190
super().__init__(
191191
wraps_model_fn(model_fn, model_dir, kwargs['config']),

hybridbackend/tensorflow/keras/model.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -473,9 +473,9 @@ class HybridBackendKerasModel(cls, HybridBackendKerasModelBase):
473473
'''
474474
def __init__(self, *args, **kwargs):
475475
self._device_fn = device_function
476-
self._train_drop_remainder = kwargs.pop('train_drop_remainder', None)
477-
self._eval_drop_remainder = kwargs.pop('eval_drop_remainder', None)
478-
self._predict_drop_remainder = kwargs.pop('predict_drop_remainder', None)
476+
self._train_drop_remainder = kwargs.pop('train_drop_remainder', True)
477+
self._eval_drop_remainder = kwargs.pop('eval_drop_remainder', True)
478+
self._predict_drop_remainder = kwargs.pop('predict_drop_remainder', True)
479479
self._load_weights_dir = None
480480
self._load_weights_scope = None
481481
self._load_weights_skip_mismatched = True

0 commit comments

Comments
 (0)