Skip to content

Commit 2eff354

Browse files
committed
Apply Janghaeng's patchfile manually
- TF timeline profile data can be done by setting timeline_prof in [training] section in conf.yml - Changed conf.yml to run for a short amount of time for functionality test on Intel GPUs - Decoupled output data directory from input data directory (KGF: modified to use status quo directory hierarchy, but should add separate conf.yaml option for input and output directories, not rely on username subdir)
1 parent 94b8775 commit 2eff354

File tree

5 files changed

+53
-6
lines changed

5 files changed

+53
-6
lines changed

examples/conf.yaml

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -135,6 +135,9 @@ training:
135135
use_process_generator: False
136136
num_batches_minimum: 20 # minimum number of batches per epoch
137137
ranking_difficulty_fac: 1.0 # how much to upweight incorrectly classified shots during training
138+
timeline_prof: False
139+
step_limit: 50
140+
no_validation: True
138141
callbacks:
139142
list: ['earlystop']
140143
metrics: ['val_loss','val_roc','train_loss']

examples/mpi_learn.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -111,6 +111,11 @@
111111
shot_list_test=shot_list_test)
112112
g.flush_all_inorder()
113113

114+
if conf['training']['no_validation'] and conf['training']['step_limit'] > 0:
115+
sys.stdout.flush()
116+
g.print_unique('SHORT TRAINING ONLY. conf.yaml (step_limit) finished without VALIDATION')
117+
quit()
118+
114119
#####################################################
115120
# TESTING #
116121
#####################################################

plasma/conf_parser.py

Lines changed: 7 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@
66
# from data.signals import (
77
# all_signals, fully_defined_signals_1D,
88
# jet, d3d) # nstx
9+
import os
910
import getpass
1011
import yaml
1112

@@ -23,10 +24,14 @@ def parameters(input_file):
2324
with open(input_file, 'r') as yaml_file:
2425
params = yaml.load(yaml_file, Loader=yaml.SafeLoader)
2526
params['user_name'] = getpass.getuser()
26-
output_path = params['fs_path'] + "/" + params['user_name']
27-
base_path = output_path
27+
base_path = params['fs_path']
28+
output_path = os.path.join(base_path, params['user_name'])
29+
# TODO(KGF): allow for completely indpendent save/output_path vs. base_path
30+
# configured in conf.yaml. don't assume username subdirectory, or pwd
31+
# save_path = os.environ.get("PWD")
2832

2933
params['paths']['base_path'] = base_path
34+
params['paths']['output_path'] = output_path
3035
if isinstance(params['paths']['signal_prepath'], list):
3136
g.print_unique('Reading from multiple data folders!')
3237
params['paths']['signal_prepath'] = [
@@ -36,7 +41,6 @@ def parameters(input_file):
3641
base_path + params['paths']['signal_prepath'])
3742
params['paths']['shot_list_dir'] = (
3843
base_path + params['paths']['shot_list_dir'])
39-
params['paths']['output_path'] = output_path
4044
# See notes in data/signals.py for details on signal tols relative to
4145
# t_disrupt. The following 2x dataset definitions permit progressively
4246
# worse signal quality when preprocessing the shots and omitting some

plasma/models/loader.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -758,8 +758,8 @@ def load_as_X_y(self, shot, prediction_mode=False):
758758

759759
def get_mock_data(self):
760760
signals = np.linspace(0, 4*np.pi, 10000)
761-
rand_idx = np.randint(6000)
762-
lgth = np.randint(1000, 3000)
761+
rand_idx = np.random.randint(6000)
762+
lgth = np.random.randint(1000, 3000)
763763
signals = signals[rand_idx:rand_idx+lgth]
764764
# ttd[-100:] = 1
765765
signals = np.vstack([signals]*8)

plasma/models/mpi_runner.py

Lines changed: 36 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,7 @@
3232
import datetime
3333
import numpy as np
3434

35+
from tensorflow.python.client import timeline
3536
from functools import partial
3637
from copy import deepcopy
3738
# import socket
@@ -277,7 +278,17 @@ def compile(self, optimizer, clipnorm, loss='mse'):
277278
else:
278279
print("Optimizer not implemented yet")
279280
exit(1)
280-
self.model.compile(optimizer=optimizer_class, loss=loss)
281+
282+
# Timeline profiler
283+
if (self.conf is not None
284+
and conf['training']['timeline_prof']):
285+
self.run_options = tf.RunOptions(trace_level=tf.RunOptions.FULL_TRACE)
286+
self.run_metadata= tf.RunMetadata()
287+
self.model.compile(optimizer=optimizer_class, loss=loss,
288+
options=self.run_options, run_metadata=self.run_metadata)
289+
else:
290+
self.model.compile(optimizer=optimizer_class, loss=loss)
291+
281292
self.ensure_equal_weights()
282293

283294
def ensure_equal_weights(self):
@@ -516,6 +527,15 @@ def train_epoch(self):
516527
loss_averager = Averager()
517528
t_start = time.time()
518529

530+
timeline_prof = False
531+
if (self.conf is not None
532+
and conf['training']['timeline_prof']):
533+
timeline_prof = True
534+
step_limit = 0
535+
if (self.conf is not None
536+
and conf['training']['step_limit'] > 0):
537+
step_limit = conf['training']['step_limit']
538+
519539
batch_iterator_func = self.batch_iterator_func
520540
num_total = 1
521541
ave_loss = -1
@@ -526,6 +546,9 @@ def train_epoch(self):
526546

527547
while ((self.num_so_far - self.epoch * num_total) < num_total
528548
or step < self.num_batches_minimum):
549+
if step_limit > 0 and step > step_limit:
550+
print('reached step limit')
551+
break
529552
try:
530553
(batch_xs, batch_ys, batches_to_reset, num_so_far_curr,
531554
num_total, is_warmup_period) = next(batch_iterator_func)
@@ -592,6 +615,15 @@ def train_epoch(self):
592615
+ 'walltime: {:.4f} | '.format(
593616
time.time() - self.start_time))
594617
g.write_unique(write_str + write_str_0)
618+
619+
if timeline_prof:
620+
# dump profile
621+
tl = timeline.Timeline(self.run_metadata.step_stats)
622+
ctf = tl.generate_chrome_trace_format()
623+
# dump file per iteration
624+
with open('timeline_%s.json' % step, 'w') as f:
625+
f.write(ctf)
626+
595627
step += 1
596628
else:
597629
g.write_unique('\r[{}] warmup phase, num so far: {}'.format(
@@ -925,6 +957,9 @@ def mpi_train(conf, shot_list_train, shot_list_validate, loader,
925957
if g.task_index == 0:
926958
specific_builder.save_model_weights(train_model, int(round(e)))
927959

960+
if conf['training']['no_validation']:
961+
break
962+
928963
epoch_logs = {}
929964
g.write_unique('Begin evaluation of epoch {:.2f}/{}\n'.format(
930965
e, num_epochs))

0 commit comments

Comments
 (0)