Apply Janghaeng's patchfile manually

felker · felker · commit 2eff35496958 · 2021-03-10T21:21:02.000-06:00
- TF timeline profile data can be done by setting timeline_prof in [training] section in conf.yml
- Changed conf.yml to run for a short amount of time for functionality test on Intel GPUs
- Decoupled output data directory from input data directory (KGF:
modified to use status quo directory hierarchy, but should add separate
conf.yaml option for input and output directories, not rely on username subdir)
diff --git a/examples/conf.yaml b/examples/conf.yaml
@@ -135,6 +135,9 @@ training:
   use_process_generator: False
   num_batches_minimum: 20 # minimum number of batches per epoch
   ranking_difficulty_fac: 1.0 # how much to upweight incorrectly classified shots during training
+  timeline_prof: False
+  step_limit: 50
+  no_validation: True
 callbacks:
   list: ['earlystop']
   metrics: ['val_loss','val_roc','train_loss']
diff --git a/examples/mpi_learn.py b/examples/mpi_learn.py
@@ -111,6 +111,11 @@
               shot_list_test=shot_list_test)
 g.flush_all_inorder()
 
+if conf['training']['no_validation'] and conf['training']['step_limit'] > 0:
+    sys.stdout.flush()
+    g.print_unique('SHORT TRAINING ONLY. conf.yaml (step_limit) finished without VALIDATION')
+    quit()
+
 #####################################################
 #                    TESTING                        #
 #####################################################
diff --git a/plasma/conf_parser.py b/plasma/conf_parser.py
@@ -6,6 +6,7 @@
 # from data.signals import (
 #     all_signals, fully_defined_signals_1D,
 #     jet, d3d)  # nstx
+import os
 import getpass
 import yaml
 
@@ -23,10 +24,14 @@ def parameters(input_file):
     with open(input_file, 'r') as yaml_file:
         params = yaml.load(yaml_file, Loader=yaml.SafeLoader)
         params['user_name'] = getpass.getuser()
-        output_path = params['fs_path'] + "/" + params['user_name']
-        base_path = output_path
+        base_path = params['fs_path']
+        output_path = os.path.join(base_path, params['user_name'])
+        # TODO(KGF): allow for completely indpendent save/output_path vs. base_path
+        # configured in conf.yaml. don't assume username subdirectory, or pwd
+        # save_path = os.environ.get("PWD")
 
         params['paths']['base_path'] = base_path
+        params['paths']['output_path'] = output_path
         if isinstance(params['paths']['signal_prepath'], list):
             g.print_unique('Reading from multiple data folders!')
             params['paths']['signal_prepath'] = [
@@ -36,7 +41,6 @@ def parameters(input_file):
                 base_path + params['paths']['signal_prepath'])
         params['paths']['shot_list_dir'] = (
             base_path + params['paths']['shot_list_dir'])
-        params['paths']['output_path'] = output_path
         # See notes in data/signals.py for details on signal tols relative to
         # t_disrupt. The following 2x dataset definitions permit progressively
         # worse signal quality when preprocessing the shots and omitting some
diff --git a/plasma/models/loader.py b/plasma/models/loader.py
@@ -758,8 +758,8 @@ def load_as_X_y(self, shot, prediction_mode=False):
 
     def get_mock_data(self):
         signals = np.linspace(0, 4*np.pi, 10000)
-        rand_idx = np.randint(6000)
-        lgth = np.randint(1000, 3000)
+        rand_idx = np.random.randint(6000)
+        lgth = np.random.randint(1000, 3000)
         signals = signals[rand_idx:rand_idx+lgth]
         # ttd[-100:] = 1
         signals = np.vstack([signals]*8)
diff --git a/plasma/models/mpi_runner.py b/plasma/models/mpi_runner.py
@@ -32,6 +32,7 @@
 import datetime
 import numpy as np
 
+from tensorflow.python.client import timeline
 from functools import partial
 from copy import deepcopy
 # import socket
@@ -277,7 +278,17 @@ def compile(self, optimizer, clipnorm, loss='mse'):
         else:
             print("Optimizer not implemented yet")
             exit(1)
-        self.model.compile(optimizer=optimizer_class, loss=loss)
+
+        # Timeline profiler
+        if (self.conf is not None
+                and conf['training']['timeline_prof']):
+            self.run_options = tf.RunOptions(trace_level=tf.RunOptions.FULL_TRACE)
+            self.run_metadata= tf.RunMetadata()
+            self.model.compile(optimizer=optimizer_class, loss=loss,
+                               options=self.run_options, run_metadata=self.run_metadata)
+        else:
+            self.model.compile(optimizer=optimizer_class, loss=loss)
+
         self.ensure_equal_weights()
 
     def ensure_equal_weights(self):
@@ -516,6 +527,15 @@ def train_epoch(self):
         loss_averager = Averager()
         t_start = time.time()
 
+        timeline_prof = False
+        if (self.conf is not None
+                and conf['training']['timeline_prof']):
+            timeline_prof = True
+        step_limit = 0
+        if (self.conf is not None
+                and conf['training']['step_limit'] > 0):
+            step_limit = conf['training']['step_limit']
+
         batch_iterator_func = self.batch_iterator_func
         num_total = 1
         ave_loss = -1
@@ -526,6 +546,9 @@ def train_epoch(self):
 
         while ((self.num_so_far - self.epoch * num_total) < num_total
                or step < self.num_batches_minimum):
+           if step_limit > 0 and step > step_limit:
+               print('reached step limit')
+               break
             try:
                 (batch_xs, batch_ys, batches_to_reset, num_so_far_curr,
                  num_total, is_warmup_period) = next(batch_iterator_func)
@@ -592,6 +615,15 @@ def train_epoch(self):
                     + 'walltime: {:.4f} | '.format(
                         time.time() - self.start_time))
                 g.write_unique(write_str + write_str_0)
+
+                if timeline_prof:
+                    # dump profile
+                    tl = timeline.Timeline(self.run_metadata.step_stats)
+                    ctf = tl.generate_chrome_trace_format()
+                    # dump file per iteration
+                    with open('timeline_%s.json' % step, 'w') as f:
+                        f.write(ctf)
+
                 step += 1
             else:
                 g.write_unique('\r[{}] warmup phase, num so far: {}'.format(
@@ -925,6 +957,9 @@ def mpi_train(conf, shot_list_train, shot_list_validate, loader,
         if g.task_index == 0:
             specific_builder.save_model_weights(train_model, int(round(e)))
 
+        if conf['training']['no_validation']:
+            break
+
         epoch_logs = {}
         g.write_unique('Begin evaluation of epoch {:.2f}/{}\n'.format(
             e, num_epochs))