PPPLDeepLearning
diff --git a/‎envs/adroit.cmd‎
Lines changed: 11 additions & 0 deletions b/‎envs/adroit.cmd‎
Lines changed: 11 additions & 0 deletions
diff --git a/‎envs/requirements-cpu.yaml‎
Lines changed: 2 additions & 1 deletion b/‎envs/requirements-cpu.yaml‎
Lines changed: 2 additions & 1 deletion
diff --git a/‎envs/requirements-linux-64-gpu.yaml‎
Lines changed: 1 addition & 1 deletion b/‎envs/requirements-linux-64-gpu.yaml‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎envs/requirements-traverse.yaml‎
Lines changed: 1 addition & 1 deletion b/‎envs/requirements-traverse.yaml‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎envs/tigergpu.cmd‎
Lines changed: 4 additions & 3 deletions b/‎envs/tigergpu.cmd‎
Lines changed: 4 additions & 3 deletions
diff --git a/‎envs/traverse.cmd‎
Lines changed: 5 additions & 3 deletions b/‎envs/traverse.cmd‎
Lines changed: 5 additions & 3 deletions
diff --git a/‎plasma/conf_parser.py‎
Lines changed: 1 addition & 0 deletions b/‎plasma/conf_parser.py‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎plasma/global_vars.py‎
Lines changed: 1 addition & 0 deletions b/‎plasma/global_vars.py‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎plasma/models/builder.py‎
Lines changed: 25 additions & 22 deletions b/‎plasma/models/builder.py‎
Lines changed: 25 additions & 22 deletions
diff --git a/‎plasma/models/custom_loss.py‎
Lines changed: 4 additions & 4 deletions b/‎plasma/models/custom_loss.py‎
Lines changed: 4 additions & 4 deletions
@@ -0,0 +1,11 @@
+#!/usr/bin/env bash
+
+module load anaconda3
+# must activate conda env before module loads
+conda activate frnn
+export OMPI_MCA_btl="tcp,self,vader"
+
+module load cudatoolkit/10.2
+module load cudnn/cuda-10.1/7.6.3
+module load openmpi/gcc/3.1.5/64
+module load hdf5/gcc/openmpi-1.10.2/1.10.0 # like TigerGPU, this is older than version on Traverse, hdf5/gcc/openmpi-3.1.4/1.10.5
@@ -4,4 +4,5 @@ flake8
 h5py
 pyparsing
 pyyaml
-tensorflow-gpu>=1.3,<2.0.0
+pytorch>1.3
+tensorflow>=1.3,<2.0.0
@@ -12,9 +12,9 @@ dependencies:
   - h5py
   - pyparsing
   - pyyaml
+  - pytorch>1.3
   - tensorflow-gpu>=1.3,<2.0.0
   - pip:
-      - keras>=2.0.5,<2.3.0
       - pathos
       - matplotlib>=2.0.2
       - hyperopt  # TODO(KGF): remove
 
@@ -14,9 +14,9 @@ dependencies:
   - h5py
   - pyparsing
   - pyyaml
+  - pytorch>1.3
   - tensorflow-gpu>=1.3,<2.0.0
   - pip:
-      - keras>=2.0.5,<2.3.0
       - pathos
       - matplotlib>=2.0.2
       - hyperopt  # TODO(KGF): remove
 
@@ -1,11 +1,12 @@
 #!/usr/bin/env bash
 
-module load anaconda3
-# must activate conda env before module loads
+module load anaconda3  # KGF: issue with my shell--- makes conda CLI return nothing
+# must activate conda env before module loads to ensure MPI, etc modules have
+# precedence for setting the environment variables, libraries, etc.
 conda activate frnn
 export OMPI_MCA_btl="tcp,self,vader"  #sm"
 module load cudatoolkit
 module load cudnn
 
 module load openmpi/gcc/3.1.3/64
-module load hdf5/gcc/openmpi-1.10.2/1.10.0
+module load hdf5/gcc/openmpi-1.10.2/1.10.0 # like Adroit, this is older than version on Traverse, hdf5/gcc/openmpi-3.1.4/1.10.5
@@ -1,10 +1,12 @@
 #!/usr/bin/env bash
 
-module load anaconda3
-# must activate conda env before module loads
+module load anaconda3  # KGF: issue with my shell--- makes conda CLI return nothing
+# must activate conda env before module loads to ensure MPI, etc modules have
+# precedence for setting the environment variables, libraries, etc.
 conda activate frnn
 export OMPI_MCA_btl="tcp,self,vader"
-
+# module load anaconda3 ------ KGF: DO NOT DO THIS--- reloads base
+# module purge
 module load cudatoolkit
 module load cudnn/cuda-10.1/7.6.1
 module load openmpi/gcc/3.1.4/64
 
@@ -55,6 +55,7 @@ def parameters(input_file):
             h = myhash_signals(sig.all_signals_max_tol.values())*2
         else:
             h = myhash_signals(sig.all_signals.values())
+
         params['paths']['global_normalizer_path'] = (
             output_path
             + '/normalization/normalization_signal_group_{}.npz'.format(h))
 
@@ -7,6 +7,7 @@
 num_workers = 1
 NUM_GPUS = 0
 MY_GPU = 0
+# TODO(KGF): remove this (and all?) references to Keras backend
 backend = ''
 tf_ver = None
 
 
@@ -1,23 +1,21 @@
 from __future__ import division, print_function
 import plasma.global_vars as g
 # KGF: the first time Keras is ever imported via mpi_learn.py -> mpi_runner.py
-import keras.backend as K
+# -> builder.py (here)
+import tensorflow as tf
 # KGF: see below synchronization--- output is launched here
-from keras.models import Model  # , Sequential
+#
 # KGF: (was used only in hyper_build_model())
-from keras.layers import Input
-from keras.layers.core import (
+from tensorflow.keras.layers import (
+    Input,
     Dense, Activation, Dropout, Lambda,
     Reshape, Flatten, Permute,  # RepeatVector
+    LSTM, CuDNNLSTM, SimpleRNN, BatchNormalization,
+    Convolution1D, MaxPooling1D, TimeDistributed,
+    Concatenate
     )
-from keras.layers import LSTM, CuDNNLSTM, SimpleRNN, BatchNormalization
-from keras.layers.convolutional import Convolution1D
-from keras.layers.pooling import MaxPooling1D
-# from keras.utils.data_utils import get_file
-from keras.layers.wrappers import TimeDistributed
-from keras.layers.merge import Concatenate
-from keras.callbacks import Callback
-from keras.regularizers import l2  # l1, l1_l2
+from tensorflow.keras.callbacks import Callback
+from tensorflow.keras.regularizers import l2  # l1, l1_l2
 
 import re
 import os
@@ -275,7 +273,7 @@ def slicer_output_shape(input_shape, indices):
                 bias_regularizer=l2(dense_regularization),
                 activity_regularizer=l2(dense_regularization))(pre_rnn)
 
-        pre_rnn_model = Model(inputs=pre_rnn_input, outputs=pre_rnn)
+        pre_rnn_model = tf.keras.Model(inputs=pre_rnn_input, outputs=pre_rnn)
         # TODO(KGF): uncomment following lines to get summary of pre-RNN model
         # from mpi4py import MPI
         # comm = MPI.COMM_WORLD
@@ -344,16 +342,17 @@ def slicer_output_shape(input_shape, indices):
                 # x_out = TimeDistributed(Dense(100,activation='tanh')) (x_in)
                 x_out = TimeDistributed(
                     Dense(1, activation=output_activation))(x_in)
-        model = Model(inputs=x_input, outputs=x_out)
+        model = tf.keras.Model(inputs=x_input, outputs=x_out)
         # bug with tensorflow/Keras
         # TODO(KGF): what is this bug? this is the only direct "tensorflow"
         # import outside of mpi_runner.py and runner.py
-        if (conf['model']['backend'] == 'tf'
-                or conf['model']['backend'] == 'tensorflow'):
-            first_time = "tensorflow" not in sys.modules
-            import tensorflow as tf
-            if first_time:
-                K.get_session().run(tf.global_variables_initializer())
+        # if (conf['model']['backend'] == 'tf'
+        #         or conf['model']['backend'] == 'tensorflow'):
+        #     first_time = "tensorflow" not in sys.modules
+        #     import tensorflow as tf
+        #     if first_time:
+        #         tf.compat.v1.keras.backend.get_session().run(
+        #             tf.global_variables_initializer())
         model.reset_states()
         return model
 
@@ -362,6 +361,8 @@ def build_train_test_models(self):
 
     def save_model_weights(self, model, epoch):
         save_path = self.get_save_path(epoch)
+        full_model_save_path = self.get_save_path(epoch, ext='hdf5')
+        model.save(full_model_save_path, overwrite=True)
         model.save_weights(save_path, overwrite=True)
         # try:
         if _has_onnx:
@@ -425,6 +426,8 @@ def load_model_weights(self, model, custom_path=None):
     def extract_id_and_epoch_from_filename(self, filename):
         regex = re.compile(r'-?\d+')
         numbers = [int(x) for x in regex.findall(filename)]
+        # TODO: should ignore any files that dont match our naming convention
+        # in this directory, especially since we are now writing full .hdf5 too
         if filename[-3:] == '.h5':
             assert len(numbers) == 3  # id, epoch number, and .h5 extension
             assert numbers[2] == 5  # .h5 extension
@@ -438,8 +441,8 @@ def get_all_saved_files(self):
         filenames = [name for name in os.listdir(path)
                      if os.path.isfile(os.path.join(path, name))]
         epochs = []
-        for file in filenames:
-            curr_id, epoch = self.extract_id_and_epoch_from_filename(file)
+        for fname in filenames:
+            curr_id, epoch = self.extract_id_and_epoch_from_filename(fname)
             if curr_id == unique_id:
                 epochs.append(epoch)
         return epochs
 
@@ -1,10 +1,10 @@
 import numpy as np
 
-# from keras import objectives
-from keras import backend as K
-from keras.losses import squared_hinge
+import tensorflow as tf
+import tensorflow.keras.backend as K
+from tensorflow.keras.losses import squared_hinge
 
-_EPSILON = K.epsilon()
+_EPSILON = tf.keras.backend.epsilon()
 
 
 def _loss_tensor(y_true, y_pred):