added support for torch fully convolutional model. Temporal convolutions and spatial convolutions. TODO is to add multi GPU support and customizability via conf. Parameters currently hardcoded.

Julian Kates-Harbeck · Julian Kates-Harbeck · commit ad5de2f262f2 · 2018-03-22T02:01:56.000-04:00
diff --git a/plasma/models/builder.py b/plasma/models/builder.py
@@ -273,7 +273,8 @@ def extract_id_and_epoch_from_filename(self,filename):
     def get_all_saved_files(self):
         self.ensure_save_directory()
         unique_id = self.get_unique_id()
-        filenames = os.listdir(self.conf['paths']['model_save_path'])
+        path = self.conf['paths']['model_save_path']
+        filenames = [name for name in os.listdir(path) if os.path.isfile(os.path.join(path, name))]
         epochs = []
         for file in filenames:
             curr_id,epoch = self.extract_id_and_epoch_from_filename(file)
diff --git a/plasma/models/loader.py b/plasma/models/loader.py
@@ -123,11 +123,13 @@ def shift_buffer(self,buff,length):
         buff[:,:-length,:] = buff[:,length:,:]
 
 
-    def resize_buffer(self,buff,new_length):
+    def resize_buffer(self,buff,new_length,dtype=None):
+        if dtype == None:
+            dtype = self.conf['data']['floatx']
         old_length = buff.shape[1]
         batch_size = buff.shape[0]
         num_signals = buff.shape[2]
-        new_buff = np.empty((batch_size,new_length,num_signals),dtype=self.conf['data']['floatx'])
+        new_buff = np.zeros((batch_size,new_length,num_signals),dtype=dtype)
         new_buff[:,:old_length,:] = buff
         #print("Resizing buffer to new length {}".format(new_length))
         return new_buff
@@ -149,18 +151,20 @@ def inference_batch_generator_full_shot(self,shot_list):
           - reset_states_now: boolean flag indicating when to reset state during stateful RNN training
           - num_so_far,num_total: number of samples generated so far and the total dataset size as per shot_list
         """
-        batch_size = self.conf['training']['pred_batch_size']
+        batch_size = self.conf['model']['pred_batch_size']
         sig,res = self.get_signal_result_from_shot(shot_list.shots[0])
-        Xbuff = np.empty((batch_size,) + sig.shape,dtype=self.conf['data']['floatx'])
-        Ybuff = np.empty((batch_size,) + res.shape,dtype=self.conf['data']['floatx'])
-        Maskbuff = np.empty((batch_size,) + res.shape,dtype=self.conf['data']['floatx'])
-        disr = np.empty(batch_size,dtype=bool)
+        Xbuff = np.zeros((batch_size,) + sig.shape,dtype=self.conf['data']['floatx'])
+        Ybuff = np.zeros((batch_size,) + res.shape,dtype=self.conf['data']['floatx'])
+        Maskbuff = np.zeros((batch_size,) + res.shape,dtype=self.conf['data']['floatx'])
+        disr = np.zeros(batch_size,dtype=bool)
+        lengths = np.zeros(batch_size,dtype=int)
         # epoch = 0
         num_total = len(shot_list)
         num_so_far = 0
         returned = False
         num_steps = 0
         batch_idx = 0
+        np.seterr(all='raise')
         # warmup_steps = self.conf['training']['batch_generator_warmup_steps']
         # is_warmup_period = num_steps < warmup_steps 
         # is_first_fill = num_steps < batch_size
@@ -178,15 +182,29 @@ def inference_batch_generator_full_shot(self,shot_list):
                     Maskbuff = self.resize_buffer(Maskbuff,sig_len)
                     Maskbuff[:,old_len:,:] = 0.0
 
-                Xbuff[batch_idx,:,:] = sig
-                Ybuff[batch_idx,:,:] = res
+                Xbuff[batch_idx,:,:] = 0.0
+                Ybuff[batch_idx,:,:] = 0.0
+                Maskbuff[batch_idx,:,:] = 0.0
+                Xbuff[batch_idx,:sig_len,:] = sig
+                Ybuff[batch_idx,:sig_len,:] = res
                 Maskbuff[batch_idx,:sig_len,:] = 1.0
-                Maskbuff[batch_idx,sig_len:,:] = 0.0
                 disr[batch_idx] = shot.is_disruptive_shot()
+                lengths[batch_idx] = res.shape[0]
                 batch_idx += 1
                 if batch_idx == batch_size:
                     num_so_far += batch_size
-                    yield 1.0*Xbuff,1.0*Ybuff,1.0*Maskbuff,disr & True,num_so_far,num_total
+                    x1 = 1.0*Xbuff
+                    try:
+                        x2 = 1.0*Ybuff
+                    except:
+                        print(Ybuff[:100])
+                        print(Ybuff[-100:])
+                        print(Ybuff)
+                    x3 = 1.0*Maskbuff
+                    x4 = disr & True
+                    x5 = 1*lengths
+                    
+                    yield x1,x2,x3,x4,x5,num_so_far,num_total
                     batch_idx = 0
 
 
@@ -236,10 +254,13 @@ def training_batch_generator_full_shot_partial_reset(self,shot_list):
                     Maskbuff = self.resize_buffer(Maskbuff,sig_len)
                     Maskbuff[:,old_len:,:] = 0.0
 
-                Xbuff[batch_idx,:,:] = sig
-                Ybuff[batch_idx,:,:] = res
+                Xbuff[batch_idx,:,:] = 0.0
+                Ybuff[batch_idx,:,:] = 0.0
+                Maskbuff[batch_idx,:,:] = 0.0
+
+                Xbuff[batch_idx,:sig_len,:] = sig
+                Ybuff[batch_idx,:sig_len,:] = res
                 Maskbuff[batch_idx,:sig_len,:] = 1.0
-                Maskbuff[batch_idx,sig_len:,:] = 0.0
                 batch_idx += 1
                 if batch_idx == batch_size:
                     num_so_far += batch_size
@@ -735,8 +756,10 @@ def __init__(self,generator):
         
     def fill_batch_queue(self):
         print("Starting process to fetch data")
+        count = 0
         while True:
             self.queue.put(next(self.generator),True) 
+            count += 1
 
     def __next__(self):
         return self.queue.get(True)
diff --git a/plasma/models/torch_runner.py b/plasma/models/torch_runner.py
@@ -45,7 +45,7 @@ def __init__(self,n_scalars,n_profiles,profile_size,layer_sizes_spatial,
                  num_channels_tcn,kernel_size_temporal,dropout=0.1):
         super(FTCN, self).__init__()
         self.lin = InputBlock(n_scalars, n_profiles,profile_size, layer_sizes_spatial, kernel_size_spatial, linear_size, dropout)
-        self.input_layer = TimeDistributed(lin,batch_first=True)
+        self.input_layer = TimeDistributed(self.lin,batch_first=True)
         self.tcn = TCN(linear_size, output_size, num_channels_tcn , kernel_size_temporal, dropout)
         self.model = nn.Sequential(self.input_layer,self.tcn)
     
@@ -102,8 +102,8 @@ def forward(self, x):
             if self.n_scalars == 0:
                 x_profiles = x
             else:
-                x_scalars = x[:,:n_scalars]
-                x_profiles = x[:,n_scalars:]
+                x_scalars = x[:,:self.n_scalars]
+                x_profiles = x[:,self.n_scalars:]
             x_profiles = x_profiles.contiguous().view(x.size(0),self.n_profiles,self.profile_size)
             profile_features = self.net(x_profiles).view(x.size(0),-1)
             if self.n_scalars == 0:
@@ -271,18 +271,18 @@ def build_torch_model(conf):
 # dim = 10
 
     # lin = nn.Linear(input_size,intermediate_dim)
-    n_scalars, n_profile, profile_size = get_signal_dimensions(conf)
+    n_scalars, n_profiles, profile_size = get_signal_dimensions(conf)
     dim = n_scalars+n_profiles*profile_size
     input_size = dim
     output_size = 1
     # intermediate_dim = 15
 
-    layer_sizes_spatial = [40,20,20]
+    layer_sizes_spatial = [6,3,3]#[40,20,20]
     kernel_size_spatial = 3
-    linear_size = 10
+    linear_size = 5
 
-    num_channels_tcn = [3]*5
-    kernel_size_temporal = 3
+    num_channels_tcn = [10,5,3,3]#[3]*5
+    kernel_size_temporal = 3 #3
     model = FTCN(n_scalars,n_profiles,profile_size,layer_sizes_spatial,
              kernel_size_spatial,linear_size,output_size,num_channels_tcn,
              kernel_size_temporal,dropout)
@@ -300,16 +300,68 @@ def get_signal_dimensions(conf):
         num_channels = sig.num_channels
         if num_channels > 1:
             profile_size = num_channels
-            num_1D += 1
+            n_profiles += 1
             is_1D_region = True
         else:
             assert(not is_1D_region), "make sure all use_signals are ordered such that 1D signals come last!"
             assert(num_channels == 1)
-            num_0D += 1
+            n_scalars += 1
             is_1D_region = False
     return n_scalars,n_profiles,profile_size 
 
-def train_epoch(model,data_gen,loss_fn):
+def apply_model_to_np(model,x):
+    #     return model(Variable(torch.from_numpy(x).float()).unsqueeze(0)).squeeze(0).data.numpy()
+    return model(Variable(torch.from_numpy(x).float())).data.numpy()
+
+
+
+def make_predictions(conf,shot_list,loader,custom_path=None):
+    generator = loader.inference_batch_generator_full_shot(shot_list)
+    inference_model = build_torch_model(conf)
+
+    if custom_path == None:
+        model_path = get_model_path(conf)
+    else:
+        model_path = custom_path
+    inference_model.load_state_dict(torch.load(model_path))
+    #shot_list = shot_list.random_sublist(10)
+
+    y_prime = []
+    y_gold = []
+    disruptive = []
+    num_shots = len(shot_list)
+
+    pbar =  Progbar(num_shots)
+    while True:
+        x,y,mask,disr,lengths,num_so_far,num_total = next(generator)
+        #x, y, mask = Variable(torch.from_numpy(x_).float()), Variable(torch.from_numpy(y_).float()),Variable(torch.from_numpy(mask_).byte())
+        output = apply_model_to_np(inference_model,x)
+        for batch_idx in range(x.shape[0]):
+            curr_length = lengths[batch_idx]
+            y_prime += [output[batch_idx,:curr_length,0]]
+            y_gold += [y[batch_idx,:curr_length,0]]
+            disruptive += [disr[batch_idx]]
+            pbar.add(1.0)
+        if len(disruptive) >= num_shots:
+            y_prime = y_prime[:num_shots]
+            y_gold = y_gold[:num_shots]
+            disruptive = disruptive[:num_shots]
+            break
+    return y_prime,y_gold,disruptive
+
+def make_predictions_and_evaluate_gpu(conf,shot_list,loader,custom_path = None):
+    y_prime,y_gold,disruptive = make_predictions(conf,shot_list,loader,custom_path)
+    analyzer = PerformanceAnalyzer(conf=conf)
+    roc_area = analyzer.get_roc_area(y_prime,y_gold,disruptive)
+    loss = get_loss_from_list(y_prime,y_gold,conf['data']['target'])
+    return y_prime,y_gold,disruptive,roc_area,loss
+
+
+def get_model_path(conf):
+    return conf['paths']['model_save_path'] + 'torch/' + model_filename #save_prepath + model_filename
+
+
+def train_epoch(model,data_gen,optimizer,loss_fn):
     loss = 0
     total_loss = 0
     num_so_far = 0
@@ -335,17 +387,19 @@ def train_epoch(model,data_gen,loss_fn):
         loss.backward()
         optimizer.step()
         step += 1
+        print("[{}]  [{}/{}] loss: {:.3f}, ave_loss: {:.3f}".format(step,num_so_far-num_so_far_start,num_total,loss.data[0],total_loss/step))
         if num_so_far-num_so_far_start >= num_total:
             break
-        x_,y_,mask_,num_so_far_start,num_total = next(data_gen)
-    return step,loss,total_loss,num_so_far,1.0*num_so_far/num_total
+        x_,y_,mask_,num_so_far,num_total = next(data_gen)
+    return step,loss.data[0],total_loss,num_so_far,1.0*num_so_far/num_total
 
 
 def train(conf,shot_list_train,shot_list_validate,loader):
 
     np.random.seed(1)
 
-    data_gen = ProcessGenerator(partial(loader.training_batch_generator_full_shot_partial_reset,shot_list=shot_list_train))
+    #data_gen = ProcessGenerator(partial(loader.training_batch_generator_full_shot_partial_reset,shot_list=shot_list_train)())
+    data_gen = partial(loader.training_batch_generator_full_shot_partial_reset,shot_list=shot_list_train)()
 
     print('validate: {} shots, {} disruptive'.format(len(shot_list_validate),shot_list_validate.num_disruptive()))
     print('training: {} shots, {} disruptive'.format(len(shot_list_train),shot_list_train.num_disruptive()))
@@ -358,6 +412,7 @@ def train(conf,shot_list_train,shot_list_validate,loader):
     # e = specific_builder.load_model_weights(train_model)
 
     num_epochs = conf['training']['num_epochs']
+    patience = conf['callbacks']['patience']
     lr_decay = conf['model']['lr_decay']
     batch_size = conf['training']['batch_size']
     lr = conf['model']['lr']
@@ -385,23 +440,25 @@ def train(conf,shot_list_train,shot_list_validate,loader):
     else:
         best_so_far = np.inf
         cmp_fn = min
-    optimizer = opt.Adam(model.parameters(),lr = lr)
-    model.train()
+    optimizer = opt.Adam(train_model.parameters(),lr = lr)
+    scheduler = opt.lr_scheduler.ExponentialLR(optimizer,lr_decay)
+    train_model.train()
     not_updated = 0
     total_loss = 0
     count = 0
-    loss_fn = nn.MSELoss(size_average=False)
-    model_path = conf['paths']['model_save_path'] + model_filename #save_prepath + model_filename
-    makedirs_process_safe(conf['paths']['model_save_path'])
+    loss_fn = nn.MSELoss(size_average=True)
+    model_path = get_model_path(conf)
+    makedirs_process_safe(os.path.dirname(model_path))
     while e < num_epochs-1:
-        print_unique('\nEpoch {}/{}'.format(e,num_epochs))
-        (step,ave_loss,curr_loss,num_so_far,effective_epochs) = train_epoch(model,data_gen,loss_fn)
+        scheduler.step()
+        print('\nEpoch {}/{}'.format(e,num_epochs))
+        (step,ave_loss,curr_loss,num_so_far,effective_epochs) = train_epoch(train_model,data_gen,optimizer,loss_fn)
         e = effective_epochs
         loader.verbose=False #True during the first iteration
         # if task_index == 0: 
             # specific_builder.save_model_weights(train_model,int(round(e)))
-        model.save_state_dict(model_path)
-        _,_,_,roc_area,loss = mpi_make_predictions_and_evaluate(conf,shot_list_validate,loader)
+        torch.save(train_model.state_dict(),model_path)
+        _,_,_,roc_area,loss = make_predictions_and_evaluate_gpu(conf,shot_list_validate,loader)
 
         best_so_far = cmp_fn(roc_area,best_so_far)
 
@@ -411,54 +468,13 @@ def train(conf,shot_list_train,shot_list_validate,loader):
         print('Validation Loss: {:.3e}'.format(loss))
         print('Validation ROC: {:.4f}'.format(roc_area))
 
-        if best_so_far != epoch_logs[conf['callbacks']['monitor']]: #only save model weights if quantity we are tracking is improving
+        if best_so_far != roc_area: #only save model weights if quantity we are tracking is improving
             print("No improvement, still saving model")
             not_updated += 1
         else:
             print("Saving model")
-            model.save_state_dict(model_path)
             # specific_builder.delete_model_weights(train_model,int(round(e)))
         if not_updated > patience:
             print("Stopping training due to early stopping")
             break
 
-def make_predictions(conf,shot_list,loader,custom_path=None):
-    generator = loader.inference_batch_generator_full_shot(shot_list)
-    inference_model = build_torch_model(conf)
-
-    if custom_path == None:
-        model_path = conf['paths']['model_save_path'] + model_filename#save_prepath + model_filename
-    else:
-        model_path = custom_path
-    inference_model.load_state_dict(model_path)
-    #shot_list = shot_list.random_sublist(10)
-
-    y_prime = []
-    y_gold = []
-    disruptive = []
-    num_shots = len(shot_list)
-
-    pbar =  Progbar(num_shots)
-    while True:
-        x_,y_,mask_,disr_,num_so_far,num_total = next(generator)
-        x, y, mask = Variable(torch.from_numpy(x_).float()), Variable(torch.from_numpy(y_).float()),Variable(torch.from_numpy(mask_).byte())
-        output = model(x)
-        for batch_idx in range(x.shape[0])
-            y_prime[batch_idx] += [output[batch_idx,:,:]]
-            y_gold += [y_[batch_idx,:,:]]
-            disruptive += [disr[batch_idx]]
-            pbar.add(1.0)
-        if len(disruptive) >= num_shots:
-            y_prime = y_prime[:num_shots]
-            y_gold = y_gold[:num_shots]
-            disruptive = disruptive[:num_shots]
-            break
-    return y_prime,y_gold,disruptive
-
-def make_predictions_and_evaluate_gpu(conf,shot_list,loader,custom_path = None):
-    y_prime,y_gold,disruptive = make_predictions(conf,shot_list,loader,custom_path)
-    analyzer = PerformanceAnalyzer(conf=conf)
-    roc_area = analyzer.get_roc_area(y_prime,y_gold,disruptive)
-    loss = get_loss_from_list(y_prime,y_gold,conf['data']['target'])
-    return y_prime,y_gold,disruptive,roc_area,loss
-