added option for partial multiplication factor in bleed in

Julian Kates-Harbeck · Julian Kates-Harbeck · commit dae8bf0c58c6 · 2018-01-24T02:48:00.000-05:00
diff --git a/examples/conf.yaml b/examples/conf.yaml
@@ -10,15 +10,16 @@ paths:
     signal_prepath: '/signal_data/' #/signal_data/jet/
     shot_list_dir: '/shot_lists/'
     tensorboard_save_path: '/Graph/'
-    data: jet_data #'d3d_to_jet_data' #'d3d_to_jet_data' # 'jet_to_d3d_data' #jet_data
+    data: jet_to_d3d_data #'d3d_to_jet_data' #'d3d_to_jet_data' # 'jet_to_d3d_data' #jet_data
     specific_signals: [] #['q95','li','ip','betan','energy','lm','pradcore','pradedge','pradtot','pin','torquein','tmamp1','tmamp2','tmfreq1','tmfreq2','pechin','energydt','ipdirect','etemp_profile','edens_profile'] #if left empty will use all valid signals defined on a machine. Only use if need a custom set
     executable: "mpi_learn.py"
     shallow_executable: "learn.py"
 
 data:
-    bleed_in: 0 #how many shots from the test sit to use in training?
+    bleed_in: 5 #how many shots from the test sit to use in training?
+    bleed_in_repeat_fac: 10
     bleed_in_remove_from_test: True
-    bleed_in_equalize_sets: True
+    bleed_in_equalize_sets: False
     signal_to_augment: None #'plasma current' #or None
     augmentation_mode: 'none'
     augment_during_training: False
@@ -52,10 +53,10 @@ data:
     floatx: 'float32'
 
 model:
-    shallow: False
+    shallow: True
     shallow_model: 
         num_samples: 1000000 #1000000 #the number of samples to use for training
-        type: "mlp" #"xgboost" #"xgboost" #"random_forest" "xgboost"
+        type: "xgboost" #"xgboost" #"xgboost" #"random_forest" "xgboost"
         n_estimators: 100 #for random forest
         max_depth: 3 #for random forest and xgboost (def = 3)
         C: 1.0 #for svm
@@ -89,8 +90,8 @@ model:
     #have not found a difference yet
     optimizer: 'adam'
     clipnorm: 10.0
-    regularization: 0.0
-    dense_regularization: 0.01
+    regularization: 0.001
+    dense_regularization: 0.001
     #1e-4 is too high, 5e-7 is too low. 5e-5 seems best at 256 batch size, full dataset and ~10 epochs, and lr decay of 0.90. 1e-4 also works well if we decay a lot (i.e ~0.7 or more)
     lr: 0.00002 #0.00001 #0.0005 #for adam plots 0.0000001 #0.00005 #0.00005 #0.00005
     lr_decay: 0.97 #0.98 #0.9
diff --git a/plasma/preprocessor/preprocess.py b/plasma/preprocessor/preprocess.py
@@ -146,7 +146,7 @@ def save_shotlists(self,shot_list_train,shot_list_validate,shot_list_test):
 
 
 def apply_bleed_in(conf,shot_list_train,shot_list_validate,shot_list_test):
-    np.random.seed(1)
+    np.random.seed(2)
     num = conf['data']['bleed_in']
     new_shots = []
     if num > 0:
@@ -170,13 +170,22 @@ def apply_bleed_in(conf,shot_list_train,shot_list_validate,shot_list_test):
         print("Sampled {} shots, {} disruptive, {} nondisruptive".format(num_sampled_nd+num_sampled_d,num_sampled_d,num_sampled_nd))
         print("Before adding: training shots: {} validation shots: {}".format(len(shot_list_train),len(shot_list_validate)))
         assert(num_sampled_d == num)
-        num_to_sample = len(shot_list_bleed)
         if conf['data']['bleed_in_equalize_sets']:#add bleed-in shots to training and validation set repeatedly
+            print("Applying equalized bleed in")
             for shot_list_curr in [shot_list_train,shot_list_validate]:
                 for i in range(len(shot_list_curr)):
                     s = shot_list_bleed.sample_shot()
                     shot_list_curr.append(s)
+        elif conf['data']['bleed_in_repeat_fac'] > 1:
+            repeat_fac = conf['data']['bleed_in_repeat_fac']
+            print("Applying bleed in with repeat factor {}".format(repeat_fac))
+            num_to_sample = int(round(repeat_fac*len(shot_list_bleed)))
+            for i in range(num_to_sample):
+                s = shot_list_bleed.sample_shot()
+                shot_list_train.append(s)
+                shot_list_validate.append(s)
         else: #add each shot only once
+            print("Applying bleed in without repetition")
             for s in shot_list_bleed:
                 shot_list_train.append(s)
                 shot_list_validate.append(s)