Skip to content

Commit 0b59a7a

Browse files
committed
Adding CoriGPU run files and modifying conf.yaml
1 parent b5d1e02 commit 0b59a7a

File tree

3 files changed

+38
-4
lines changed

3 files changed

+38
-4
lines changed

examples/conf.yaml

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@
44

55
# will do stuff in fs_path / [username] / signal_data | shot_lists | processed shots, etc.
66

7-
fs_path: '/tigress'
7+
fs_path: '/global/cscratch1/sd/'
88
target: 'hinge' # 'maxhinge' # 'maxhinge' # 'binary' # 'hinge'
99
num_gpus: 4 # per node
1010
paths:
@@ -127,7 +127,7 @@ training:
127127
num_shots_at_once: 200
128128
# large number = maximum number of epochs.
129129
# Early stopping will occur if loss does not decrease, after some patience # of epochs
130-
num_epochs: 1000
130+
num_epochs: 50
131131
use_mock_data: False
132132
data_parallel: False
133133
hyperparam_tuning: False
@@ -136,8 +136,8 @@ training:
136136
num_batches_minimum: 20 # minimum number of batches per epoch
137137
ranking_difficulty_fac: 1.0 # how much to upweight incorrectly classified shots during training
138138
timeline_prof: False
139-
step_limit: 50
140-
no_validation: True
139+
step_limit: 0
140+
no_validation: False
141141
callbacks:
142142
list: ['earlystop']
143143
metrics: ['val_loss','val_roc','train_loss']

examples/corigpu_1GPU_slurm.cmd

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,15 @@
1+
#!/bin/bash
2+
#SBATCH -C gpu
3+
#SBATCH -t 01:30:00
4+
#SBATCH -G 1
5+
#SBATCH -c 4
6+
#SBATCH --exclusive
7+
8+
# rm /global/cscratch1/sd/$USER/model_checkpoints/*
9+
# rm /global/cscratch1/sd/$USER/results/*
10+
# rm /global/cscratch1/sd/$USER/csv_logs/*
11+
# rm /global/cscratch1/sd/$USER/Graph/*
12+
# rm /global/cscratch1/sd/$USER/normalization/*
13+
14+
export OMPI_MCA_btl="tcp,self,vader"
15+
srun python mpi_learn.py

examples/corigpu_4GPU_slurm.cmd

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,19 @@
1+
#!/bin/bash
2+
#SBATCH -C gpu
3+
#SBATCH -t 02:00:00
4+
#SBATCH -N 4
5+
#SBATCH -G 4
6+
#SBATCH --ntasks-per-node=4
7+
#SBATCH --ntasks-per-socket=2
8+
#SBATCH -c 4
9+
#SBATCH --mem-per-cpu=0
10+
#SBATCH --exclusive
11+
12+
# rm /global/cscratch1/sd/$USER/model_checkpoints/*
13+
# rm /global/cscratch1/sd/$USER/results/*
14+
# rm /global/cscratch1/sd/$USER/csv_logs/*
15+
# rm /global/cscratch1/sd/$USER/Graph/*
16+
# rm /global/cscratch1/sd/$USER/normalization/*
17+
18+
export OMPI_MCA_btl="tcp,self,vader"
19+
srun python mpi_learn.py

0 commit comments

Comments
 (0)