-
Notifications
You must be signed in to change notification settings - Fork 9
Expand file tree
/
Copy pathoptimizer.py
More file actions
888 lines (788 loc) · 41 KB
/
optimizer.py
File metadata and controls
888 lines (788 loc) · 41 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
# -*- coding: utf-8 -*-
'''
optimizer.py -- holds ``Optimizer`` and ``OptimizationResult`` classes for
optimization routines and management conducted on PRMS parameters.
'''
from __future__ import print_function
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os, sys, json, re, shutil
import multiprocessing as mp
from copy import copy
from copy import deepcopy
from numpy import log10
from datetime import datetime
from .data import Data
from .parameters import Parameters
from .simulation import Simulation, SimulationSeries
from .util import load_statvar, nash_sutcliffe, percent_bias, rmse
OPJ = os.path.join
class Optimizer:
'''
Container for PRMS parameter optimization and related routines.
Currently the ``monte_carlo`` method provides random parameter
resampling routines using uniform and normal random variables.
Example:
>>> from prms_python import Data, Optimizer, Parameters
>>> params = Parameters('path/to/parameters')
>>> data = Data('path/to/data')
>>> control = 'path/to/control'
>>> work_directory = 'path/to/create/simulations'
>>> optr = Optimizer(
params,
data,
control,
work_directory,
title='the title',
description='desc')
>>> measured = 'path/to/measured/csv'
>>> statvar_name = 'basin_cfs' # or any other valid statvar
>>> params_to_resample = ['dday_intcp', 'dday_slope'] # list of params
>>> optr.monte_carlo(measured, params_to_resample, statvar_name)
'''
#dic for min/max of parameter allowable ranges, add more when needed
param_ranges = {'dday_intcp': (-60.0, 10.0),
'dday_slope': (0.2, 0.9),
'jh_coef': (0.005, 0.06),
'pt_alpha': (1.0, 2.0),
'potet_coef_hru_mo': (1.0, 2.0),
'tmax_index': (-10.0, 110.0),
'tmin_lapse': (-10.0, 10.0),
'soil_moist_max': (0.001, 10.0),
'rain_adj': (0.5, 2.0),
'ppt_rad_adj': (0.0, 0.5),
'radadj_intcp': (0.0, 1.0),
'radadj_slope': (0.0, 1.0),
'radj_sppt': (0.0, 1.0),
'radj_wppt': (0.0, 1.0),
'radmax': (0.1, 1.0)
}
def __init__(self, parameters, data, control_file, working_dir,
title, description=None):
if isinstance(parameters, Parameters):
self.parameters = parameters
else:
raise TypeError('parameters must be instance of Parameters')
if isinstance(data, Data):
self.data = data
else:
raise TypeError('data must be instance of Data')
input_dir = '{}'.format(os.sep).join(control_file.split(os.sep)[:-1])
if not os.path.isfile(OPJ(input_dir, 'statvar.dat')):
print('You have no statvar.dat file in your model directory')
print('Running PRMS on original data in {} for later comparison'\
.format(input_dir))
sim = Simulation(input_dir)
sim.run()
if not os.path.isdir(working_dir):
os.mkdir(working_dir)
self.input_dir = input_dir
self.control_file = control_file
self.working_dir = working_dir
self.title = title
self.description = description
self.measured_arb = None # for arbitrary output methods
self.statvar_name = None
self.arb_outputs = []
def monte_carlo(self, reference_path, param_names, statvar_name, \
stage, n_sims=10, method='uniform', mu_factor=1,\
noise_factor=0.1, nproc=None):
'''
The ``monte_carlo`` method of ``Optimizer`` performs parameter
random resampling techniques to a set of PRMS parameters and
executes and manages the corresponding simulations.
Arguments:
reference_path (str): path to measured data for optimization
param_names (list): list of parameter names to resample
statvar_name (str): name of statisical variable output name for
optimization
stage (str): custom name of optimization stage e.g. 'ddsolrad'
Keyword Arguments:
n_sims (int): number of simulations to conduct
parameter optimization/uncertaitnty analysis.
method (str): resampling method for parameters (normal or uniform)
mu_factor (float): coefficient to scale mean of the parameter(s)
to resample from when using the normal distribution to resample
i.e. a value of 1.5 will sample from a normal rv with mean
50% higher than the original parameter mean
noise_factor (float): scales the variance of noise to add to
parameter values when using normal rv (method='normal')
nproc (int): number of processors available to run PRMS simulations
Returns:
None
'''
if '_' in stage:
raise ValueError('stage name cannot contain an underscore')
# assign the optimization object a copy of measured data for plots
self.measured_arb = pd.Series.from_csv(reference_path,\
parse_dates=True)
# statistical variable output name
self.statvar_name = statvar_name
start_time = datetime.now().isoformat()
# resample params for all simulations- potential place to serialize
params = []
for name in param_names: # create list of lists of resampled params
tmp = []
for idx in range(n_sims):
tmp.append(resample_param(self.parameters, name, how=method,\
mu_factor=mu_factor, noise_factor=noise_factor))
params.append(list(tmp))
# SimulationSeries comprised of each resampled param set
series = SimulationSeries(
Simulation.from_data(
self.data, _mod_params(self.parameters,\
[params[n][i] for n in range(len(params))],\
param_names),
self.control_file,
OPJ(
self.working_dir, # name of sim: first param and mean value
'{0}_{1:.10f}'.format(param_names[0], np.mean(params[0][i]))
)
)
for i in range(n_sims)
)
if not nproc:
nproc = mp.cpu_count() // 2
# run
outputs = list(series.run(nproc=nproc).outputs_iter())
self.arb_outputs.extend(outputs) # for current instance- add outputs
end_time = datetime.now().isoformat()
# json metadata for Monte Carlo run
meta = { 'params_adjusted' : param_names,
'statvar_name' : self.statvar_name,
'optimization_title' : self.title,
'optimization_description' : self.description,
'start_datetime' : start_time,
'end_datetime' : end_time,
'measured' : reference_path,
'method' : 'Monte Carlo',
'mu_factor' : mu_factor,
'noise_factor' : noise_factor,
'resample': method,
'sim_dirs' : [],
'stage': stage,
'original_params' : self.parameters.base_file,
'nproc': nproc,
'n_sims' : n_sims
}
for output in outputs:
meta['sim_dirs'].append(output['simulation_dir'])
json_outfile = OPJ(self.working_dir, _create_metafile_name(\
self.working_dir, self.title, stage))
with open(json_outfile, 'w') as outf:
json.dump(meta, outf, sort_keys=True, indent=4,\
ensure_ascii=False)
print('{0}\nOutput information sent to {1}\n'.\
format('-' * 80, json_outfile))
def plot_optimization(self, freq='daily', method='time_series',\
plot_vars='both', plot_1to1=True, return_fig=False,\
n_plots=4):
"""
Basic plotting of current optimization results with limited options.
Plots measured, original simluated, and optimization simulated variabes
Not recommended for plotting results when n_sims is very large, instead
use options from an OptimizationResult object, or employ a user-defined
method using the result data.
Keyword Arguments:
freq (str): frequency of time series plots, value can be 'daily'
or 'monthly' for solar radiation
method (str): 'time_series' for time series sub plot of each
simulation alongside measured radiation. Another choice is
'correlation' which plots each measured daily solar radiation
value versus the corresponding simulated variable as subplots
one for each simulation in the optimization. With coefficients
of determiniationi i.e. square of pearson correlation coef.
plot_vars (str): what to plot alongside simulated srad:
'meas': plot simulated along with measured swrad
'orig': plot simulated along with the original simulated swrad
'both': plot simulated, with original simulation and measured
plot_1to1 (bool): if True plot one to one line on correlation
scatter plot, otherwise exclude.
return_fig (bool): flag whether to return matplotlib figure
Returns:
f (:obj:`matplotlib.figure.Figure`): If kwarg return_fig=True, then return
copy of the figure that is generated to the user.
"""
if not self.arb_outputs:
raise ValueError('You have not run any optimizations')
var_name = self.statvar_name
X = self.measured_arb
idx = X.index.intersection(load_statvar(self.arb_outputs[0]\
['statvar'])['{}'.format(var_name)].index)
X = X[idx]
orig = load_statvar(OPJ(self.input_dir, 'statvar.dat'))['{}'\
.format(var_name)][idx]
meas = self.measured_arb[idx]
sims = [load_statvar(out['statvar'])['{}'.format(var_name)][idx] for \
out in self.arb_outputs]
simdirs = [out['simulation_dir'].split(os.sep)[-1].\
replace('_', ' ') for out in self.arb_outputs]
var_name = '{}'.format(self.statvar_name)
n = len(self.arb_outputs) # number of simulations to plot
# user defined number of subplots from first n_plots results
if (n > n_plots): n = n_plots
# styles for each plot
ms = 4 # markersize for all points
orig_sty = dict(linestyle='none',markersize=ms,\
markerfacecolor='none', marker='s',\
markeredgecolor='royalblue', color='royalblue')
meas_sty = dict(linestyle='none',markersize=ms+1,\
markerfacecolor='none', marker='1',\
markeredgecolor='k', color='k')
sim_sty = dict(linestyle='none',markersize=ms,\
markerfacecolor='none', marker='o',\
markeredgecolor='r', color='r')
## number of subplots and rows (two plots per row)
nrow = n//2 # round down if odd n
ncol = 2
odd_n = False
if n/2. - nrow == 0.5:
nrow+=1 # odd number need extra row
odd_n = True
########
## Start plots depnding on key word arguments
########
if freq == 'daily' and method == 'time_series':
fig, ax = plt.subplots(n, sharex=True, sharey=True,\
figsize=(12,n*3.5))
axs = ax.ravel()
for i,sim in enumerate(sims[:n]):
if plot_vars in ('meas', 'both'):
axs[i].plot(meas, label='Measured', **meas_sty)
if plot_vars in ('orig', 'both'):
axs[i].plot(orig, label='Original sim.', **orig_sty)
axs[i].plot(sim, **sim_sty)
axs[i].set_ylabel('sim: {}'.format(simdirs[i]), fontsize=10)
if i == 0: axs[i].legend(markerscale=5, loc='best')
fig.subplots_adjust(hspace=0)
fig.autofmt_xdate()
#monthly means
elif freq == 'monthly' and method == 'time_series':
# compute monthly means
meas = meas.groupby(meas.index.month).mean()
orig = orig.groupby(orig.index.month).mean()
# change line styles for monthly plots to lines not points
for d in (orig_sty, meas_sty, sim_sty):
d['linestyle'] = '-'
d['marker'] = None
fig, ax = plt.subplots(nrows=nrow, ncols=ncol, figsize=(12,n*3.5))
axs = ax.ravel()
for i,sim in enumerate(sims[:n]):
if plot_vars in ('meas', 'both'):
axs[i].plot(meas, label='Measured', **meas_sty)
if plot_vars in ('orig', 'both'):
axs[i].plot(orig, label='Original sim.', **orig_sty)
sim = sim.groupby(sim.index.month).mean()
axs[i].plot(sim, **sim_sty)
axs[i].set_ylabel('sim: {}\nmean'.format(simdirs[i]),\
fontsize=10)
axs[i].set_xlim(0.5,12.5)
if i == 0: axs[i].legend(markerscale=5, loc='best')
if odd_n: # empty subplot if odd number of simulations
fig.delaxes(axs[n])
fig.text(0.5, 0.1, 'month')
#x-y scatter
elif method == 'correlation':
## figure
fig, ax = plt.subplots(nrows=nrow, ncols=ncol, figsize=(12,n*3))
axs = ax.ravel()
## subplot dimensions
meas_min = min(X)
meas_max = max(X)
for i, sim in enumerate(sims[:n]):
Y = sim
sim_max = max(Y)
sim_min = min(Y)
m = max(meas_max,sim_max)
if plot_1to1:
axs[i].plot([0, m], [0, m], 'k--', lw=2) ## one to one line
axs[i].set_xlim(meas_min,meas_max)
axs[i].set_ylim(sim_min, sim_max)
axs[i].plot(X, Y, **sim_sty)
axs[i].set_ylabel('sim: {}'.format(simdirs[i]))
axs[i].set_xlabel('Measured {}'.format(var_name))
axs[i].text(0.05, 0.95,r'$R^2 = {0:.2f}$'.format(\
X.corr(Y)**2), fontsize=16,\
ha='left', va='center', transform=axs[i].transAxes)
if odd_n: # empty subplot if odd number of simulations
fig.delaxes(axs[n])
if return_fig:
return fig
def _create_metafile_name(out_dir, opt_title, stage):
"""
Search through output directory where simulations are conducted
look for all metadata simulation json files and find out if the
current simulation is a replicate. Then use that information to
build the correct file name for the output json file. The series
are typically run in parallel that is why this step has to be
done after running multiple simulations from an optimization stage.
Arguments:
out_dir (str): path to directory with model results, i.e.
location where simulation series outputs and optimization
json files are located, aka Optimizer.working_dir
opt_title (str): optimization instance title for file search
stage (str): stage of optimization, e.g. 'swrad', 'pet'
Returns:
name (str): file name for the current optimization simulation series
metadata json file. E.g 'dry_creek_swrad_opt.json', or if
this is the second time you have run an optimization titled
'dry_creek' the next json file will be returned as
'dry_creek_swrad_opt1.json' and so on with integer increments
"""
meta_re = re.compile(r'^{}_{}_opt(\d*)\.json'.format(opt_title, stage))
reps = []
for f in os.listdir(out_dir):
if meta_re.match(f):
nrep = meta_re.match(f).group(1)
if nrep == '':
reps.append(0)
else:
reps.append(nrep)
if not reps:
name = '{}_{}_opt.json'.format(opt_title, stage)
else:
# this is the nth optimization done under the same title
n = max(map(int, reps)) + 1
name = '{}_{}_opt{}.json'.format(opt_title, stage, n)
return name
def resample_param(params, param_name, how='uniform', mu_factor=1,\
noise_factor=0.1):
"""
Resample PRMS parameter by shifting all values by a constant that is
taken from a uniform distribution, where the range of the uniform
values is equal to the difference between the min and max of the allowable
range. The parameter min and max are set in ``Optimizer.param_ranges``.
If the resampling method (``how`` argument) is set to 'normal', randomly
sample a normal distribution with mean = mean(parameter) X ``mu_factor`` and
sigma = param allowable range multiplied by ``noise_factor``. If parameters have
array length <= 366 then individual parameter values are resampled otherwise
resample all param values at once, e.g. by taking a single random value
from the uniform distribution. If they are taking all at once using the
normal method then the original values are scaled by mu_factor and a normal
random variable with mean=0 and std dev = parameter range X ``noise_factor``.
Arguments:
params (:class:`prms_python.Parameters`): ``Parameters`` object
param_name (str): name of PRMS parameter to resample
Keyword Arguments:
how (str): distribution to resample parameters from in the case
that each parameter element can be resampled (len <=366)
Currently works for uniform and normal distributions.
noise_factor (float): factor to multiply parameter range by,
use the result as the standard deviation for the normal rand.
variable used to add element wise noise. i.e. higher
noise_factor will result in higher variance. Must be > 0.
Returns:
ret (:obj:`numpy.ndarry`): ndarray of param after resampling
Raises:
KeyError: if ``param_name`` not a valid parameter name
ValueError: if the parameter range has not been set in
``Optimizer.param_ranges``
"""
p_min, p_max = Optimizer.param_ranges.get(param_name,(-1,-1))
# create dictionary of parameter basic info (not values)
param_dic = {param['name']: param for param in params.base_params}
if not param_dic.get(param_name):
raise KeyError('{} is not a valid parameter'.format(param_name))
if p_min == p_max == -1:
raise ValueError("""{} has not been added to the dictionary of
parameters to resample, add it's allowable min and max value
to the Optimizer.param_ranges attribute in
Optimizer.py""".format(param_name))
dim_case = None
nhru = params.dimensions['nhru']
ndims = param_dic.get(param_name)['ndims']
dimnames = param_dic.get(param_name)['dimnames']
length = param_dic.get(param_name)['length']
param = deepcopy(params[param_name])
# could expand list and check parameter name also e.g. cascade_flg
# is a parameter that should not be changed
dims_to_not_change = set(['ncascade','ncascdgw','nreach',\
'nsegment'])
if (len(set.intersection(dims_to_not_change, set(dimnames))) > 0):
raise ValueError("""{} should not be resampled as
it relates to the location of cascade flow
parameters.""".format(param_name))
# use param info to get dimension info- e.g. if multidimensional
if (ndims == 1 and length <= 366):
dim_case = 'resample_each_value' # for smaller param dimensions
elif (ndims == 1 and length > 366):
dim_case = 'resample_all_values_once' # covers nssr, ngw, etc.
elif (ndims == 2 and dimnames[1] == 'nmonths' and \
nhru == params.dimensions[dimnames[0]]):
dim_case = 'nhru_nmonths'
elif not dim_case:
raise ValueError('The {} parameter is not set for resampling'.\
format(param_name))
# #testing purposes
# print('name: ', param_name)
# print('max_val: ', p_max)
# print('min_val: ', p_min)
# print('ndims: ', ndims)
# print('dimnames: ', dimnames)
# print('length: ', length)
# print('resample_method: ', dim_case)
s = (p_max - p_min) * noise_factor # std_dev (s) default: param_range/10
#do resampling based on param dimensions and sampling distribution
if dim_case == 'resample_all_values_once':
if how == 'uniform':
ret = np.random.uniform(low=p_min, high=p_max, size=param.shape)
elif how == 'normal': # scale parameter mean if mu_factor given
param *= mu_factor
tmp = np.random.normal(0, s, size=param.shape)
ret = tmp + param
elif dim_case == 'resample_each_value':
ret = param
if how == 'uniform':
ret = np.random.uniform(low=p_min, high=p_max, size=param.shape)
elif how == 'normal': # the original value is considered the mean
if len(ret.shape) != 0:
for i, el in enumerate(param):
mu = el * mu_factor
ret[i] = np.random.normal(mu, s)
else: # single value parameter
mu = param * mu_factor
ret = np.random.normal(mu, s)
# nhru by nmonth dimensional params
elif dim_case == 'nhru_nmonths':
ret = param
if how == 'uniform':
for month in range(12):
ret[month] = np.random.uniform(low=p_min, high=p_max,\
size=param[0].shape)
elif how == 'normal':
for i, el in enumerate(param):
el *= mu_factor
tmp = np.random.normal(0, s, size=el.shape)
ret[i] = tmp + el
return ret
def _mod_params(parameters, params, param_names):
# loop through list of params and assign their values to Parameter instance
ret = copy(parameters)
for idx, param in enumerate(params):
ret[param_names[idx]] = np.array(param)
return ret
class OptimizationResult:
"""
The ``OptimizationResult`` object serves to collect and manage output
from an ``Optimizer`` method. Upon initialization and a given optimization
stage that was used when running the Optimizer method, e.g. ``monte_carlo``,
the class gathers all JSON metadata that was produced for the given stage.
The ``OptimizationResult`` has three main user methods: first ``result_table``
which returns the top n simulations according to four model performance
metrics (Nash-Sutcliffe efficiency (NSE), root-mean squared-error (RMSE),
percent bias (PBIAS), and the coefficient of determination (COEF_DET) as
calculated against measured data. For example the table may look like:
>>> ddsolrad_res = OptimizationResult(work_directory, stage=stage)
>>> top10 = ddsolrad_res.result_table(freq='monthly',top_n=10)
>>> top10
======================== ======== ======= ========= ========
ddsolrad parameters NSE RMSE PBIAS COEF_DET
======================== ======== ======= ========= ========
orig_params 0.956267 39.4725 -0.885715 0.963116
tmax_index_54.2224631748 0.921626 47.6092 -0.849256 0.94402
tmax_index_44.8823940703 0.879965 58.9194 5.79603 0.922021
tmax_index_47.6835387480 0.764133 82.5918 -4.78896 0.837582
======================== ======== ======= ========= ========
Second, the ``get_top_ranked_sims`` which returns a dictionary that map
key information about the top n ranked simulations, an example returned
dictionary may look like:
>>> {
'dir_name' : ['pathToSim1', 'pathToSim2'],
'param_path' : ['pathToSim1/input/parameters', 'pathToSim2/input/parameters'],
'statvar_path' : ['pathToSim1/output/statvar.dat', 'pathToSim2/output/statvar.dat'],
'params_adjusted' : [[param_names_sim1], [param_names_sim2]]
}
The third method of ``OptimizationResult`` is ``archive`` which essentially
opens all parameter and statvar files from each simulation of the given
stage and archives the parameters that were modified and their modified values
and the statistical variable (PRMS time series output) that is associated with
the optimization stage. Other ``Optimizer`` simulation metadata is also gathered
and new JSON metadata containing only this information is created and written
within a newly created "archived" subdirectory within the same directory that
the ``Optimizer`` routine managed simulations. The ``OptimizationResult.archive``
method then recursively deletes the simulation data for each of the given stage.
"""
def __init__(self, working_dir, stage):
"""
Create an ``OptimizationResult`` instance to manage output and analyse parameter-
output relationships as produced by the use of an ``Optimizer`` method of a user
defined optimization stage.
"""
self.working_dir = working_dir
self.stage = stage
self.metadata_json_paths = self._get_optr_jsons(working_dir, stage)
self.total_sims = self._count_total_sims()
self.statvar_name = self._get_statvar_name(stage)
self.measured = self._get_measured(stage)
self.input_dir = self._get_input_dir(stage)
self.input_params = self._get_input_params(stage)
# if there are more than one input param for given stage
if len(self.input_params) > 1:
print(
"""Warning: there were more than one initial parameter sets used for the
optimization for stage: {}. Make sure to compare the the correct input
params with their corresponding output sims.""".format(stage))
print('\nThis optimization stage used the\
following input parameter files:\n{}'.format('\n'.join(self.input_params)))
def _count_total_sims(self):
# total number of simulations of given stage in working directory
tracked_dirs = []
for f in self.metadata_json_paths[self.stage]:
with open(f) as fh:
json_data = json.load(fh)
tracked_dirs.extend(json_data.get('sim_dirs'))
return len(tracked_dirs)
def _get_optr_jsons(self, work_dir, stage):
"""
Retrieve locations of optimization output jsons which contain
metadata needed to understand optimization results.
Create dictionary of each optimization with stage as key and lists
of corresponding json file paths as values.
Arguments:
work_dir (str): path to directory with model results, i.e.
location where simulation series outputs and optimization
json files are located, aka Optimizer.working_dir
stage (str): the stage ('ddsolrad', 'jhpet', 'flow', etc.) of
the optimization in which to gather the jsons
Returns:
ret (dict): dictionary of stage (keys) and lists of
json file paths for that stage (values).
"""
ret = {}
optr_metafile_re = re.compile(r'^.*_{}_opt(\d*)\.json'.\
format(stage))
ret[stage] = [OPJ(work_dir, f) for f in\
os.listdir(work_dir) if\
optr_metafile_re.match(f)]
return ret
def _get_input_dir(self, stage):
# retrieves the input directory from the first json file of given stage
json_file = self.metadata_json_paths[stage][0]
with open(json_file) as jf:
meta_dic = json.load(jf)
return '{}'.format(os.sep).join(meta_dic['original_params'].\
split(os.sep)[:-1])
def _get_input_params(self, stage):
json_files = self.metadata_json_paths[stage]
param_paths = []
for json_file in json_files:
with open(json_file) as jf:
meta_dic = json.load(jf)
param_paths.append(meta_dic['original_params'])
return list(set(param_paths))
def _get_sim_dirs(self, stage):
jsons = self.metadata_json_paths[stage]
json_files = []
sim_dirs = []
for inf in jsons:
with open(inf) as json_file:
json_files.append(json.load(json_file))
for json_file in json_files:
sim_dirs.extend(json_file['sim_dirs'])
# list of all simulation directory paths for stage
return sim_dirs
def _get_measured(self, stage):
# only need to open one json file to get this information
if not self.metadata_json_paths.get(stage):
return # no optimization json files exist for given stage
first_json = self.metadata_json_paths[stage][0]
with open(first_json) as json_file:
json_data = json.load(json_file)
measured_series = pd.Series.from_csv(json_data.get('measured'),\
parse_dates=True)
return measured_series
def _get_statvar_name(self, stage):
# only need to open one json file to get this information
try:
first_json = self.metadata_json_paths[stage][0]
except:
raise ValueError("""No optimization has been run for
stage: {}""".format(stage))
with open(first_json) as json_file:
json_data = json.load(json_file)
var_name = json_data.get('statvar_name')
return var_name
def result_table(self, freq='daily', top_n=5, latex=False):
##TODO: add stats for freq options annual (means or sum)
sim_dirs = self._get_sim_dirs(self.stage)
if top_n >= len(sim_dirs):
top_n = len(sim_dirs) + 1 # for returning inclusive last sim
sim_names = [path.split(os.sep)[-1] for path in sim_dirs]
meas_var = self._get_measured(self.stage)
statvar_name = self._get_statvar_name(self.stage)
orig_statvar = load_statvar(OPJ(self.input_dir,'statvar.dat'))\
[statvar_name]
result_df = pd.DataFrame(columns=\
['NSE','RMSE','PBIAS','COEF_DET','ABS(PBIAS)'])
orig_results = pd.DataFrame(index=['orig_params'],\
columns=['NSE','RMSE','PBIAS','COEF_DET'])
# get datetime indices that overlap from measured and simulated
sim_out = load_statvar(OPJ(sim_dirs[0], 'outputs', 'statvar.dat'))\
[statvar_name]
idx = meas_var.index.intersection(sim_out.index)
meas_var = copy(meas_var[idx])
#sim_out = sim_out[idx]
orig_statvar = orig_statvar[idx]
if freq == 'monthly':
meas_mo = meas_var.groupby(meas_var.index.month).mean()
orig_mo = orig_statvar.groupby(orig_statvar.index.month).mean()
for i, sim in enumerate(sim_dirs):
try:
sim_out = load_statvar(OPJ(sim, 'outputs', 'statvar.dat'))\
['{}'.format(statvar_name)]
except: # simulation might have been removed or missing
pass
sim_out = sim_out[idx]
if freq == 'daily':
result_df.loc[sim_names[i]] = [\
nash_sutcliffe(meas_var, sim_out),\
rmse(meas_var, sim_out),\
percent_bias(meas_var,sim_out),\
meas_var.corr(sim_out)**2,\
np.abs(percent_bias(meas_var, sim_out)) ]
orig_results.loc['orig_params'] = [\
nash_sutcliffe(orig_statvar,meas_var),\
rmse(orig_statvar,meas_var),\
percent_bias(orig_statvar,meas_var),\
orig_statvar.corr(meas_var)**2]
elif freq == 'monthly':
sim_out = sim_out.groupby(sim_out.index.month).mean()
result_df.loc[sim_names[i]] = [\
nash_sutcliffe(meas_mo, sim_out),\
rmse(meas_mo, sim_out),\
percent_bias(meas_mo, sim_out),\
meas_mo.corr(sim_out)**2,\
np.abs(percent_bias(meas_mo, sim_out)) ]
orig_results.loc['orig_params'] = [\
nash_sutcliffe(orig_mo,meas_mo),\
rmse(orig_mo,meas_mo),\
percent_bias(orig_mo,meas_mo),\
orig_mo.corr(meas_mo)**2]
sorted_result = result_df.sort_values(by=['NSE','RMSE','ABS(PBIAS)',\
'COEF_DET'], ascending=[False,True,True,False])
sorted_result.columns.name = '{} parameters'.format(self.stage)
sorted_result = sorted_result[['NSE','RMSE','PBIAS','COEF_DET']]
sorted_result = pd.concat([orig_results,sorted_result])
if latex: return sorted_result[:top_n].to_latex(escape=False)
else: return sorted_result[:top_n]
def get_top_ranked_sims(self, sorted_df):
# use result table to make dic with best param and statvar paths
# index of table are simulation directory names
ret = {
'dir_name' : [],
'param_path' : [],
'statvar_path' : [],
'params_adjusted' : []
}
json_paths = self.metadata_json_paths[self.stage]
for el in sorted_df.drop('orig_params').index:
ret['dir_name'].append(el)
ret['param_path'].append(OPJ(self.working_dir,el,'inputs',\
'parameters'))
ret['statvar_path'].append(OPJ(self.working_dir,el,'outputs',\
'statvar.dat'))
for f in json_paths:
with open(f) as fh:
json_data = json.load(fh)
if OPJ(self.working_dir, el) in json_data.get('sim_dirs'):
ret['params_adjusted'].append(\
json_data.get('params_adjusted'))
return ret
def archive(self, remove_sims=True, remove_meta=False, metric_freq='daily'):
"""
Create archive directory to hold json files that contain
information of adjusted parameters, model output, and performance
metrics for each Optimizer simulation of the
OptimizationResult.stage in the OptimizationResult.working_dir.
Keyword Arguments:
remove_sims (bool): If True recursively delete all folders
and files associated with original simulations of the
OptimizationResult.stage in the
OptimizationResult.working_dir, if False do not delete
simulations.
remove_meta (bool): Whether to delete original Optimizer
JSON metadata files, default is False.
metric_freq (Str): Frequency of output metric computation
for recording of model performance. Can be 'daily'
(default) or 'monthly'. Note, other results can be computed
later with archived results.
Returns:
None
"""
# create output archive directory
archive_dir = OPJ(self.working_dir,"{}_archived".format(self.stage))
if not os.path.isdir(archive_dir):
os.mkdir(archive_dir)
# create table and use to make mapping dic
table = self.result_table(freq=metric_freq,\
top_n=self.total_sims, latex=False)
map_dic = self.get_top_ranked_sims(table)
metadata_json_paths = self.metadata_json_paths[self.stage]
# get measured optimization variable path
first_json = metadata_json_paths[0]
with open(first_json) as json_file:
json_data = json.load(json_file)
measured_path = json_data.get('measured')
# record info for each simulation and archive to JSONs
# pandas series and numpy arrays are converted to Python lists
# for JSON serialization
for i, sim in enumerate(map_dic.get('dir_name')):
json_path = OPJ(archive_dir, '{sim}.json'.format(sim=sim))
try:
output_series = load_statvar(OPJ(self.working_dir, sim,\
'outputs', 'statvar.dat'))\
[self.statvar_name]
except: # simulation directory was already removed
continue
# look for resampling method info for the particular simulation
for f in metadata_json_paths:
with open(f) as tmp_file:
tmp = json.load(tmp_file)
if OPJ(self.working_dir, sim) in tmp.get('sim_dirs'):
resample = tmp.get('resample')
noise_factor = tmp.get('noise_factor')
mu_factor = tmp.get('mu_factor')
json_data = {
'param_names' : [],
'param_values' : [],
'original_param_path' : self.input_params,
'measured_path' : measured_path,
'output_name' : self.statvar_name,
'output_date_index' : output_series.\
index.astype(str).tolist(),
'output_values' : output_series.values.tolist(),
'metric_freq' : metric_freq,
'resample' : resample,
'stage' : self.stage,
'mu_factor' : mu_factor,
'noise_factor' : noise_factor,
'NSE' : table.loc[sim, 'NSE'],
'RMSE' : table.loc[sim, 'RMSE'],
'PBIAS' : table.loc[sim, 'PBIAS'],
'COEF_DET' : table.loc[sim, 'COEF_DET']
}
for param in map_dic.get('params_adjusted')[i]:
json_data['param_names'].append(param)
json_data['param_values'].append(Parameters(\
map_dic.get('param_path')[i])[param]\
.tolist())
# save JSON file into archive directory
with open(json_path, 'w') as outf:
json.dump(json_data, outf, sort_keys = True, indent = 4,\
ensure_ascii = False)
# recursively delete all simulation directories after archiving
if remove_sims:
path = OPJ(self.working_dir, sim)
for dirpath, dirnames, filenames in os.walk(path,\
topdown=False):
shutil.rmtree(dirpath, ignore_errors=True)
else:
continue
# optional delete the original JSON metadata
if remove_meta:
for meta_file in self.metadata_json_paths[self.stage]:
try:
os.remove(meta_file)
except:
continue