-
Notifications
You must be signed in to change notification settings - Fork 9
Expand file tree
/
Copy pathdata.py
More file actions
162 lines (136 loc) · 6.56 KB
/
data.py
File metadata and controls
162 lines (136 loc) · 6.56 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
'''
PRMS-Python: Powerful, sane tools for manipulating PRMS input data to create
new scenarios or parameterizations for sensitivity analysis, scenario
modeling, or whatever other uses this might have.
The fundamental process in scenario development is to modify some "base"
starting data to create some "scenario" data. No matter what data we're using,
once it's ready, we run a PRMS "simulation" on that data.
This module presents a Simulation and Scenario class, where each tracks
relevant provenance information and input files to facilitate better
data management techniques to streamline later analyses.
'''
import numpy as np
import pandas as pd
import os
class Data(object):
"""
Object that reads the PRMS data file and loads it into a date-time indexed
DataFrame for data management, analysis and visualization. The ``modify``
method allows for inplace modification of one or more time series inputs in the
data file based on a user defined function. The ``write`` method reformats the
Dataframe to PRMS text format and writes a new data file to disk. Here is an
example of loading a data file, modifying the temperature inputs ('tmin' and
'tmax') by adding two degrees to each element, and rewritting the modified
data to disk:
>>> d = Data('example_data_file')
>>> def f(x):
return x + 2
>>> d.modify(f,['tmax','tmin'])
>>> d.write('example_modified_data_file')
d is a Data instance of example_data_file, calling
>>> d.DataFrame
shows the datetime indexed DataFrame of the input data that is created when a
``Data`` object is initiated. We then pass the function f(x) to d.modify
along with a Python list of input variable/s that we want to modify, this
modifies d.DataFrame in place. Printing the metadata attribute of the data
object,
>>> d.metadata
will show the names of the variables in the data file in case you forget which
you would like to modify. Last we call d.write with an output path to write
the modified data variables to disk in PRMS text format.
"""
## data file constant attributes
date_header = ['year',
'month',
'day',
'hh',
'mm',
'sec']
valid_input_variables = ('gate_ht',
'humidity',
'lake_elev',
'pan_evap',
'precip',
'rain_day',
'runoff',
'snowdepth',
'solrad',
'tmax',
'tmin',
'wind_speed')
def __init__(self, base_file, na_rep=-999):
self.base_file = base_file
self.na_rep = na_rep
self.metadata = self.__load_metadata()
self.data_frame = self.__load_data()
def __load_metadata(self):
## starting list for variable names in data file
input_data_names = []
## open data file and read header information
with open(self.base_file, 'r') as inf:
for idx,l in enumerate(inf):
if idx == 0: ## first line always string identifier of the file- may use later
data_head = l.rstrip()
elif l.startswith('/'): ## comment lines
continue
if l.startswith(Data.valid_input_variables):
h = l.split() ## split line into list, first element name and second number of columns
if int(h[1]) > 1: ## more than one input time series of a particular variable
for el in range(int(h[1])):
tmp = '{var_name} {var_ind}'.format(var_name = h[0], var_ind = el+1)
input_data_names.append(tmp)
elif int(h[1]) == 1:
input_data_names.append(h[0])
if l.startswith('#'): ## end of header info and begin time series input data
data_startline = idx+1 ## 0 indexed line of first data entry
break
return dict([('data_startline',data_startline), ('data_variables',input_data_names)])
def __load_data(self):
df = pd.read_csv(self.base_file, header = -1, skiprows = self.metadata['data_startline'],
delim_whitespace = True, na_values = [self.na_rep]) ## read data file
df.columns = Data.date_header + self.metadata['data_variables']
date = pd.Series(pd.to_datetime(df.year * 10000 + df.month * 100 +\
df.day, format = '%Y%m%d'), index = df.index)
df.index = pd.to_datetime(date) ## assign datetime index
df.drop(Data.date_header, axis = 1, inplace = True) ## unneeded columns
df.columns.name = 'input variables' ; df.index.name = 'date'
return df
def modify(self, func, vars_to_adjust):
"""
Apply a user defined function to one or more variable in the data file
Args:
func (function): function to apply to each variable in vars_to_adjust
e.g.:
>>> def f(x):
return np.sin(x)
vars_to_adjust (list or tuple): collection of variable names to apply
func to.
Returns:
None: data.data_frame is modified inplace
"""
for v in vars_to_adjust:
self.data_frame[v] = self.data_frame[v].apply(func)
def write(self, out_path):
"""
Writes the current state of the data to PRMS text format,
particularly useful after modifying the data variable values in place.
Args:
out_path (str): path to save the new PRMS data file
Returns:
None
"""
## reconstruct PRMS data file format, don't overwrite date-indexed
df = self.data_frame[self.metadata['data_variables']]
df['year'] = self.data_frame.index.year
df['month'] = self.data_frame.index.month
df['day'] = self.data_frame.index.day
df['hh'] = df['mm'] = df['sec'] = 0
df = df[Data.date_header + self.metadata['data_variables']]
with open(out_path,'w') as outf:
with open(self.base_file) as data:
for idx, line in enumerate(data):
if idx == self.metadata['data_startline']:
df.to_csv(outf, sep=' ', header=None,\
index=False, na_rep=self.na_rep)
break
outf.write(line) # write line by line the header lines from base file