-
Notifications
You must be signed in to change notification settings - Fork 9
Expand file tree
/
Copy pathdata.py
More file actions
263 lines (224 loc) · 11.3 KB
/
data.py
File metadata and controls
263 lines (224 loc) · 11.3 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
# -*- coding: utf-8 -*-
'''
data.py -- holds ``Data`` class for standard PRMS climate input data.
'''
import pandas as pd
from shutil import copyfile
class Data(object):
"""
Object to access or create a PRMS data file with ability to load/assign it to a
date-time indexed pandas.DataFrame for data management, analysis and visualization.
It can be used to build a new PRMS data file from user defined metadata and a
``pandas.DataFrame`` of PRMS datetime-indexed climatic forcing and observation
variables.
The class properties ``metadata`` and ``data_frame`` can be later assigned if no
``base_file`` is given on initialization, allowing for the creation of PRMS climatic
forcing file in a Python environment.
Keyword Arguments:
base_file (str, optional): path to standard PRMS data file
na_rep (int, optional): how to represent missing values default = -999
Attributes:
date_header (list): date and time header for PRMS data file
valid_input_variables (tuple): valid hydro-climate variables for PRMS data file
Note:
If using the ``Data`` class to create a new data file, it is up to the user
to ensure that the metadata and :class:`pandas.DataFrame` assigned are correct
and compatible.
"""
## data file constant attributes
date_header = ['year',
'month',
'day',
'hh',
'mm',
'sec']
valid_input_variables = ('gate_ht',
'humidity',
'lake_elev',
'pan_evap',
'precip',
'rain_day',
'runoff',
'snowdepth',
'solrad',
'tmax',
'tmin',
'wind_speed')
def __init__(self, base_file=None, na_rep=-999):
self.base_file = base_file
self.na_rep = na_rep
self._metadata = None
self._data_frame = None
@property
def metadata(self):
"""
:obj:`dict`:A property that gets and sets the header information from
a standard PRMS climate input data file held in a Python dictionary. As
a property it can be assigned directly to overwrite or create a new PRMS
data file. As such the user is in control and must supply the correct
syntax for PRMS standard data files, e.g. text lines before header should
begin with "//". Here is an example of the information gathered and held in
this attribute:
Example:
>>> data.metadata
{
'data_startline' : 6,
'data_variables' : ['runoff 1', 'runoff 2', 'tmin', 'tmax', 'ppt']
'text_before_header' : "Title of data file \\n //some comments\\nrunoff 2
\\ntmin 1\\ntmax 1\\nppt 1\\nrunoff 2\\ntmin 1
\\ntmax 1\\nppt 1\\n
########################################\\n"
}
Note:
When assigning or creating a new data file, the ``Data.write`` method will
assign the appropriate date header that follows the line of number signs "#".
Raises:
ValueError: if data in metadata is accessed before data is assigned,
e.g. if accessed to write a PRMS data file from a ``Data`` instance
that was initialized without a valid PRMS data file.
TypeError: if an object that is not a Python dictionary is assigned.
"""
# to avoid overwriting pre-assigned data, check if already exists
if isinstance(self._metadata, dict):
return self._metadata
elif not self.base_file:
raise ValueError('No data file was given on initialization')
## starting list for variable names in data file
input_data_names = []
text_before_header = str()
## open data file and read header information
with open(self.base_file, 'r') as inf:
for idx,l in enumerate(inf):
text_before_header+=l
if idx == 0: ## first line always string identifier of the file- may use later
data_head = l.rstrip()
elif l.startswith('/'): ## comment lines
continue
if l.startswith(Data.valid_input_variables):
h = l.split() ## split, first name and second number of columns
if int(h[1]) > 1: ## more than one input time series of a particular variable
for el in range(int(h[1])):
tmp = '{var_name} {var_ind}'.format(var_name = h[0], var_ind = el+1)
input_data_names.append(tmp)
elif int(h[1]) == 1:
input_data_names.append(h[0])
if l.startswith('#'): ## end of header info and begin time series input data
data_startline = idx+1 ## 0 indexed line of first data entry
break
self._metadata = dict([('data_startline',data_startline),
('data_variables',input_data_names),
('text_before_header',text_before_header)])
return self._metadata
@metadata.setter
def metadata(self, dic):
if not isinstance(dic, dict):
raise TypeError('Must assign a Python dictionary for new Data object/file metadata')
self._metadata = dic
@property
def data_frame(self):
"""
A property that gets and sets the climatic forcing data for a standard PRMS
climate input data file as a :class:`pandas.DataFrame`.
Example:
d is a Data instance, calling
>>> d.data_frame
input variables runoff 1 runoff 2 runoff 3 precip tmax tmin
date
1996-12-27 0.54 1.6 NaN 0.0 46 32.0
1996-12-28 0.65 1.6 NaN 0.0 45 24.0
1996-12-29 0.80 1.6 NaN 0.0 44 28.0
1996-12-30 0.90 1.6 NaN 0.0 51 33.0
1996-12-31 1.00 1.7 NaN 0.0 47 32.0
shows the date-indexed ``pd.DataFrame`` of the input data that is created
when a ``Data`` object is initiated if given a valid ``base_file``, i.e.
file path to a PRMS climate data file.
Raises:
ValueError: if attribute is accessed before either assigning a PRMS data
file on ``Data`` initialization or not assigning a compatabile
date-indexed ``pandas.DataFrame`` of hydro-climate variables.
TypeError: if a data type other than ``pandas.DataFrame`` is assigned.
"""
if not self._metadata:
self.metadata
elif not isinstance(self._data_frame, pd.DataFrame) and self.base_file == None:
raise ValueError('No data base_file given on initialization, '\
'therefore you must assign a DataFrame'\
+' before accessing the .data_frame attribute!')
# to avoid overwriting pre-assigned data
elif isinstance(self._data_frame, pd.DataFrame):
return self._data_frame
df = pd.read_csv(self.base_file, header = None, skiprows = self.metadata['data_startline'],
delim_whitespace = True, na_values = [self.na_rep]) ## read data file
df.columns = Data.date_header + self.metadata['data_variables']
date = pd.Series(pd.to_datetime(df.year * 10000 + df.month * 100 +\
df.day, format = '%Y%m%d'), index = df.index)
df.index = pd.to_datetime(date) ## assign datetime index
df.drop(Data.date_header, axis = 1, inplace = True) ## unneeded columns
df.columns.name = 'input variables' ; df.index.name = 'date'
self._data_frame = df
return self._data_frame
@data_frame.setter
def data_frame(self, df):
if not isinstance(df, pd.DataFrame):
raise TypeError("Must assign a Pandas.DataFrame object for PRMS data input")
self._data_frame = df
def modify(self, func, vars_to_adjust):
"""
Apply a user defined function to one or more variable(s) in the data file.
The ``modify`` method allows for inplace modification of one or more
time series inputs in the data file based on a user defined function.
Arguments:
func (function): function to apply to each variable in vars_to_adjust
vars_to_adjust (list or tuple): collection of variable names to apply func to.
Returns:
None
Example:
Here is an example of loading a data file, modifying the temperature inputs
(*tmin* and *tmax*) by adding two degrees to each element, and rewritting the
modified data to disk,
>>> d = Data('path_to_data_file')
>>> def f(x):
return x + 2
>>> d.modify(f,['tmax','tmin'])
>>> d.write('data_temp_plus_2')
"""
if not isinstance(self._data_frame, pd.DataFrame):
self.data_frame # will raise ValueError from data_frame property
for v in vars_to_adjust:
self._data_frame[v] = self._data_frame[v].apply(func)
def write(self, out_path):
"""
Writes the current state of the ``Data`` to PRMS text format
utilizing the ``Data.metadata`` and ``Data.data_frame`` instance
properties. If ``Data.data_frame`` was never accessed or assigned
new values then this method simply copies the original PRMS
data file to ``out_path``.
Arguments:
out_path (str): full path to save or copy the current PRMS data
in PRMS text format.
Returns:
None
Raises:
ValueError: if the ``write`` method is called without assigning either
an initial data (``base_file``) path or assigning correct ``metadata``
and ``data_frame`` properties.
"""
# if file data was never accessed- unchanged
if not isinstance(self._data_frame, pd.DataFrame):
if self.base_file:
copyfile(self.base_file, out_path)
else: # if data not from original file and dataframe never assigned
raise ValueError('No data base_file was given and'\
+' no data was assigned!')
## reconstruct PRMS data file format, don't overwrite date-indexed
else:
df = self._data_frame[self.metadata['data_variables']]
df['year'] = self._data_frame.index.year
df['month'] = self._data_frame.index.month
df['day'] = self._data_frame.index.day
df['hh'] = df['mm'] = df['sec'] = 0
df = df[Data.date_header + self._metadata['data_variables']]
with open(out_path,'w') as outf: # write comment header then data
outf.write(self._metadata['text_before_header'])
df.to_csv(outf, sep=' ', header=None,\
index=False, na_rep=self.na_rep, lineterminator='\n')