Skip to content

Commit 1d9a0c0

Browse files
committed
Added Netapp API based volume latency and iop module
1 parent f6211b7 commit 1d9a0c0

3 files changed

Lines changed: 383 additions & 0 deletions

File tree

netapp_api/README.mkdn

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,19 @@
1+
NetApp Filer API metrics
2+
========================
3+
4+
This is a GMOND Python Module that gathers metrics from NetApp appliances via the Netapp Data ONTAP APIs.
5+
The API allows counter access to many more metrics than available through SNMP.
6+
7+
This module currently gathers per volume Read/Write/Average IOPs and Latency and handles multiple filers.
8+
9+
## DEPENDS
10+
* Netapp Managemability SDK 5.0 (download from now.netapp.com to /opt/netapp)
11+
12+
## USAGE
13+
* Save the netapp_api.pyconf into /etc/ganglia/conf.d
14+
* Save the netapp_api.py into your ganglia python module dir eg: /usr/lib/ganglia/python_modules.
15+
* Update the Username, password, IP and filer name.
16+
* Restart gmond and the volume latency & iop metrics should appear in ganglia.
17+
18+
## AUTHOR
19+
* Author: Evan Fraser <[email protected]>
Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,18 @@
1+
modules {
2+
module {
3+
name = "netapp_api"
4+
language = "python"
5+
}
6+
}
7+
#/* Collection groups for the
8+
# example python module */
9+
collection_group {
10+
collect_every = 15
11+
time_threshold = 70
12+
metric {
13+
name_match = "(.+)latency"
14+
}
15+
metric {
16+
name_match = "(.+)ops"
17+
}
18+
}
Lines changed: 346 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,346 @@
1+
#!/usr/bin/python
2+
import sys
3+
import time
4+
import pprint
5+
import unicodedata
6+
import os
7+
8+
sys.path.append("/opt/netapp/lib/python/NetApp")
9+
from NaServer import *
10+
11+
descriptors = list()
12+
params = {}
13+
filerdict = {}
14+
FASMETRICS = {
15+
'time' : 0,
16+
'data' : {}
17+
}
18+
LAST_FASMETRICS = dict(FASMETRICS)
19+
#This is the minimum interval between querying the RPA for metrics
20+
FASMETRICS_CACHE_MAX = 10
21+
22+
def get_metrics(name):
23+
global FASMETRICS, LAST_FASMETRICS, FASMETRICS_CACHE_MAX, params
24+
max_records = 10
25+
metrics = {}
26+
if (time.time() - FASMETRICS['time']) > FASMETRICS_CACHE_MAX:
27+
28+
for filer in filerdict.keys():
29+
s = NaServer(filerdict[filer]['ipaddr'], 1, 3)
30+
out = s.set_transport_type('HTTPS')
31+
if (out and out.results_errno() != 0) :
32+
r = out.results_reason()
33+
print ("Connection to filer failed: " + r + "\n")
34+
sys.exit(2)
35+
36+
out = s.set_style('LOGIN')
37+
if (out and out.results_errno() != 0) :
38+
r = out.results_reason()
39+
print ("Connection to filer failed: " + r + "\n")
40+
sys.exit(2)
41+
out = s.set_admin_user(filerdict[filer]['user'], filerdict[filer]['password'])
42+
perf_in = NaElement("perf-object-get-instances-iter-start")
43+
#Hard coding volume object for testing
44+
obj_name = "volume"
45+
perf_in.child_add_string("objectname", obj_name)
46+
#Create object of type counters
47+
counters = NaElement("counters")
48+
#Add counter names to the object
49+
counters.child_add_string("counter", "total_ops")
50+
counters.child_add_string("counter", "avg_latency")
51+
counters.child_add_string("counter", "read_ops")
52+
counters.child_add_string("counter", "read_latency")
53+
counters.child_add_string("counter", "write_ops")
54+
counters.child_add_string("counter", "write_latency")
55+
56+
perf_in.child_add(counters)
57+
58+
#Invoke API
59+
out = s.invoke_elem(perf_in)
60+
61+
if(out.results_status() == "failed"):
62+
print(out.results_reason() + "\n")
63+
sys.exit(2)
64+
65+
iter_tag = out.child_get_string("tag")
66+
num_records = 1
67+
68+
filername = filerdict[filer]['name']
69+
70+
while(int(num_records) != 0):
71+
perf_in = NaElement("perf-object-get-instances-iter-next")
72+
perf_in.child_add_string("tag", iter_tag)
73+
perf_in.child_add_string("maximum", max_records)
74+
out = s.invoke_elem(perf_in)
75+
76+
if(out.results_status() == "failed"):
77+
print(out.results_reason() + "\n")
78+
sys.exit(2)
79+
80+
num_records = out.child_get_int("records")
81+
82+
if(num_records > 0) :
83+
instances_list = out.child_get("instances")
84+
instances = instances_list.children_get()
85+
86+
for inst in instances:
87+
inst_name = unicodedata.normalize('NFKD',inst.child_get_string("name")).encode('ascii','ignore')
88+
counters_list = inst.child_get("counters")
89+
counters = counters_list.children_get()
90+
91+
for counter in counters:
92+
counter_name = unicodedata.normalize('NFKD',counter.child_get_string("name")).encode('ascii','ignore')
93+
counter_value = counter.child_get_string("value")
94+
counter_unit = counter.child_get_string("unit")
95+
metrics[filername + '_vol_' + inst_name + '_' + counter_name] = float(counter_value)
96+
# update cache
97+
LAST_FASMETRICS = dict(FASMETRICS)
98+
FASMETRICS = {
99+
'time': time.time(),
100+
'data': metrics
101+
}
102+
103+
104+
else:
105+
metrics = FASMETRICS['data']
106+
#print name
107+
#calculate change in values and return
108+
if 'total_ops' in name:
109+
try:
110+
delta = float(FASMETRICS['data'][name] - LAST_FASMETRICS['data'][name])/(FASMETRICS['time'] - LAST_FASMETRICS['time'])
111+
if delta < 0:
112+
print "Less than 0"
113+
delta = 0
114+
except StandardError:
115+
delta = 0
116+
#This is the Operations per second
117+
return delta
118+
119+
elif 'avg_latency' in name:
120+
try:
121+
#T1 and T2
122+
#(T2_lat - T1_lat) / (T2_ops - T1_ops)
123+
#Find the metric name of the base counter
124+
total_ops_name = name.replace('avg_latency', 'total_ops')
125+
#Calculate latency in time (div 100 to change to ms)
126+
return float((FASMETRICS['data'][name] - LAST_FASMETRICS['data'][name]) / (FASMETRICS['data'][total_ops_name] -LAST_FASMETRICS['data'][total_ops_name])) / 100
127+
except StandardError:
128+
return 0
129+
elif 'read_ops' in name:
130+
131+
try:
132+
delta = float(FASMETRICS['data'][name] - LAST_FASMETRICS['data'][name])/(FASMETRICS['time'] - LAST_FASMETRICS['time'])
133+
if delta < 0:
134+
print "Less than 0"
135+
delta = 0
136+
except StandardError:
137+
delta = 0
138+
return delta
139+
140+
elif 'read_latency' in name:
141+
try:
142+
read_ops_name = name.replace('read_latency', 'read_ops')
143+
return float((FASMETRICS['data'][name] - LAST_FASMETRICS['data'][name]) / (FASMETRICS['data'][read_ops_name] -LAST_FASMETRICS['data'][read_ops_name])) / 100
144+
except StandardError:
145+
return 0
146+
elif 'write_ops' in name:
147+
try:
148+
delta = float(FASMETRICS['data'][name] - LAST_FASMETRICS['data'][name])/(FASMETRICS['time'] - LAST_FASMETRICS['time'])
149+
if delta < 0:
150+
print "Less than 0"
151+
delta = 0
152+
except StandardError:
153+
delta = 0
154+
return delta
155+
156+
elif 'write_latency' in name:
157+
try:
158+
write_ops_name = name.replace('write_latency', 'write_ops')
159+
return float((FASMETRICS['data'][name] - LAST_FASMETRICS['data'][name]) / (FASMETRICS['data'][write_ops_name] -LAST_FASMETRICS['data'][write_ops_name])) / 100
160+
except StandardError:
161+
return 0
162+
163+
164+
return 0
165+
166+
167+
168+
def create_desc(skel, prop):
169+
d = skel.copy()
170+
for k,v in prop.iteritems():
171+
d[k] = v
172+
return d
173+
174+
def define_metrics(Desc_Skel,params):
175+
max_records = 10
176+
for filer in params.keys():
177+
s = NaServer(params[filer]['ipaddr'], 1, 3)
178+
out = s.set_transport_type('HTTPS')
179+
if (out and out.results_errno() != 0) :
180+
r = out.results_reason()
181+
print ("Connection to filer failed: " + r + "\n")
182+
sys.exit(2)
183+
184+
out = s.set_style('LOGIN')
185+
if (out and out.results_errno() != 0) :
186+
r = out.results_reason()
187+
print ("Connection to filer failed: " + r + "\n")
188+
sys.exit(2)
189+
out = s.set_admin_user(params[filer]['user'], params[filer]['password'])
190+
perf_in = NaElement("perf-object-get-instances-iter-start")
191+
#Hard coded volume, only volume stats gathered at present
192+
obj_name = "volume"
193+
perf_in.child_add_string("objectname", obj_name)
194+
#Create object of type counters
195+
counters = NaElement("counters")
196+
#Add counter names to the object
197+
counters.child_add_string("counter", "total_ops")
198+
counters.child_add_string("counter", "avg_latency")
199+
counters.child_add_string("counter", "read_ops")
200+
counters.child_add_string("counter", "read_latency")
201+
counters.child_add_string("counter", "write_ops")
202+
counters.child_add_string("counter", "write_latency")
203+
204+
perf_in.child_add(counters)
205+
206+
#Invoke API
207+
out = s.invoke_elem(perf_in)
208+
209+
if(out.results_status() == "failed"):
210+
print(out.results_reason() + "\n")
211+
sys.exit(2)
212+
213+
iter_tag = out.child_get_string("tag")
214+
num_records = 1
215+
filername = params[filer]['name']
216+
217+
while(int(num_records) != 0):
218+
perf_in = NaElement("perf-object-get-instances-iter-next")
219+
perf_in.child_add_string("tag", iter_tag)
220+
perf_in.child_add_string("maximum", max_records)
221+
out = s.invoke_elem(perf_in)
222+
223+
if(out.results_status() == "failed"):
224+
print(out.results_reason() + "\n")
225+
sys.exit(2)
226+
227+
num_records = out.child_get_int("records")
228+
229+
if(num_records > 0) :
230+
instances_list = out.child_get("instances")
231+
instances = instances_list.children_get()
232+
233+
for inst in instances:
234+
inst_name = unicodedata.normalize('NFKD',inst.child_get_string("name")).encode('ascii','ignore')
235+
#print ("Instance = " + inst_name + "\n")
236+
counters_list = inst.child_get("counters")
237+
counters = counters_list.children_get()
238+
239+
for counter in counters:
240+
counter_name = unicodedata.normalize('NFKD',counter.child_get_string("name")).encode('ascii','ignore')
241+
counter_value = counter.child_get_string("value")
242+
counter_unit = counter.child_get_string("unit")
243+
if 'total_ops' in counter_name:
244+
descriptors.append(create_desc(Desc_Skel, {
245+
"name" : filername + '_vol_' + inst_name + '_' + counter_name,
246+
"units" : 'iops',
247+
"description" : "volume iops",
248+
"spoof_host" : params[filer]['ipaddr'] + ':' + params[filer]['name'],
249+
"groups" : "iops"
250+
}))
251+
elif 'avg_latency' in counter_name:
252+
descriptors.append(create_desc(Desc_Skel, {
253+
"name" : filername + '_vol_' + inst_name + '_' + counter_name,
254+
"units" : 'ms',
255+
"description" : "volume avg latency",
256+
"spoof_host" : params[filer]['ipaddr'] + ':' + params[filer]['name'],
257+
"groups" : "latency"
258+
}))
259+
elif 'read_ops' in counter_name:
260+
descriptors.append(create_desc(Desc_Skel, {
261+
"name" : filername + '_vol_' + inst_name + '_' + counter_name,
262+
"units" : 'iops',
263+
"description" : "volume read iops",
264+
"spoof_host" : params[filer]['ipaddr'] + ':' + params[filer]['name'],
265+
"groups" : "iops"
266+
}))
267+
elif 'read_latency' in counter_name:
268+
descriptors.append(create_desc(Desc_Skel, {
269+
"name" : filername + '_vol_' + inst_name + '_' + counter_name,
270+
"units" : 'ms',
271+
"description" : "volume read latency",
272+
"spoof_host" : params[filer]['ipaddr'] + ':' + params[filer]['name'],
273+
"groups" : "latency"
274+
}))
275+
elif 'write_ops' in counter_name:
276+
descriptors.append(create_desc(Desc_Skel, {
277+
"name" : filername + '_vol_' + inst_name + '_' + counter_name,
278+
"units" : 'iops',
279+
"description" : "volume write iops",
280+
"spoof_host" : params[filer]['ipaddr'] + ':' + params[filer]['name'],
281+
"groups" : "iops"
282+
}))
283+
elif 'write_latency' in counter_name:
284+
descriptors.append(create_desc(Desc_Skel, {
285+
"name" : filername + '_vol_' + inst_name + '_' + counter_name,
286+
"units" : 'ms',
287+
"description" : "volume write latency",
288+
"spoof_host" : params[filer]['ipaddr'] + ':' + params[filer]['name'],
289+
"groups" : "latency"
290+
}))
291+
292+
return descriptors
293+
294+
def metric_init(params):
295+
global descriptors,filerdict
296+
print 'netapp_stats] Received the following parameters'
297+
pprint.pprint(params)
298+
params = {
299+
'filer1' : {
300+
'name' : 'filer1.localdomain',
301+
'ipaddr' : '192.168.1.100',
302+
'user' : 'root',
303+
'password' : 'password',
304+
},
305+
}
306+
307+
filerdict = dict(params)
308+
Desc_Skel = {
309+
'name' : 'XXX',
310+
'call_back' : get_metrics,
311+
'time_max' : 60,
312+
'value_type' : 'double',
313+
'format' : '%0f',
314+
'units' : 'XXX',
315+
'slope' : 'both',
316+
'description' : 'XXX',
317+
'groups' : 'netiron',
318+
'spoof_host' : 'XXX',
319+
}
320+
321+
# Run define_metrics
322+
descriptors = define_metrics(Desc_Skel,params)
323+
324+
return descriptors
325+
326+
# For CLI Debugging:
327+
if __name__ == '__main__':
328+
#global params
329+
params = {
330+
'filer1' : {
331+
'name' : 'filer1.localdomain',
332+
'ipaddr' : '192.168.1.100',
333+
'user' : 'root',
334+
'password' : 'password',
335+
},
336+
}
337+
descriptors = metric_init(params)
338+
pprint.pprint(descriptors)
339+
#print len(descriptors)
340+
while True:
341+
for d in descriptors:
342+
v = d['call_back'](d['name'])
343+
#print v
344+
print 'value for %s is %.2f' % (d['name'], v)
345+
print 'Sleeping 5 seconds'
346+
time.sleep(5)

0 commit comments

Comments
 (0)