Skip to content

Commit b37a56b

Browse files
committed
Merge pull request ganglia#188 from Microway/merge-infiniband-support
Add a metric module for InfiniBand network fabrics
2 parents 74a4fd5 + 7f3e84e commit b37a56b

3 files changed

Lines changed: 759 additions & 0 deletions

File tree

network/infiniband/README.mkdn

Lines changed: 62 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,62 @@
1+
InfiniBand monitoring plugin for gmond
2+
======================================
3+
4+
Installation instructions:
5+
6+
1. Copy ``python_modules/infiniband.py`` to ``{libdir}/ganglia/python_modules/``
7+
2. Copy ``conf.d/infiniband.pyconf`` to ``/etc/ganglia/conf.d``
8+
3. The ``gmond`` Ganglia daemon will need read/write access to the InfiniBand
9+
devices in order to perform these queries. To make this happen, you will
10+
need ``gmond`` to be running as some user other than ``nobody`` (e.g.,
11+
``ganglia``). For example, create an InfiniBand group and add ganglia:
12+
13+
* ``groupadd --system infiniband``
14+
* ``usermod --append --groups infiniband ganglia``
15+
* ``chgrp -R infiniband /dev/infiniband/``
16+
* To make these changes permanent, you may need custom udev rules. Create a
17+
file named ``/etc/udev/rules.d/90-ib.rules`` with the contents:
18+
19+
KERNEL=="umad*", NAME="infiniband/%k", GROUP="infiniband"
20+
KERNEL=="issm*", NAME="infiniband/%k", GROUP="infiniband"
21+
KERNEL=="ucm*", NAME="infiniband/%k", MODE="0666", GROUP="infiniband"
22+
KERNEL=="uverbs*", NAME="infiniband/%k", MODE="0666", GROUP="infiniband"
23+
KERNEL=="ucma", NAME="infiniband/%k", MODE="0666", GROUP="infiniband"
24+
KERNEL=="rdma_cm", NAME="infiniband/%k", MODE="0666", GROUP="infiniband"
25+
26+
* Ensure your gmond daemon is set to run as the ``ganglia`` username. Set
27+
``user = ganglia`` in the file ``/etc/ganglia/gmond.conf``
28+
* ``service gmond restart``
29+
30+
31+
By default, all metrics that we could detect for each InfiniBand port are
32+
collected. The device type (e.g., ``mlx4``), device number and port number will
33+
be appended to the end of each metric. For example:
34+
35+
* ``ib_port_multicast_xmit_packets_mlx4_0_port1``
36+
* ``ib_port_multicast_xmit_packets_mlx4_0_port2``
37+
38+
39+
The following metrics have been implemented:
40+
41+
* ib_excessive_buffer_overrun_errors
42+
* ib_link_downed
43+
* ib_link_error_recovery
44+
* ib_local_link_integrity_errors
45+
* ib_port_rcv_constraint_errors
46+
* ib_port_rcv_data
47+
* ib_port_rcv_errors
48+
* ib_port_rcv_packets
49+
* ib_port_rcv_remote_physical_errors
50+
* ib_port_rcv_switch_relay_errors
51+
* ib_port_unicast_xmit_packets
52+
* ib_port_unicast_rcv_packets
53+
* ib_port_multicast_xmit_packets
54+
* ib_port_multicast_rcv_packets
55+
* ib_port_xmit_constraint_errors
56+
* ib_port_xmit_data
57+
* ib_port_xmit_discards
58+
* ib_port_xmit_packets
59+
* ib_symbol_error
60+
* ib_vl15_dropped
61+
* ib_rate
62+
Lines changed: 144 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,144 @@
1+
modules {
2+
module {
3+
name = "infiniband"
4+
language = "python"
5+
}
6+
}
7+
8+
collection_group {
9+
collect_every = 15
10+
time_threshold = 45
11+
12+
metric {
13+
name_match = "ib_port_xmit_data_([\\S]+)"
14+
title = "Bytes Sent \\1"
15+
value_threshold = 1.0
16+
}
17+
18+
metric {
19+
name_match = "ib_port_rcv_data_([\\S]+)"
20+
title = "Bytes Received \\1"
21+
value_threshold = 1.0
22+
}
23+
24+
metric {
25+
name_match = "ib_port_xmit_packets_([\\S]+)"
26+
title = "Packets Sent \\1"
27+
value_threshold = 1.0
28+
}
29+
30+
metric {
31+
name_match = "ib_port_rcv_packets_([\\S]+)"
32+
title = "Packets Received \\1"
33+
value_threshold = 1.0
34+
}
35+
36+
metric {
37+
name_match = "ib_port_unicast_xmit_packets_([\\S]+)"
38+
title = "UniCast Packets Sent \\1"
39+
value_threshold = 1.0
40+
}
41+
42+
metric {
43+
name_match = "ib_port_unicast_rcv_packets_([\\S]+)"
44+
title = "UniCast Packets Received \\1"
45+
value_threshold = 1.0
46+
}
47+
48+
metric {
49+
name_match = "ib_port_multicast_xmit_packets_([\\S]+)"
50+
title = "MultiCast Packets Sent \\1"
51+
value_threshold = 1.0
52+
}
53+
54+
metric {
55+
name_match = "ib_port_multicast_rcv_packets_([\\S]+)"
56+
title = "MultiCast Packets Received \\1"
57+
value_threshold = 1.0
58+
}
59+
60+
metric {
61+
name_match = "ib_excessive_buffer_overrun_errors_([\\S]+)"
62+
title = "Buffer Overrun Errors \\1"
63+
value_threshold = 1.0
64+
}
65+
66+
metric {
67+
name_match = "ib_link_downed_([\\S]+)"
68+
title = "Link Downed \\1"
69+
value_threshold = 1.0
70+
}
71+
72+
metric {
73+
name_match = "ib_link_error_recovery_([\\S]+)"
74+
title = "Link Error Recoveries \\1"
75+
value_threshold = 1.0
76+
}
77+
78+
metric {
79+
name_match = "ib_local_link_integrity_errors_([\\S]+)"
80+
title = "Link Integrity Errors \\1"
81+
value_threshold = 1.0
82+
}
83+
84+
metric {
85+
name_match = "ib_port_rcv_constraint_errors_([\\S]+)"
86+
title = "Switch Receive Constraint Errors \\1"
87+
value_threshold = 1.0
88+
}
89+
90+
metric {
91+
name_match = "ib_port_rcv_errors_([\\S]+)"
92+
title = "Packet Errors \\1"
93+
value_threshold = 1.0
94+
}
95+
96+
metric {
97+
name_match = "ib_port_rcv_remote_physical_errors_([\\S]+)"
98+
title = "Bad Packets \\1"
99+
value_threshold = 1.0
100+
}
101+
102+
metric {
103+
name_match = "ib_port_rcv_switch_relay_errors_([\\S]+)"
104+
title = "Switch Relay Errors \\1"
105+
value_threshold = 1.0
106+
}
107+
108+
metric {
109+
name_match = "ib_port_xmit_constraint_errors_([\\S]+)"
110+
title = "Switch Transmit Constraint Errors \\1"
111+
value_threshold = 1.0
112+
}
113+
114+
metric {
115+
name_match = "ib_port_xmit_discards_([\\S]+)"
116+
title = "Packet Discards \\1"
117+
value_threshold = 1.0
118+
}
119+
120+
metric {
121+
name_match = "ib_symbol_error_([\\S]+)"
122+
title = "Symbol Errors \\1"
123+
value_threshold = 1.0
124+
}
125+
126+
metric {
127+
name_match = "ib_vl15_dropped_([\\S]+)"
128+
title = "Management Packet Drops \\1"
129+
value_threshold = 1.0
130+
}
131+
132+
}
133+
134+
collection_group {
135+
collect_every = 600
136+
time_threshold = 1200
137+
138+
metric {
139+
name_match = "ib_rate_([\\S]+)"
140+
title= "Data Rate \\1"
141+
value_threshold = 1.0
142+
}
143+
}
144+

0 commit comments

Comments
 (0)