From 93ff7e1cb6bf5976c6f28a08c1e62c1faf1dce42 Mon Sep 17 00:00:00 2001 From: wvcollenburg <50706527+wvcollenburg@users.noreply.github.com> Date: Mon, 27 Oct 2025 22:02:40 +0100 Subject: [PATCH 1/4] Initial upload A yaml that can be imported to zabbix to add monitor data to it. See readme for full info --- specific_task/ZabbixPlugin/README.md | 92 ++++ .../Scale_Computing_Hypercore.yaml | 447 ++++++++++++++++++ 2 files changed, 539 insertions(+) create mode 100644 specific_task/ZabbixPlugin/README.md create mode 100644 specific_task/ZabbixPlugin/Scale_Computing_Hypercore.yaml diff --git a/specific_task/ZabbixPlugin/README.md b/specific_task/ZabbixPlugin/README.md new file mode 100644 index 0000000..331af8f --- /dev/null +++ b/specific_task/ZabbixPlugin/README.md @@ -0,0 +1,92 @@ +# Scale Computing HyperCore Monitoring Template + +This Zabbix template is designed to monitor a Scale Computing HyperCore cluster by directly querying its REST API (v1). It utilizes the **HTTP Agent** and **Low-Level Discovery (LLD)** to automatically monitor nodes, virtual machines (VMs), and physical drives, including real-time performance metrics and disposition status. + +This template has been validated on **Zabbix 7.0** and is engineered to handle complex JSON data and array parsing errors common in API integrations. + +--- + +## 🚀 Usage and Setup + +### 1. Prerequisites + +1. **Zabbix Version:** Zabbix 5.4 or newer (optimized for 7.0). + +2. **API Access:** A valid Scale Computing HyperCore user account with read-only API access. + +### 2. Import Template + +1. Go to **Data collection** -> **Templates** in the Zabbix frontend. + +2. Click **Import** (top right corner). + +3. Select the YAML template file (e.g., `template_scale_api_final.yaml`). + +4. Click **Import**. + +### 3. Configure Host and Macros + +The template requires three mandatory host macros to successfully authenticate and connect to the HyperCore API. + +1. Go to **Data collection** -> **Hosts**. + +2. Select the host representing your Scale Computing cluster (or create a new one). + +3. Go to the **Templates** tab and link the `Scale Computing HyperCore by HTTP` template. + +4. Go to the **Macros** tab and set the following values: + +| Macro | Type | Example Value | Description | +| :---- | :---- | :---- | :---- | +| **{$API_URL}** | Text | https://172.16.0.241 | The base URL of the HyperCore API (must include http:// or https://). **Do not include /rest/v1.** | +| **{$API_USER}** | Text | api_reader | Username for Basic Authentication. | +| **{$API_PASS}** | **Secret text** | P@$$w0rdS3curE! | Password for Basic Authentication. (Must be stored as Secret text). | + +5. Click **Update**. The master items should turn green within one minute, and discovery should begin shortly thereafter. + +--- + +## 📊 Monitored Metrics (LLD) + +The template utilizes four master HTTP Agent items to retrieve data and three Low-Level Discovery rules to dynamically create items for each unique resource found. + +### 1. Node Discovery (Cluster Members) + +| LLD Macro | Item Prototype | Unit | Description | +| :-------- | :------------- | :--- | :---------- | +| {##NODE_NAME} | **CPU Usage** | % | Current CPU utilization of the node. | +| | **Memory Usage** | % | Current total memory utilization of the node. | +| | **Network Status** | Char | Network connectivity status (ONLINE, OFFLINE). | +| | **Disposition** | Char | Node status regarding cluster participation (IN, EVACUATED, OUT). | + +### 2. VM Discovery (Virtual Machines) + +| LLD Macro | Item Prototype | Unit | Description | +| :-------- | :------------- | :--- | :---------- | +| {##VM_NAME} | **State** | Char | Current power state of the VM (RUNNING, SHUTOFF, etc.). | +| | **Guest Agent Status** | Char | Status of the Scale Guest Agent (AVAILABLE, UNAVAILABLE). | +| | **CPU Usage** | % | Current CPU utilization by the VM. | +| | **Network RX Rate** | bps | Incoming network traffic rate. | +| | **Network TX Rate** | bps | Outgoing network traffic rate. | +| | **Total Disk Capacity** | B | Logical capacity reserved for the VM (sum of all virtual disks). | +| | **Disk Used Allocation** | B | Actual disk space used (allocated) by the VM on shared storage. | + +### 3. Physical Drive Discovery + +| LLD Macro | Item Prototype | Unit | Description | +| :-------- | :------------- | :--- | :---------- | +| {##DRIVE_SN} | **Health Status** | Float | Drive health status (0=Unhealthy, 1=Healthy). Uses "Zabbix boolean" Value Map. | +| | **Temperature** | C | Current reported drive temperature. | +| | **Error Count** | errors | Total count of drive errors (reallocated sectors, etc.). | + +### 🚨 Trigger Thresholds + +* **Node Offline:** High priority if **Network Status is not ONLINE**. + +* **Node Disposition:** Warning if **Disposition is not IN**. + +* **Utilization:** Warning if Node CPU or Memory Usage average exceeds **90%** over 5 minutes. + +* **Drive Health:** High priority if **Health Status is False (0)**. + +* **Guest Agent:** Warning if **Guest Agent Status is not AVAILABLE** while the VM is running. \ No newline at end of file diff --git a/specific_task/ZabbixPlugin/Scale_Computing_Hypercore.yaml b/specific_task/ZabbixPlugin/Scale_Computing_Hypercore.yaml new file mode 100644 index 0000000..3537326 --- /dev/null +++ b/specific_task/ZabbixPlugin/Scale_Computing_Hypercore.yaml @@ -0,0 +1,447 @@ +zabbix_export: + version: '7.0' + template_groups: + - uuid: 4b1ecf81a02a4f04b82a91d650499d4b # Standard UUID for Templates/Applications + name: Templates/Applications + templates: + - uuid: f68fd423a81e4d3e8aa04f3a9bad61fe # Placeholder UUID - Regenerate locally + template: Template Scale Computing HyperCore API # Technical Name + name: Scale Computing HyperCore by HTTP # User-friendly Name + description: |- + Monitors Scale Computing HyperCore infrastructure (Nodes, VMs, Drives) via its REST API. + + Setup: + 1. Link this template to the host representing the HyperCore cluster. + 2. Set host macros for API access ({$API_URL}, {$API_USER}, {$API_PASS}). + groups: + - name: Templates/Applications + macros: + - macro: '{$API_URL}' + description: Base URL of the HyperCore API (e.g., https://your-cluster-ip) + - macro: '{$API_USER}' + description: Username for API Basic Authentication + - macro: '{$API_PASS}' + type: SECRET_TEXT + description: Password for API Basic Authentication + items: + - uuid: 84feac7ac823493d825e64f48edd6812 + name: 'HyperCore API: Get All Nodes' + type: HTTP_AGENT + key: hypercore.api.get[nodes] + delay: 1m + history: 1h + trends: '0' + value_type: TEXT + url: '{$API_URL}/rest/v1/Node' + authtype: BASIC + username: '{$API_USER}' + password: '{$API_PASS}' + timeout: 15s + status: ENABLED + description: Master item retrieving all node information and statistics. + - uuid: 9cb100fb52d4477b80b2b25b26545509 + name: 'HyperCore API: Get All VMs' + type: HTTP_AGENT + key: hypercore.api.get[vms] + delay: 5m + history: 1h + trends: '0' + value_type: TEXT + url: '{$API_URL}/rest/v1/VirDomain' + authtype: BASIC + username: '{$API_USER}' + password: '{$API_PASS}' + timeout: 15s + status: ENABLED + description: Master item retrieving all VM configuration data. + - uuid: efb3953074c3406ca0cf740ea03457f3 + name: 'HyperCore API: Get All VM Stats' + type: HTTP_AGENT + key: hypercore.api.get[vmstats] + delay: 1m + history: 1h + trends: '0' + value_type: TEXT + url: '{$API_URL}/rest/v1/VirDomainStats' + authtype: BASIC + username: '{$API_USER}' + password: '{$API_PASS}' + timeout: 15s + status: ENABLED + description: Master item retrieving all VM performance metrics. + - uuid: bbafb36a8b944fb38688373ee79b5e80 + name: 'HyperCore API: Get All Drives' + type: HTTP_AGENT + key: hypercore.api.get[drives] + delay: 5m + history: 1h + trends: '0' + value_type: TEXT + url: '{$API_URL}/rest/v1/Drive' + authtype: BASIC + username: '{$API_USER}' + password: '{$API_PASS}' + timeout: 15s + status: ENABLED + description: Master item retrieving all physical drive status data. + discovery_rules: + - uuid: 053a6dca55824fdd82f6a751c9b5bdad + name: Node Discovery + type: DEPENDENT + key: hypercore.nodes.discovery + delay: '0' + master_item: + key: hypercore.api.get[nodes] + preprocessing: + - type: JSONPATH + parameters: + - '$.*' + lld_macro_paths: + - lld_macro: '{#NODE_ID}' + path: $.uuid + - lld_macro: '{#NODE_NAME}' + path: $.lanIP # Using LAN IP as Node Name, change path if another field is better + item_prototypes: + - uuid: a02204864d3c4f4b86e6bbd0bf1bf1a9 + name: 'Node {#NODE_NAME}: CPU Usage' + type: DEPENDENT + key: hypercore.node.cpu_usage[{#NODE_ID}] + delay: '0' + history: 7d + value_type: FLOAT + units: '%' + master_item: + key: hypercore.api.get[nodes] + preprocessing: + - type: JSONPATH + parameters: + - "$[?(@.uuid == '{#NODE_ID}')].cpuUsage.first()" + error_handler: CUSTOM_VALUE + error_handler_params: '0' + - uuid: c46df85e825d4302a109e99072b08ad1 + name: 'Node {#NODE_NAME}: Memory Usage (%)' + type: DEPENDENT + key: hypercore.node.mem_usage_pct[{#NODE_ID}] + delay: '0' + history: 7d + value_type: FLOAT + units: '%' + master_item: + key: hypercore.api.get[nodes] + preprocessing: + - type: JSONPATH + parameters: + - "$[?(@.uuid == '{#NODE_ID}')].memUsagePercentage.first()" + error_handler: CUSTOM_VALUE + error_handler_params: '0' + - uuid: 36f75e5b606446eeb296a2b0eee20b4a + name: 'Node {#NODE_NAME}: Network Status' + type: DEPENDENT + key: hypercore.node.network_status[{#NODE_ID}] + delay: '0' + history: 7d + value_type: CHAR + master_item: + key: hypercore.api.get[nodes] + preprocessing: + - type: JSONPATH + parameters: + - "$[?(@.uuid == '{#NODE_ID}')].networkStatus.first()" # FIX: Added .first() + error_handler: DISCARD_VALUE + - uuid: 6e81f4e7ead843ef84a5a231192eb1ea + name: 'Node {#NODE_NAME}: Disposition' + type: DEPENDENT + key: hypercore.node.disposition[{#NODE_ID}] + delay: '0' + history: 7d + value_type: CHAR + master_item: + key: hypercore.api.get[nodes] + preprocessing: + - type: JSONPATH + parameters: + - "$[?(@.uuid == '{#NODE_ID}')].currentDisposition.first()" # FIX: Added .first() + error_handler: DISCARD_VALUE + trigger_prototypes: + - uuid: c80f2f1128ed431cb9ba9758b68162e6 + expression: 'last(/Template Scale Computing HyperCore API/hypercore.node.network_status[{#NODE_ID}])<>"ONLINE"' + name: 'Node {#NODE_NAME} is offline' + priority: HIGH + description: The network status for node {#NODE_NAME} is not 'ONLINE'. + - uuid: 87e3116b4afe424187e158ed1d1f089b + expression: 'last(/Template Scale Computing HyperCore API/hypercore.node.disposition[{#NODE_ID}])<>"IN"' + name: "Node {#NODE_NAME} has unusual status (not 'IN')" + priority: WARNING + description: Node {#NODE_NAME} disposition is {ITEM.VALUE} (not 'IN'). This might indicate maintenance or evacuation. + - uuid: c4e8bd7a82474680b3912074a55abefc + expression: 'avg(/Template Scale Computing HyperCore API/hypercore.node.cpu_usage[{#NODE_ID}],5m)>90' + name: 'Node {#NODE_NAME} CPU utilization is high' + priority: WARNING + description: Average CPU usage on node {#NODE_ID}] exceeded 90% for 5 minutes. + - uuid: d29ec4afdd18493098f3c8bea7b146a5 + expression: 'avg(/Template Scale Computing HyperCore API/hypercore.node.mem_usage_pct[{#NODE_ID}],5m)>90' + name: 'Node {#NODE_NAME} memory utilization is high' + priority: WARNING + description: Average memory usage on node {#NODE_ID} exceeded 90% for 5 minutes. + - uuid: 37d39874c311448fa058f881b3b66c16 + name: VM Discovery + type: DEPENDENT + key: hypercore.vms.discovery + delay: '0' + master_item: + key: hypercore.api.get[vms] + preprocessing: + - type: JSONPATH + parameters: + - '$.*' + lld_macro_paths: + - lld_macro: '{#VM_ID}' + path: $.uuid + - lld_macro: '{#VM_NAME}' + path: $.name + item_prototypes: + - uuid: 7fd5a01b4a7d4c27b9c531990310a09f + name: 'VM {#VM_NAME}: State' + type: DEPENDENT + key: hypercore.vm.state[{#VM_ID}] + delay: '0' + history: 7d + value_type: CHAR + master_item: + key: hypercore.api.get[vms] + preprocessing: + - type: JSONPATH + parameters: + - "$[?(@.uuid == '{#VM_ID}')].state.first()" # FIX: Added .first() + error_handler: DISCARD_VALUE + - uuid: 16d192352fa94ea4a5db8d1ea01a6c5c + name: 'VM {#VM_NAME}: Guest Agent Status' + type: DEPENDENT + key: hypercore.vm.guest_agent[{#VM_ID}] + delay: '0' + history: 7d + value_type: CHAR + master_item: + key: hypercore.api.get[vms] + preprocessing: + - type: JSONPATH + parameters: + - "$[?(@.uuid == '{#VM_ID}')].guestAgentState.first()" # FIX: Added .first() + error_handler: DISCARD_VALUE + # --- DISK USAGE ITEMS --- + - uuid: 4ece83a77f9e4c53b2c19cfd5c920ab5 + name: 'VM {#VM_NAME}: Total Disk Capacity (Bytes)' + type: DEPENDENT + key: hypercore.vm.disk.capacity[{#VM_ID}] + delay: '0' + history: 7d + value_type: FLOAT + units: B + master_item: + key: hypercore.api.get[vms] + preprocessing: + - type: JSONPATH + parameters: + - "$[?(@.uuid == '{#VM_ID}')].blockDevs[*].capacity.sum()" # Capacity sum + error_handler: CUSTOM_VALUE + error_handler_params: '0' + - uuid: 8a7cb426e3a24d639dc59904c5c706f7 + name: 'VM {#VM_NAME}: Disk Used Allocation (Bytes)' + type: DEPENDENT + key: hypercore.vm.disk.used_allocated[{#VM_ID}] + delay: '0' + history: 7d + value_type: FLOAT + units: B + master_item: + key: hypercore.api.get[vms] + preprocessing: + - type: JSONPATH + parameters: + - "$[?(@.uuid == '{#VM_ID}')].blockDevs[*].allocation.sum()" # Allocation sum + error_handler: CUSTOM_VALUE + error_handler_params: '0' + - uuid: a2fa0be1a2d84b0faac81d8765fe37f1 + name: 'VM {#VM_NAME}: Disk Allocation Growth Rate (Bps)' + type: DEPENDENT + key: hypercore.vm.disk.used_allocated.rate[{#VM_ID}] + delay: '0' + history: 7d + value_type: FLOAT + units: Bps + master_item: + key: hypercore.api.get[vms] # Master item MUST be a main fetch item. + preprocessing: + - type: JSONPATH + parameters: + - "$[?(@.uuid == '{#VM_ID}')].blockDevs[*].allocation.sum()" + error_handler: CUSTOM_VALUE + error_handler_params: '0' + - type: CHANGE_PER_SECOND + parameters: [] + - type: DISCARD_UNCHANGED_HEARTBEAT + parameters: + - 1h + # CPU/Network stats items query VM Stats master item (vmstats) + - uuid: f6f436a31ad9434c8b8db29319c87ed4 + name: 'VM {#VM_NAME}: CPU Usage' + type: DEPENDENT + key: hypercore.vm.cpu_usage[{#VM_ID}] + delay: '0' + history: 7d + value_type: FLOAT + units: '%' + master_item: + key: hypercore.api.get[vmstats] + preprocessing: + - type: JSONPATH + parameters: + - "$[?(@.uuid == '{#VM_ID}')].cpuUsage.first()" + error_handler: CUSTOM_VALUE + error_handler_params: '0' + - uuid: d3b2a3f70b74489ea0f8a53e67c6c1d5 + name: 'VM {#VM_NAME}: Network RX Rate' + type: DEPENDENT + key: hypercore.vm.net_rx[{#VM_ID}] + delay: '0' + history: 7d + value_type: FLOAT + units: bps + master_item: + key: hypercore.api.get[vmstats] + preprocessing: + - type: JSONPATH + parameters: + - "$[?(@.uuid == '{#VM_ID}')].rxBitRate.first()" + error_handler: CUSTOM_VALUE + error_handler_params: '0' + - uuid: 829399b2d8ae4a9b9dd45e01f675a87f + name: 'VM {#VM_NAME}: Network TX Rate' + type: DEPENDENT + key: hypercore.vm.net_tx[{#VM_ID}] + delay: '0' + history: 7d + value_type: FLOAT + units: bps + master_item: + key: hypercore.api.get[vmstats] + preprocessing: + - type: JSONPATH + parameters: + - "$[?(@.uuid == '{#VM_ID}')].txBitRate.first()" + error_handler: CUSTOM_VALUE + error_handler_params: '0' + trigger_prototypes: + - uuid: 9546a4651d54422099a584d6acaba2cf + expression: 'last(/Template Scale Computing HyperCore API/hypercore.vm.state[{#VM_ID}])<>"RUNNING"' + name: 'VM {#VM_NAME} is not running' + priority: INFO + description: VM {#VM_NAME} state is {ITEM.VALUE} (not 'RUNNING'). + - uuid: 4ab5e0503c5a43288b97461c4017cad7 + expression: 'last(/Template Scale Computing HyperCore API/hypercore.vm.guest_agent[{#VM_ID}])<>"AVAILABLE" and last(/Template Scale Computing HyperCore API/hypercore.vm.state[{#VM_ID}])="RUNNING"' + name: 'VM {#VM_NAME} Guest Agent is unavailable' + priority: WARNING + description: The Guest Agent on VM {#VM_NAME} is not responding, but the VM is running. + - uuid: cc0870c3c5544f709b5edb829fb2b837 + name: Physical Drive Discovery + type: DEPENDENT + key: hypercore.drives.discovery + delay: '0' + master_item: + key: hypercore.api.get[drives] + preprocessing: + - type: JSONPATH + parameters: + - '$.*' + lld_macro_paths: + - lld_macro: '{#DRIVE_ID}' + path: $.uuid + - lld_macro: '{#DRIVE_SN}' + path: $.serialNumber + - lld_macro: '{#DRIVE_SLOT}' + path: $.slot + item_prototypes: + - uuid: 220b4f9765bf4e21b6c0ed24b356cdf5 + name: 'Drive {#DRIVE_SN} (Slot {#DRIVE_SLOT}): Health Status' + type: DEPENDENT + key: hypercore.drive.healthy[{#DRIVE_ID}] + delay: '0' + history: 7d + value_type: FLOAT # Using FLOAT for 0/1, value map handles display + master_item: + key: hypercore.api.get[drives] + preprocessing: + - type: JSONPATH + parameters: + - "$[?(@.uuid == '{#DRIVE_ID}')].isHealthy.first()" + error_handler: CUSTOM_VALUE + error_handler_params: '0' + - type: BOOL_TO_DECIMAL + parameters: [] + valuemap: + name: Zabbix boolean + - uuid: ec9eb41d5d1e44d78e91a7d2e2c762e8 + name: 'Drive {#DRIVE_SN} (Slot {#DRIVE_SLOT}): Temperature' + type: DEPENDENT + key: hypercore.drive.temp[{#DRIVE_ID}] + delay: '0' + history: 7d + value_type: FLOAT # API returns integer, float handles it fine + units: C + master_item: + key: hypercore.api.get[drives] + preprocessing: + - type: JSONPATH + parameters: + - "$[?(@.uuid == '{#DRIVE_ID}')].temperature.first()" + error_handler: CUSTOM_VALUE + error_handler_params: '0' + - type: DISCARD_UNCHANGED_HEARTBEAT + parameters: + - 1h + - uuid: cff2f1fd0c3c46c1a297f92fdb07361d + name: 'Drive {#DRIVE_SN} (Slot {#DRIVE_SLOT}): Error Count' + type: DEPENDENT + key: hypercore.drive.errors[{#DRIVE_ID}] + delay: '0' + history: 7d + value_type: FLOAT # API returns integer, float handles it fine + units: errors + master_item: + key: hypercore.api.get[drives] + preprocessing: + - type: JSONPATH + parameters: + - "$[?(@.uuid == '{#DRIVE_ID}')].errorCount.first()" + error_handler: CUSTOM_VALUE + error_handler_params: '0' + - type: DISCARD_UNCHANGED_HEARTBEAT + parameters: + - 1h + trigger_prototypes: + - uuid: a15325f148fe4087aceee5c69ba984fb + expression: 'last(/Template Scale Computing HyperCore API/hypercore.drive.healthy[{#DRIVE_ID}])=0' + name: 'Drive {#DRIVE_SN} (Slot {#DRIVE_SLOT}) is unhealthy' + priority: HIGH + description: The 'isHealthy' status for drive {#DRIVE_SN} is 'false'. The drive might need replacement. + - uuid: 94661649b86a41dcac12507a3320b62e + expression: 'last(/Template Scale Computing HyperCore API/hypercore.drive.errors[{#DRIVE_ID}])>0' + name: 'Drive {#DRIVE_SN} (Slot {#DRIVE_SLOT}) is reporting errors' + priority: WARNING + description: Drive {#DRIVE_SN} has reported {ITEM.VALUE} errors. + dependencies: + - name: 'Drive {#DRIVE_SN} (Slot {#DRIVE_SLOT}) is unhealthy' + expression: 'last(/Template Scale Computing HyperCore API/hypercore.drive.healthy[{#DRIVE_ID}])=0' + - uuid: 70f9d44495014c7694a56779016d7489 + expression: 'avg(/Template Scale Computing HyperCore API/hypercore.drive.temp[{#DRIVE_ID}],5m)>65' + name: 'Drive {#DRIVE_SN} (Slot {#DRIVE_SLOT}) temperature is high' + priority: AVERAGE + description: The temperature of drive {#DRIVE_SN} is {ITEM.VALUE}C, exceeding the threshold (65C). + valuemaps: + - uuid: 6787163ebbed42369966666ec415ec35 # Placeholder UUID - Regenerate locally + name: 'Zabbix boolean' + mappings: + - value: '0' + newvalue: 'False' + - value: '1' + newvalue: 'True' From 28ffb972a7575d776b44788b68c62b64e80e45ca Mon Sep 17 00:00:00 2001 From: wvcollenburg <50706527+wvcollenburg@users.noreply.github.com> Date: Tue, 28 Oct 2025 16:06:38 +0100 Subject: [PATCH 2/4] each discovered item becomes a host Made a change that will allow each node and each vm that is discovered to be registered as host, with it's own sub-items. this makes navigation a lot easier --- specific_task/ZabbixPlugin/README.md | 125 ++--- .../Scale_Computing_Hypercore.yaml | 447 ------------------ .../Scale_Computing_Hypercore_Zabbix.yaml | 441 +++++++++++++++++ 3 files changed, 493 insertions(+), 520 deletions(-) delete mode 100644 specific_task/ZabbixPlugin/Scale_Computing_Hypercore.yaml create mode 100644 specific_task/ZabbixPlugin/Scale_Computing_Hypercore_Zabbix.yaml diff --git a/specific_task/ZabbixPlugin/README.md b/specific_task/ZabbixPlugin/README.md index 331af8f..d87f05d 100644 --- a/specific_task/ZabbixPlugin/README.md +++ b/specific_task/ZabbixPlugin/README.md @@ -1,92 +1,71 @@ -# Scale Computing HyperCore Monitoring Template +# Scale Computing HyperCore by HTTP Monitoring for Zabbix -This Zabbix template is designed to monitor a Scale Computing HyperCore cluster by directly querying its REST API (v1). It utilizes the **HTTP Agent** and **Low-Level Discovery (LLD)** to automatically monitor nodes, virtual machines (VMs), and physical drives, including real-time performance metrics and disposition status. +This Zabbix monitoring solution uses the Scale Computing HyperCore REST API to automatically discover and monitor your cluster's Nodes, VMs, and Physical Drives. -This template has been validated on **Zabbix 7.0** and is engineered to handle complex JSON data and array parsing errors common in API integrations. +It uses a "host-per-object" model, meaning Zabbix will create an individual host for each discovered Node and VM, giving you a clean, organized view of your infrastructure. ---- +## Features -## 🚀 Usage and Setup +This solution consists of three templates: -### 1. Prerequisites +* **`Template Scale Computing HyperCore API` (Main Template):** + * This is the *only* template you link to your main cluster host. + * It performs Low-Level Discovery (LLD) to find all Nodes and VMs. + * It creates a new Zabbix host for each Node, linking it to the `Template Scale Computing Node`. + * It creates a new Zabbix host for each VM, linking it to the `Template Scale Computing VM`. -1. **Zabbix Version:** Zabbix 5.4 or newer (optimized for 7.0). +* **`Template Scale Computing Node` (Node Template):** + * Monitors a single SCNode host (CPU Usage, Memory Usage, Network Status, Disposition). + * Contains triggers for Node status (Offline, CPU, Memory). + * Contains a *nested* LLD rule to discover all physical drives associated with *that specific node*. + * Creates items and triggers for each drive (Health, Temperature, Error Count). -2. **API Access:** A valid Scale Computing HyperCore user account with read-only API access. +* **`Template Scale Computing VM` (VM Template):** + * Monitors a single VM host (CPU Usage, VM State, Guest Agent Status, Disk Allocation). + * Contains triggers for VM status (Not Running, Agent Unavailable, CPU). -### 2. Import Template +## Setup Instructions -1. Go to **Data collection** -> **Templates** in the Zabbix frontend. +1. **Import Templates:** Import the final YAML file (`Scale_Computing_Hypercore_Zabbix.yaml`) into your Zabbix instance. This will add all three templates and the required host groups (`HyperCore Nodes`, `Virtual machines`). -2. Click **Import** (top right corner). +2. **Create Cluster Host:** + * Create a single new host in Zabbix. This host will represent your entire Scale Computing cluster (e.g., `sc-cluster.yourdomain.com`). + * **Agent interface:** This host does not need an agent. You can remove all interfaces. + * **Templates Tab:** Link *only* the `Template Scale Computing HyperCore API` to this host. -3. Select the YAML template file (e.g., `template_scale_api_final.yaml`). +3. **Configure Macros:** + * On the **Macros** tab for your new cluster host, set the following three "Inherited and host macros": + * `{$API_URL}`: The base URL of your cluster (e.g., `https://172.16.0.241`) + * `{$API_USER}`: The API username (e.g., `zabbix`) + * `{$API_PASS}`: The API user's password. -4. Click **Import**. +4. **Run Discovery:** + * Wait for the discovery rules to run (default is 5 minutes), or force them by: + * Going to your cluster host's **Items** list. + * Clicking **Execute now** for `HyperCore API: Get All Nodes (for LLD)`. + * Clicking **Execute now** for `HyperCore API: Get All VMs (for LLD)`. -### 3. Configure Host and Macros +Within a few minutes, Zabbix will automatically create new hosts for all your VMs (e.g., `VM MyWebServer`) and Nodes (e.g., `SCNode 172.16.0.20`). These new hosts will automatically inherit the API credentials and start polling for data. -The template requires three mandatory host macros to successfully authenticate and connect to the HyperCore API. +## What is Monitored -1. Go to **Data collection** -> **Hosts**. +Here is a breakdown of the items that will be created on your discovered hosts. -2. Select the host representing your Scale Computing cluster (or create a new one). +### On Each `SCNode` Host -3. Go to the **Templates** tab and link the `Scale Computing HyperCore by HTTP` template. +* **Node CPU Usage:** The total CPU utilization of the physical node, as a percentage. +* **Node Memory Usage (%)**: The total RAM utilization of the physical node, as a percentage. +* **Node Network Status:** The health of the node's network connection to the cluster. `ONLINE` is healthy. +* **Node Disposition:** The operational state of the node. `IN` is the normal, healthy state. Other states like `OUT` or `EVACUATING` will trigger an alert. +* **Discovered Drives (for each drive):** + * **Health Status:** A boolean (True/False) reported by the drive's S.M.A.R.T. diagnostics. + * **Temperature:** The drive's internal temperature in Celsius. + * **Error Count:** A counter of read/write or other hardware errors. -4. Go to the **Macros** tab and set the following values: +### On Each `VM` Host -| Macro | Type | Example Value | Description | -| :---- | :---- | :---- | :---- | -| **{$API_URL}** | Text | https://172.16.0.241 | The base URL of the HyperCore API (must include http:// or https://). **Do not include /rest/v1.** | -| **{$API_USER}** | Text | api_reader | Username for Basic Authentication. | -| **{$API_PASS}** | **Secret text** | P@$$w0rdS3curE! | Password for Basic Authentication. (Must be stored as Secret text). | - -5. Click **Update**. The master items should turn green within one minute, and discovery should begin shortly thereafter. - ---- - -## 📊 Monitored Metrics (LLD) - -The template utilizes four master HTTP Agent items to retrieve data and three Low-Level Discovery rules to dynamically create items for each unique resource found. - -### 1. Node Discovery (Cluster Members) - -| LLD Macro | Item Prototype | Unit | Description | -| :-------- | :------------- | :--- | :---------- | -| {##NODE_NAME} | **CPU Usage** | % | Current CPU utilization of the node. | -| | **Memory Usage** | % | Current total memory utilization of the node. | -| | **Network Status** | Char | Network connectivity status (ONLINE, OFFLINE). | -| | **Disposition** | Char | Node status regarding cluster participation (IN, EVACUATED, OUT). | - -### 2. VM Discovery (Virtual Machines) - -| LLD Macro | Item Prototype | Unit | Description | -| :-------- | :------------- | :--- | :---------- | -| {##VM_NAME} | **State** | Char | Current power state of the VM (RUNNING, SHUTOFF, etc.). | -| | **Guest Agent Status** | Char | Status of the Scale Guest Agent (AVAILABLE, UNAVAILABLE). | -| | **CPU Usage** | % | Current CPU utilization by the VM. | -| | **Network RX Rate** | bps | Incoming network traffic rate. | -| | **Network TX Rate** | bps | Outgoing network traffic rate. | -| | **Total Disk Capacity** | B | Logical capacity reserved for the VM (sum of all virtual disks). | -| | **Disk Used Allocation** | B | Actual disk space used (allocated) by the VM on shared storage. | - -### 3. Physical Drive Discovery - -| LLD Macro | Item Prototype | Unit | Description | -| :-------- | :------------- | :--- | :---------- | -| {##DRIVE_SN} | **Health Status** | Float | Drive health status (0=Unhealthy, 1=Healthy). Uses "Zabbix boolean" Value Map. | -| | **Temperature** | C | Current reported drive temperature. | -| | **Error Count** | errors | Total count of drive errors (reallocated sectors, etc.). | - -### 🚨 Trigger Thresholds - -* **Node Offline:** High priority if **Network Status is not ONLINE**. - -* **Node Disposition:** Warning if **Disposition is not IN**. - -* **Utilization:** Warning if Node CPU or Memory Usage average exceeds **90%** over 5 minutes. - -* **Drive Health:** High priority if **Health Status is False (0)**. - -* **Guest Agent:** Warning if **Guest Agent Status is not AVAILABLE** while the VM is running. \ No newline at end of file +* **VM State:** The power state of the virtual machine. `RUNNING` is the normal state. Triggers on any other state (e.g., `STOPPED`, `PAUSED`). +* **Guest Agent Status:** The status of the Scale Guest Tools inside the VM's operating system. `AVAILABLE` is healthy. +* **CPU Usage:** The CPU utilization of *this specific VM*, as a percentage. +* **Disk Used Allocation (Bytes):** The total physical storage (in bytes) that the VM's virtual disks are currently consuming on the cluster. +* **Disk Allocation Growth Rate (Bps):** A calculated rate (in bytes per second) showing how fast the VM's disk allocation is growing. Useful for spotting runaway logs or backups. \ No newline at end of file diff --git a/specific_task/ZabbixPlugin/Scale_Computing_Hypercore.yaml b/specific_task/ZabbixPlugin/Scale_Computing_Hypercore.yaml deleted file mode 100644 index 3537326..0000000 --- a/specific_task/ZabbixPlugin/Scale_Computing_Hypercore.yaml +++ /dev/null @@ -1,447 +0,0 @@ -zabbix_export: - version: '7.0' - template_groups: - - uuid: 4b1ecf81a02a4f04b82a91d650499d4b # Standard UUID for Templates/Applications - name: Templates/Applications - templates: - - uuid: f68fd423a81e4d3e8aa04f3a9bad61fe # Placeholder UUID - Regenerate locally - template: Template Scale Computing HyperCore API # Technical Name - name: Scale Computing HyperCore by HTTP # User-friendly Name - description: |- - Monitors Scale Computing HyperCore infrastructure (Nodes, VMs, Drives) via its REST API. - - Setup: - 1. Link this template to the host representing the HyperCore cluster. - 2. Set host macros for API access ({$API_URL}, {$API_USER}, {$API_PASS}). - groups: - - name: Templates/Applications - macros: - - macro: '{$API_URL}' - description: Base URL of the HyperCore API (e.g., https://your-cluster-ip) - - macro: '{$API_USER}' - description: Username for API Basic Authentication - - macro: '{$API_PASS}' - type: SECRET_TEXT - description: Password for API Basic Authentication - items: - - uuid: 84feac7ac823493d825e64f48edd6812 - name: 'HyperCore API: Get All Nodes' - type: HTTP_AGENT - key: hypercore.api.get[nodes] - delay: 1m - history: 1h - trends: '0' - value_type: TEXT - url: '{$API_URL}/rest/v1/Node' - authtype: BASIC - username: '{$API_USER}' - password: '{$API_PASS}' - timeout: 15s - status: ENABLED - description: Master item retrieving all node information and statistics. - - uuid: 9cb100fb52d4477b80b2b25b26545509 - name: 'HyperCore API: Get All VMs' - type: HTTP_AGENT - key: hypercore.api.get[vms] - delay: 5m - history: 1h - trends: '0' - value_type: TEXT - url: '{$API_URL}/rest/v1/VirDomain' - authtype: BASIC - username: '{$API_USER}' - password: '{$API_PASS}' - timeout: 15s - status: ENABLED - description: Master item retrieving all VM configuration data. - - uuid: efb3953074c3406ca0cf740ea03457f3 - name: 'HyperCore API: Get All VM Stats' - type: HTTP_AGENT - key: hypercore.api.get[vmstats] - delay: 1m - history: 1h - trends: '0' - value_type: TEXT - url: '{$API_URL}/rest/v1/VirDomainStats' - authtype: BASIC - username: '{$API_USER}' - password: '{$API_PASS}' - timeout: 15s - status: ENABLED - description: Master item retrieving all VM performance metrics. - - uuid: bbafb36a8b944fb38688373ee79b5e80 - name: 'HyperCore API: Get All Drives' - type: HTTP_AGENT - key: hypercore.api.get[drives] - delay: 5m - history: 1h - trends: '0' - value_type: TEXT - url: '{$API_URL}/rest/v1/Drive' - authtype: BASIC - username: '{$API_USER}' - password: '{$API_PASS}' - timeout: 15s - status: ENABLED - description: Master item retrieving all physical drive status data. - discovery_rules: - - uuid: 053a6dca55824fdd82f6a751c9b5bdad - name: Node Discovery - type: DEPENDENT - key: hypercore.nodes.discovery - delay: '0' - master_item: - key: hypercore.api.get[nodes] - preprocessing: - - type: JSONPATH - parameters: - - '$.*' - lld_macro_paths: - - lld_macro: '{#NODE_ID}' - path: $.uuid - - lld_macro: '{#NODE_NAME}' - path: $.lanIP # Using LAN IP as Node Name, change path if another field is better - item_prototypes: - - uuid: a02204864d3c4f4b86e6bbd0bf1bf1a9 - name: 'Node {#NODE_NAME}: CPU Usage' - type: DEPENDENT - key: hypercore.node.cpu_usage[{#NODE_ID}] - delay: '0' - history: 7d - value_type: FLOAT - units: '%' - master_item: - key: hypercore.api.get[nodes] - preprocessing: - - type: JSONPATH - parameters: - - "$[?(@.uuid == '{#NODE_ID}')].cpuUsage.first()" - error_handler: CUSTOM_VALUE - error_handler_params: '0' - - uuid: c46df85e825d4302a109e99072b08ad1 - name: 'Node {#NODE_NAME}: Memory Usage (%)' - type: DEPENDENT - key: hypercore.node.mem_usage_pct[{#NODE_ID}] - delay: '0' - history: 7d - value_type: FLOAT - units: '%' - master_item: - key: hypercore.api.get[nodes] - preprocessing: - - type: JSONPATH - parameters: - - "$[?(@.uuid == '{#NODE_ID}')].memUsagePercentage.first()" - error_handler: CUSTOM_VALUE - error_handler_params: '0' - - uuid: 36f75e5b606446eeb296a2b0eee20b4a - name: 'Node {#NODE_NAME}: Network Status' - type: DEPENDENT - key: hypercore.node.network_status[{#NODE_ID}] - delay: '0' - history: 7d - value_type: CHAR - master_item: - key: hypercore.api.get[nodes] - preprocessing: - - type: JSONPATH - parameters: - - "$[?(@.uuid == '{#NODE_ID}')].networkStatus.first()" # FIX: Added .first() - error_handler: DISCARD_VALUE - - uuid: 6e81f4e7ead843ef84a5a231192eb1ea - name: 'Node {#NODE_NAME}: Disposition' - type: DEPENDENT - key: hypercore.node.disposition[{#NODE_ID}] - delay: '0' - history: 7d - value_type: CHAR - master_item: - key: hypercore.api.get[nodes] - preprocessing: - - type: JSONPATH - parameters: - - "$[?(@.uuid == '{#NODE_ID}')].currentDisposition.first()" # FIX: Added .first() - error_handler: DISCARD_VALUE - trigger_prototypes: - - uuid: c80f2f1128ed431cb9ba9758b68162e6 - expression: 'last(/Template Scale Computing HyperCore API/hypercore.node.network_status[{#NODE_ID}])<>"ONLINE"' - name: 'Node {#NODE_NAME} is offline' - priority: HIGH - description: The network status for node {#NODE_NAME} is not 'ONLINE'. - - uuid: 87e3116b4afe424187e158ed1d1f089b - expression: 'last(/Template Scale Computing HyperCore API/hypercore.node.disposition[{#NODE_ID}])<>"IN"' - name: "Node {#NODE_NAME} has unusual status (not 'IN')" - priority: WARNING - description: Node {#NODE_NAME} disposition is {ITEM.VALUE} (not 'IN'). This might indicate maintenance or evacuation. - - uuid: c4e8bd7a82474680b3912074a55abefc - expression: 'avg(/Template Scale Computing HyperCore API/hypercore.node.cpu_usage[{#NODE_ID}],5m)>90' - name: 'Node {#NODE_NAME} CPU utilization is high' - priority: WARNING - description: Average CPU usage on node {#NODE_ID}] exceeded 90% for 5 minutes. - - uuid: d29ec4afdd18493098f3c8bea7b146a5 - expression: 'avg(/Template Scale Computing HyperCore API/hypercore.node.mem_usage_pct[{#NODE_ID}],5m)>90' - name: 'Node {#NODE_NAME} memory utilization is high' - priority: WARNING - description: Average memory usage on node {#NODE_ID} exceeded 90% for 5 minutes. - - uuid: 37d39874c311448fa058f881b3b66c16 - name: VM Discovery - type: DEPENDENT - key: hypercore.vms.discovery - delay: '0' - master_item: - key: hypercore.api.get[vms] - preprocessing: - - type: JSONPATH - parameters: - - '$.*' - lld_macro_paths: - - lld_macro: '{#VM_ID}' - path: $.uuid - - lld_macro: '{#VM_NAME}' - path: $.name - item_prototypes: - - uuid: 7fd5a01b4a7d4c27b9c531990310a09f - name: 'VM {#VM_NAME}: State' - type: DEPENDENT - key: hypercore.vm.state[{#VM_ID}] - delay: '0' - history: 7d - value_type: CHAR - master_item: - key: hypercore.api.get[vms] - preprocessing: - - type: JSONPATH - parameters: - - "$[?(@.uuid == '{#VM_ID}')].state.first()" # FIX: Added .first() - error_handler: DISCARD_VALUE - - uuid: 16d192352fa94ea4a5db8d1ea01a6c5c - name: 'VM {#VM_NAME}: Guest Agent Status' - type: DEPENDENT - key: hypercore.vm.guest_agent[{#VM_ID}] - delay: '0' - history: 7d - value_type: CHAR - master_item: - key: hypercore.api.get[vms] - preprocessing: - - type: JSONPATH - parameters: - - "$[?(@.uuid == '{#VM_ID}')].guestAgentState.first()" # FIX: Added .first() - error_handler: DISCARD_VALUE - # --- DISK USAGE ITEMS --- - - uuid: 4ece83a77f9e4c53b2c19cfd5c920ab5 - name: 'VM {#VM_NAME}: Total Disk Capacity (Bytes)' - type: DEPENDENT - key: hypercore.vm.disk.capacity[{#VM_ID}] - delay: '0' - history: 7d - value_type: FLOAT - units: B - master_item: - key: hypercore.api.get[vms] - preprocessing: - - type: JSONPATH - parameters: - - "$[?(@.uuid == '{#VM_ID}')].blockDevs[*].capacity.sum()" # Capacity sum - error_handler: CUSTOM_VALUE - error_handler_params: '0' - - uuid: 8a7cb426e3a24d639dc59904c5c706f7 - name: 'VM {#VM_NAME}: Disk Used Allocation (Bytes)' - type: DEPENDENT - key: hypercore.vm.disk.used_allocated[{#VM_ID}] - delay: '0' - history: 7d - value_type: FLOAT - units: B - master_item: - key: hypercore.api.get[vms] - preprocessing: - - type: JSONPATH - parameters: - - "$[?(@.uuid == '{#VM_ID}')].blockDevs[*].allocation.sum()" # Allocation sum - error_handler: CUSTOM_VALUE - error_handler_params: '0' - - uuid: a2fa0be1a2d84b0faac81d8765fe37f1 - name: 'VM {#VM_NAME}: Disk Allocation Growth Rate (Bps)' - type: DEPENDENT - key: hypercore.vm.disk.used_allocated.rate[{#VM_ID}] - delay: '0' - history: 7d - value_type: FLOAT - units: Bps - master_item: - key: hypercore.api.get[vms] # Master item MUST be a main fetch item. - preprocessing: - - type: JSONPATH - parameters: - - "$[?(@.uuid == '{#VM_ID}')].blockDevs[*].allocation.sum()" - error_handler: CUSTOM_VALUE - error_handler_params: '0' - - type: CHANGE_PER_SECOND - parameters: [] - - type: DISCARD_UNCHANGED_HEARTBEAT - parameters: - - 1h - # CPU/Network stats items query VM Stats master item (vmstats) - - uuid: f6f436a31ad9434c8b8db29319c87ed4 - name: 'VM {#VM_NAME}: CPU Usage' - type: DEPENDENT - key: hypercore.vm.cpu_usage[{#VM_ID}] - delay: '0' - history: 7d - value_type: FLOAT - units: '%' - master_item: - key: hypercore.api.get[vmstats] - preprocessing: - - type: JSONPATH - parameters: - - "$[?(@.uuid == '{#VM_ID}')].cpuUsage.first()" - error_handler: CUSTOM_VALUE - error_handler_params: '0' - - uuid: d3b2a3f70b74489ea0f8a53e67c6c1d5 - name: 'VM {#VM_NAME}: Network RX Rate' - type: DEPENDENT - key: hypercore.vm.net_rx[{#VM_ID}] - delay: '0' - history: 7d - value_type: FLOAT - units: bps - master_item: - key: hypercore.api.get[vmstats] - preprocessing: - - type: JSONPATH - parameters: - - "$[?(@.uuid == '{#VM_ID}')].rxBitRate.first()" - error_handler: CUSTOM_VALUE - error_handler_params: '0' - - uuid: 829399b2d8ae4a9b9dd45e01f675a87f - name: 'VM {#VM_NAME}: Network TX Rate' - type: DEPENDENT - key: hypercore.vm.net_tx[{#VM_ID}] - delay: '0' - history: 7d - value_type: FLOAT - units: bps - master_item: - key: hypercore.api.get[vmstats] - preprocessing: - - type: JSONPATH - parameters: - - "$[?(@.uuid == '{#VM_ID}')].txBitRate.first()" - error_handler: CUSTOM_VALUE - error_handler_params: '0' - trigger_prototypes: - - uuid: 9546a4651d54422099a584d6acaba2cf - expression: 'last(/Template Scale Computing HyperCore API/hypercore.vm.state[{#VM_ID}])<>"RUNNING"' - name: 'VM {#VM_NAME} is not running' - priority: INFO - description: VM {#VM_NAME} state is {ITEM.VALUE} (not 'RUNNING'). - - uuid: 4ab5e0503c5a43288b97461c4017cad7 - expression: 'last(/Template Scale Computing HyperCore API/hypercore.vm.guest_agent[{#VM_ID}])<>"AVAILABLE" and last(/Template Scale Computing HyperCore API/hypercore.vm.state[{#VM_ID}])="RUNNING"' - name: 'VM {#VM_NAME} Guest Agent is unavailable' - priority: WARNING - description: The Guest Agent on VM {#VM_NAME} is not responding, but the VM is running. - - uuid: cc0870c3c5544f709b5edb829fb2b837 - name: Physical Drive Discovery - type: DEPENDENT - key: hypercore.drives.discovery - delay: '0' - master_item: - key: hypercore.api.get[drives] - preprocessing: - - type: JSONPATH - parameters: - - '$.*' - lld_macro_paths: - - lld_macro: '{#DRIVE_ID}' - path: $.uuid - - lld_macro: '{#DRIVE_SN}' - path: $.serialNumber - - lld_macro: '{#DRIVE_SLOT}' - path: $.slot - item_prototypes: - - uuid: 220b4f9765bf4e21b6c0ed24b356cdf5 - name: 'Drive {#DRIVE_SN} (Slot {#DRIVE_SLOT}): Health Status' - type: DEPENDENT - key: hypercore.drive.healthy[{#DRIVE_ID}] - delay: '0' - history: 7d - value_type: FLOAT # Using FLOAT for 0/1, value map handles display - master_item: - key: hypercore.api.get[drives] - preprocessing: - - type: JSONPATH - parameters: - - "$[?(@.uuid == '{#DRIVE_ID}')].isHealthy.first()" - error_handler: CUSTOM_VALUE - error_handler_params: '0' - - type: BOOL_TO_DECIMAL - parameters: [] - valuemap: - name: Zabbix boolean - - uuid: ec9eb41d5d1e44d78e91a7d2e2c762e8 - name: 'Drive {#DRIVE_SN} (Slot {#DRIVE_SLOT}): Temperature' - type: DEPENDENT - key: hypercore.drive.temp[{#DRIVE_ID}] - delay: '0' - history: 7d - value_type: FLOAT # API returns integer, float handles it fine - units: C - master_item: - key: hypercore.api.get[drives] - preprocessing: - - type: JSONPATH - parameters: - - "$[?(@.uuid == '{#DRIVE_ID}')].temperature.first()" - error_handler: CUSTOM_VALUE - error_handler_params: '0' - - type: DISCARD_UNCHANGED_HEARTBEAT - parameters: - - 1h - - uuid: cff2f1fd0c3c46c1a297f92fdb07361d - name: 'Drive {#DRIVE_SN} (Slot {#DRIVE_SLOT}): Error Count' - type: DEPENDENT - key: hypercore.drive.errors[{#DRIVE_ID}] - delay: '0' - history: 7d - value_type: FLOAT # API returns integer, float handles it fine - units: errors - master_item: - key: hypercore.api.get[drives] - preprocessing: - - type: JSONPATH - parameters: - - "$[?(@.uuid == '{#DRIVE_ID}')].errorCount.first()" - error_handler: CUSTOM_VALUE - error_handler_params: '0' - - type: DISCARD_UNCHANGED_HEARTBEAT - parameters: - - 1h - trigger_prototypes: - - uuid: a15325f148fe4087aceee5c69ba984fb - expression: 'last(/Template Scale Computing HyperCore API/hypercore.drive.healthy[{#DRIVE_ID}])=0' - name: 'Drive {#DRIVE_SN} (Slot {#DRIVE_SLOT}) is unhealthy' - priority: HIGH - description: The 'isHealthy' status for drive {#DRIVE_SN} is 'false'. The drive might need replacement. - - uuid: 94661649b86a41dcac12507a3320b62e - expression: 'last(/Template Scale Computing HyperCore API/hypercore.drive.errors[{#DRIVE_ID}])>0' - name: 'Drive {#DRIVE_SN} (Slot {#DRIVE_SLOT}) is reporting errors' - priority: WARNING - description: Drive {#DRIVE_SN} has reported {ITEM.VALUE} errors. - dependencies: - - name: 'Drive {#DRIVE_SN} (Slot {#DRIVE_SLOT}) is unhealthy' - expression: 'last(/Template Scale Computing HyperCore API/hypercore.drive.healthy[{#DRIVE_ID}])=0' - - uuid: 70f9d44495014c7694a56779016d7489 - expression: 'avg(/Template Scale Computing HyperCore API/hypercore.drive.temp[{#DRIVE_ID}],5m)>65' - name: 'Drive {#DRIVE_SN} (Slot {#DRIVE_SLOT}) temperature is high' - priority: AVERAGE - description: The temperature of drive {#DRIVE_SN} is {ITEM.VALUE}C, exceeding the threshold (65C). - valuemaps: - - uuid: 6787163ebbed42369966666ec415ec35 # Placeholder UUID - Regenerate locally - name: 'Zabbix boolean' - mappings: - - value: '0' - newvalue: 'False' - - value: '1' - newvalue: 'True' diff --git a/specific_task/ZabbixPlugin/Scale_Computing_Hypercore_Zabbix.yaml b/specific_task/ZabbixPlugin/Scale_Computing_Hypercore_Zabbix.yaml new file mode 100644 index 0000000..0d7ff1e --- /dev/null +++ b/specific_task/ZabbixPlugin/Scale_Computing_Hypercore_Zabbix.yaml @@ -0,0 +1,441 @@ +zabbix_export: + version: '7.0' + template_groups: + - uuid: 7c0ac5bdc3534844ad44c2878ad9c5f9 + name: Templates/Applications + host_groups: + - uuid: 6613e553dce04927a61fb572cc9505ff + name: 'HyperCore Nodes' + - uuid: 1d149f6520ca4acb9236d537a4b251be + name: 'Virtual machines' + templates: + - uuid: 7ca8528ca7504139994b4e25fb9684bc + template: 'Template Scale Computing HyperCore API' + name: 'Scale Computing HyperCore by HTTP' + description: | + Monitors Scale Computing HyperCore cluster. + Discovers Nodes and VMs and creates Zabbix hosts for them. + groups: + - name: Templates/Applications + items: + - uuid: 00c3ae73c0684cb3a33827148b0d5b75 + name: 'HyperCore API: Get All Nodes (for LLD)' + type: HTTP_AGENT + key: 'hypercore.api.get[nodes_for_discovery]' + delay: 5m + history: 1h + value_type: TEXT + trends: '0' + authtype: BASIC + username: '{$API_USER}' + password: '{$API_PASS}' + timeout: 15s + url: '{$API_URL}/rest/v1/Node' + - uuid: e3092651165a43e58370aafc08904b48 + name: 'HyperCore API: Get All VMs (for LLD)' + type: HTTP_AGENT + key: 'hypercore.api.get[vms_for_discovery]' + delay: 5m + history: 1h + value_type: TEXT + trends: '0' + authtype: BASIC + username: '{$API_USER}' + password: '{$API_PASS}' + timeout: 15s + url: '{$API_URL}/rest/v1/VirDomain' + discovery_rules: + - uuid: 1b720aad17f647edbb68e5e40f08cc46 + name: 'Node Discovery' + type: DEPENDENT + key: hypercore.nodes.discovery + delay: '0' + host_prototypes: + - uuid: c6aa9834ab714a7486c3bf646c82e116 + host: '{#NODE_ID}' + name: 'SCNode {#NODE_NAME}' + group_links: + - group: + name: 'HyperCore Nodes' + templates: + - name: 'Template Scale Computing Node' + macros: + - macro: '{$NODE_ID}' + value: '{#NODE_ID}' + master_item: + key: 'hypercore.api.get[nodes_for_discovery]' + lld_macro_paths: + - lld_macro: '{#NODE_ID}' + path: $.uuid + - lld_macro: '{#NODE_NAME}' + path: $.lanIP + preprocessing: + - type: JSONPATH + parameters: + - '$.*' + - uuid: 7e0cea6bc1ec4e70b79913494d5ecf0a + name: 'VM Discovery' + type: DEPENDENT + key: hypercore.vms.discovery + delay: '0' + host_prototypes: + - uuid: 8f3a750074424245978c77a7b3a0a66c + host: '{#VM_ID}' + name: 'VM {#VM_NAME}' + group_links: + - group: + name: 'Virtual machines' + templates: + - name: 'Template Scale Computing VM' + macros: + - macro: '{$VM_ID}' + value: '{#VM_ID}' + master_item: + key: 'hypercore.api.get[vms_for_discovery]' + lld_macro_paths: + - lld_macro: '{#VM_ID}' + path: $.uuid + - lld_macro: '{#VM_NAME}' + path: $.name + preprocessing: + - type: JSONPATH + parameters: + - '$.*' + macros: + - macro: '{$API_PASS}' + type: SECRET_TEXT + description: 'Password for API Basic Authentication' + - macro: '{$API_URL}' + description: 'Base URL of the HyperCore API (e.g., https://your-cluster-ip)' + - macro: '{$API_USER}' + description: 'Username for API Basic Authentication' + - uuid: d496a54cf96145da884f7f4b7789cd4b + template: 'Template Scale Computing Node' + name: 'Scale Computing Node by HTTP' + description: 'Defines items for a single Scale Computing Node and discovers its drives.' + groups: + - name: Templates/Applications + items: + - uuid: 66de609d9adc4b08933e72d4de149aa6 + name: 'HyperCore API: Get All Drives (for LLD)' + type: HTTP_AGENT + key: 'hypercore.api.get[drives_for_discovery]' + delay: 5m + history: 1h + value_type: TEXT + trends: '0' + authtype: BASIC + username: '{$API_USER}' + password: '{$API_PASS}' + timeout: 15s + url: '{$API_URL}/rest/v1/Drive' + - uuid: cd4fbf05e58d45ec8b9347f50c2f1a41 + name: 'Node CPU Usage' + type: HTTP_AGENT + key: hypercore.node.cpu_usage + history: 7d + value_type: FLOAT + units: '%' + authtype: BASIC + username: '{$API_USER}' + password: '{$API_PASS}' + preprocessing: + - type: JSONPATH + parameters: + - '$[?(@.uuid == ''{$NODE_ID}'')].cpuUsage.first()' + error_handler: CUSTOM_VALUE + error_handler_params: '0' + url: '{$API_URL}/rest/v1/Node' + triggers: + - uuid: 34a020f730644b6cb2cd7f29d02794b5 + expression: 'avg(/Template Scale Computing Node/hypercore.node.cpu_usage,5m)>90' + name: 'Node CPU utilization is high' + priority: WARNING + description: 'Average CPU usage on node exceeded 90% for 5 minutes.' + - uuid: 5aa07432045c4426ae3dc31479eaf9d6 + name: 'Node Disposition' + type: HTTP_AGENT + key: hypercore.node.disposition + history: 7d + value_type: CHAR + trends: '0' + authtype: BASIC + username: '{$API_USER}' + password: '{$API_PASS}' + preprocessing: + - type: JSONPATH + parameters: + - '$[?(@.uuid == ''{$NODE_ID}'')].currentDisposition.first()' + error_handler: DISCARD_VALUE + url: '{$API_URL}/rest/v1/Node' + triggers: + - uuid: 81c6cdc8f5d04cbdb04194416a1fdc4a + expression: 'last(/Template Scale Computing Node/hypercore.node.disposition)<>"IN"' + name: "Node has unusual status (not 'IN')" + priority: WARNING + description: 'Node disposition is {ITEM.VALUE} (not ''IN''). This might indicate maintenance or evacuation.' + - uuid: 106907e45461410b8c1a73b8450d67d9 + name: 'Node Memory Usage (%)' + type: HTTP_AGENT + key: hypercore.node.mem_usage_pct + history: 7d + value_type: FLOAT + units: '%' + authtype: BASIC + username: '{$API_USER}' + password: '{$API_PASS}' + preprocessing: + - type: JSONPATH + parameters: + - '$[?(@.uuid == ''{$NODE_ID}'')].memUsagePercentage.first()' + error_handler: CUSTOM_VALUE + error_handler_params: '0' + url: '{$API_URL}/rest/v1/Node' + triggers: + - uuid: f1081bcd5fb54d47ac2a28dd65d1b407 + expression: 'avg(/Template Scale Computing Node/hypercore.node.mem_usage_pct,5m)>90' + name: 'Node memory utilization is high' + priority: WARNING + description: 'Average memory usage on node exceeded 90% for 5 minutes.' + - uuid: e83b46990772463b922b959c79c76ea5 + name: 'Node Network Status' + type: HTTP_AGENT + key: hypercore.node.network_status + history: 7d + value_type: CHAR + trends: '0' + authtype: BASIC + username: '{$API_USER}' + password: '{$API_PASS}' + preprocessing: + - type: JSONPATH + parameters: + - '$[?(@.uuid == ''{$NODE_ID}'')].networkStatus.first()' + error_handler: DISCARD_VALUE + url: '{$API_URL}/rest/v1/Node' + triggers: + - uuid: bbf79e28079a408c9bb9240721bccf01 + expression: 'last(/Template Scale Computing Node/hypercore.node.network_status)<>"ONLINE"' + name: 'Node is offline' + priority: HIGH + description: 'The network status for the node is not ''ONLINE''.' + discovery_rules: + - uuid: 276f0b2c8c644f67a27603e57da481ac + name: 'Physical Drive Discovery' + type: DEPENDENT + key: hypercore.node.drives.discovery + delay: '0' + item_prototypes: + - uuid: db32f539f3d740eb9591c09786604025 + name: 'Drive {#DRIVE_SN} (Slot {#DRIVE_SLOT}): Error Count' + type: HTTP_AGENT + key: 'hypercore.drive.errors[{#DRIVE_ID}]' + delay: 5m + history: 7d + value_type: FLOAT + units: errors + authtype: BASIC + username: '{$API_USER}' + password: '{$API_PASS}' + preprocessing: + - type: JSONPATH + parameters: + - '$[?(@.uuid == ''{#DRIVE_ID}'')].errorCount.first()' + error_handler: CUSTOM_VALUE + error_handler_params: '0' + url: '{$API_URL}/rest/v1/Drive' + trigger_prototypes: + - uuid: 0a73b58ea26b46a385b08ad1c563d31e + expression: 'last(/Template Scale Computing Node/hypercore.drive.errors[{#DRIVE_ID}])>0' + name: 'Drive {#DRIVE_SN} (Slot {#DRIVE_SLOT}) is reporting errors' + priority: WARNING + description: 'Drive {#DRIVE_SN} has reported {ITEM.VALUE} errors.' + dependencies: + - name: 'Drive {#DRIVE_SN} (Slot {#DRIVE_SLOT}) is unhealthy' + expression: 'last(/Template Scale Computing Node/hypercore.drive.healthy[{#DRIVE_ID}])=0' + - uuid: 358e3812a2294f748bab54868ff0553d + name: 'Drive {#DRIVE_SN} (Slot {#DRIVE_SLOT}): Health Status' + type: HTTP_AGENT + key: 'hypercore.drive.healthy[{#DRIVE_ID}]' + delay: 5m + history: 7d + value_type: FLOAT + authtype: BASIC + username: '{$API_USER}' + password: '{$API_PASS}' + valuemap: + name: 'Zabbix boolean' + preprocessing: + - type: JSONPATH + parameters: + - '$[?(@.uuid == ''{#DRIVE_ID}'')].isHealthy.first()' + error_handler: CUSTOM_VALUE + error_handler_params: '0' + - type: BOOL_TO_DECIMAL + parameters: + - '' + url: '{$API_URL}/rest/v1/Drive' + trigger_prototypes: + - uuid: cff0d98f86b247b495601fa8c3ec2917 + expression: 'last(/Template Scale Computing Node/hypercore.drive.healthy[{#DRIVE_ID}])=0' + name: 'Drive {#DRIVE_SN} (Slot {#DRIVE_SLOT}) is unhealthy' + priority: HIGH + description: 'The ''isHealthy'' status for drive {#DRIVE_SN} is ''false''. The drive might need replacement.' + - uuid: 958ea7dd17d949dea4fc9a34637e47e8 + name: 'Drive {#DRIVE_SN} (Slot {#DRIVE_SLOT}): Temperature' + type: HTTP_AGENT + key: 'hypercore.drive.temp[{#DRIVE_ID}]' + history: 7d + value_type: FLOAT + units: C + authtype: BASIC + username: '{$API_USER}' + password: '{$API_PASS}' + preprocessing: + - type: JSONPATH + parameters: + - '$[?(@.uuid == ''{#DRIVE_ID}'')].temperature.first()' + error_handler: CUSTOM_VALUE + error_handler_params: '0' + url: '{$API_URL}/rest/v1/Drive' + trigger_prototypes: + - uuid: abc220647a1a42c6b318caa8c2ce1e16 + expression: 'avg(/Template Scale Computing Node/hypercore.drive.temp[{#DRIVE_ID}],5m)>65' + name: 'Drive {#DRIVE_SN} (Slot {#DRIVE_SLOT}) temperature is high' + priority: AVERAGE + description: 'The temperature of drive {#DRIVE_SN} is {ITEM.VALUE}C, exceeding the threshold (65C).' + master_item: + key: 'hypercore.api.get[drives_for_discovery]' + lld_macro_paths: + - lld_macro: '{#DRIVE_ID}' + path: $.uuid + - lld_macro: '{#DRIVE_SLOT}' + path: $.slot + - lld_macro: '{#DRIVE_SN}' + path: $.serialNumber + preprocessing: + - type: JSONPATH + parameters: + - '$[?(@.nodeUUID == ''{$NODE_ID}'')]' + error_handler: DISCARD_VALUE + macros: + - macro: '{$NODE_ID}' + valuemaps: + - uuid: 25a763c474714dbe9c99d4195d9a7fdc + name: 'Zabbix boolean' + mappings: + - value: '0' + newvalue: 'False' + - value: '1' + newvalue: 'True' + - uuid: c5c79d19c8a64716ad0887990f5a845a + template: 'Template Scale Computing VM' + name: 'Scale Computing VM by HTTP' + description: 'Defines items and triggers for a single Scale Computing VM.' + groups: + - name: Templates/Applications + items: + - uuid: 2d938300a35e427296f91a1c6540b32d + name: 'CPU Usage' + type: HTTP_AGENT + key: hypercore.vm.cpu_usage + history: 7d + value_type: FLOAT + units: '%' + authtype: BASIC + username: '{$API_USER}' + password: '{$API_PASS}' + preprocessing: + - type: JSONPATH + parameters: + - '$[?(@.uuid == ''{$VM_ID}'')].cpuUsage.first()' + error_handler: CUSTOM_VALUE + error_handler_params: '0' + url: '{$API_URL}/rest/v1/VirDomainStats' + triggers: + - uuid: bb1a73fceef64faa97bab8c722906511 + expression: 'avg(/Template Scale Computing VM/hypercore.vm.cpu_usage,5m)>90' + name: 'VM CPU utilization is high' + priority: WARNING + description: 'Average CPU usage on VM exceeded 90% for 5 minutes.' + - uuid: 122109e7ba35461ba1ac115c6d3302d6 + name: 'Disk Used Allocation (Bytes)' + type: HTTP_AGENT + key: hypercore.vm.disk.used_allocated + delay: 5m + history: 7d + value_type: FLOAT + units: B + authtype: BASIC + username: '{$API_USER}' + password: '{$API_PASS}' + preprocessing: + - type: JSONPATH + parameters: + - '$[?(@.uuid == ''{$VM_ID}'')].blockDevs[*].allocation.sum()' + error_handler: CUSTOM_VALUE + error_handler_params: '0' + url: '{$API_URL}/rest/v1/VirDomain' + - uuid: e5976dcb963648159a24d4d3e2ae7a79 + name: 'Disk Allocation Growth Rate (Bps)' + type: DEPENDENT + key: hypercore.vm.disk.used_allocated.rate + delay: '0' + history: 7d + value_type: FLOAT + units: Bps + preprocessing: + - type: CHANGE_PER_SECOND + parameters: + - '' + - type: DISCARD_UNCHANGED_HEARTBEAT + parameters: + - 1h + master_item: + key: hypercore.vm.disk.used_allocated + - uuid: 3be0bd32e89c4012bea215c6bd2ac7f1 + name: 'Guest Agent Status' + type: HTTP_AGENT + key: hypercore.vm.guest_agent + history: 7d + value_type: CHAR + trends: '0' + authtype: BASIC + username: '{$API_USER}' + password: '{$API_PASS}' + preprocessing: + - type: JSONPATH + parameters: + - '$[?(@.uuid == ''{$VM_ID}'')].guestAgentState.first()' + error_handler: DISCARD_VALUE + url: '{$API_URL}/rest/v1/VirDomain' + triggers: + - uuid: 08a3435a743e4ec7b29547f563633a30 + expression: 'last(/Template Scale Computing VM/hypercore.vm.guest_agent)<>"AVAILABLE" and last(/Template Scale Computing VM/hypercore.vm.state)="RUNNING"' + name: 'VM Guest Agent is unavailable' + priority: WARNING + description: 'The Guest Agent on the VM is not responding, but the VM is running.' + - uuid: 34a750172eb6415fa15d467d0560eb6b + name: 'VM State' + type: HTTP_AGENT + key: hypercore.vm.state + history: 7d + value_type: CHAR + trends: '0' + authtype: BASIC + username: '{$API_USER}' + password: '{$API_PASS}' + preprocessing: + - type: JSONPATH + parameters: + - '$[?(@.uuid == ''{$VM_ID}'')].state.first()' + error_handler: DISCARD_VALUE + url: '{$API_URL}/rest/v1/VirDomain' + triggers: + - uuid: af3a6ec8959f4571a162454a38ca8f30 + expression: 'last(/Template Scale Computing VM/hypercore.vm.state)<>"RUNNING"' + name: 'VM is not running' + priority: INFO + description: 'VM state is {ITEM.VALUE} (not ''RUNNING'').' + macros: + - macro: '{$VM_ID}' \ No newline at end of file From fb9726f7fa56706b677f1d420ebb20c6d238596c Mon Sep 17 00:00:00 2001 From: wvcollenburg <50706527+wvcollenburg@users.noreply.github.com> Date: Wed, 29 Oct 2025 14:21:04 +0100 Subject: [PATCH 3/4] Added graphs and dashboards for all elements --- .../Scale_Computing_Hypercore_Zabbix.yaml | 873 +++++++++++++----- 1 file changed, 654 insertions(+), 219 deletions(-) diff --git a/specific_task/ZabbixPlugin/Scale_Computing_Hypercore_Zabbix.yaml b/specific_task/ZabbixPlugin/Scale_Computing_Hypercore_Zabbix.yaml index 0d7ff1e..a307edf 100644 --- a/specific_task/ZabbixPlugin/Scale_Computing_Hypercore_Zabbix.yaml +++ b/specific_task/ZabbixPlugin/Scale_Computing_Hypercore_Zabbix.yaml @@ -1,57 +1,80 @@ zabbix_export: version: '7.0' template_groups: - - uuid: 7c0ac5bdc3534844ad44c2878ad9c5f9 + - uuid: 2f5db8f3486d4770b224ba40bcec8d1c name: Templates/Applications host_groups: - - uuid: 6613e553dce04927a61fb572cc9505ff - name: 'HyperCore Nodes' - - uuid: 1d149f6520ca4acb9236d537a4b251be + - uuid: e2c57129c5b545ae8af4f0d1cbff1321 name: 'Virtual machines' + - uuid: 3b3aefe0f3494c54af07343cb6273051 + name: 'HyperCore Nodes' templates: - - uuid: 7ca8528ca7504139994b4e25fb9684bc - template: 'Template Scale Computing HyperCore API' - name: 'Scale Computing HyperCore by HTTP' - description: | + # ----------------------------------------------------------------- + # --- TEMPLATE 1: The Main Cluster Monitor + # ----------------------------------------------------------------- + - uuid: 31ca588e5b544d75b04bc0a53947ad1f + template: Template Scale Computing HyperCore API + name: Scale Computing HyperCore by HTTP + description: |- Monitors Scale Computing HyperCore cluster. Discovers Nodes and VMs and creates Zabbix hosts for them. groups: - name: Templates/Applications + macros: + - macro: '{$API_URL}' + description: Base URL of the HyperCore API (e.g., https://your-cluster-ip) + - macro: '{$API_USER}' + description: Username for API Basic Authentication + - macro: '{$API_PASS}' + type: SECRET_TEXT + description: Password for API Basic Authentication items: - - uuid: 00c3ae73c0684cb3a33827148b0d5b75 + - uuid: acdd05734d264b7994cd4dbda0cdd394 name: 'HyperCore API: Get All Nodes (for LLD)' type: HTTP_AGENT key: 'hypercore.api.get[nodes_for_discovery]' delay: 5m history: 1h - value_type: TEXT trends: '0' + value_type: TEXT + url: '{$API_URL}/rest/v1/Node' authtype: BASIC username: '{$API_USER}' password: '{$API_PASS}' timeout: 15s - url: '{$API_URL}/rest/v1/Node' - - uuid: e3092651165a43e58370aafc08904b48 + - uuid: de6c2f3497e447a38a615fed8e1cf7c6 name: 'HyperCore API: Get All VMs (for LLD)' type: HTTP_AGENT key: 'hypercore.api.get[vms_for_discovery]' delay: 5m history: 1h - value_type: TEXT trends: '0' + value_type: TEXT + url: '{$API_URL}/rest/v1/VirDomain' authtype: BASIC username: '{$API_USER}' password: '{$API_PASS}' timeout: 15s - url: '{$API_URL}/rest/v1/VirDomain' discovery_rules: - - uuid: 1b720aad17f647edbb68e5e40f08cc46 - name: 'Node Discovery' + # --- NODE DISCOVERY (Creates Hosts, links to Template 3) --- + - uuid: c48644843cb749119fd667357cfa45b8 + name: Node Discovery type: DEPENDENT key: hypercore.nodes.discovery delay: '0' + master_item: + key: 'hypercore.api.get[nodes_for_discovery]' + preprocessing: + - type: JSONPATH + parameters: + - '$.*' + lld_macro_paths: + - lld_macro: '{#NODE_ID}' + path: $.uuid + - lld_macro: '{#NODE_NAME}' + path: $.lanIP host_prototypes: - - uuid: c6aa9834ab714a7486c3bf646c82e116 + - uuid: ded4e877eb6647529cc8e8d2faa11cb6 host: '{#NODE_ID}' name: 'SCNode {#NODE_NAME}' group_links: @@ -62,77 +85,195 @@ zabbix_export: macros: - macro: '{$NODE_ID}' value: '{#NODE_ID}' + + # --- VM DISCOVERY (Creates Hosts, links to Template 2) --- + - uuid: 8d36120d3a1747ff8d7dedb3a8449ebb + name: VM Discovery + type: DEPENDENT + key: hypercore.vms.discovery + delay: '0' master_item: - key: 'hypercore.api.get[nodes_for_discovery]' - lld_macro_paths: - - lld_macro: '{#NODE_ID}' - path: $.uuid - - lld_macro: '{#NODE_NAME}' - path: $.lanIP + key: 'hypercore.api.get[vms_for_discovery]' preprocessing: - type: JSONPATH parameters: - '$.*' - - uuid: 7e0cea6bc1ec4e70b79913494d5ecf0a - name: 'VM Discovery' - type: DEPENDENT - key: hypercore.vms.discovery - delay: '0' + lld_macro_paths: + - lld_macro: '{#VM_ID}' + path: $.uuid + - lld_macro: '{#VM_NAME}' + path: $.name host_prototypes: - - uuid: 8f3a750074424245978c77a7b3a0a66c + - uuid: d5f0f9a712c941cc9e84428c73d0da55 host: '{#VM_ID}' name: 'VM {#VM_NAME}' group_links: - group: name: 'Virtual machines' templates: - - name: 'Template Scale Computing VM' + - name: Template Scale Computing VM macros: - macro: '{$VM_ID}' value: '{#VM_ID}' - master_item: - key: 'hypercore.api.get[vms_for_discovery]' - lld_macro_paths: - - lld_macro: '{#VM_ID}' - path: $.uuid - - lld_macro: '{#VM_NAME}' - path: $.name - preprocessing: - - type: JSONPATH - parameters: - - '$.*' - macros: - - macro: '{$API_PASS}' - type: SECRET_TEXT - description: 'Password for API Basic Authentication' - - macro: '{$API_URL}' - description: 'Base URL of the HyperCore API (e.g., https://your-cluster-ip)' - - macro: '{$API_USER}' - description: 'Username for API Basic Authentication' - - uuid: d496a54cf96145da884f7f4b7789cd4b - template: 'Template Scale Computing Node' - name: 'Scale Computing Node by HTTP' - description: 'Defines items for a single Scale Computing Node and discovers its drives.' + + # ----------------------------------------------------------------- + # --- TEMPLATE 2: The individual VM Monitor + # --- ADDED Configured Memory Item + # ----------------------------------------------------------------- + - uuid: cfd9b7e8b7cd4bbda39671a0dd16da1b + template: Template Scale Computing VM + name: Scale Computing VM by HTTP + description: 'Defines items and triggers for a single Scale Computing VM.' groups: - name: Templates/Applications + macros: + - macro: '{$VM_ID}' items: - - uuid: 66de609d9adc4b08933e72d4de149aa6 - name: 'HyperCore API: Get All Drives (for LLD)' + - uuid: a627ffeac712479181d3cd463430c226 + name: 'VM Info (Master)' type: HTTP_AGENT - key: 'hypercore.api.get[drives_for_discovery]' - delay: 5m + key: hypercore.vm.info + delay: 1m history: 1h - value_type: TEXT trends: '0' + value_type: TEXT authtype: BASIC username: '{$API_USER}' password: '{$API_PASS}' - timeout: 15s - url: '{$API_URL}/rest/v1/Drive' - - uuid: cd4fbf05e58d45ec8b9347f50c2f1a41 - name: 'Node CPU Usage' + url: '{$API_URL}/rest/v1/VirDomain' + preprocessing: + - type: JSONPATH + parameters: + - '$[?(@.uuid == ''{$VM_ID}'')]' + error_handler: DISCARD_VALUE + - type: JSONPATH + parameters: + - '$.first()' + - uuid: da108e4f3a2b4ae8b9551035c66684ed + name: 'VM State' + type: DEPENDENT + key: hypercore.vm.state + delay: '0' + history: 7d + value_type: CHAR + trends: '0' + preprocessing: + - type: JSONPATH + parameters: + - '$.state' + error_handler: DISCARD_VALUE + master_item: + key: hypercore.vm.info + triggers: + - uuid: 9161f14551624f2294e97e60f3563fb3 + expression: 'last(/Template Scale Computing VM/hypercore.vm.state)<>"RUNNING"' + name: 'VM is not running' + priority: INFO + description: 'VM state is {ITEM.VALUE} (not ''RUNNING'').' + - uuid: 6ea89b9269814430a41a54e732bacf16 + name: 'Guest Agent Status' + type: DEPENDENT + key: hypercore.vm.guest_agent + delay: '0' + history: 7d + value_type: CHAR + trends: '0' + preprocessing: + - type: JSONPATH + parameters: + - '$.guestAgentState' + error_handler: DISCARD_VALUE + master_item: + key: hypercore.vm.info + triggers: + - uuid: ece8ae2591a045a3bcd5e550be74b1e6 + expression: 'last(/Template Scale Computing VM/hypercore.vm.guest_agent)<>"AVAILABLE" and last(/Template Scale Computing VM/hypercore.vm.state)="RUNNING"' + name: 'VM Guest Agent is unavailable' + priority: WARNING + description: 'The Guest Agent on the VM is not responding, but the VM is running.' + - uuid: 0bff5c27ec1d49048f45704cc0491e7a + name: 'Disk Used Allocation (Total Bytes)' + type: DEPENDENT + key: hypercore.vm.disk.used_allocated.total + delay: '0' + history: 7d + value_type: FLOAT + units: B + preprocessing: + - type: JSONPATH + parameters: + - '$.blockDevs[*].allocation.sum()' + error_handler: CUSTOM_VALUE + error_handler_params: '0' + master_item: + key: hypercore.vm.info + - uuid: ec053620ec3040aaa9e1f2797d4e1b85 + name: 'Total Disk Capacity (Total Bytes)' + type: DEPENDENT + key: hypercore.vm.disk.capacity.total + delay: '0' + history: 7d + value_type: FLOAT + units: B + preprocessing: + - type: JSONPATH + parameters: + - '$.blockDevs[*].capacity.sum()' + error_handler: CUSTOM_VALUE + error_handler_params: '0' + master_item: + key: hypercore.vm.info + - uuid: 19351ef3e09c44f6a17667af291216f2 + name: 'Disk Used (Total %)' + type: DEPENDENT + key: hypercore.vm.disk.used_pct.total + delay: '0' + history: 7d + value_type: FLOAT + units: '%' + master_item: + key: hypercore.vm.info + preprocessing: + - type: JAVASCRIPT + parameters: + - | + var data = JSON.parse(value); + var total_alloc = 0; + var total_cap = 0; + + if (data && data.blockDevs && data.blockDevs.length > 0) { + for (var i = 0; i < data.blockDevs.length; i++) { + total_alloc += (data.blockDevs[i].allocation || 0); + total_cap += (data.blockDevs[i].capacity || 0); + } + if (total_cap > 0) { + return (100 * total_alloc / total_cap).toFixed(2); + } + } + return 0; + error_handler: CUSTOM_VALUE + error_handler_params: '0' + - uuid: 43dbb31df5e64e159ce2145a418303bb + name: 'Disk Allocation Growth Rate (Total Bps)' + type: DEPENDENT + key: hypercore.vm.disk.used_allocated.rate.total + delay: '0' + history: 7d + value_type: FLOAT + units: Bps + preprocessing: + - type: CHANGE_PER_SECOND + parameters: + - '' + - type: DISCARD_UNCHANGED_HEARTBEAT + parameters: + - 1h + master_item: + key: hypercore.vm.disk.used_allocated.total + - uuid: e8e908e34cd84c508f3d867785757370 + name: 'CPU Usage' type: HTTP_AGENT - key: hypercore.node.cpu_usage + key: hypercore.vm.cpu_usage history: 7d value_type: FLOAT units: '%' @@ -142,39 +283,300 @@ zabbix_export: preprocessing: - type: JSONPATH parameters: - - '$[?(@.uuid == ''{$NODE_ID}'')].cpuUsage.first()' + - "$[?(@.uuid == '{$VM_ID}')].cpuUsage.first()" error_handler: CUSTOM_VALUE error_handler_params: '0' - url: '{$API_URL}/rest/v1/Node' + url: '{$API_URL}/rest/v1/VirDomainStats' triggers: - - uuid: 34a020f730644b6cb2cd7f29d02794b5 - expression: 'avg(/Template Scale Computing Node/hypercore.node.cpu_usage,5m)>90' - name: 'Node CPU utilization is high' + - uuid: f6339b6c83a54148a705ff16ea3ee9ad + expression: 'avg(/Template Scale Computing VM/hypercore.vm.cpu_usage,5m)>90' + name: 'VM CPU utilization is high' priority: WARNING - description: 'Average CPU usage on node exceeded 90% for 5 minutes.' - - uuid: 5aa07432045c4426ae3dc31479eaf9d6 - name: 'Node Disposition' + description: 'Average CPU usage on VM exceeded 90% for 5 minutes.' + - uuid: c008a85a7a4241ce9bf10d7aa0f4347e + name: 'Snapshot Count' + type: DEPENDENT + key: hypercore.vm.snapshot.count + delay: '0' + history: 7d + value_type: FLOAT + units: snapshots + preprocessing: + - type: JSONPATH + parameters: + - '$.snapUUIDs.length()' + error_handler: CUSTOM_VALUE + error_handler_params: '0' + master_item: + key: hypercore.vm.info + - uuid: 90270f33f18c4a37b075bdc115e7361f # New Item for Configured Memory + name: 'Configured Memory' + type: DEPENDENT + key: hypercore.vm.memory.configured + delay: '0' + history: 7d + value_type: FLOAT + units: B + preprocessing: + - type: JSONPATH + parameters: + - '$.mem' + error_handler: CUSTOM_VALUE + error_handler_params: '0' + master_item: + key: hypercore.vm.info + discovery_rules: + - uuid: 728fffd5aa294114a546a39d7a82ed36 + name: 'VM Disk Discovery' + type: DEPENDENT + key: hypercore.vm.disks.discovery + delay: '0' + master_item: + key: hypercore.vm.info # Depends on the VM Info Master Item + preprocessing: + - type: JSONPATH + parameters: + - '$.blockDevs[*]' + error_handler: DISCARD_VALUE + - type: JAVASCRIPT + parameters: + - | + var disks = JSON.parse(value); + var valid_types = ['VIRTIO_DISK', 'SCSI_DISK', 'IDE_DISK']; + var result = []; + for (var i = 0; i < disks.length; i++) { + if (valid_types.indexOf(disks[i].type) !== -1) { + result.push(disks[i]); + } + } + return JSON.stringify(result); + error_handler: DISCARD_VALUE + lld_macro_paths: + - lld_macro: '{#DISK_UUID}' + path: '$.uuid' + - lld_macro: '{#DISK_NAME}' # Extract the first mount point, if available + path: '$.mountPoints[0]' + item_prototypes: + - uuid: c33c81f937cb4647aa58c80eb26452dc + name: 'Disk {#DISK_UUID}: Used Allocation (Bytes)' # Simplified Name + type: DEPENDENT + key: 'hypercore.vm.disk.used_allocated[{#DISK_UUID}]' + delay: '0' + history: 7d + value_type: FLOAT + units: B + master_item: + key: hypercore.vm.info + preprocessing: + - type: JSONPATH + parameters: + - '$.blockDevs[?(@.uuid == "{#DISK_UUID}")].allocation.first()' + error_handler: CUSTOM_VALUE + error_handler_params: '0' + tags: # Added tag for mount point + - tag: mount_point + value: '{#DISK_NAME}' + - uuid: 28ba92ffb24d48d6bdc8b490b612e8e1 + name: 'Disk {#DISK_UUID}: Total Capacity (Bytes)' # Simplified Name + type: DEPENDENT + key: 'hypercore.vm.disk.capacity[{#DISK_UUID}]' + delay: '0' + history: 7d + value_type: FLOAT + units: B + master_item: + key: hypercore.vm.info + preprocessing: + - type: JSONPATH + parameters: + - '$.blockDevs[?(@.uuid == "{#DISK_UUID}")].capacity.first()' + error_handler: CUSTOM_VALUE + error_handler_params: '0' + tags: # Added tag for mount point + - tag: mount_point + value: '{#DISK_NAME}' + - uuid: c53afe7d17f54371b32957f556110ba0 + name: 'Disk {#DISK_UUID}: Used (%)' # Simplified Name + type: DEPENDENT + key: 'hypercore.vm.disk.used_pct[{#DISK_UUID}]' + delay: '0' + history: 7d + value_type: FLOAT + units: '%' + master_item: + key: hypercore.vm.info + preprocessing: + - type: JAVASCRIPT + parameters: + - | + var data = JSON.parse(value); + var alloc = 0; + var cap = 0; + if (data && data.blockDevs) { + for (var i = 0; i < data.blockDevs.length; i++) { + if (data.blockDevs[i].uuid === "{#DISK_UUID}") { + alloc = data.blockDevs[i].allocation || 0; + cap = data.blockDevs[i].capacity || 0; + break; + } + } + } + if (cap > 0) { + return (100 * alloc / cap).toFixed(2); + } + return 0; + error_handler: CUSTOM_VALUE + error_handler_params: '0' + tags: # Added tag for mount point + - tag: mount_point + value: '{#DISK_NAME}' + - uuid: 77bd81cf1a2a4bac8e176de310a7cd37 + name: 'Disk {#DISK_UUID}: Allocation Growth Rate (Bps)' # Simplified Name + type: DEPENDENT + key: 'hypercore.vm.disk.used_allocated.rate[{#DISK_UUID}]' + delay: '0' + history: 7d + value_type: FLOAT + units: Bps + preprocessing: + - type: CHANGE_PER_SECOND + parameters: + - '' + - type: DISCARD_UNCHANGED_HEARTBEAT + parameters: + - 1h + master_item: + key: 'hypercore.vm.disk.used_allocated[{#DISK_UUID}]' # Depends on the individual disk item + tags: # Added tag for mount point + - tag: mount_point + value: '{#DISK_NAME}' + dashboards: + - uuid: 9a6cc006e1184f01a81137e41d289a1b + name: 'VM Performance' + pages: + - widgets: + - type: graph + name: 'VM CPU Utilization' + width: '12' + height: '5' + fields: + - type: GRAPH + name: graphid.0 + value: + host: 'Template Scale Computing VM' + name: 'VM CPU Utilization' + - type: item + name: 'Disk Consumption (Total)' + x: '12' + width: '12' + height: '5' + fields: + - type: ITEM + name: itemid.0 + value: + host: 'Template Scale Computing VM' + key: hypercore.vm.disk.used_pct.total + - type: INTEGER + name: show.0 + value: '1' # Changed from 10 to 1 (Show Value) + - type: INTEGER + name: threshold.show.0 + value: '1' + - type: STRING + name: threshold.value.0 + value: '80' + - type: STRING + name: threshold.color.0 + value: F63100 + - type: STRING + name: threshold.value.1 + value: '90' + - type: STRING + name: threshold.color.1 + value: C80000 + - type: graph + name: 'VM Disk Growth Rate (Total)' + 'y': '5' + width: '12' + height: '5' + fields: + - type: GRAPH + name: graphid.0 + value: + host: 'Template Scale Computing VM' + name: 'VM Disk Growth Rate' + - type: graph + name: 'VM Disk Allocation (Total)' + x: '12' + 'y': '5' + width: '12' + height: '5' + fields: + - type: GRAPH + name: graphid.0 + value: + host: 'Template Scale Computing VM' + name: 'VM Disk Allocation' + - type: item + name: 'VM State' + 'y': '10' + width: '12' + height: '5' + fields: + - type: ITEM + name: itemid.0 + value: + host: 'Template Scale Computing VM' + key: hypercore.vm.state + - type: item + name: 'Guest Agent Status' + x: '12' + 'y': '10' + width: '12' + height: '5' + fields: + - type: ITEM + name: itemid.0 + value: + host: 'Template Scale Computing VM' + key: hypercore.vm.guest_agent + + # ----------------------------------------------------------------- + # --- TEMPLATE 3: The new individual NODE Monitor + # ----------------------------------------------------------------- + - uuid: 0dc94c2476d442f3bdc0a72f35e95b43 + template: 'Template Scale Computing Node' + name: 'Scale Computing Node by HTTP' + description: 'Defines items for a single Scale Computing Node and discovers its drives.' + groups: + - name: Templates/Applications + macros: + - macro: '{$NODE_ID}' + items: + - uuid: d55b6b527f8e4ef891bce56af5aef5b4 + name: 'Node CPU Usage' type: HTTP_AGENT - key: hypercore.node.disposition + key: hypercore.node.cpu_usage history: 7d - value_type: CHAR - trends: '0' + value_type: FLOAT + units: '%' authtype: BASIC username: '{$API_USER}' password: '{$API_PASS}' preprocessing: - type: JSONPATH parameters: - - '$[?(@.uuid == ''{$NODE_ID}'')].currentDisposition.first()' - error_handler: DISCARD_VALUE + - '$[?(@.uuid == ''{$NODE_ID}'')].cpuUsage.first()' + error_handler: CUSTOM_VALUE + error_handler_params: '0' url: '{$API_URL}/rest/v1/Node' triggers: - - uuid: 81c6cdc8f5d04cbdb04194416a1fdc4a - expression: 'last(/Template Scale Computing Node/hypercore.node.disposition)<>"IN"' - name: "Node has unusual status (not 'IN')" + - uuid: 754c2c9f0519403d9f8bc5c4c9089a72 + expression: 'avg(/Template Scale Computing Node/hypercore.node.cpu_usage,5m)>90' + name: 'Node CPU utilization is high' priority: WARNING - description: 'Node disposition is {ITEM.VALUE} (not ''IN''). This might indicate maintenance or evacuation.' - - uuid: 106907e45461410b8c1a73b8450d67d9 + description: 'Average CPU usage on node exceeded 90% for 5 minutes.' + - uuid: 24701c937d9743d1a144c417eeb6f2a0 name: 'Node Memory Usage (%)' type: HTTP_AGENT key: hypercore.node.mem_usage_pct @@ -192,12 +594,12 @@ zabbix_export: error_handler_params: '0' url: '{$API_URL}/rest/v1/Node' triggers: - - uuid: f1081bcd5fb54d47ac2a28dd65d1b407 + - uuid: 0e5c6edc7b9347b8ae3b1c391201c3f4 expression: 'avg(/Template Scale Computing Node/hypercore.node.mem_usage_pct,5m)>90' name: 'Node memory utilization is high' priority: WARNING description: 'Average memory usage on node exceeded 90% for 5 minutes.' - - uuid: e83b46990772463b922b959c79c76ea5 + - uuid: 9edb6c7ee3f0422d9bb98749f774cfeb name: 'Node Network Status' type: HTTP_AGENT key: hypercore.node.network_status @@ -214,19 +616,68 @@ zabbix_export: error_handler: DISCARD_VALUE url: '{$API_URL}/rest/v1/Node' triggers: - - uuid: bbf79e28079a408c9bb9240721bccf01 + - uuid: 0e869eb45fa044d7b7ef8c64bd5f7b5d expression: 'last(/Template Scale Computing Node/hypercore.node.network_status)<>"ONLINE"' name: 'Node is offline' priority: HIGH description: 'The network status for the node is not ''ONLINE''.' + - uuid: 9c5dd7bc3ff14c95895e1f0a6ec394f7 + name: 'Node Disposition' + type: HTTP_AGENT + key: hypercore.node.disposition + history: 7d + value_type: CHAR + trends: '0' + authtype: BASIC + username: '{$API_USER}' + password: '{$API_PASS}' + preprocessing: + - type: JSONPATH + parameters: + - '$[?(@.uuid == ''{$NODE_ID}'')].currentDisposition.first()' + error_handler: DISCARD_VALUE + url: '{$API_URL}/rest/v1/Node' + triggers: + - uuid: b3683aa350ad49998e960fae1ac04918 + expression: 'last(/Template Scale Computing Node/hypercore.node.disposition)<>"IN"' + name: "Node has unusual status (not 'IN')" + priority: WARNING + description: 'Node disposition is {ITEM.VALUE} (not ''IN''). This might indicate maintenance or evacuation.' + - uuid: cc18eaabae01498a8f85439377b1a489 + name: 'HyperCore API: Get All Drives (for LLD)' + type: HTTP_AGENT + key: 'hypercore.api.get[drives_for_discovery]' + delay: 5m + history: 1h + value_type: TEXT + trends: '0' + authtype: BASIC + username: '{$API_USER}' + password: '{$API_PASS}' + timeout: 15s + url: '{$API_URL}/rest/v1/Drive' discovery_rules: - - uuid: 276f0b2c8c644f67a27603e57da481ac + - uuid: 6bc7d604aabf45c4b617d8af99b03fd5 name: 'Physical Drive Discovery' type: DEPENDENT key: hypercore.node.drives.discovery delay: '0' + master_item: + key: 'hypercore.api.get[drives_for_discovery]' + preprocessing: + - type: JSONPATH + parameters: + - '$[?(@.nodeUUID == ''{$NODE_ID}'')]' + error_handler: DISCARD_VALUE + lld_macro_paths: + - lld_macro: '{#DRIVE_ID}' + path: $.uuid + - lld_macro: '{#DRIVE_SLOT}' + path: $.slot + - lld_macro: '{#DRIVE_SN}' + path: $.serialNumber item_prototypes: - - uuid: db32f539f3d740eb9591c09786604025 + - uuid: 73412100076e4b968dbda67c993d3362 name: 'Drive {#DRIVE_SN} (Slot {#DRIVE_SLOT}): Error Count' type: HTTP_AGENT key: 'hypercore.drive.errors[{#DRIVE_ID}]' @@ -245,7 +696,7 @@ zabbix_export: error_handler_params: '0' url: '{$API_URL}/rest/v1/Drive' trigger_prototypes: - - uuid: 0a73b58ea26b46a385b08ad1c563d31e + - uuid: 1e39abd351c84f6bb9de5a7ecefc317b expression: 'last(/Template Scale Computing Node/hypercore.drive.errors[{#DRIVE_ID}])>0' name: 'Drive {#DRIVE_SN} (Slot {#DRIVE_SLOT}) is reporting errors' priority: WARNING @@ -253,7 +704,7 @@ zabbix_export: dependencies: - name: 'Drive {#DRIVE_SN} (Slot {#DRIVE_SLOT}) is unhealthy' expression: 'last(/Template Scale Computing Node/hypercore.drive.healthy[{#DRIVE_ID}])=0' - - uuid: 358e3812a2294f748bab54868ff0553d + - uuid: a24a2056f27a443d984808bc745c04f3 name: 'Drive {#DRIVE_SN} (Slot {#DRIVE_SLOT}): Health Status' type: HTTP_AGENT key: 'hypercore.drive.healthy[{#DRIVE_ID}]' @@ -276,12 +727,12 @@ zabbix_export: - '' url: '{$API_URL}/rest/v1/Drive' trigger_prototypes: - - uuid: cff0d98f86b247b495601fa8c3ec2917 + - uuid: 6e2bb055388c4a08a8f8e9db2b50fadc expression: 'last(/Template Scale Computing Node/hypercore.drive.healthy[{#DRIVE_ID}])=0' name: 'Drive {#DRIVE_SN} (Slot {#DRIVE_SLOT}) is unhealthy' priority: HIGH description: 'The ''isHealthy'' status for drive {#DRIVE_SN} is ''false''. The drive might need replacement.' - - uuid: 958ea7dd17d949dea4fc9a34637e47e8 + - uuid: fab6697e075c4b51b1002fac55dbaceb name: 'Drive {#DRIVE_SN} (Slot {#DRIVE_SLOT}): Temperature' type: HTTP_AGENT key: 'hypercore.drive.temp[{#DRIVE_ID}]' @@ -299,143 +750,127 @@ zabbix_export: error_handler_params: '0' url: '{$API_URL}/rest/v1/Drive' trigger_prototypes: - - uuid: abc220647a1a42c6b318caa8c2ce1e16 + - uuid: 06fc701dba294712af42d563b6955b9b expression: 'avg(/Template Scale Computing Node/hypercore.drive.temp[{#DRIVE_ID}],5m)>65' name: 'Drive {#DRIVE_SN} (Slot {#DRIVE_SLOT}) temperature is high' priority: AVERAGE description: 'The temperature of drive {#DRIVE_SN} is {ITEM.VALUE}C, exceeding the threshold (65C).' - master_item: - key: 'hypercore.api.get[drives_for_discovery]' - lld_macro_paths: - - lld_macro: '{#DRIVE_ID}' - path: $.uuid - - lld_macro: '{#DRIVE_SLOT}' - path: $.slot - - lld_macro: '{#DRIVE_SN}' - path: $.serialNumber - preprocessing: - - type: JSONPATH - parameters: - - '$[?(@.nodeUUID == ''{$NODE_ID}'')]' - error_handler: DISCARD_VALUE - macros: - - macro: '{$NODE_ID}' + graph_prototypes: + - uuid: 0fa785e238c345ceacb8df13366a138d + name: 'Drive {#DRIVE_SN} (Slot {#DRIVE_SLOT}): Temperature' + graph_items: + - color: F63100 + item: + host: 'Template Scale Computing Node' + key: 'hypercore.drive.temp[{#DRIVE_ID}]' + - uuid: 48e126dbf8dc4d5a8ad20c7d6623b69c + name: 'Drive {#DRIVE_SN} (Slot {#DRIVE_SLOT}): Error Count' + graph_items: + - color: C80000 + drawtype: FILLED_REGION + item: + host: 'Template Scale Computing Node' + key: 'hypercore.drive.errors[{#DRIVE_ID}]' + dashboards: + - uuid: 1ab324b1551d4654bc208c971ac56c18 + name: 'Node Performance' + pages: + - widgets: + - type: graph + name: 'Node CPU Utilization' + width: '24' + height: '5' + fields: + - type: GRAPH + name: graphid.0 + value: + host: 'Template Scale Computing Node' + name: 'Node CPU Utilization' + - type: graph + name: 'Node Memory Utilization' + 'y': '5' + width: '24' + height: '5' + fields: + - type: GRAPH + name: graphid.0 + value: + host: 'Template Scale Computing Node' + name: 'Node Memory Utilization' + - type: item + name: 'Node Network Status' + 'y': '10' + width: '12' + height: '5' + fields: + - type: ITEM + name: itemid.0 + value: + host: 'Template Scale Computing Node' + key: hypercore.node.network_status + - type: item + name: 'Node Disposition' + x: '12' + 'y': '10' + width: '12' + height: '5' + fields: + - type: ITEM + name: itemid.0 + value: + host: 'Template Scale Computing Node' + key: hypercore.node.disposition valuemaps: - - uuid: 25a763c474714dbe9c99d4195d9a7fdc + - uuid: 94079248c03945cb944a858278cc31c9 name: 'Zabbix boolean' mappings: - value: '0' newvalue: 'False' - value: '1' newvalue: 'True' - - uuid: c5c79d19c8a64716ad0887990f5a845a - template: 'Template Scale Computing VM' - name: 'Scale Computing VM by HTTP' - description: 'Defines items and triggers for a single Scale Computing VM.' - groups: - - name: Templates/Applications - items: - - uuid: 2d938300a35e427296f91a1c6540b32d - name: 'CPU Usage' - type: HTTP_AGENT - key: hypercore.vm.cpu_usage - history: 7d - value_type: FLOAT - units: '%' - authtype: BASIC - username: '{$API_USER}' - password: '{$API_PASS}' - preprocessing: - - type: JSONPATH - parameters: - - '$[?(@.uuid == ''{$VM_ID}'')].cpuUsage.first()' - error_handler: CUSTOM_VALUE - error_handler_params: '0' - url: '{$API_URL}/rest/v1/VirDomainStats' - triggers: - - uuid: bb1a73fceef64faa97bab8c722906511 - expression: 'avg(/Template Scale Computing VM/hypercore.vm.cpu_usage,5m)>90' - name: 'VM CPU utilization is high' - priority: WARNING - description: 'Average CPU usage on VM exceeded 90% for 5 minutes.' - - uuid: 122109e7ba35461ba1ac115c6d3302d6 - name: 'Disk Used Allocation (Bytes)' - type: HTTP_AGENT - key: hypercore.vm.disk.used_allocated - delay: 5m - history: 7d - value_type: FLOAT - units: B - authtype: BASIC - username: '{$API_USER}' - password: '{$API_PASS}' - preprocessing: - - type: JSONPATH - parameters: - - '$[?(@.uuid == ''{$VM_ID}'')].blockDevs[*].allocation.sum()' - error_handler: CUSTOM_VALUE - error_handler_params: '0' - url: '{$API_URL}/rest/v1/VirDomain' - - uuid: e5976dcb963648159a24d4d3e2ae7a79 - name: 'Disk Allocation Growth Rate (Bps)' - type: DEPENDENT - key: hypercore.vm.disk.used_allocated.rate - delay: '0' - history: 7d - value_type: FLOAT - units: Bps - preprocessing: - - type: CHANGE_PER_SECOND - parameters: - - '' - - type: DISCARD_UNCHANGED_HEARTBEAT - parameters: - - 1h - master_item: - key: hypercore.vm.disk.used_allocated - - uuid: 3be0bd32e89c4012bea215c6bd2ac7f1 - name: 'Guest Agent Status' - type: HTTP_AGENT - key: hypercore.vm.guest_agent - history: 7d - value_type: CHAR - trends: '0' - authtype: BASIC - username: '{$API_USER}' - password: '{$API_PASS}' - preprocessing: - - type: JSONPATH - parameters: - - '$[?(@.uuid == ''{$VM_ID}'')].guestAgentState.first()' - error_handler: DISCARD_VALUE - url: '{$API_URL}/rest/v1/VirDomain' - triggers: - - uuid: 08a3435a743e4ec7b29547f563633a30 - expression: 'last(/Template Scale Computing VM/hypercore.vm.guest_agent)<>"AVAILABLE" and last(/Template Scale Computing VM/hypercore.vm.state)="RUNNING"' - name: 'VM Guest Agent is unavailable' - priority: WARNING - description: 'The Guest Agent on the VM is not responding, but the VM is running.' - - uuid: 34a750172eb6415fa15d467d0560eb6b - name: 'VM State' - type: HTTP_AGENT - key: hypercore.vm.state - history: 7d - value_type: CHAR - trends: '0' - authtype: BASIC - username: '{$API_USER}' - password: '{$API_PASS}' - preprocessing: - - type: JSONPATH - parameters: - - '$[?(@.uuid == ''{$VM_ID}'')].state.first()' - error_handler: DISCARD_VALUE - url: '{$API_URL}/rest/v1/VirDomain' - triggers: - - uuid: af3a6ec8959f4571a162454a38ca8f30 - expression: 'last(/Template Scale Computing VM/hypercore.vm.state)<>"RUNNING"' - name: 'VM is not running' - priority: INFO - description: 'VM state is {ITEM.VALUE} (not ''RUNNING'').' - macros: - - macro: '{$VM_ID}' \ No newline at end of file + graphs: + - uuid: b613fb46661c4dbcbcaf285d43faba9f + name: 'Node CPU Utilization' + width: '1200' + height: '300' + graph_items: + - color: 199C0D + item: + host: 'Template Scale Computing Node' + key: hypercore.node.cpu_usage + - uuid: 3ebbb21e2c054e4a9b67ea58c69e370a + name: 'Node Memory Utilization' + width: '1200' + height: '300' + graph_items: + - color: F63100 + item: + host: 'Template Scale Computing Node' + key: hypercore.node.mem_usage_pct + - uuid: 3397891d7bac4bfd9e9bbece9d7cee23 + name: 'VM CPU Utilization' + width: '1200' + height: '300' + graph_items: + - color: 199C0D + item: + host: 'Template Scale Computing VM' + key: hypercore.vm.cpu_usage + - uuid: fd3565a1af614ef2a6b972718fddb592 + name: 'VM Disk Allocation' + width: '1200' + height: '300' + graph_items: + - color: 0090FF + item: + host: 'Template Scale Computing VM' + key: hypercore.vm.disk.used_allocated.total + - uuid: 0206f10792f64e5a8702e6a46909f519 + name: 'VM Disk Growth Rate' + width: '1200' + height: '300' + graph_items: + - color: 00C7FF + item: + host: 'Template Scale Computing VM' + key: hypercore.vm.disk.used_allocated.rate.total \ No newline at end of file From 49f26ee505a13b28f50db73a6e7e2a5c5aac5b03 Mon Sep 17 00:00:00 2001 From: wvcollenburg <50706527+wvcollenburg@users.noreply.github.com> Date: Wed, 5 Nov 2025 19:52:22 +0100 Subject: [PATCH 4/4] lowered delays to make template poll less --- .../ZabbixPlugin/Scale_Computing_Hypercore_Zabbix.yaml | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/specific_task/ZabbixPlugin/Scale_Computing_Hypercore_Zabbix.yaml b/specific_task/ZabbixPlugin/Scale_Computing_Hypercore_Zabbix.yaml index a307edf..3e7b985 100644 --- a/specific_task/ZabbixPlugin/Scale_Computing_Hypercore_Zabbix.yaml +++ b/specific_task/ZabbixPlugin/Scale_Computing_Hypercore_Zabbix.yaml @@ -33,7 +33,7 @@ zabbix_export: name: 'HyperCore API: Get All Nodes (for LLD)' type: HTTP_AGENT key: 'hypercore.api.get[nodes_for_discovery]' - delay: 5m + delay: 2m history: 1h trends: '0' value_type: TEXT @@ -46,7 +46,7 @@ zabbix_export: name: 'HyperCore API: Get All VMs (for LLD)' type: HTTP_AGENT key: 'hypercore.api.get[vms_for_discovery]' - delay: 5m + delay: 2m history: 1h trends: '0' value_type: TEXT @@ -647,7 +647,7 @@ zabbix_export: name: 'HyperCore API: Get All Drives (for LLD)' type: HTTP_AGENT key: 'hypercore.api.get[drives_for_discovery]' - delay: 5m + delay: 2m history: 1h value_type: TEXT trends: '0' @@ -681,7 +681,7 @@ zabbix_export: name: 'Drive {#DRIVE_SN} (Slot {#DRIVE_SLOT}): Error Count' type: HTTP_AGENT key: 'hypercore.drive.errors[{#DRIVE_ID}]' - delay: 5m + delay: 2m history: 7d value_type: FLOAT units: errors @@ -708,7 +708,7 @@ zabbix_export: name: 'Drive {#DRIVE_SN} (Slot {#DRIVE_SLOT}): Health Status' type: HTTP_AGENT key: 'hypercore.drive.healthy[{#DRIVE_ID}]' - delay: 5m + delay: 2m history: 7d value_type: FLOAT authtype: BASIC