textfile example script rework (#1074)

* textfile smartmon.sh Added functions to also parse megaraid disks. Added parsing to also detect the grown_defects counters. * textfile storcli.py Reworked the example file to export lots more information about megaraid attached controllers, VDs and PDs. Signed-off-by: Christopher Blum <christopher.blum@profitbricks.com>
2025-03-05 21:00:12 -08:00 · 2018-09-18 22:43:20 +02:00 · 2018-09-18 22:43:20 +02:00 · 6aa5cfba6c
parent 1c9ea46cca
commit 6aa5cfba6c
2 changed files with 220 additions and 136 deletions
--- a/text_collector_examples/smartmon.sh
+++ b/text_collector_examples/smartmon.sh
@ -7,7 +7,11 @@
 #       data in them than you'd think.
 #       http://arstechnica.com/civis/viewtopic.php?p=22062211
-parse_smartctl_attributes_awk="$(cat << 'SMARTCTLAWK'
+# Formatting done via shfmt -i 2
 # https://github.com/mvdan/sh
 parse_smartctl_attributes_awk="$(
  cat <<'SMARTCTLAWK'
 $1 ~ /^ *[0-9]+$/ && $2 ~ /^[a-zA-Z0-9_-]+$/ {
  gsub(/-/, "_");
  printf "%s_value{%s,smart_id=\"%s\"} %d\n", $2, labels, $1, $4
@ -18,7 +22,8 @@ $1 ~ /^ *[0-9]+$/ && $2 ~ /^[a-zA-Z0-9_-]+$/ {
 SMARTCTLAWK
 )"
-smartmon_attrs="$(cat << 'SMARTMONATTRS'
+smartmon_attrs="$(
  cat <<'SMARTMONATTRS'
 airflow_temperature_cel
 command_timeout
 current_pending_sector
@ -64,10 +69,10 @@ parse_smartctl_attributes() {
  local disk_type="$2"
  local labels="disk=\"${disk}\",type=\"${disk_type}\""
  local vars="$(echo "${smartmon_attrs}" | xargs | tr ' ' '|')"
-  sed 's/^ \+//g' \
+  sed 's/^ \+//g' |
-    | awk -v labels="${labels}" "${parse_smartctl_attributes_awk}" 2>/dev/null \
+    awk -v labels="${labels}" "${parse_smartctl_attributes_awk}" 2>/dev/null |
-    | tr A-Z a-z \
+    tr A-Z a-z |
-    | grep -E "(${smartmon_attrs})"
+    grep -E "(${smartmon_attrs})"
 }
 parse_smartctl_scsi_attributes() {
@ -82,12 +87,14 @@ parse_smartctl_scsi_attributes() {
    Current_Drive_Temperature) temp_cel="$(echo ${attr_value} | cut -f1 -d' ' | awk '{ printf "%e\n", $1 }')" ;;
    Blocks_read_from_cache_and_sent_to_initiator_) lbas_read="$(echo ${attr_value} | awk '{ printf "%e\n", $1 }')" ;;
    Accumulated_start-stop_cycles) power_cycle="$(echo ${attr_value} | awk '{ printf "%e\n", $1 }')" ;;
    Elements_in_grown_defect_list) grown_defects="$(echo ${attr_value} | awk '{ printf "%e\n", $1 }')" ;;
    esac
  done
-    echo "power_on_hours_raw_value{"${labels}",smart_id=\"9\"} ${power_on}"
+  [ ! -z "$power_on" ] && echo "power_on_hours_raw_value{${labels},smart_id=\"9\"} ${power_on}"
-    echo "temperature_celsius_raw_value{"${labels}",smart_id=\"194\"} ${temp_cel}"
+  [ ! -z "$temp_cel" ] && echo "temperature_celsius_raw_value{${labels},smart_id=\"194\"} ${temp_cel}"
-    echo "total_lbas_read_raw_value{"${labels}",smart_id=\"242\"} ${lbas_read}"
+  [ ! -z "$lbas_read" ] && echo "total_lbas_read_raw_value{${labels},smart_id=\"242\"} ${lbas_read}"
-    echo "power_cycle_count_raw_value{"${labels}",smart_id=\"12\"} ${power_cycle}"
+  [ ! -z "$power_cycle" ] && echo "power_cycle_count_raw_value{${labels},smart_id=\"12\"} ${power_cycle}"
  [ ! -z "$grown_defects" ] && echo "grown_defects_count_raw_value{${labels},smart_id=\"12\"} ${grown_defects}"
 }
 parse_smartctl_info() {
@ -130,7 +137,8 @@ parse_smartctl_info() {
  echo "device_smart_healthy{disk=\"${disk}\",type=\"${disk_type}\"} ${smart_healthy}"
 }
-output_format_awk="$(cat << 'OUTPUTAWK'
+output_format_awk="$(
  cat <<'OUTPUTAWK'
 BEGIN { v = "" }
 v != $1 {
  print "# HELP smartmon_" $1 " SMART metric " $1;
@ -142,8 +150,8 @@ OUTPUTAWK
 )"
 format_output() {
-  sort \
+  sort |
-  | awk -F'{' "${output_format_awk}"
+    awk -F'{' "${output_format_awk}"
 }
 smartctl_version="$(/usr/sbin/smartctl -V | head -n1 | awk '$1 == "smartctl" {print $2}')"
@ -159,13 +167,18 @@ device_list="$(/usr/sbin/smartctl --scan-open | awk '/^\/dev/{print $1 "|" $3}')
 for device in ${device_list}; do
  disk="$(echo ${device} | cut -f1 -d'|')"
  type="$(echo ${device} | cut -f2 -d'|')"
-  echo "smartctl_run{disk=\"${disk}\",type=\"${type}\"}" $(TZ=UTC date '+%s')
+  echo "smartctl_run{disk=\"${disk}\",type=\"${type}\"}" "$(TZ=UTC date '+%s')"
  # Get the SMART information and health
  /usr/sbin/smartctl -i -H -d "${type}" "${disk}" | parse_smartctl_info "${disk}" "${type}"
  # Get the SMART attributes
  case ${type} in
  sat) /usr/sbin/smartctl -A -d "${type}" "${disk}" | parse_smartctl_attributes "${disk}" "${type}" ;;
  sat+megaraid*) /usr/sbin/smartctl -A -d "${type}" "${disk}" | parse_smartctl_attributes "${disk}" "${type}" ;;
  scsi) /usr/sbin/smartctl -A -d "${type}" "${disk}" | parse_smartctl_scsi_attributes "${disk}" "${type}" ;;
-    *) echo "disk type is not sat or scsi, ${type}"; exit ;;
+  megaraid*) /usr/sbin/smartctl -A -d "${type}" "${disk}" | parse_smartctl_scsi_attributes "${disk}" "${type}" ;;
  *)
    echo "disk type is not sat, scsi or megaraid but ${type}"
    exit
    ;;
  esac
 done | format_output
--- a/text_collector_examples/storcli.py
+++ b/text_collector_examples/storcli.py
@ -1,4 +1,4 @@
-#!/usr/bin/env python
+#!/usr/bin/env python3
 """
 Script to parse StorCLI's JSON output and expose
 MegaRAID health as Prometheus metrics.
@ -19,110 +19,181 @@ import argparse
 import json
 import os
 import subprocess
 import shlex
 from dateutil.parser import parse
 import collections
 from enum import IntEnum
 DESCRIPTION = """Parses StorCLI's JSON output and exposes MegaRAID health as
    Prometheus metrics."""
-VERSION = '0.0.1'
+VERSION = '0.0.2'
 storcli_path = ''
 metric_prefix = 'megaraid_'
 metric_list = {}
 metric_list = collections.defaultdict(list)
 class VD_State(IntEnum):
    Optl = 0  # Optimal
    Dgrd = 1  # Degraded
    Pdgd = 2  # Partially Degraded
    OfLn = 3  # Offline
    Rec = 4  # Recovery
    Cac = 5  # CacheCade
 def main(args):
    """ main """
    global storcli_path
    storcli_path = args.storcli_path
    data = json.loads(get_storcli_json('/cALL show all J'))
-    # exporter variables
+    # All the information is collected underneath the Controllers key
-    metric_prefix = 'megaraid_'
+    data = data['Controllers']
    metric_controller_labels = '{{controller="{}", model="{}"}}'
-    data = json.loads(get_storcli_json(args.storcli_path))
+    # try:
    #     overview = status['Response Data']['System Overview']
    # except KeyError:
    #     pass
-    # It appears that the data we need will always be present in the first
+    for controller in data:
-    # item in the Controllers array
+        response = controller['Response Data']
-    status = data['Controllers'][0]
+        if response['Version']['Driver Name'] == 'megaraid_sas':
            handle_megaraid_controller(response)
        elif response['Version']['Driver Name'] == 'mpt3sas':
            handle_sas_controller(response)
-    metrics = {
+    # print_dict_to_exporter({'controller_info': [1]}, controller_info_list)
-        'status_code': status['Command Status']['Status Code'],
+    # print_dict_to_exporter({'virtual_disk_info': [1]}, vd_info_list)
-        'controllers': status['Response Data']['Number of Controllers'],
+    # print_dict_to_exporter({'physical_disk_info': [1]}, pd_info_list)
-    }
+    # print_all_metrics(vd_metric_list)
    print_all_metrics(metric_list)
    for name, value in metrics.iteritems():
        print('# HELP {}{} MegaRAID {}'.format(metric_prefix, name, name.replace('_', ' ')))
        print('# TYPE {}{} gauge'.format(metric_prefix, name))
        print("{}{} {}".format(metric_prefix, name, value))
-    controller_info = []
+def handle_sas_controller(response):
    controller_metrics = {}
    overview = []
    try:
        overview = status['Response Data']['System Overview']
    except KeyError:
    pass
    for controller in overview:
        controller_index = controller['Ctl']
        model = controller['Model']
        controller_info.append(metric_controller_labels.format(controller_index, model))
-        controller_metrics = {
+def handle_megaraid_controller(response):
-            # FIXME: Parse dimmer switch options
+    controller_index = response['Basics']['Controller']
-            # 'dimmer_switch':          controller['DS'],
+    baselabel = 'controller="{}"'.format(controller_index)
-            'battery_backup_healthy':   int(controller['BBU'] == 'Opt'),
+    controller_info_label = baselabel + ',model="{}",serial="{}",fwversion="{}"'.format(
-            'degraded':                 int(controller['Hlth'] == 'Dgd'),
+        response['Basics']['Model'],
-            'drive_groups':             controller['DGs'],
+        response['Basics']['Serial Number'],
-            'emergency_hot_spare':      int(controller['EHS'] == 'Y'),
+        response['Version']['Firmware Version'],
-            'failed':                   int(controller['Hlth'] == 'Fld'),
+    )
-            'healthy':                  int(controller['Hlth'] == 'Opt'),
+    add_metric('controller_info', controller_info_label, 1)
            'physical_drives':          controller['PDs'],
            'ports':                    controller['Ports'],
            'scheduled_patrol_read':    int(controller['sPR'] == 'On'),
            'virtual_drives':           controller['VDs'],
-            # Reverse StorCLI's logic to make metrics consistent
+    add_metric('battery_backup_healthy', baselabel, int(response['Status']['BBU Status'] == 0))
-            'drive_groups_optimal':     int(controller['DNOpt'] == 0),
+    add_metric('degraded', baselabel, int(response['Status']['Controller Status'] == 'Degraded'))
-            'virtual_drives_optimal':   int(controller['VNOpt'] == 0),
+    add_metric('failed', baselabel, int(response['Status']['Controller Status'] == 'Failed'))
-            }
+    add_metric('healthy', baselabel, int(response['Status']['Controller Status'] == 'Optimal'))
    add_metric('drive_groups', baselabel, response['Drive Groups'])
    add_metric('virtual_drives', baselabel, response['Virtual Drives'])
    add_metric('physical_drives', baselabel, response['Physical Drives'])
    add_metric('ports', baselabel, response['HwCfg']['Backend Port Count'])
    add_metric('scheduled_patrol_read', baselabel,
               int('hrs' in response['Scheduled Tasks']['Patrol Read Reoccurrence']))
-    for name, value in controller_metrics.iteritems():
+    time_difference_seconds = -1
-        print('# HELP {}{} MegaRAID {}'.format(metric_prefix, name, name.replace('_', ' ')))
+    system_time = parse(response['Basics'].get('Current System Date/time'))
-        print('# TYPE {}{} gauge'.format(metric_prefix, name))
+    controller_time = parse(response['Basics'].get('Current Controller Date/Time'))
-        print('{}{}{{controller="{}"}} {}'.format(metric_prefix, name,
+    if system_time and controller_time:
-                                                  controller_index, value))
+        time_difference_seconds = abs(system_time - controller_time).seconds
        add_metric('time_difference', baselabel, time_difference_seconds)
-    if controller_info:
+    for virtual_drive in response['VD LIST']:
-        print('# HELP {}{} MegaRAID controller info'.format(metric_prefix, 'controller_info'))
+        vd_position = virtual_drive.get('DG/VD')
-        print('# TYPE {}{} gauge'.format(metric_prefix, 'controller_info'))
+        drive_group, volume_group = -1, -1
-    for labels in controller_info:
+        if vd_position:
-        print('{}{}{} {}'.format(metric_prefix, 'controller_info', labels, 1))
+            drive_group = vd_position.split('/')[0]
            volume_group = vd_position.split('/')[1]
        vd_baselabel = 'controller="{}",DG="{}",VG="{}"'.format(controller_index, drive_group,
                                                                volume_group)
        vd_info_label = vd_baselabel + ',name="{}",cache="{}",type="{}"'.format(
            virtual_drive.get('Name'), virtual_drive.get('Cache'), virtual_drive.get('TYPE'))
        add_metric('vd_info', vd_info_label, 1)
        add_metric('vd_status', vd_baselabel, int(VD_State[virtual_drive.get('State')]))
    if response['Physical Drives'] > 0:
        data = json.loads(get_storcli_json('/cALL/eALL/sALL show all J'))
        drive_info = data['Controllers'][controller_index]['Response Data']
    for physical_drive in response['PD LIST']:
        enclosure = physical_drive.get('EID:Slt').split(':')[0]
        slot = physical_drive.get('EID:Slt').split(':')[1]
        pd_baselabel = 'controller="{}",enclosure="{}",slot="{}"'.format(
            controller_index, enclosure, slot)
        pd_info_label = pd_baselabel + ',disk_id="{}",interface="{}",media="{}",model="{}"'.format(
            physical_drive.get('DID'), physical_drive.get('Intf'), physical_drive.get('Med'),
            physical_drive.get('Model').strip())
        drive_identifier = 'Drive /c' + str(controller_index) + '/e' + str(enclosure) + '/s' + str(
            slot)
        try:
            info = drive_info[drive_identifier + ' - Detailed Information']
            state = info[drive_identifier + ' State']
            attributes = info[drive_identifier + ' Device attributes']
            settings = info[drive_identifier + ' Policies/Settings']
            add_metric('pd_shield_counter', pd_baselabel, state['Shield Counter'])
            add_metric('pd_media_errors_total', pd_baselabel, state['Media Error Count'])
            add_metric('pd_other_errors_total', pd_baselabel, state['Other Error Count'])
            add_metric('pd_predictive_errors_total', pd_baselabel,
                       state['Predictive Failure Count'])
            add_metric('pd_smart_alerted', pd_baselabel,
                       int(state['S.M.A.R.T alert flagged by drive'] == 'Yes'))
            add_metric('pd_link_speed_gbps', pd_baselabel, attributes['Link Speed'].split('.')[0])
            add_metric('pd_device_speed_gbps', pd_baselabel,
                       attributes['Device Speed'].split('.')[0])
            add_metric('pd_commissioned_spare', pd_baselabel,
                       int(settings['Commissioned Spare'] == 'Yes'))
            add_metric('pd_emergency_spare', pd_baselabel,
                       int(settings['Emergency Spare'] == 'Yes'))
            pd_info_label += ',firmware="{}"'.format(attributes['Firmware Revision'])
        except KeyError:
            pass
        add_metric('pd_info', pd_info_label, 1)
-def get_storcli_json(storcli_path):
+def add_metric(name, labels, value):
    global metric_list
    metric_list[name].append({
        'labels': labels,
        'value': value,
    })
 def print_all_metrics(metrics):
    for metric, measurements in metrics.items():
        print('# HELP {}{} MegaRAID {}'.format(metric_prefix, metric, metric.replace('_', ' ')))
        print('# TYPE {}{} gauge'.format(metric_prefix, metric))
        for measurement in measurements:
            print('{}{}{} {}'.format(metric_prefix, metric, '{' + measurement['labels'] + '}',
                                     measurement['value']))
 def get_storcli_json(storcli_args):
    """Get storcli output in JSON format."""
-
+    # Check if storcli is installed and executable
-    # Check if storcli is installed
+    if not (os.path.isfile(storcli_path) and os.access(storcli_path, os.X_OK)):
-    if os.path.isfile(storcli_path) and os.access(storcli_path, os.X_OK):
+        SystemExit(1)
-        storcli_cmd = [storcli_path, 'show', 'all', 'J']
+    storcli_cmd = shlex.split(storcli_path + ' ' + storcli_args)
-        proc = subprocess.Popen(storcli_cmd, shell=False,
+    proc = subprocess.Popen(
-                                stdout=subprocess.PIPE, stderr=subprocess.PIPE)
+        storcli_cmd, shell=False, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
    output_json = proc.communicate()[0]
    else:
        # Create an empty dummy-JSON where storcli not installed.
        dummy_json = {"Controllers":[{
            "Command Status": {"Status Code": 0, "Status": "Success",
                               "Description": "None"},
            "Response Data": {"Number of Controllers": 0}}]}
        output_json = json.dumps(dummy_json)
-    return output_json
+    return output_json.decode("utf-8")
 if __name__ == "__main__":
-    PARSER = argparse.ArgumentParser(description=DESCRIPTION,
+    PARSER = argparse.ArgumentParser(
-                                     formatter_class=argparse.ArgumentDefaultsHelpFormatter)
+        description=DESCRIPTION, formatter_class=argparse.ArgumentDefaultsHelpFormatter)
-    PARSER.add_argument('--storcli_path',
+    PARSER.add_argument(
-                        default='/opt/MegaRAID/storcli/storcli64',
+        '--storcli_path', default='/opt/MegaRAID/storcli/storcli64', help='path to StorCLi binary')
-                        help='path to StorCLi binary')
+    PARSER.add_argument('--version', action='version', version='%(prog)s {}'.format(VERSION))
    PARSER.add_argument('--version',
                        action='version',
                        version='%(prog)s {}'.format(VERSION))
    ARGS = PARSER.parse_args()
    main(ARGS)