textfile example script rework (#1074)

* textfile smartmon.sh

Added functions to also parse megaraid disks.
Added parsing to also detect the grown_defects counters.

* textfile storcli.py

Reworked the example file to export lots more information about
megaraid attached controllers, VDs and PDs.

Signed-off-by: Christopher Blum <christopher.blum@profitbricks.com>
This commit is contained in:
Christopher Blum 2018-09-18 22:43:20 +02:00 committed by Ben Kochie
parent 1c9ea46cca
commit 6aa5cfba6c
2 changed files with 220 additions and 136 deletions

View file

@ -7,7 +7,11 @@
# data in them than you'd think. # data in them than you'd think.
# http://arstechnica.com/civis/viewtopic.php?p=22062211 # http://arstechnica.com/civis/viewtopic.php?p=22062211
parse_smartctl_attributes_awk="$(cat << 'SMARTCTLAWK' # Formatting done via shfmt -i 2
# https://github.com/mvdan/sh
parse_smartctl_attributes_awk="$(
cat <<'SMARTCTLAWK'
$1 ~ /^ *[0-9]+$/ && $2 ~ /^[a-zA-Z0-9_-]+$/ { $1 ~ /^ *[0-9]+$/ && $2 ~ /^[a-zA-Z0-9_-]+$/ {
gsub(/-/, "_"); gsub(/-/, "_");
printf "%s_value{%s,smart_id=\"%s\"} %d\n", $2, labels, $1, $4 printf "%s_value{%s,smart_id=\"%s\"} %d\n", $2, labels, $1, $4
@ -18,7 +22,8 @@ $1 ~ /^ *[0-9]+$/ && $2 ~ /^[a-zA-Z0-9_-]+$/ {
SMARTCTLAWK SMARTCTLAWK
)" )"
smartmon_attrs="$(cat << 'SMARTMONATTRS' smartmon_attrs="$(
cat <<'SMARTMONATTRS'
airflow_temperature_cel airflow_temperature_cel
command_timeout command_timeout
current_pending_sector current_pending_sector
@ -64,10 +69,10 @@ parse_smartctl_attributes() {
local disk_type="$2" local disk_type="$2"
local labels="disk=\"${disk}\",type=\"${disk_type}\"" local labels="disk=\"${disk}\",type=\"${disk_type}\""
local vars="$(echo "${smartmon_attrs}" | xargs | tr ' ' '|')" local vars="$(echo "${smartmon_attrs}" | xargs | tr ' ' '|')"
sed 's/^ \+//g' \ sed 's/^ \+//g' |
| awk -v labels="${labels}" "${parse_smartctl_attributes_awk}" 2>/dev/null \ awk -v labels="${labels}" "${parse_smartctl_attributes_awk}" 2>/dev/null |
| tr A-Z a-z \ tr A-Z a-z |
| grep -E "(${smartmon_attrs})" grep -E "(${smartmon_attrs})"
} }
parse_smartctl_scsi_attributes() { parse_smartctl_scsi_attributes() {
@ -82,12 +87,14 @@ parse_smartctl_scsi_attributes() {
Current_Drive_Temperature) temp_cel="$(echo ${attr_value} | cut -f1 -d' ' | awk '{ printf "%e\n", $1 }')" ;; Current_Drive_Temperature) temp_cel="$(echo ${attr_value} | cut -f1 -d' ' | awk '{ printf "%e\n", $1 }')" ;;
Blocks_read_from_cache_and_sent_to_initiator_) lbas_read="$(echo ${attr_value} | awk '{ printf "%e\n", $1 }')" ;; Blocks_read_from_cache_and_sent_to_initiator_) lbas_read="$(echo ${attr_value} | awk '{ printf "%e\n", $1 }')" ;;
Accumulated_start-stop_cycles) power_cycle="$(echo ${attr_value} | awk '{ printf "%e\n", $1 }')" ;; Accumulated_start-stop_cycles) power_cycle="$(echo ${attr_value} | awk '{ printf "%e\n", $1 }')" ;;
Elements_in_grown_defect_list) grown_defects="$(echo ${attr_value} | awk '{ printf "%e\n", $1 }')" ;;
esac esac
done done
echo "power_on_hours_raw_value{"${labels}",smart_id=\"9\"} ${power_on}" [ ! -z "$power_on" ] && echo "power_on_hours_raw_value{${labels},smart_id=\"9\"} ${power_on}"
echo "temperature_celsius_raw_value{"${labels}",smart_id=\"194\"} ${temp_cel}" [ ! -z "$temp_cel" ] && echo "temperature_celsius_raw_value{${labels},smart_id=\"194\"} ${temp_cel}"
echo "total_lbas_read_raw_value{"${labels}",smart_id=\"242\"} ${lbas_read}" [ ! -z "$lbas_read" ] && echo "total_lbas_read_raw_value{${labels},smart_id=\"242\"} ${lbas_read}"
echo "power_cycle_count_raw_value{"${labels}",smart_id=\"12\"} ${power_cycle}" [ ! -z "$power_cycle" ] && echo "power_cycle_count_raw_value{${labels},smart_id=\"12\"} ${power_cycle}"
[ ! -z "$grown_defects" ] && echo "grown_defects_count_raw_value{${labels},smart_id=\"12\"} ${grown_defects}"
} }
parse_smartctl_info() { parse_smartctl_info() {
@ -130,7 +137,8 @@ parse_smartctl_info() {
echo "device_smart_healthy{disk=\"${disk}\",type=\"${disk_type}\"} ${smart_healthy}" echo "device_smart_healthy{disk=\"${disk}\",type=\"${disk_type}\"} ${smart_healthy}"
} }
output_format_awk="$(cat << 'OUTPUTAWK' output_format_awk="$(
cat <<'OUTPUTAWK'
BEGIN { v = "" } BEGIN { v = "" }
v != $1 { v != $1 {
print "# HELP smartmon_" $1 " SMART metric " $1; print "# HELP smartmon_" $1 " SMART metric " $1;
@ -142,8 +150,8 @@ OUTPUTAWK
)" )"
format_output() { format_output() {
sort \ sort |
| awk -F'{' "${output_format_awk}" awk -F'{' "${output_format_awk}"
} }
smartctl_version="$(/usr/sbin/smartctl -V | head -n1 | awk '$1 == "smartctl" {print $2}')" smartctl_version="$(/usr/sbin/smartctl -V | head -n1 | awk '$1 == "smartctl" {print $2}')"
@ -159,13 +167,18 @@ device_list="$(/usr/sbin/smartctl --scan-open | awk '/^\/dev/{print $1 "|" $3}')
for device in ${device_list}; do for device in ${device_list}; do
disk="$(echo ${device} | cut -f1 -d'|')" disk="$(echo ${device} | cut -f1 -d'|')"
type="$(echo ${device} | cut -f2 -d'|')" type="$(echo ${device} | cut -f2 -d'|')"
echo "smartctl_run{disk=\"${disk}\",type=\"${type}\"}" $(TZ=UTC date '+%s') echo "smartctl_run{disk=\"${disk}\",type=\"${type}\"}" "$(TZ=UTC date '+%s')"
# Get the SMART information and health # Get the SMART information and health
/usr/sbin/smartctl -i -H -d "${type}" "${disk}" | parse_smartctl_info "${disk}" "${type}" /usr/sbin/smartctl -i -H -d "${type}" "${disk}" | parse_smartctl_info "${disk}" "${type}"
# Get the SMART attributes # Get the SMART attributes
case ${type} in case ${type} in
sat) /usr/sbin/smartctl -A -d "${type}" "${disk}" | parse_smartctl_attributes "${disk}" "${type}" ;; sat) /usr/sbin/smartctl -A -d "${type}" "${disk}" | parse_smartctl_attributes "${disk}" "${type}" ;;
sat+megaraid*) /usr/sbin/smartctl -A -d "${type}" "${disk}" | parse_smartctl_attributes "${disk}" "${type}" ;;
scsi) /usr/sbin/smartctl -A -d "${type}" "${disk}" | parse_smartctl_scsi_attributes "${disk}" "${type}" ;; scsi) /usr/sbin/smartctl -A -d "${type}" "${disk}" | parse_smartctl_scsi_attributes "${disk}" "${type}" ;;
*) echo "disk type is not sat or scsi, ${type}"; exit ;; megaraid*) /usr/sbin/smartctl -A -d "${type}" "${disk}" | parse_smartctl_scsi_attributes "${disk}" "${type}" ;;
*)
echo "disk type is not sat, scsi or megaraid but ${type}"
exit
;;
esac esac
done | format_output done | format_output

View file

@ -1,4 +1,4 @@
#!/usr/bin/env python #!/usr/bin/env python3
""" """
Script to parse StorCLI's JSON output and expose Script to parse StorCLI's JSON output and expose
MegaRAID health as Prometheus metrics. MegaRAID health as Prometheus metrics.
@ -19,110 +19,181 @@ import argparse
import json import json
import os import os
import subprocess import subprocess
import shlex
from dateutil.parser import parse
import collections
from enum import IntEnum
DESCRIPTION = """Parses StorCLI's JSON output and exposes MegaRAID health as DESCRIPTION = """Parses StorCLI's JSON output and exposes MegaRAID health as
Prometheus metrics.""" Prometheus metrics."""
VERSION = '0.0.1' VERSION = '0.0.2'
storcli_path = ''
metric_prefix = 'megaraid_'
metric_list = {}
metric_list = collections.defaultdict(list)
class VD_State(IntEnum):
Optl = 0 # Optimal
Dgrd = 1 # Degraded
Pdgd = 2 # Partially Degraded
OfLn = 3 # Offline
Rec = 4 # Recovery
Cac = 5 # CacheCade
def main(args): def main(args):
""" main """ """ main """
global storcli_path
storcli_path = args.storcli_path
data = json.loads(get_storcli_json('/cALL show all J'))
# exporter variables # All the information is collected underneath the Controllers key
metric_prefix = 'megaraid_' data = data['Controllers']
metric_controller_labels = '{{controller="{}", model="{}"}}'
data = json.loads(get_storcli_json(args.storcli_path)) # try:
# overview = status['Response Data']['System Overview']
# except KeyError:
# pass
# It appears that the data we need will always be present in the first for controller in data:
# item in the Controllers array response = controller['Response Data']
status = data['Controllers'][0] if response['Version']['Driver Name'] == 'megaraid_sas':
handle_megaraid_controller(response)
elif response['Version']['Driver Name'] == 'mpt3sas':
handle_sas_controller(response)
metrics = { # print_dict_to_exporter({'controller_info': [1]}, controller_info_list)
'status_code': status['Command Status']['Status Code'], # print_dict_to_exporter({'virtual_disk_info': [1]}, vd_info_list)
'controllers': status['Response Data']['Number of Controllers'], # print_dict_to_exporter({'physical_disk_info': [1]}, pd_info_list)
} # print_all_metrics(vd_metric_list)
print_all_metrics(metric_list)
for name, value in metrics.iteritems():
print('# HELP {}{} MegaRAID {}'.format(metric_prefix, name, name.replace('_', ' ')))
print('# TYPE {}{} gauge'.format(metric_prefix, name))
print("{}{} {}".format(metric_prefix, name, value))
controller_info = [] def handle_sas_controller(response):
controller_metrics = {}
overview = []
try:
overview = status['Response Data']['System Overview']
except KeyError:
pass pass
for controller in overview:
controller_index = controller['Ctl']
model = controller['Model']
controller_info.append(metric_controller_labels.format(controller_index, model))
controller_metrics = { def handle_megaraid_controller(response):
# FIXME: Parse dimmer switch options controller_index = response['Basics']['Controller']
# 'dimmer_switch': controller['DS'], baselabel = 'controller="{}"'.format(controller_index)
'battery_backup_healthy': int(controller['BBU'] == 'Opt'), controller_info_label = baselabel + ',model="{}",serial="{}",fwversion="{}"'.format(
'degraded': int(controller['Hlth'] == 'Dgd'), response['Basics']['Model'],
'drive_groups': controller['DGs'], response['Basics']['Serial Number'],
'emergency_hot_spare': int(controller['EHS'] == 'Y'), response['Version']['Firmware Version'],
'failed': int(controller['Hlth'] == 'Fld'), )
'healthy': int(controller['Hlth'] == 'Opt'), add_metric('controller_info', controller_info_label, 1)
'physical_drives': controller['PDs'],
'ports': controller['Ports'],
'scheduled_patrol_read': int(controller['sPR'] == 'On'),
'virtual_drives': controller['VDs'],
# Reverse StorCLI's logic to make metrics consistent add_metric('battery_backup_healthy', baselabel, int(response['Status']['BBU Status'] == 0))
'drive_groups_optimal': int(controller['DNOpt'] == 0), add_metric('degraded', baselabel, int(response['Status']['Controller Status'] == 'Degraded'))
'virtual_drives_optimal': int(controller['VNOpt'] == 0), add_metric('failed', baselabel, int(response['Status']['Controller Status'] == 'Failed'))
} add_metric('healthy', baselabel, int(response['Status']['Controller Status'] == 'Optimal'))
add_metric('drive_groups', baselabel, response['Drive Groups'])
add_metric('virtual_drives', baselabel, response['Virtual Drives'])
add_metric('physical_drives', baselabel, response['Physical Drives'])
add_metric('ports', baselabel, response['HwCfg']['Backend Port Count'])
add_metric('scheduled_patrol_read', baselabel,
int('hrs' in response['Scheduled Tasks']['Patrol Read Reoccurrence']))
for name, value in controller_metrics.iteritems(): time_difference_seconds = -1
print('# HELP {}{} MegaRAID {}'.format(metric_prefix, name, name.replace('_', ' '))) system_time = parse(response['Basics'].get('Current System Date/time'))
print('# TYPE {}{} gauge'.format(metric_prefix, name)) controller_time = parse(response['Basics'].get('Current Controller Date/Time'))
print('{}{}{{controller="{}"}} {}'.format(metric_prefix, name, if system_time and controller_time:
controller_index, value)) time_difference_seconds = abs(system_time - controller_time).seconds
add_metric('time_difference', baselabel, time_difference_seconds)
if controller_info: for virtual_drive in response['VD LIST']:
print('# HELP {}{} MegaRAID controller info'.format(metric_prefix, 'controller_info')) vd_position = virtual_drive.get('DG/VD')
print('# TYPE {}{} gauge'.format(metric_prefix, 'controller_info')) drive_group, volume_group = -1, -1
for labels in controller_info: if vd_position:
print('{}{}{} {}'.format(metric_prefix, 'controller_info', labels, 1)) drive_group = vd_position.split('/')[0]
volume_group = vd_position.split('/')[1]
vd_baselabel = 'controller="{}",DG="{}",VG="{}"'.format(controller_index, drive_group,
volume_group)
vd_info_label = vd_baselabel + ',name="{}",cache="{}",type="{}"'.format(
virtual_drive.get('Name'), virtual_drive.get('Cache'), virtual_drive.get('TYPE'))
add_metric('vd_info', vd_info_label, 1)
add_metric('vd_status', vd_baselabel, int(VD_State[virtual_drive.get('State')]))
if response['Physical Drives'] > 0:
data = json.loads(get_storcli_json('/cALL/eALL/sALL show all J'))
drive_info = data['Controllers'][controller_index]['Response Data']
for physical_drive in response['PD LIST']:
enclosure = physical_drive.get('EID:Slt').split(':')[0]
slot = physical_drive.get('EID:Slt').split(':')[1]
pd_baselabel = 'controller="{}",enclosure="{}",slot="{}"'.format(
controller_index, enclosure, slot)
pd_info_label = pd_baselabel + ',disk_id="{}",interface="{}",media="{}",model="{}"'.format(
physical_drive.get('DID'), physical_drive.get('Intf'), physical_drive.get('Med'),
physical_drive.get('Model').strip())
drive_identifier = 'Drive /c' + str(controller_index) + '/e' + str(enclosure) + '/s' + str(
slot)
try:
info = drive_info[drive_identifier + ' - Detailed Information']
state = info[drive_identifier + ' State']
attributes = info[drive_identifier + ' Device attributes']
settings = info[drive_identifier + ' Policies/Settings']
add_metric('pd_shield_counter', pd_baselabel, state['Shield Counter'])
add_metric('pd_media_errors_total', pd_baselabel, state['Media Error Count'])
add_metric('pd_other_errors_total', pd_baselabel, state['Other Error Count'])
add_metric('pd_predictive_errors_total', pd_baselabel,
state['Predictive Failure Count'])
add_metric('pd_smart_alerted', pd_baselabel,
int(state['S.M.A.R.T alert flagged by drive'] == 'Yes'))
add_metric('pd_link_speed_gbps', pd_baselabel, attributes['Link Speed'].split('.')[0])
add_metric('pd_device_speed_gbps', pd_baselabel,
attributes['Device Speed'].split('.')[0])
add_metric('pd_commissioned_spare', pd_baselabel,
int(settings['Commissioned Spare'] == 'Yes'))
add_metric('pd_emergency_spare', pd_baselabel,
int(settings['Emergency Spare'] == 'Yes'))
pd_info_label += ',firmware="{}"'.format(attributes['Firmware Revision'])
except KeyError:
pass
add_metric('pd_info', pd_info_label, 1)
def get_storcli_json(storcli_path): def add_metric(name, labels, value):
global metric_list
metric_list[name].append({
'labels': labels,
'value': value,
})
def print_all_metrics(metrics):
for metric, measurements in metrics.items():
print('# HELP {}{} MegaRAID {}'.format(metric_prefix, metric, metric.replace('_', ' ')))
print('# TYPE {}{} gauge'.format(metric_prefix, metric))
for measurement in measurements:
print('{}{}{} {}'.format(metric_prefix, metric, '{' + measurement['labels'] + '}',
measurement['value']))
def get_storcli_json(storcli_args):
"""Get storcli output in JSON format.""" """Get storcli output in JSON format."""
# Check if storcli is installed and executable
# Check if storcli is installed if not (os.path.isfile(storcli_path) and os.access(storcli_path, os.X_OK)):
if os.path.isfile(storcli_path) and os.access(storcli_path, os.X_OK): SystemExit(1)
storcli_cmd = [storcli_path, 'show', 'all', 'J'] storcli_cmd = shlex.split(storcli_path + ' ' + storcli_args)
proc = subprocess.Popen(storcli_cmd, shell=False, proc = subprocess.Popen(
stdout=subprocess.PIPE, stderr=subprocess.PIPE) storcli_cmd, shell=False, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
output_json = proc.communicate()[0] output_json = proc.communicate()[0]
else:
# Create an empty dummy-JSON where storcli not installed.
dummy_json = {"Controllers":[{
"Command Status": {"Status Code": 0, "Status": "Success",
"Description": "None"},
"Response Data": {"Number of Controllers": 0}}]}
output_json = json.dumps(dummy_json)
return output_json return output_json.decode("utf-8")
if __name__ == "__main__": if __name__ == "__main__":
PARSER = argparse.ArgumentParser(description=DESCRIPTION, PARSER = argparse.ArgumentParser(
formatter_class=argparse.ArgumentDefaultsHelpFormatter) description=DESCRIPTION, formatter_class=argparse.ArgumentDefaultsHelpFormatter)
PARSER.add_argument('--storcli_path', PARSER.add_argument(
default='/opt/MegaRAID/storcli/storcli64', '--storcli_path', default='/opt/MegaRAID/storcli/storcli64', help='path to StorCLi binary')
help='path to StorCLi binary') PARSER.add_argument('--version', action='version', version='%(prog)s {}'.format(VERSION))
PARSER.add_argument('--version',
action='version',
version='%(prog)s {}'.format(VERSION))
ARGS = PARSER.parse_args() ARGS = PARSER.parse_args()
main(ARGS) main(ARGS)