Add MegaCLI collector

This collector exports the following metrics:

- raid_drive_temperature: drive temperature
- raid_drive_count: drive error and event counters
- raid_adapter_disk_presence: disk presence per adapter
This commit is contained in:
Johannes 'fish' Ziemke 2014-07-08 16:24:29 +02:00
parent eb17b5fc9d
commit f47abc5d06
6 changed files with 770 additions and 2 deletions

View file

@ -18,5 +18,6 @@ type Collector interface {
// time.)
type Config struct {
Config map[string]string `json:"config"`
Attributes map[string]string `json:"attributes"`
}

View file

@ -0,0 +1,280 @@
Adapter #0
==============================================================================
Versions
================
Product Name : PERC 6/i Integrated
Serial No : 1234567890123456
FW Package Build: 6.3.3.0002
Mfg. Data
================
Mfg. Date : 06/24/08
Rework Date : 06/24/08
Revision No :
Battery FRU : N/A
Image Versions in Flash:
================
FW Version : 1.22.52-1909
BIOS Version : 2.04.00
WebBIOS Version : 1.1-46-e_15-Rel
Ctrl-R Version : 1.02-015B
Preboot CLI Version: 01.00-022:#%00005
Boot Block Version : 1.00.00.01-0011
Pending Images in Flash
================
None
PCI Info
================
Controller Id : 0000
Vendor Id : 1000
Device Id : 0060
SubVendorId : 1028
SubDeviceId : 1f0c
Host Interface : PCIE
Link Speed : 0
Number of Frontend Port: 0
Device Interface : PCIE
Number of Backend Port: 8
Port : Address
0 5000c50028f2083d
1 5000c50023cb3f39
2 5000c50023cea805
3 5000c50029124491
4 0000000000000000
5 0000000000000000
6 0000000000000000
7 0000000000000000
HW Configuration
================
SAS Address : 5a4badb01e219100
BBU : Present
Alarm : Absent
NVRAM : Present
Serial Debugger : Present
Memory : Present
Flash : Present
Memory Size : 256MB
TPM : Absent
On board Expander: Absent
Upgrade Key : Absent
Temperature sensor for ROC : Absent
Temperature sensor for controller : Absent
Settings
================
Current Time : 14:55:23 7/4, 2014
Predictive Fail Poll Interval : 300sec
Interrupt Throttle Active Count : 16
Interrupt Throttle Completion : 50us
Rebuild Rate : 30%
PR Rate : 30%
BGI Rate : 30%
Check Consistency Rate : 30%
Reconstruction Rate : 30%
Cache Flush Interval : 4s
Max Drives to Spinup at One Time : 2
Delay Among Spinup Groups : 12s
Physical Drive Coercion Mode : 128MB
Cluster Mode : Disabled
Alarm : Disabled
Auto Rebuild : Enabled
Battery Warning : Enabled
Ecc Bucket Size : 15
Ecc Bucket Leak Rate : 1440 Minutes
Restore HotSpare on Insertion : Disabled
Expose Enclosure Devices : Disabled
Maintain PD Fail History : Disabled
Host Request Reordering : Enabled
Auto Detect BackPlane Enabled : SGPIO/i2c SEP
Load Balance Mode : Auto
Use FDE Only : No
Security Key Assigned : No
Security Key Failed : No
Security Key Not Backedup : No
Default LD PowerSave Policy : Controller Defined
Maximum number of direct attached drives to spin up in 1 min : 0
Auto Enhanced Import : No
Any Offline VD Cache Preserved : No
Allow Boot with Preserved Cache : No
Disable Online Controller Reset : No
PFK in NVRAM : No
Use disk activity for locate : No
POST delay : 90 seconds
BIOS Error Handling : Stop On Errors
Current Boot Mode :Normal
Capabilities
================
RAID Level Supported : RAID0, RAID1, RAID5, RAID6, RAID00, RAID10, RAID50, RAID60, PRL 11, PRL 11 with spanning, SRL 3 supported, PRL11-RLQ0 DDF layout with no span, PRL11-RLQ0 DDF layout with span
Supported Drives : SAS, SATA
Allowed Mixing:
Mix in Enclosure Allowed
Status
================
ECC Bucket Count : 0
Limitations
================
Max Arms Per VD : 32
Max Spans Per VD : 8
Max Arrays : 128
Max Number of VDs : 64
Max Parallel Commands : 1008
Max SGE Count : 80
Max Data Transfer Size : 8192 sectors
Max Strips PerIO : 42
Max LD per array : 16
Min Strip Size : 8 KB
Max Strip Size : 1.0 MB
Max Configurable CacheCade Size: 0 GB
Current Size of CacheCade : 0 GB
Current Size of FW Cache : 0 MB
Device Present
================
Virtual Drives : 1
Degraded : 0
Offline : 0
Physical Devices : 5
Disks : 4
Critical Disks : 0
Failed Disks : 0
Supported Adapter Operations
================
Rebuild Rate : Yes
CC Rate : Yes
BGI Rate : Yes
Reconstruct Rate : Yes
Patrol Read Rate : Yes
Alarm Control : Yes
Cluster Support : No
BBU : Yes
Spanning : Yes
Dedicated Hot Spare : Yes
Revertible Hot Spares : Yes
Foreign Config Import : Yes
Self Diagnostic : Yes
Allow Mixed Redundancy on Array : No
Global Hot Spares : Yes
Deny SCSI Passthrough : No
Deny SMP Passthrough : No
Deny STP Passthrough : No
Support Security : No
Snapshot Enabled : No
Support the OCE without adding drives : No
Support PFK : No
Support PI : No
Support Boot Time PFK Change : No
Disable Online PFK Change : No
Support Shield State : No
Block SSD Write Disk Cache Change: No
Supported VD Operations
================
Read Policy : Yes
Write Policy : Yes
IO Policy : Yes
Access Policy : Yes
Disk Cache Policy : Yes
Reconstruction : Yes
Deny Locate : No
Deny CC : No
Allow Ctrl Encryption: No
Enable LDBBM : No
Support Breakmirror : No
Power Savings : No
Supported PD Operations
================
Force Online : Yes
Force Offline : Yes
Force Rebuild : Yes
Deny Force Failed : No
Deny Force Good/Bad : No
Deny Missing Replace : No
Deny Clear : No
Deny Locate : No
Support Temperature : No
NCQ : No
Disable Copyback : No
Enable JBOD : No
Enable Copyback on SMART : No
Enable Copyback to SSD on SMART Error : No
Enable SSD Patrol Read : No
PR Correct Unconfigured Areas : Yes
Error Counters
================
Memory Correctable Errors : 0
Memory Uncorrectable Errors : 0
Cluster Information
================
Cluster Permitted : No
Cluster Active : No
Default Settings
================
Phy Polarity : 0
Phy PolaritySplit : 0
Background Rate : 30
Strip Size : 64kB
Flush Time : 4 seconds
Write Policy : WB
Read Policy : None
Cache When BBU Bad : Disabled
Cached IO : No
SMART Mode : Mode 6
Alarm Disable : No
Coercion Mode : 128MB
ZCR Config : Unknown
Dirty LED Shows Drive Activity : No
BIOS Continue on Error : 0
Spin Down Mode : None
Allowed Device Type : SAS/SATA Mix
Allow Mix in Enclosure : Yes
Allow HDD SAS/SATA Mix in VD : No
Allow SSD SAS/SATA Mix in VD : No
Allow HDD/SSD Mix in VD : No
Allow SATA in Cluster : No
Max Chained Enclosures : 1
Disable Ctrl-R : No
Enable Web BIOS : No
Direct PD Mapping : Yes
BIOS Enumerate VDs : Yes
Restore Hot Spare on Insertion : No
Expose Enclosure Devices : No
Maintain PD Fail History : No
Disable Puncturing : No
Zero Based Enclosure Enumeration : Yes
PreBoot CLI Enabled : No
LED Show Drive Activity : No
Cluster Disable : Yes
SAS Disable : No
Auto Detect BackPlane Enable : SGPIO/i2c SEP
Use FDE Only : No
Enable Led Header : No
Delay during POST : 0
EnableCrashDump : No
Disable Online Controller Reset : No
EnableLDBBM : No
Un-Certified Hard Disk Drives : Block
Treat Single span R1E as R10 : No
Max LD per array : 16
Power Saving option : All power saving options are enabled
Default spin down time in minutes: 0
Enable JBOD : No
Time taken to detect CME : 60s
Exit Code: 0x00

View file

@ -0,0 +1,197 @@
Adapter #0
Enclosure Device ID: 32
Slot Number: 0
Drive's position: DiskGroup: 0, Span: 0, Arm: 0
Enclosure position: N/A
Device Id: 0
WWN:
Sequence Number: 2
Media Error Count: 0
Other Error Count: 0
Predictive Failure Count: 0
Last Predictive Failure Event Seq Number: 0
PD Type: SAS
Raw Size: 419.186 GB [0x3465f870 Sectors]
Non Coerced Size: 418.686 GB [0x3455f870 Sectors]
Coerced Size: 418.625 GB [0x34540000 Sectors]
Sector Size: 0
Firmware state: Online, Spun Up
Device Firmware Level: ES64
Shield Counter: 0
Successful diagnostics completion on : N/A
SAS Address(0): 0x5000c50028f2083d
SAS Address(1): 0x0
Connected Port Number: 0(path0)
Inquiry Data: SEAGATE ST3450857SS ES643SK26856
FDE Capable: Not Capable
FDE Enable: Disable
Secured: Unsecured
Locked: Unlocked
Needs EKM Attention: No
Foreign State: None
Device Speed: Unknown
Link Speed: Unknown
Media Type: Hard Disk Device
Drive Temperature :37C (98.60 F)
PI Eligibility: No
Drive is formatted for PI information: No
PI: No PI
Port-0 :
Port status: Active
Port's Linkspeed: Unknown
Port-1 :
Port status: Active
Port's Linkspeed: Unknown
Drive has flagged a S.M.A.R.T alert : No
Enclosure Device ID: 32
Slot Number: 1
Drive's position: DiskGroup: 0, Span: 0, Arm: 1
Enclosure position: N/A
Device Id: 1
WWN:
Sequence Number: 2
Media Error Count: 0
Other Error Count: 0
Predictive Failure Count: 0
Last Predictive Failure Event Seq Number: 0
PD Type: SAS
Raw Size: 419.186 GB [0x3465f870 Sectors]
Non Coerced Size: 418.686 GB [0x3455f870 Sectors]
Coerced Size: 418.625 GB [0x34540000 Sectors]
Sector Size: 0
Firmware state: Online, Spun Up
Device Firmware Level: ES62
Shield Counter: 0
Successful diagnostics completion on : N/A
SAS Address(0): 0x5000c50023cb3f39
SAS Address(1): 0x0
Connected Port Number: 1(path0)
Inquiry Data: SEAGATE ST3450857SS ES623SK16HLC
FDE Capable: Not Capable
FDE Enable: Disable
Secured: Unsecured
Locked: Unlocked
Needs EKM Attention: No
Foreign State: None
Device Speed: Unknown
Link Speed: Unknown
Media Type: Hard Disk Device
Drive Temperature :37C (98.60 F)
PI Eligibility: No
Drive is formatted for PI information: No
PI: No PI
Port-0 :
Port status: Active
Port's Linkspeed: Unknown
Port-1 :
Port status: Active
Port's Linkspeed: Unknown
Drive has flagged a S.M.A.R.T alert : No
Enclosure Device ID: 32
Slot Number: 2
Drive's position: DiskGroup: 0, Span: 1, Arm: 0
Enclosure position: N/A
Device Id: 2
WWN:
Sequence Number: 2
Media Error Count: 0
Other Error Count: 0
Predictive Failure Count: 0
Last Predictive Failure Event Seq Number: 0
PD Type: SAS
Raw Size: 419.186 GB [0x3465f870 Sectors]
Non Coerced Size: 418.686 GB [0x3455f870 Sectors]
Coerced Size: 418.625 GB [0x34540000 Sectors]
Sector Size: 0
Firmware state: Online, Spun Up
Device Firmware Level: ES62
Shield Counter: 0
Successful diagnostics completion on : N/A
SAS Address(0): 0x5000c50023cea805
SAS Address(1): 0x0
Connected Port Number: 2(path0)
Inquiry Data: SEAGATE ST3450857SS ES623SK189BR
FDE Capable: Not Capable
FDE Enable: Disable
Secured: Unsecured
Locked: Unlocked
Needs EKM Attention: No
Foreign State: None
Device Speed: Unknown
Link Speed: Unknown
Media Type: Hard Disk Device
Drive Temperature :39C (102.20 F)
PI Eligibility: No
Drive is formatted for PI information: No
PI: No PI
Port-0 :
Port status: Active
Port's Linkspeed: Unknown
Port-1 :
Port status: Active
Port's Linkspeed: Unknown
Drive has flagged a S.M.A.R.T alert : No
Enclosure Device ID: 32
Slot Number: 3
Drive's position: DiskGroup: 0, Span: 1, Arm: 1
Enclosure position: N/A
Device Id: 3
WWN:
Sequence Number: 2
Media Error Count: 0
Other Error Count: 0
Predictive Failure Count: 23
Last Predictive Failure Event Seq Number: 0
PD Type: SAS
Raw Size: 419.186 GB [0x3465f870 Sectors]
Non Coerced Size: 418.686 GB [0x3455f870 Sectors]
Coerced Size: 418.625 GB [0x34540000 Sectors]
Sector Size: 0
Firmware state: Online, Spun Up
Device Firmware Level: ES64
Shield Counter: 0
Successful diagnostics completion on : N/A
SAS Address(0): 0x5000c50029124491
SAS Address(1): 0x0
Connected Port Number: 3(path0)
Inquiry Data: SEAGATE ST3450857SS ES643SK27GQ9
FDE Capable: Not Capable
FDE Enable: Disable
Secured: Unsecured
Locked: Unlocked
Needs EKM Attention: No
Foreign State: None
Device Speed: Unknown
Link Speed: Unknown
Media Type: Hard Disk Device
Drive Temperature :38C (100.40 F)
PI Eligibility: No
Drive is formatted for PI information: No
PI: No PI
Port-0 :
Port status: Active
Port's Linkspeed: Unknown
Port-1 :
Port status: Active
Port's Linkspeed: Unknown
Drive has flagged a S.M.A.R.T alert : No
Exit Code: 0x00

233
collector/megacli.go Normal file
View file

@ -0,0 +1,233 @@
// +build megacli
package collector
import (
"bufio"
"io"
"os/exec"
"strconv"
"strings"
"github.com/prometheus/client_golang/prometheus"
)
const (
defaultMegaCli = "megacli"
adapterHeaderSep = "================"
)
var (
driveTemperature = prometheus.NewGaugeVec(prometheus.GaugeOpts{
Namespace: Namespace,
Name: "raid_drive_temperature_celsius",
Help: "megacli: drive temperature",
}, []string{"enclosure", "slot"})
driveCounters = prometheus.NewCounterVec(prometheus.CounterOpts{
Namespace: Namespace,
Name: "raid_drive_count",
Help: "megacli: drive error and event counters",
}, []string{"enclosure", "slot", "type"})
drivePresence = prometheus.NewGaugeVec(prometheus.GaugeOpts{
Namespace: Namespace,
Name: "raid_adapter_disk_presence",
Help: "megacli: disk presence per adapter",
}, []string{"type"})
counters = []string{"Media Error Count", "Other Error Count", "Predictive Failure Count"}
)
func init() {
Factories["megacli"] = NewMegaCliCollector
}
func parseMegaCliDisks(r io.ReadCloser) (map[int]map[int]map[string]string, error) {
defer r.Close()
stats := map[int]map[int]map[string]string{}
scanner := bufio.NewScanner(r)
curEnc := -1
curSlot := -1
for scanner.Scan() {
var err error
text := strings.TrimSpace(scanner.Text())
parts := strings.SplitN(text, ":", 2)
if len(parts) != 2 { // Adapter #X
continue
}
key := strings.TrimSpace(parts[0])
value := strings.TrimSpace(parts[1])
switch {
case key == "Enclosure Device ID":
curEnc, err = strconv.Atoi(value)
if err != nil {
return nil, err
}
case key == "Slot Number":
curSlot, err = strconv.Atoi(value)
if err != nil {
return nil, err
}
case curSlot != -1 && curEnc != -1:
if _, ok := stats[curEnc]; !ok {
stats[curEnc] = map[int]map[string]string{}
}
if _, ok := stats[curEnc][curSlot]; !ok {
stats[curEnc][curSlot] = map[string]string{}
}
stats[curEnc][curSlot][key] = value
}
}
return stats, nil
}
func parseMegaCliAdapter(r io.ReadCloser) (map[string]map[string]string, error) {
defer r.Close()
raidStats := map[string]map[string]string{}
scanner := bufio.NewScanner(r)
header := ""
last := ""
for scanner.Scan() {
text := strings.TrimSpace(scanner.Text())
if text == adapterHeaderSep {
header = last
raidStats[header] = map[string]string{}
continue
}
last = text
if header == "" { // skip Adapter #X and separator
continue
}
parts := strings.SplitN(text, ":", 2)
if len(parts) != 2 { // these section never include anything we are interested in
continue
}
key := strings.TrimSpace(parts[0])
value := strings.TrimSpace(parts[1])
raidStats[header][key] = value
}
return raidStats, nil
}
type megaCliCollector struct {
config Config
cli string
}
// Takes a config struct and prometheus registry and returns a new Collector exposing
// RAID status through megacli.
func NewMegaCliCollector(config Config) (Collector, error) {
cli := defaultMegaCli
if config.Config["megacli_command"] != "" {
cli = config.Config["megacli_command"]
}
c := megaCliCollector{
config: config,
cli: cli,
}
if _, err := prometheus.RegisterOrGet(driveTemperature); err != nil {
return nil, err
}
if _, err := prometheus.RegisterOrGet(driveCounters); err != nil {
return nil, err
}
if _, err := prometheus.RegisterOrGet(drivePresence); err != nil {
return nil, err
}
return &c, nil
}
func (c *megaCliCollector) Update() (updates int, err error) {
au, err := c.updateAdapter()
if err != nil {
return au, err
}
du, err := c.updateDisks()
return au + du, err
}
func (c *megaCliCollector) updateAdapter() (int, error) {
cmd := exec.Command(c.cli, "-AdpAllInfo", "-aALL")
pipe, err := cmd.StdoutPipe()
if err != nil {
return 0, err
}
if err := cmd.Start(); err != nil {
return 0, err
}
stats, err := parseMegaCliAdapter(pipe)
if err != nil {
return 0, err
}
if err := cmd.Wait(); err != nil {
return 0, err
}
updates := 0
for k, v := range stats["Device Present"] {
value, err := strconv.ParseFloat(v, 64)
if err != nil {
return updates, err
}
drivePresence.WithLabelValues(k).Set(value)
updates++
}
return updates, nil
}
func (c *megaCliCollector) updateDisks() (int, error) {
cmd := exec.Command(c.cli, "-PDList", "-aALL")
pipe, err := cmd.StdoutPipe()
if err != nil {
return 0, err
}
if err := cmd.Start(); err != nil {
return 0, err
}
stats, err := parseMegaCliDisks(pipe)
if err != nil {
return 0, err
}
if err := cmd.Wait(); err != nil {
return 0, err
}
updates := 0
for enc, encStats := range stats {
for slot, slotStats := range encStats {
tStr := slotStats["Drive Temperature"]
tStr = tStr[:strings.Index(tStr, "C")]
t, err := strconv.ParseFloat(tStr, 64)
if err != nil {
return updates, err
}
encStr := strconv.Itoa(enc)
slotStr := strconv.Itoa(slot)
driveTemperature.WithLabelValues(encStr, slotStr).Set(t)
updates++
for _, c := range counters {
counter, err := strconv.ParseFloat(slotStats[c], 64)
if err != nil {
return updates, err
}
driveCounters.WithLabelValues(encStr, slotStr, c).Set(counter)
updates++
}
}
}
return updates, nil
}

54
collector/megacli_test.go Normal file
View file

@ -0,0 +1,54 @@
// +build megacli
package collector
import (
"os"
"testing"
)
const (
testMegaCliAdapter = "fixtures/megacli_adapter.txt"
testMegaCliDisks = "fixtures/megacli_disks.txt"
physicalDevicesExpected = "5"
virtualDevicesDegraded = "0"
)
func TestMegaCliAdapter(t *testing.T) {
data, err := os.Open(testMegaCliAdapter)
if err != nil {
t.Fatal(err)
}
stats, err := parseMegaCliAdapter(data)
if err != nil {
t.Fatal(err)
}
if stats["Device Present"]["Physical Devices"] != physicalDevicesExpected {
t.Fatalf("Unexpected device count: %d != %d", stats["Device Present"]["Physical Devices"], physicalDevicesExpected)
}
if stats["Device Present"]["Degraded"] != virtualDevicesDegraded {
t.Fatal()
}
}
func TestMegaCliDisks(t *testing.T) {
data, err := os.Open(testMegaCliDisks)
if err != nil {
t.Fatal(err)
}
stats, err := parseMegaCliDisks(data)
if err != nil {
t.Fatal(err)
}
if stats[32][0]["Drive Temperature"] != "37C (98.60 F)" {
t.Fatalf("Unexpected drive temperature: %s", stats[32][0]["Drive Temperature"])
}
if stats[32][3]["Predictive Failure Count"] != "23" {
t.Fatal()
}
}

View file

@ -1,7 +1,10 @@
{
"attributes" : {
"default" : "1",
"web_server" : "1",
"zone" : "a",
"default" : "1"
"zone" : "a"
},
"config" : {
"megacli_command" : "megacli.sh"
}
}