prometheus/storage/local/instrumentation.go
beorn7 0ea5801e47 Handle errors caused by data corruption more gracefully
This requires all the panic calls upon unexpected data to be converted
into errors returned. This pollute the function signatures quite
lot. Well, this is Go...

The ideas behind this are the following:

- panic only if it's a programming error. Data corruptions happen, and
  they are not programming errors.

- If we detect a data corruption, we "quarantine" the series,
  essentially removing it from the database and putting its data into
  a separate directory for forensics.

- Failure during writing to a series file is not considered corruption
  automatically. It will call setDirty, though, so that a
  crashrecovery upon the next restart will commence and check for
  that.

- Series quarantining and setDirty calls are logged and counted in
  metrics, but are hidden from the user of the interfaces in
  interface.go, whith the notable exception of Append(). The reasoning
  is that we treat corruption by removing the corrupted series, i.e. a
  query for it will return no results on its next call anyway, so
  return no results right now. In the case of Append(), we want to
  tell the user that no data has been appended, though.

Minor side effects:

- Now consistently using filepath.* instead of path.*.

- Introduced structured logging where I touched it. This makes things
  less consistent, but a complete change to structured logging would
  be out of scope for this PR.
2016-03-02 23:02:34 +01:00

107 lines
3.3 KiB
Go

// Copyright 2014 The Prometheus Authors
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package local
import "github.com/prometheus/client_golang/prometheus"
// Usually, a separate file for instrumentation is frowned upon. Metrics should
// be close to where they are used. However, the metrics below are set all over
// the place, so we go for a separate instrumentation file in this case.
var (
chunkOps = prometheus.NewCounterVec(
prometheus.CounterOpts{
Namespace: namespace,
Subsystem: subsystem,
Name: "chunk_ops_total",
Help: "The total number of chunk operations by their type.",
},
[]string{opTypeLabel},
)
chunkDescOps = prometheus.NewCounterVec(
prometheus.CounterOpts{
Namespace: namespace,
Subsystem: subsystem,
Name: "chunkdesc_ops_total",
Help: "The total number of chunk descriptor operations by their type.",
},
[]string{opTypeLabel},
)
numMemChunkDescs = prometheus.NewGauge(prometheus.GaugeOpts{
Namespace: namespace,
Subsystem: subsystem,
Name: "memory_chunkdescs",
Help: "The current number of chunk descriptors in memory.",
})
)
const (
namespace = "prometheus"
subsystem = "local_storage"
opTypeLabel = "type"
// Op-types for seriesOps.
create = "create"
archive = "archive"
unarchive = "unarchive"
memoryPurge = "purge_from_memory"
archivePurge = "purge_from_archive"
requestedPurge = "purge_on_request"
memoryMaintenance = "maintenance_in_memory"
archiveMaintenance = "maintenance_in_archive"
completedQurantine = "quarantine_completed"
droppedQuarantine = "quarantine_dropped"
failedQuarantine = "quarantine_failed"
// Op-types for chunkOps.
createAndPin = "create" // A chunkDesc creation with refCount=1.
persistAndUnpin = "persist"
pin = "pin" // Excluding the pin on creation.
unpin = "unpin" // Excluding the unpin on persisting.
clone = "clone"
transcode = "transcode"
drop = "drop"
// Op-types for chunkOps and chunkDescOps.
evict = "evict"
load = "load"
seriesLocationLabel = "location"
// Maintenance types for maintainSeriesDuration.
maintainInMemory = "memory"
maintainArchived = "archived"
)
func init() {
prometheus.MustRegister(chunkOps)
prometheus.MustRegister(chunkDescOps)
prometheus.MustRegister(numMemChunkDescs)
}
var (
// Global counter, also used internally, so not implemented as
// metrics. Collected in memorySeriesStorage.Collect.
// TODO(beorn7): As it is used internally, it is actually very bad style
// to have it as a global variable.
numMemChunks int64
// Metric descriptors for the above.
numMemChunksDesc = prometheus.NewDesc(
prometheus.BuildFQName(namespace, subsystem, "memory_chunks"),
"The current number of chunks in memory, excluding cloned chunks (i.e. chunks without a descriptor).",
nil, nil,
)
)