Skip to content
This repository was archived by the owner on Apr 2, 2024. It is now read-only.

Commit 231904a

Browse files
Add alerts for database check metrics.
Signed-off-by: Harkishen-Singh <[email protected]>
1 parent e7eae31 commit 231904a

File tree

6 files changed

+164
-94
lines changed

6 files changed

+164
-94
lines changed

CHANGELOG.md

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,8 @@ We use the following categories for changes:
1515
## [Unreleased]
1616

1717
## Added
18-
- Alerting rules for Promscale. You can find them at [here](docs/promscale_alerting.md) [#1181]
18+
- Alerting rules for Promscale. You can find them [here](docs/promscale_alerting.md) [#1181, #1185]
19+
- Add database status and request metrics [#1185]
1920

2021
### Fixed
2122
- Register `promscale_ingest_channel_len_bucket` metric and make it a gauge [#1177]

docs/mixin/alerts/alerts.yaml

Lines changed: 44 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,5 @@
1+
# Note: Alert thresholds are experimental. Feel free to change them or suggest back at
2+
# Promscale channel in TimescaleDB slack.
13
groups:
24
- name: promscale-general
35
rules:
@@ -61,7 +63,7 @@ groups:
6163
severity: warning
6264
annotations:
6365
summary: Slow Promscale ingestion
64-
description: "90% of ingestion batch took {{ $value }} seconds to ingest."
66+
description: "Slowest 10% of ingestion batch took more than {{ $value }} seconds to ingest."
6567
- alert: PromscaleIngestHighLatency
6668
expr: |
6769
(
@@ -81,7 +83,7 @@ groups:
8183
severity: critical
8284
annotations:
8385
summary: Slow Promscale ingestion
84-
description: "90% of ingestion batch took {{ $value }} seconds to ingest."
86+
description: "Slowest 10% of ingestion batch took more than {{ $value }} seconds to ingest."
8587
- name: promscale-query
8688
rules:
8789
- alert: PromscaleQueryHighErrorRate
@@ -135,7 +137,7 @@ groups:
135137
severity: warning
136138
annotations:
137139
summary: Slow Promscale querying
138-
description: "90% of the queries took {{ $value }} seconds to evaluate."
140+
description: "Slowest 10% of the queries took more than {{ $value }} seconds to evaluate."
139141
- alert: PromscaleQueryHighLatency
140142
expr: |
141143
(
@@ -155,7 +157,7 @@ groups:
155157
severity: critical
156158
annotations:
157159
summary: Slow Promscale querying
158-
description: "90% of the queries took {{ $value }} seconds to evaluate."
160+
description: "Slowest 10% of the queries took {{ $value }} seconds to evaluate."
159161
- name: promscale-cache
160162
rules:
161163
- alert: PromscaleCacheHighNumberOfEvictions
@@ -189,4 +191,41 @@ groups:
189191
severity: warning
190192
annotations:
191193
summary: High cache eviction in Promscale
192-
description: "Promscale {{ $labels.name }} has a hit ratio of {{ $value }}."
194+
description: "Promscale {{ $labels.name }} has a hit ratio of {{ $value | humanizePercentage }}."
195+
- name: promscale-database-connection
196+
rules:
197+
- alert: PromscaleStorageHighErrorRate
198+
expr: |
199+
(
200+
sum by (job) (
201+
# Error counter exists for query, query_row & exec, and not for send_batch.
202+
rate(promscale_database_request_errors_total{method=~"query.*|exec"}[5m])
203+
)
204+
/
205+
sum by (job) (
206+
rate(promscale_database_requests_total{method=~"query.*|exec"}[5m])
207+
)
208+
) > 0.05
209+
labels:
210+
severity: warning
211+
annotations:
212+
summary: Promscale experiences a high error rate when connecting to the database
213+
description: "Promscale connection with the database has an error of {{ $value | humanizePercentage }}."
214+
- alert: PromscaleStorageHighLatency
215+
expr: |
216+
(
217+
histogram_quantile(0.9,
218+
sum by (le, job, type) (
219+
rate(promscale_database_requests_duration_seconds_bucket[5m])
220+
)
221+
) > 5
222+
and
223+
sum by (job, type) (
224+
rate(promscale_database_requests_duration_seconds_count[5m])
225+
) > 0
226+
)
227+
labels:
228+
severity: warning
229+
annotations:
230+
summary: Slow database response
231+
description: "Slowest 10% of database requests are taking more than {{ $value }} seconds to respond."

pkg/pgclient/client.go

Lines changed: 0 additions & 52 deletions
Original file line numberDiff line numberDiff line change
@@ -8,13 +8,10 @@ import (
88
"context"
99
"fmt"
1010
"net/url"
11-
"os"
12-
"time"
1311

1412
"github.com/google/uuid"
1513
"github.com/jackc/pgx/v4"
1614
"github.com/jackc/pgx/v4/pgxpool"
17-
"github.com/prometheus/client_golang/prometheus"
1815
"go.opentelemetry.io/collector/model/pdata"
1916

2017
"github.com/timescale/promscale/pkg/ha"
@@ -30,7 +27,6 @@ import (
3027
"github.com/timescale/promscale/pkg/query"
3128
"github.com/timescale/promscale/pkg/telemetry"
3229
"github.com/timescale/promscale/pkg/tenancy"
33-
"github.com/timescale/promscale/pkg/util"
3430
)
3531

3632
var PromscaleID uuid.UUID
@@ -60,7 +56,6 @@ type Client struct {
6056
sigClose chan struct{}
6157
haService *ha.Service
6258
TelemetryEngine telemetry.Engine
63-
stopHealthChecker context.CancelFunc
6459
}
6560

6661
// NewClient creates a new PostgreSQL client
@@ -185,9 +180,6 @@ func NewClientWithPool(cfg *Config, numCopiers int, connPool *pgxpool.Pool, mt t
185180
}
186181
}
187182

188-
healthCheckerCtx, stopHealthChecker := context.WithCancel(context.Background())
189-
healthCheckRoutine(healthCheckerCtx, dbConn)
190-
191183
client := &Client{
192184
Connection: dbConn,
193185
QuerierConnection: dbQuerierConn,
@@ -200,7 +192,6 @@ func NewClientWithPool(cfg *Config, numCopiers int, connPool *pgxpool.Pool, mt t
200192
seriesCache: seriesCache,
201193
sigClose: sigClose,
202194
TelemetryEngine: telemetryEngine,
203-
stopHealthChecker: stopHealthChecker,
204195
}
205196

206197
InitClientMetrics(client)
@@ -210,9 +201,6 @@ func NewClientWithPool(cfg *Config, numCopiers int, connPool *pgxpool.Pool, mt t
210201
// Close closes the client and performs cleanup
211202
func (c *Client) Close() {
212203
log.Info("msg", "Shutting down Client")
213-
if c.stopHealthChecker != nil {
214-
c.stopHealthChecker()
215-
}
216204
if c.TelemetryEngine != nil {
217205
c.TelemetryEngine.Stop()
218206
}
@@ -309,43 +297,3 @@ func observeStatementCacheState(conn *pgx.Conn) bool {
309297
statementCacheLen.Observe(float64(statementCacheSize))
310298
return true
311299
}
312-
313-
func healthCheckRoutine(ctx context.Context, conn pgxconn.PgxConn) {
314-
r := prometheus.DefaultRegisterer
315-
if env := os.Getenv("IS_TEST"); env == "true" {
316-
r = prometheus.NewRegistry()
317-
}
318-
dbHealthChecks := prometheus.NewCounter(
319-
prometheus.CounterOpts{
320-
Namespace: util.PromNamespace,
321-
Subsystem: "database",
322-
Name: "health_checks_total",
323-
Help: "Total number of database health checks performed.",
324-
},
325-
)
326-
dbHealthErrors := prometheus.NewCounter(
327-
prometheus.CounterOpts{
328-
Namespace: util.PromNamespace,
329-
Subsystem: "database",
330-
Name: "health_check_errors_total",
331-
Help: "Total number of database health check errors.",
332-
},
333-
)
334-
r.MustRegister(dbHealthChecks, dbHealthErrors)
335-
go func() {
336-
check := time.NewTicker(time.Minute)
337-
defer check.Stop()
338-
connection := health.NewHealthChecker(conn)
339-
for {
340-
select {
341-
case <-ctx.Done():
342-
return
343-
case <-check.C:
344-
}
345-
dbHealthChecks.Inc()
346-
if err := connection(); err != nil {
347-
dbHealthErrors.Inc()
348-
}
349-
}
350-
}()
351-
}

pkg/pgxconn/implement.go

Lines changed: 99 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,99 @@
1+
package pgxconn
2+
3+
import (
4+
"time"
5+
6+
"github.com/jackc/pgconn"
7+
"github.com/jackc/pgproto3/v2"
8+
"github.com/jackc/pgx/v4"
9+
)
10+
11+
// rowWithTelemetry wraps the row returned by QueryRow() for metrics telemetry.
12+
type rowWithTelemetry struct {
13+
row pgx.Row
14+
}
15+
16+
func (w rowWithTelemetry) Scan(dest ...interface{}) error {
17+
err := w.row.Scan(dest...)
18+
if err != nil && err != pgx.ErrNoRows {
19+
errorsTotal.With(promMethodLabel("query_row")).Inc()
20+
}
21+
return err
22+
}
23+
24+
// rowsWithDuration wraps the Query() function with duration metric taken to complete the entire execution.
25+
type rowsWithDuration struct {
26+
rows pgx.Rows
27+
start time.Time
28+
}
29+
30+
func newRowsWithDuration(rows pgx.Rows, start time.Time) rowsWithDuration {
31+
return rowsWithDuration{rows, start}
32+
}
33+
34+
func (r rowsWithDuration) Next() bool {
35+
return r.rows.Next()
36+
}
37+
38+
func (r rowsWithDuration) Scan(dest ...interface{}) error {
39+
return r.rows.Scan(dest...)
40+
}
41+
42+
func (r rowsWithDuration) Err() error {
43+
return r.rows.Err()
44+
}
45+
46+
func (r rowsWithDuration) Close() {
47+
defer func() { duration.With(promMethodLabel("query")).Observe(time.Since(r.start).Seconds()) }()
48+
r.rows.Close()
49+
}
50+
51+
func (r rowsWithDuration) CommandTag() pgconn.CommandTag {
52+
return r.rows.CommandTag()
53+
}
54+
55+
func (r rowsWithDuration) FieldDescriptions() []pgproto3.FieldDescription {
56+
return r.rows.FieldDescriptions()
57+
}
58+
59+
func (r rowsWithDuration) Values() ([]interface{}, error) {
60+
return r.rows.Values()
61+
}
62+
63+
func (r rowsWithDuration) RawValues() [][]byte {
64+
return r.rows.RawValues()
65+
}
66+
67+
// batchResultsWithDuration wraps the SendBatch and records the duration of the batch till it closed.
68+
type batchResultsWithDuration struct {
69+
batch pgx.BatchResults
70+
start time.Time
71+
}
72+
73+
func newBatchResultsWithDuration(b pgx.BatchResults, start time.Time) batchResultsWithDuration {
74+
return batchResultsWithDuration{
75+
batch: b,
76+
start: start,
77+
}
78+
}
79+
80+
func (w batchResultsWithDuration) Close() error {
81+
defer func() { duration.With(promMethodLabel("send_batch")).Observe(time.Since(w.start).Seconds()) }()
82+
return w.batch.Close()
83+
}
84+
85+
func (w batchResultsWithDuration) Exec() (pgconn.CommandTag, error) {
86+
return w.batch.Exec()
87+
}
88+
89+
func (w batchResultsWithDuration) Query() (pgx.Rows, error) {
90+
return w.batch.Query()
91+
}
92+
93+
func (w batchResultsWithDuration) QueryRow() pgx.Row {
94+
return rowWithTelemetry{w.batch.QueryRow()}
95+
}
96+
97+
func (w batchResultsWithDuration) QueryFunc(scans []interface{}, f func(pgx.QueryFuncRow) error) (pgconn.CommandTag, error) {
98+
return w.batch.QueryFunc(scans, f)
99+
}

0 commit comments

Comments
 (0)