add prometheus metrics for errors when getting certificates through acme (typically from let's encrypt)

and add an alerting rule for it.
we certainly want a heads up when there are issues with the certificates.
This commit is contained in:
Mechiel Lukkien 2025-02-06 15:12:36 +01:00
parent 1277d78cb1
commit e5e15a3965
No known key found for this signature in database
2 changed files with 29 additions and 2 deletions

View File

@ -42,6 +42,24 @@ import (
) )
var ( var (
metricMissingServerName = promauto.NewCounter(
prometheus.CounterOpts{
Name: "mox_autotls_missing_servername_total",
Help: "Number of failed TLS connection attempts with missing SNI where no fallback hostname was configured.",
},
)
metricUnknownServerName = promauto.NewCounter(
prometheus.CounterOpts{
Name: "mox_autotls_unknown_servername_total",
Help: "Number of failed TLS connection attempts with an unrecognized SNI name where no fallback hostname was configured.",
},
)
metricCertRequestErrors = promauto.NewCounter(
prometheus.CounterOpts{
Name: "mox_autotls_cert_request_errors_total",
Help: "Number of errors trying to retrieve a certificate for a hostname, possibly ACME verification errors.",
},
)
metricCertput = promauto.NewCounter( metricCertput = promauto.NewCounter(
prometheus.CounterOpts{ prometheus.CounterOpts{
Name: "mox_autotls_certput_total", Name: "mox_autotls_certput_total",
@ -171,7 +189,7 @@ func Load(name, acmeDir, contactEmail, directoryURL string, eabKeyID string, eab
return a, nil return a, nil
} }
// logigngGetCertificate is a helper to implement crypto/tls.Config.GetCertificate, // loggingGetCertificate is a helper to implement crypto/tls.Config.GetCertificate,
// optionally falling back to a certificate for fallbackHostname in case SNI is // optionally falling back to a certificate for fallbackHostname in case SNI is
// absent or for an unknown hostname. // absent or for an unknown hostname.
func (m *Manager) loggingGetCertificate(hello *tls.ClientHelloInfo, fallbackHostname dns.Domain, fallbackNoSNI, fallbackUnknownSNI bool) (*tls.Certificate, error) { func (m *Manager) loggingGetCertificate(hello *tls.ClientHelloInfo, fallbackHostname dns.Domain, fallbackNoSNI, fallbackUnknownSNI bool) (*tls.Certificate, error) {
@ -188,6 +206,7 @@ func (m *Manager) loggingGetCertificate(hello *tls.ClientHelloInfo, fallbackHost
// Handle missing SNI to prevent logging an error below. // Handle missing SNI to prevent logging an error below.
if hello.ServerName == "" { if hello.ServerName == "" {
metricMissingServerName.Inc()
log.Debug("tls request without sni servername, rejecting", slog.Any("localaddr", hello.Conn.LocalAddr()), slog.Any("supportedprotos", hello.SupportedProtos)) log.Debug("tls request without sni servername, rejecting", slog.Any("localaddr", hello.Conn.LocalAddr()), slog.Any("supportedprotos", hello.SupportedProtos))
return nil, nil return nil, nil
} }
@ -195,6 +214,7 @@ func (m *Manager) loggingGetCertificate(hello *tls.ClientHelloInfo, fallbackHost
cert, err := m.Manager.GetCertificate(hello) cert, err := m.Manager.GetCertificate(hello)
if err != nil && errors.Is(err, errHostNotAllowed) { if err != nil && errors.Is(err, errHostNotAllowed) {
if !fallbackUnknownSNI { if !fallbackUnknownSNI {
metricUnknownServerName.Inc()
log.Debugx("requesting certificate", err, slog.String("host", hello.ServerName)) log.Debugx("requesting certificate", err, slog.String("host", hello.ServerName))
return nil, nil return nil, nil
} }
@ -203,12 +223,14 @@ func (m *Manager) loggingGetCertificate(hello *tls.ClientHelloInfo, fallbackHost
hello.ServerName = fallbackHostname.ASCII hello.ServerName = fallbackHostname.ASCII
cert, err = m.Manager.GetCertificate(hello) cert, err = m.Manager.GetCertificate(hello)
if err != nil { if err != nil {
metricCertRequestErrors.Inc()
log.Errorx("requesting certificate for fallback hostname", err, slog.String("host", hello.ServerName)) log.Errorx("requesting certificate for fallback hostname", err, slog.String("host", hello.ServerName))
} else { } else {
log.Debugx("requesting certificate for fallback hostname", err, slog.String("host", hello.ServerName)) log.Debug("using certificate for fallback hostname", slog.String("host", hello.ServerName))
} }
return cert, err return cert, err
} else if err != nil { } else if err != nil {
metricCertRequestErrors.Inc()
log.Errorx("requesting certificate", err, slog.String("host", hello.ServerName)) log.Errorx("requesting certificate", err, slog.String("host", hello.ServerName))
} }
return cert, err return cert, err

View File

@ -8,6 +8,11 @@ groups:
annotations: annotations:
summary: unhandled panic summary: unhandled panic
- alert: mox-acme-request-cert-errors
expr: increase(mox_autotls_cert_request_errors_total[1h]) > 0
annotations:
summary: errors requesting tls certificates with acme
- alert: mox-ip-on-dns-blocklist - alert: mox-ip-on-dns-blocklist
expr: mox_dnsbl_ips_success < 1 expr: mox_dnsbl_ips_success < 1
annotations: annotations: