mirror of
https://github.com/mjl-/mox.git
synced 2025-07-10 07:54:40 +03:00
better handling of outgoing tls reports to recipient domains vs hosts
based on discussion on uta mailing list. it seems the intention of the tlsrpt is to only send reports to recipient domains. but i was able to interpret the tlsrpt rfc as sending reports to mx hosts too ("policy domain", and because it makes sense given how DANE works per MX host, not recipient domain). this change makes the behaviour of outgoing reports to recipient domains work more in line with expectations most folks may have about tls reporting (i.e. also include per-mx host tlsa policies in the report). this also keeps reports to mx hosts working, and makes them more useful by including the recipient domains of affected deliveries.
This commit is contained in:
@ -29,6 +29,8 @@ import (
|
||||
"sync"
|
||||
"time"
|
||||
|
||||
"golang.org/x/exp/slices"
|
||||
|
||||
"github.com/prometheus/client_golang/prometheus"
|
||||
"github.com/prometheus/client_golang/prometheus/promauto"
|
||||
|
||||
@ -162,27 +164,31 @@ func sendReports(ctx context.Context, log *mlog.Log, resolver dns.Resolver, db *
|
||||
policyDomain string
|
||||
dayUTC string
|
||||
}
|
||||
destDomains := map[key]bool{}
|
||||
|
||||
// Gather all policy domains we plan to send to.
|
||||
rcptDoms := map[key]bool{} // Results where recipient domain is equal to policy domain, regardless of IsHost.
|
||||
nonRcptDoms := map[key]bool{} // MX domains (without those that are also recipient domains).
|
||||
var nsend int
|
||||
q := bstore.QueryDB[tlsrptdb.TLSResult](ctx, db)
|
||||
q.FilterLessEqual("DayUTC", dayUTC)
|
||||
q.SortAsc("PolicyDomain", "DayUTC", "RecipientDomain") // Sort for testability.
|
||||
err := q.ForEach(func(e tlsrptdb.TLSResult) error {
|
||||
k := key{e.PolicyDomain, dayUTC}
|
||||
if e.SendReport && !destDomains[k] {
|
||||
doms := rcptDoms
|
||||
if e.PolicyDomain != e.RecipientDomain {
|
||||
doms = nonRcptDoms
|
||||
}
|
||||
k := key{e.PolicyDomain, e.DayUTC}
|
||||
if e.SendReport && !doms[k] {
|
||||
nsend++
|
||||
}
|
||||
destDomains[k] = destDomains[k] || e.SendReport
|
||||
doms[k] = doms[k] || e.SendReport
|
||||
return nil
|
||||
})
|
||||
if err != nil {
|
||||
return fmt.Errorf("looking for domains to send tls reports to: %v", err)
|
||||
}
|
||||
|
||||
// Send report to each domain. We stretch sending over 4 hours, but only if there
|
||||
// are quite a few message. ../rfc/8460:479
|
||||
// Stretch sending reports over max 4 hours, but only if there are quite a few
|
||||
// messages. ../rfc/8460:479
|
||||
between := 4 * time.Hour
|
||||
if nsend > 0 {
|
||||
between = between / time.Duration(nsend)
|
||||
@ -194,61 +200,86 @@ func sendReports(ctx context.Context, log *mlog.Log, resolver dns.Resolver, db *
|
||||
var wg sync.WaitGroup
|
||||
|
||||
var n int
|
||||
for k, send := range destDomains {
|
||||
// Cleanup results for domain that doesn't need to get a report (e.g. for TLS
|
||||
// connections that were the result of delivering TLSRPT messages).
|
||||
if !send {
|
||||
removeResults(ctx, log, db, k.policyDomain, k.dayUTC)
|
||||
continue
|
||||
}
|
||||
|
||||
if n > 0 {
|
||||
ok := sleepBetween(ctx, between)
|
||||
if !ok {
|
||||
return nil
|
||||
remove := map[key]struct{}{}
|
||||
var removeMutex sync.Mutex
|
||||
|
||||
sendDomains := func(isRcptDom bool, doms map[key]bool) {
|
||||
for k, send := range doms {
|
||||
if !send {
|
||||
removeMutex.Lock()
|
||||
remove[k] = struct{}{}
|
||||
removeMutex.Unlock()
|
||||
continue
|
||||
}
|
||||
}
|
||||
n++
|
||||
|
||||
// In goroutine, so our timing stays independent of how fast we process.
|
||||
wg.Add(1)
|
||||
go func(policyDomain string, dayUTC string) {
|
||||
defer func() {
|
||||
// In case of panic don't take the whole program down.
|
||||
x := recover()
|
||||
if x != nil {
|
||||
log.Error("unhandled panic in tlsrptsend sendReports", mlog.Field("panic", x))
|
||||
debug.PrintStack()
|
||||
metrics.PanicInc(metrics.Tlsrptdb)
|
||||
if n > 0 {
|
||||
ok := sleepBetween(ctx, between)
|
||||
if !ok {
|
||||
return
|
||||
}
|
||||
}()
|
||||
defer wg.Done()
|
||||
|
||||
rlog := log.WithCid(mox.Cid()).Fields(mlog.Field("policydomain", policyDomain), mlog.Field("daytutc", dayUTC))
|
||||
rlog.Info("sending tls report")
|
||||
if _, err := sendReportDomain(ctx, rlog, resolver, db, endTimeUTC, policyDomain, dayUTC); err != nil {
|
||||
rlog.Errorx("sending tls report to domain", err)
|
||||
metricReportError.Inc()
|
||||
}
|
||||
}(k.policyDomain, k.dayUTC)
|
||||
n++
|
||||
|
||||
// In goroutine, so our timing stays independent of how fast we process.
|
||||
wg.Add(1)
|
||||
go func(k key) {
|
||||
defer func() {
|
||||
// In case of panic don't take the whole program down.
|
||||
x := recover()
|
||||
if x != nil {
|
||||
log.Error("unhandled panic in tlsrptsend sendReports", mlog.Field("panic", x))
|
||||
debug.PrintStack()
|
||||
metrics.PanicInc(metrics.Tlsrptdb)
|
||||
}
|
||||
}()
|
||||
defer wg.Done()
|
||||
|
||||
rlog := log.WithCid(mox.Cid()).Fields(mlog.Field("policydomain", k.policyDomain), mlog.Field("daytutc", k.dayUTC), mlog.Field("isrcptdom", isRcptDom))
|
||||
rlog.Info("looking to send tls report for domain")
|
||||
cleanup, err := sendReportDomain(ctx, rlog, resolver, db, endTimeUTC, isRcptDom, k.policyDomain, k.dayUTC)
|
||||
if err != nil {
|
||||
rlog.Errorx("sending tls report to domain", err)
|
||||
metricReportError.Inc()
|
||||
}
|
||||
if cleanup {
|
||||
removeMutex.Lock()
|
||||
defer removeMutex.Unlock()
|
||||
remove[k] = struct{}{}
|
||||
}
|
||||
}(k)
|
||||
}
|
||||
}
|
||||
|
||||
// We send to recipient domains first. That will store the reporting addresses for
|
||||
// the recipient domains, which are used when sending to nonRcptDoms to potentially
|
||||
// skip sending a duplicate report.
|
||||
sendDomains(true, rcptDoms)
|
||||
wg.Wait()
|
||||
sendDomains(false, nonRcptDoms)
|
||||
wg.Wait()
|
||||
|
||||
return nil
|
||||
}
|
||||
// Remove all records that have been processed.
|
||||
err = db.Write(ctx, func(tx *bstore.Tx) error {
|
||||
for k := range remove {
|
||||
q := bstore.QueryTx[tlsrptdb.TLSResult](tx)
|
||||
q.FilterNonzero(tlsrptdb.TLSResult{PolicyDomain: k.policyDomain, DayUTC: k.dayUTC})
|
||||
_, err := q.Delete()
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
}
|
||||
return nil
|
||||
})
|
||||
log.Check(err, "cleaning up tls results in database")
|
||||
|
||||
func removeResults(ctx context.Context, log *mlog.Log, db *bstore.DB, policyDomain string, dayUTC string) {
|
||||
q := bstore.QueryDB[tlsrptdb.TLSResult](ctx, db)
|
||||
q.FilterNonzero(tlsrptdb.TLSResult{PolicyDomain: policyDomain, DayUTC: dayUTC})
|
||||
_, err := q.Delete()
|
||||
log.Check(err, "removing tls results from database")
|
||||
return nil
|
||||
}
|
||||
|
||||
// replaceable for testing.
|
||||
var queueAdd = queue.Add
|
||||
|
||||
func sendReportDomain(ctx context.Context, log *mlog.Log, resolver dns.Resolver, db *bstore.DB, endUTC time.Time, policyDomain, dayUTC string) (cleanup bool, rerr error) {
|
||||
func sendReportDomain(ctx context.Context, log *mlog.Log, resolver dns.Resolver, db *bstore.DB, endUTC time.Time, isRcptDom bool, policyDomain, dayUTC string) (cleanup bool, rerr error) {
|
||||
polDom, err := dns.ParseDomain(policyDomain)
|
||||
if err != nil {
|
||||
return false, fmt.Errorf("parsing policy domain for sending tls reports: %v", err)
|
||||
@ -287,9 +318,8 @@ func sendReportDomain(ctx context.Context, log *mlog.Log, resolver dns.Resolver,
|
||||
|
||||
defer func() {
|
||||
if !cleanup || tempError {
|
||||
cleanup = false
|
||||
log.Debug("not cleaning up results after attempting to send tls report")
|
||||
} else {
|
||||
removeResults(ctx, log, db, policyDomain, dayUTC)
|
||||
}
|
||||
}()
|
||||
|
||||
@ -305,6 +335,7 @@ func sendReportDomain(ctx context.Context, log *mlog.Log, resolver dns.Resolver,
|
||||
}
|
||||
|
||||
var recipients []message.NameAddress
|
||||
var recipientStrs []string
|
||||
|
||||
for _, l := range record.RUAs {
|
||||
for _, s := range l {
|
||||
@ -321,6 +352,7 @@ func sendReportDomain(ctx context.Context, log *mlog.Log, resolver dns.Resolver,
|
||||
continue
|
||||
}
|
||||
recipients = append(recipients, message.NameAddress{Address: addr})
|
||||
recipientStrs = append(recipientStrs, string(s))
|
||||
} else if u.Scheme == "https" {
|
||||
// Although "report" is ambiguous and could mean both only the JSON data or an
|
||||
// entire message (including DKIM-Signature) with the JSON data, it appears the
|
||||
@ -346,10 +378,12 @@ func sendReportDomain(ctx context.Context, log *mlog.Log, resolver dns.Resolver,
|
||||
return true, nil
|
||||
}
|
||||
|
||||
log.Info("sending tlsrpt report")
|
||||
|
||||
q := bstore.QueryDB[tlsrptdb.TLSResult](ctx, db)
|
||||
q.FilterNonzero(tlsrptdb.TLSResult{PolicyDomain: policyDomain, DayUTC: dayUTC})
|
||||
if isRcptDom {
|
||||
q.FilterNonzero(tlsrptdb.TLSResult{RecipientDomain: policyDomain, DayUTC: dayUTC})
|
||||
} else {
|
||||
q.FilterNonzero(tlsrptdb.TLSResult{PolicyDomain: policyDomain, DayUTC: dayUTC})
|
||||
}
|
||||
tlsResults, err := q.List()
|
||||
if err != nil {
|
||||
return true, fmt.Errorf("get tls results from database: %v", err)
|
||||
@ -360,6 +394,13 @@ func sendReportDomain(ctx context.Context, log *mlog.Log, resolver dns.Resolver,
|
||||
return true, fmt.Errorf("no tls results found")
|
||||
}
|
||||
|
||||
// Stop if we already sent a report for this destination.
|
||||
for _, r := range tlsResults {
|
||||
if r.PolicyDomain == r.RecipientDomain && (isRcptDom && r.SentToRecipientDomain || !isRcptDom && r.SentToPolicyDomain) {
|
||||
return true, nil
|
||||
}
|
||||
}
|
||||
|
||||
beginUTC := endUTC.Add(-24 * time.Hour)
|
||||
|
||||
report := tlsrpt.Report{
|
||||
@ -370,14 +411,67 @@ func sendReportDomain(ctx context.Context, log *mlog.Log, resolver dns.Resolver,
|
||||
},
|
||||
ContactInfo: "postmaster@" + fromDom.ASCII,
|
||||
// todo spec: ../rfc/8460:968 ../rfc/8460:1772 ../rfc/8460:691 subject header assumes a report-id in the form of a msg-id, but example and report-id json field explanation allows free-form report-id's (assuming we're talking about the same report-id here).
|
||||
ReportID: endUTC.Format("20060102") + "." + polDom.ASCII + "@" + fromDom.ASCII,
|
||||
ReportID: endUTC.Add(-12*time.Hour).Format("20060102") + "." + polDom.ASCII + "@" + fromDom.ASCII,
|
||||
}
|
||||
|
||||
rcptDomAddresses := map[string][]string{}
|
||||
for _, tlsResult := range tlsResults {
|
||||
rcptDomAddresses[tlsResult.RecipientDomain] = tlsResult.RecipientDomainReportingAddresses
|
||||
}
|
||||
|
||||
// Merge all results into this report.
|
||||
for _, tlsResult := range tlsResults {
|
||||
// If we are sending to a recipient domain, we include all relevant policy domains,
|
||||
// so possibly multiple MX hosts (with DANE policies). That means we may be sending
|
||||
// multiple "no-policy-found" results (1 for sts and 0 or more for mx hosts). An
|
||||
// explicit no-sts or no-tlsa would make these less ambiguous, but the
|
||||
// policy-domain's will make clear which is the MX and which is the recipient
|
||||
// domain. Only for recipient domains with an MX target equal to the recipient host
|
||||
// could it be confusing.
|
||||
// If we are sending to MX targets (that aren't recipient domains), we mention the
|
||||
// affected recipient domains as policy-domain while keeping the original policy
|
||||
// domain (MX target) in the "mx-host" field. This behaviour isn't in the RFC, but
|
||||
// seems useful to give MX operators insight into the recipient domains affected.
|
||||
// We also won't include results for a recipient domain if its TLSRPT policy has
|
||||
// the same reporting addresses as the MX target TLSRPT policy.
|
||||
for i, tlsResult := range tlsResults {
|
||||
if !isRcptDom {
|
||||
if slices.Equal(rcptDomAddresses[tlsResult.RecipientDomain], recipientStrs) {
|
||||
continue
|
||||
}
|
||||
for j, r := range tlsResult.Results {
|
||||
if tlsResult.IsHost {
|
||||
tlsResults[i].Results[j].Policy.MXHost = []string{r.Policy.Domain}
|
||||
}
|
||||
tlsResults[i].Results[j].Policy.Domain = tlsResult.RecipientDomain
|
||||
}
|
||||
}
|
||||
|
||||
report.Merge(tlsResult.Results...)
|
||||
}
|
||||
|
||||
// We may not have any results left, i.e. when this is an MX target and we already
|
||||
// sent all results in the report to the recipient domain with identical reporting
|
||||
// addresses.
|
||||
if len(report.Policies) == 0 {
|
||||
return true, nil
|
||||
}
|
||||
|
||||
if !mox.Conf.Static.OutgoingTLSReportsForAllSuccess {
|
||||
var haveFailure bool
|
||||
// Check there is at least one failure. If not, we don't send a report.
|
||||
for _, r := range report.Policies {
|
||||
if r.Summary.TotalFailureSessionCount > 0 || len(r.FailureDetails) > 0 {
|
||||
haveFailure = true
|
||||
break
|
||||
}
|
||||
}
|
||||
if !haveFailure {
|
||||
return true, nil
|
||||
}
|
||||
}
|
||||
|
||||
log.Info("sending tls report")
|
||||
|
||||
reportFile, err := store.CreateMessageTemp("tlsreportout")
|
||||
if err != nil {
|
||||
return false, fmt.Errorf("creating temporary file for outgoing tls report: %v", err)
|
||||
@ -440,6 +534,41 @@ Period: %s - %s UTC
|
||||
}
|
||||
msgSize := int64(len(msgPrefix)) + msgInfo.Size()
|
||||
|
||||
// Already mark the report as sent. If it won't succeed below, it probably won't
|
||||
// succeed on a later retry either. And if we would fail to mark a report as sent
|
||||
// after sending it, we may sent duplicates or even get in some kind of sending
|
||||
// loop.
|
||||
err = db.Write(ctx, func(tx *bstore.Tx) error {
|
||||
if isRcptDom {
|
||||
q := bstore.QueryTx[tlsrptdb.TLSResult](tx)
|
||||
q.FilterNonzero(tlsrptdb.TLSResult{DayUTC: dayUTC, RecipientDomain: policyDomain})
|
||||
_, err := q.UpdateNonzero(tlsrptdb.TLSResult{SentToRecipientDomain: true})
|
||||
if err != nil {
|
||||
return fmt.Errorf("already marking tls results as sent for recipient domain: %v", err)
|
||||
}
|
||||
|
||||
// Also set reporting addresses for the recipient domain results.
|
||||
q = bstore.QueryTx[tlsrptdb.TLSResult](tx)
|
||||
q.FilterNonzero(tlsrptdb.TLSResult{DayUTC: dayUTC, RecipientDomain: policyDomain})
|
||||
_, err = q.UpdateNonzero(tlsrptdb.TLSResult{RecipientDomainReportingAddresses: recipientStrs})
|
||||
if err != nil {
|
||||
return fmt.Errorf("storing recipient domain reporting addresses: %v", err)
|
||||
}
|
||||
} else {
|
||||
q := bstore.QueryTx[tlsrptdb.TLSResult](tx)
|
||||
q.FilterNonzero(tlsrptdb.TLSResult{DayUTC: dayUTC, PolicyDomain: policyDomain})
|
||||
_, err := q.UpdateNonzero(tlsrptdb.TLSResult{SentToPolicyDomain: true})
|
||||
if err != nil {
|
||||
return fmt.Errorf("already marking tls results as sent for policy domain: %v", err)
|
||||
}
|
||||
}
|
||||
return nil
|
||||
})
|
||||
if err != nil {
|
||||
return false, fmt.Errorf("marking tls results as sent: %v", err)
|
||||
}
|
||||
|
||||
var queued bool
|
||||
for _, rcpt := range recipients {
|
||||
// If recipient is on suppression list, we won't queue the reporting message.
|
||||
q := bstore.QueryDB[tlsrptdb.TLSRPTSuppressAddress](ctx, db)
|
||||
@ -466,10 +595,12 @@ Period: %s - %s UTC
|
||||
|
||||
err = queueAdd(ctx, log, &qm, msgf)
|
||||
if err != nil {
|
||||
tempError = true
|
||||
tempError = !queued
|
||||
log.Errorx("queueing message with tls report", err)
|
||||
metricReportError.Inc()
|
||||
} else {
|
||||
queued = true
|
||||
tempError = false
|
||||
log.Debug("tls report queued", mlog.Field("recipient", rcpt))
|
||||
metricReport.Inc()
|
||||
}
|
||||
|
Reference in New Issue
Block a user