implement message threading in backend and webmail

we match messages to their parents based on the "references" and "in-reply-to"
headers (requiring the same base subject), and in absense of those headers we
also by only base subject (against messages received max 4 weeks ago).

we store a threadid with messages. all messages in a thread have the same
threadid.  messages also have a "thread parent ids", which holds all id's of
parent messages up to the thread root.  then there is "thread missing link",
which is set when a referenced immediate parent wasn't found (but possibly
earlier ancestors can still be found and will be in thread parent ids".

threads can be muted: newly delivered messages are automatically marked as
read/seen.  threads can be marked as collapsed: if set, the webmail collapses
the thread to a single item in the basic threading view (default is to expand
threads).  the muted and collapsed fields are copied from their parent on
message delivery.

the threading is implemented in the webmail. the non-threading mode still works
as before. the new default threading mode "unread" automatically expands only
the threads with at least one unread (not seen) meessage. the basic threading
mode "on" expands all threads except when explicitly collapsed (as saved in the
thread collapsed field). new shortcuts for navigation/interaction threads have
been added, e.g. go to previous/next thread root, toggle collapse/expand of
thread (or double click), toggle mute of thread. some previous shortcuts have
changed, see the help for details.

the message threading are added with an explicit account upgrade step,
automatically started when an account is opened. the upgrade is done in the
background because it will take too long for large mailboxes to block account
operations. the upgrade takes two steps: 1. updating all message records in the
database to add a normalized message-id and thread base subject (with "re:",
"fwd:" and several other schemes stripped). 2. going through all messages in
the database again, reading the "references" and "in-reply-to" headers from
disk, and matching against their parents. this second step is also done at the
end of each import of mbox/maildir mailboxes. new deliveries are matched
immediately against other existing messages, currently no attempt is made to
rematch previously delivered messages (which could be useful for related
messages being delivered out of order).

the threading is not yet exposed over imap.
This commit is contained in:
Mechiel Lukkien
2023-09-13 08:51:50 +02:00
parent b754b5f9ac
commit 3fb41ff073
44 changed files with 5930 additions and 821 deletions

53
message/messageid.go Normal file
View File

@ -0,0 +1,53 @@
package message
import (
"errors"
"fmt"
"strings"
"github.com/mjl-/mox/moxvar"
"github.com/mjl-/mox/smtp"
)
var errBadMessageID = errors.New("not a message-id")
// MessageIDCanonical parses the Message-ID, returning a canonical value that is
// lower-cased, without <>, and no unneeded quoting. For matching in threading,
// with References/In-Reply-To. If the message-id is invalid (e.g. no <>), an error
// is returned. If the message-id could not be parsed as address (localpart "@"
// domain), the raw value and the bool return parameter true is returned. It is
// quite common that message-id's don't adhere to the localpart @ domain
// syntax.
func MessageIDCanonical(s string) (string, bool, error) {
// ../rfc/5322:1383
s = strings.TrimSpace(s)
if !strings.HasPrefix(s, "<") {
return "", false, fmt.Errorf("%w: missing <", errBadMessageID)
}
s = s[1:]
// Seen in practice: Message-ID: <valid@valid.example> (added by postmaster@some.example)
// Doesn't seem valid, but we allow it.
s, rem, have := strings.Cut(s, ">")
if !have || (rem != "" && (moxvar.Pedantic || !strings.HasPrefix(rem, " "))) {
return "", false, fmt.Errorf("%w: missing >", errBadMessageID)
}
// We canonicalize the Message-ID: lower-case, no unneeded quoting.
s = strings.ToLower(s)
if s == "" {
return "", false, fmt.Errorf("%w: empty message-id", errBadMessageID)
}
addr, err := smtp.ParseAddress(s)
if err != nil {
// Common reasons for not being an address:
// 1. underscore in hostname.
// 2. ip literal instead of domain.
// 3. two @'s, perhaps intended as time-separator
// 4. no @'s, so no domain/host
return s, true, nil
}
// We preserve the unicode-ness of domain.
t := strings.Split(s, "@")
s = addr.Localpart.String() + "@" + t[len(t)-1]
return s, false, nil
}

29
message/messageid_test.go Normal file
View File

@ -0,0 +1,29 @@
package message
import (
"errors"
"testing"
)
func TestMessageIDCanonical(t *testing.T) {
check := func(s string, expID string, expRaw bool, expErr error) {
t.Helper()
id, raw, err := MessageIDCanonical(s)
if id != expID || raw != expRaw || (expErr == nil) != (err == nil) || err != nil && !errors.Is(err, expErr) {
t.Fatalf("got message-id %q, raw %v, err %v, expected %q %v %v, for message-id %q", id, raw, err, expID, expRaw, expErr, s)
}
}
check("bogus", "", false, errBadMessageID)
check("<bogus@host", "", false, errBadMessageID)
check("bogus@host>", "", false, errBadMessageID)
check("<>", "", false, errBadMessageID)
check("<user@domain>", "user@domain", false, nil)
check("<USER@DOMAIN>", "user@domain", false, nil)
check("<user@[10.0.0.1]>", "user@[10.0.0.1]", true, nil)
check("<user@domain> (added by postmaster@isp.example)", "user@domain", false, nil)
check("<user@domain> other", "user@domain", false, nil)
check("<User@Domain@Time>", "user@domain@time", true, nil)
check("<User>", "user", true, nil)
}

View File

@ -0,0 +1,77 @@
package message
import (
"bytes"
"fmt"
"net/mail"
"net/textproto"
)
// ParseHeaderFields parses only the header fields in "fields" from the complete
// header buffer "header", while using "scratch" as temporary space, prevent lots
// of unneeded allocations when only a few headers are needed.
func ParseHeaderFields(header []byte, scratch []byte, fields [][]byte) (textproto.MIMEHeader, error) {
// todo: should not use mail.ReadMessage, it allocates a bufio.Reader. should implement header parsing ourselves.
// Gather the raw lines for the fields, with continuations, without the other
// headers. Put them in a byte slice and only parse those headers. For now, use
// mail.ReadMessage without letting it do allocations for all headers.
scratch = scratch[:0]
var keepcontinuation bool
for len(header) > 0 {
if header[0] == ' ' || header[0] == '\t' {
// Continuation.
i := bytes.IndexByte(header, '\n')
if i < 0 {
i = len(header)
} else {
i++
}
if keepcontinuation {
scratch = append(scratch, header[:i]...)
}
header = header[i:]
continue
}
i := bytes.IndexByte(header, ':')
if i < 0 || i > 0 && (header[i-1] == ' ' || header[i-1] == '\t') {
i = bytes.IndexByte(header, '\n')
if i < 0 {
break
}
header = header[i+1:]
keepcontinuation = false
continue
}
k := header[:i]
keepcontinuation = false
for _, f := range fields {
if bytes.EqualFold(k, f) {
keepcontinuation = true
break
}
}
i = bytes.IndexByte(header, '\n')
if i < 0 {
i = len(header)
} else {
i++
}
if keepcontinuation {
scratch = append(scratch, header[:i]...)
}
header = header[i:]
}
if len(scratch) == 0 {
return nil, nil
}
scratch = append(scratch, "\r\n"...)
msg, err := mail.ReadMessage(bytes.NewReader(scratch))
if err != nil {
return nil, fmt.Errorf("reading message header")
}
return textproto.MIMEHeader(msg.Header), nil
}

View File

@ -0,0 +1,40 @@
package message
import (
"net/textproto"
"reflect"
"strings"
"testing"
)
func TestParseHeaderFields(t *testing.T) {
check := func(headers string, fields []string, expHdrs textproto.MIMEHeader, expErr error) {
t.Helper()
buffields := [][]byte{}
for _, f := range fields {
buffields = append(buffields, []byte(f))
}
scratches := [][]byte{
make([]byte, 0),
make([]byte, 4*1024),
}
for _, scratch := range scratches {
hdrs, err := ParseHeaderFields([]byte(strings.ReplaceAll(headers, "\n", "\r\n")), scratch, buffields)
if !reflect.DeepEqual(hdrs, expHdrs) || !reflect.DeepEqual(err, expErr) {
t.Fatalf("got %v %v, expected %v %v", hdrs, err, expHdrs, expErr)
}
}
}
check("", []string{"subject"}, textproto.MIMEHeader(nil), nil)
check("Subject: test\n", []string{"subject"}, textproto.MIMEHeader{"Subject": []string{"test"}}, nil)
check("References: <id@host>\nOther: ignored\nSubject: first\nSubject: test\n\tcontinuation\n", []string{"subject", "REFERENCES"}, textproto.MIMEHeader{"References": []string{"<id@host>"}, "Subject": []string{"first", "test continuation"}}, nil)
check(":\n", []string{"subject"}, textproto.MIMEHeader(nil), nil)
check("bad\n", []string{"subject"}, textproto.MIMEHeader(nil), nil)
check("subject: test\n continuation without end\n", []string{"subject"}, textproto.MIMEHeader{"Subject": []string{"test continuation without end"}}, nil)
check("subject: test\n", []string{"subject"}, textproto.MIMEHeader{"Subject": []string{"test"}}, nil)
check("subject \t: test\n", []string{"subject"}, textproto.MIMEHeader(nil), nil) // Note: In go1.20, this would be interpreted as valid "Subject" header. Not in go1.21.
// note: in go1.20, missing end of line would cause it to be ignored, in go1.21 it is used.
}

View File

@ -88,7 +88,7 @@ type Part struct {
// Envelope holds the basic/common message headers as used in IMAP4.
type Envelope struct {
Date time.Time
Subject string
Subject string // Q/B-word-decoded.
From []Address
Sender []Address
ReplyTo []Address

74
message/referencedids.go Normal file
View File

@ -0,0 +1,74 @@
package message
import (
"strings"
"github.com/mjl-/mox/smtp"
)
// ReferencedIDs returns the Message-IDs referenced from the References header(s),
// with a fallback to the In-Reply-To header(s). The ids are canonicalized for
// thread-matching, like with MessageIDCanonical. Empty message-id's are skipped.
func ReferencedIDs(references []string, inReplyTo []string) ([]string, error) {
var refids []string // In thread-canonical form.
// parse and add 0 or 1 reference, returning the remaining refs string for a next attempt.
parse1 := func(refs string, one bool) string {
refs = strings.TrimLeft(refs, " \t\r\n")
if !strings.HasPrefix(refs, "<") {
// To make progress, we skip to next space or >.
i := strings.IndexAny(refs, " >")
if i < 0 {
return ""
}
return refs[i+1:]
}
refs = refs[1:]
// Look for the ending > or next <. If < is before >, this entry is truncated.
i := strings.IndexAny(refs, "<>")
if i < 0 {
return ""
}
if refs[i] == '<' {
// Truncated entry, we ignore it.
return refs[i:]
}
ref := strings.ToLower(refs[:i])
// Some MUAs wrap References line in the middle of message-id's, and others
// recombine them. Take out bare WSP in message-id's.
ref = strings.ReplaceAll(ref, " ", "")
ref = strings.ReplaceAll(ref, "\t", "")
refs = refs[i+1:]
// Canonicalize the quotedness of the message-id.
addr, err := smtp.ParseAddress(ref)
if err == nil {
// Leave the hostname form intact.
t := strings.Split(ref, "@")
ref = addr.Localpart.String() + "@" + t[len(t)-1]
}
// log.Errorx("assigning threads: bad reference in references header, using raw value", err, mlog.Field("msgid", mid), mlog.Field("reference", ref))
if ref != "" {
refids = append(refids, ref)
}
return refs
}
// References is the modern way (for a long time already) to reference ancestors.
// The direct parent is typically at the end of the list.
for _, refs := range references {
for refs != "" {
refs = parse1(refs, false)
}
}
// We only look at the In-Reply-To header if we didn't find any References.
if len(refids) == 0 {
for _, s := range inReplyTo {
parse1(s, true)
if len(refids) > 0 {
break
}
}
}
return refids, nil
}

View File

@ -0,0 +1,35 @@
package message
import (
"strings"
"testing"
)
func TestReferencedIDs(t *testing.T) {
check := func(msg string, expRefs []string) {
t.Helper()
p, err := Parse(xlog, true, strings.NewReader(msg))
tcheck(t, err, "parsing message")
h, err := p.Header()
tcheck(t, err, "parsing header")
refs, err := ReferencedIDs(h["References"], h["In-Reply-To"])
tcheck(t, err, "parsing references/in-reply-to")
tcompare(t, refs, expRefs)
}
check("References: bogus\r\n", nil)
check("References: <User@host>\r\n", []string{"user@host"})
check("References: <User@tést.example>\r\n", []string{"user@tést.example"})
check("References: <User@xn--tst-bma.example>\r\n", []string{"user@xn--tst-bma.example"})
check("References: <User@bad_label.domain>\r\n", []string{"user@bad_label.domain"})
check("References: <truncated@hos <user@host>\r\n", []string{"user@host"})
check("References: <previously wrapped@host>\r\n", []string{"previouslywrapped@host"})
check("References: <user1@host> <user2@other.example>\r\n", []string{"user1@host", "user2@other.example"})
check("References: <missinghost>\r\n", []string{"missinghost"})
check("References: <user@host@time>\r\n", []string{"user@host@time"})
check("References: bogus bad <user@host>\r\n", []string{"user@host"})
check("In-Reply-To: <user@host> more stuff\r\nReferences: bogus bad\r\n", []string{"user@host"})
}

124
message/threadsubject.go Normal file
View File

@ -0,0 +1,124 @@
package message
import (
"strings"
)
// ThreadSubject returns the base subject to use for matching against other
// messages, to see if they belong to the same thread. A matching subject is
// always required to match to an existing thread, both if
// References/In-Reply-To header(s) are present, and if not.
//
// Subject should already be q/b-word-decoded.
//
// If allowNull is true, base subjects with a \0 can be returned. If not set,
// an empty string is returned if a base subject would have a \0.
func ThreadSubject(subject string, allowNull bool) (threadSubject string, isResponse bool) {
subject = strings.ToLower(subject)
// ../rfc/5256:101, Step 1.
var s string
for _, c := range subject {
if c == '\r' {
continue
} else if c == ' ' || c == '\n' || c == '\t' {
if !strings.HasSuffix(s, " ") {
s += " "
}
} else {
s += string(c)
}
}
// ../rfc/5256:107 ../rfc/5256:811, removing mailing list tag "[...]" and reply/forward "re"/"fwd" prefix.
removeBlob := func(s string) string {
for i, c := range s {
if i == 0 {
if c != '[' {
return s
}
} else if c == '[' {
return s
} else if c == ']' {
s = s[i+1:] // Past [...].
s = strings.TrimRight(s, " \t") // *WSP
return s
}
}
return s
}
// ../rfc/5256:107 ../rfc/5256:811
removeLeader := func(s string) string {
if strings.HasPrefix(s, " ") || strings.HasPrefix(s, "\t") {
s = s[1:] // WSP
}
orig := s
// Remove zero or more subj-blob
for {
prevs := s
s = removeBlob(s)
if prevs == s {
break
}
}
if strings.HasPrefix(s, "re") {
s = s[2:]
} else if strings.HasPrefix(s, "fwd") {
s = s[3:]
} else if strings.HasPrefix(s, "fw") {
s = s[2:]
} else {
return orig
}
s = strings.TrimLeft(s, " \t") // *WSP
s = removeBlob(s)
if !strings.HasPrefix(s, ":") {
return orig
}
s = s[1:]
isResponse = true
return s
}
for {
// ../rfc/5256:104 ../rfc/5256:817, remove trailing "(fwd)" or WSP, Step 2.
for {
prevs := s
s = strings.TrimRight(s, " \t")
if strings.HasSuffix(s, "(fwd)") {
s = strings.TrimSuffix(s, "(fwd)")
isResponse = true
}
if s == prevs {
break
}
}
for {
prevs := s
s = removeLeader(s) // Step 3.
if ns := removeBlob(s); ns != "" {
s = ns // Step 4.
}
// Step 5, ../rfc/5256:123
if s == prevs {
break
}
}
// Step 6. ../rfc/5256:128 ../rfc/5256:805
if strings.HasPrefix(s, "[fwd:") && strings.HasSuffix(s, "]") {
s = s[len("[fwd:") : len(s)-1]
isResponse = true
continue // From step 2 again.
}
break
}
if !allowNull && strings.ContainsRune(s, 0) {
s = ""
}
return s, isResponse
}

View File

@ -0,0 +1,35 @@
package message
import (
"testing"
)
func TestThreadSubject(t *testing.T) {
check := func(s, expBase string, expResp bool) {
t.Helper()
base, isResp := ThreadSubject(s, false)
if base != expBase || isResp != expResp {
t.Fatalf("got base %q, resp %v, expected %q %v for subject %q", base, isResp, expBase, expResp, s)
}
}
check("test", "test", false)
check(" a b\tc\r\n d\t", "a b c d", false)
check("test (fwd) (fwd) ", "test", true)
check("re: test", "test", true)
check("fw: test", "test", true)
check("fwd: test", "test", true)
check("fwd [tag] Test", "fwd [tag] test", false)
check("[list] re: a b c\t", "a b c", true)
check("[list] fw: a b c", "a b c", true)
check("[tag1][tag2] [tag3]\t re: a b c", "a b c", true)
check("[tag1][tag2] [tag3]\t re: a \u0000b c", "", true)
check("[list] fw:[tag] a b c", "a b c", true)
check("[list] re: [list] fwd: a b c\t", "a b c", true)
check("[fwd: a b c]", "a b c", true)
check("[fwd: [fwd: a b c]]", "a b c", true)
check("[fwd: [list] re: a b c]", "a b c", true)
check("[nonlist]", "[nonlist]", false)
check("fwd [list]:", "", true)
}