imapserver: implement PREVIEW extension (RFC 8970), and store previews in message database

We were already generating previews of plain text parts for the webmail interface, but we didn't store them, so were generating the previews each time messages were listed. Now we store previews in the database for faster handling. And we also generate previews for html parts if needed. We use the first part that has textual content. For IMAP, the previews can be requested by an IMAP client. When we get the "LAZY" variant, which doesn't require us to generate a preview, we generate it anyway, because it should be fast enough. So don't make clients first ask for "PREVIEW (LAZY)" and then again a request for "PREVIEW". We now also generate a preview when a message is added to the account. Except for imports. It would slow us down, the previews aren't urgent, and they will be generated on-demand at first-request.
2025-07-14 12:14:38 +03:00 · 2025-03-28 16:57:44 +01:00
parent 8b418a9ca2
commit aa631c604c
23 changed files with 735 additions and 187 deletions
--- a/message/preview.go
+++ b/message/preview.go
@ -0,0 +1,350 @@
+package message
+
+import (
+	"bufio"
+	"fmt"
+	"io"
+	"regexp"
+	"slices"
+	"strings"
+
+	"golang.org/x/net/html"
+	"golang.org/x/net/html/atom"
+
+	"github.com/mjl-/mox/mlog"
+	"github.com/mjl-/mox/moxio"
+)
+
+// Preview returns a message preview, based on the first text/plain or text/html
+// part of the message that has textual content. Preview returns at most 256
+// characters (possibly more bytes). Callers may want to truncate and trim trailing
+// whitespace before using the preview.
+//
+// Preview logs at debug level for invalid messages. An error is only returned for
+// serious errors, like i/o errors.
+func (p Part) Preview(log mlog.Log) (string, error) {
+	// ../rfc/8970:190
+
+	// Don't use if Content-Disposition attachment.
+	disp, _, err := p.DispositionFilename()
+	if err != nil {
+		log.Debugx("parsing disposition/filename", err)
+	} else if strings.EqualFold(disp, "attachment") {
+		return "", nil
+	}
+
+	mt := p.MediaType + "/" + p.MediaSubType
+	switch mt {
+	case "TEXT/PLAIN", "/":
+		r := &moxio.LimitReader{R: p.ReaderUTF8OrBinary(), Limit: 100 * 1024}
+		s, err := previewText(r)
+		if err != nil {
+			return "", fmt.Errorf("making preview from text part: %v", err)
+		}
+		return s, nil
+
+	case "TEXT/HTML":
+		r := &moxio.LimitReader{R: p.ReaderUTF8OrBinary(), Limit: 1024 * 1024}
+
+		// First turn the HTML into text.
+		s, err := previewHTML(r)
+		if err != nil {
+			log.Debugx("parsing html part for preview (ignored)", err)
+			return "", nil
+		}
+
+		// Turn text body into a preview text.
+		s, err = previewText(strings.NewReader(s))
+		if err != nil {
+			return "", fmt.Errorf("making preview from text from html: %v", err)
+		}
+		return s, nil
+
+	case "MULTIPART/ENCRYPTED":
+		return "", nil
+	}
+
+	for i, sp := range p.Parts {
+		if mt == "MULTIPART/SIGNED" && i >= 1 {
+			break
+		}
+		s, err := sp.Preview(log)
+		if err != nil || s != "" {
+			return s, err
+		}
+	}
+	return "", nil
+}
+
+// previewText returns a line the client can display next to the subject line
+// in a mailbox. It will replace quoted text, and any prefixing "On ... wrote:"
+// line with "[...]" so only new and useful information will be displayed.
+// Trailing signatures are not included.
+func previewText(r io.Reader) (string, error) {
+	// We look quite a bit of lines ahead for trailing signatures with trailing empty lines.
+	var lines []string
+	scanner := bufio.NewScanner(r)
+	ensureLines := func() {
+		for len(lines) < 10 && scanner.Scan() {
+			lines = append(lines, strings.TrimSpace(scanner.Text()))
+		}
+	}
+	ensureLines()
+
+	isSnipped := func(s string) bool {
+		return s == "[...]" || s == "[…]" || s == "..."
+	}
+
+	nextLineQuoted := func(i int) bool {
+		if i+1 < len(lines) && lines[i+1] == "" {
+			i++
+		}
+		return i+1 < len(lines) && (strings.HasPrefix(lines[i+1], ">") || isSnipped(lines[i+1]))
+	}
+
+	// Remainder is signature if we see a line with only and minimum 2 dashes, and
+	// there are no more empty lines, and there aren't more than 5 lines left.
+	isSignature := func() bool {
+		if len(lines) == 0 || !strings.HasPrefix(lines[0], "--") || strings.Trim(strings.TrimSpace(lines[0]), "-") != "" {
+			return false
+		}
+		l := lines[1:]
+		for len(l) > 0 && l[len(l)-1] == "" {
+			l = l[:len(l)-1]
+		}
+		if len(l) >= 5 {
+			return false
+		}
+		return !slices.Contains(l, "")
+	}
+
+	result := ""
+
+	resultSnipped := func() bool {
+		return strings.HasSuffix(result, "[...]\n") || strings.HasSuffix(result, "[…]")
+	}
+
+	// Quick check for initial wrapped "On ... wrote:" line.
+	if len(lines) > 3 && strings.HasPrefix(lines[0], "On ") && !strings.HasSuffix(lines[0], "wrote:") && strings.HasSuffix(lines[1], ":") && nextLineQuoted(1) {
+		result = "[...]\n"
+		lines = lines[3:]
+		ensureLines()
+	}
+
+	for ; len(lines) > 0 && !isSignature(); ensureLines() {
+		line := lines[0]
+		if strings.HasPrefix(line, ">") {
+			if !resultSnipped() {
+				result += "[...]\n"
+			}
+			lines = lines[1:]
+			continue
+		}
+		if line == "" {
+			lines = lines[1:]
+			continue
+		}
+		// Check for a "On <date>, <person> wrote:", we require digits before a quoted
+		// line, with an optional empty line in between. If we don't have any text yet, we
+		// don't require the digits.
+		if strings.HasSuffix(line, ":") && (strings.ContainsAny(line, "0123456789") || result == "") && nextLineQuoted(0) {
+			if !resultSnipped() {
+				result += "[...]\n"
+			}
+			lines = lines[1:]
+			continue
+		}
+		// Skip possibly duplicate snipping by author.
+		if !isSnipped(line) || !resultSnipped() {
+			result += line + "\n"
+		}
+		lines = lines[1:]
+		if len(result) > 250 {
+			break
+		}
+	}
+
+	// Limit number of characters (not bytes). ../rfc/8970:200
+	// To 256 characters. ../rfc/8970:211
+	var o, n int
+	for o = range result {
+		n++
+		if n > 256 {
+			result = result[:o]
+			break
+		}
+	}
+
+	return result, scanner.Err()
+}
+
+// Any text inside these html elements (recursively) is ignored.
+var ignoreAtoms = atomMap(
+	atom.Dialog,
+	atom.Head,
+	atom.Map,
+	atom.Math,
+	atom.Script,
+	atom.Style,
+	atom.Svg,
+	atom.Template,
+)
+
+// Inline elements don't force newlines at beginning & end of text in this element.
+// https://developer.mozilla.org/en-US/docs/Web/HTML/Element#inline_text_semantics
+var inlineAtoms = atomMap(
+	atom.A,
+	atom.Abbr,
+	atom.B,
+	atom.Bdi,
+	atom.Bdo,
+	atom.Cite,
+	atom.Code,
+	atom.Data,
+	atom.Dfn,
+	atom.Em,
+	atom.I,
+	atom.Kbd,
+	atom.Mark,
+	atom.Q,
+	atom.Rp,
+	atom.Rt,
+	atom.Ruby,
+	atom.S,
+	atom.Samp,
+	atom.Small,
+	atom.Span,
+	atom.Strong,
+	atom.Sub,
+	atom.Sup,
+	atom.Time,
+	atom.U,
+	atom.Var,
+	atom.Wbr,
+
+	atom.Del,
+	atom.Ins,
+
+	// We treat these specially, inserting a space after them instead of a newline.
+	atom.Td,
+	atom.Th,
+)
+
+func atomMap(l ...atom.Atom) map[atom.Atom]bool {
+	m := map[atom.Atom]bool{}
+	for _, a := range l {
+		m[a] = true
+	}
+	return m
+}
+
+var regexpSpace = regexp.MustCompile(`[ \t]+`)                                                    // Replaced with single space.
+var regexpNewline = regexp.MustCompile(`\n\n\n+`)                                                 // Replaced with single newline.
+var regexpZeroWidth = regexp.MustCompile("[\u00a0\u200b\u200c\u200d][\u00a0\u200b\u200c\u200d]+") // Removed, combinations don't make sense, generated.
+
+func previewHTML(r io.Reader) (string, error) {
+	// Stack/state, based on elements.
+	var ignores []bool
+	var inlines []bool
+
+	var text string // Collecting text.
+	var err error   // Set when walking DOM.
+	var quoteLevel int
+
+	// We'll walk the DOM nodes, keeping track of whether we are ignoring text, and
+	// whether we are in an inline or block element, and building up the text. We stop
+	// when we have enough data, returning false in that case.
+	var walk func(n *html.Node) bool
+	walk = func(n *html.Node) bool {
+		switch n.Type {
+		case html.ErrorNode:
+			err = fmt.Errorf("unexpected error node")
+			return false
+
+		case html.ElementNode:
+			ignores = append(ignores, ignoreAtoms[n.DataAtom])
+			inline := inlineAtoms[n.DataAtom]
+			inlines = append(inlines, inline)
+			if n.DataAtom == atom.Blockquote {
+				quoteLevel++
+			}
+			defer func() {
+				if n.DataAtom == atom.Blockquote {
+					quoteLevel--
+				}
+				if !inline && !strings.HasSuffix(text, "\n\n") {
+					text += "\n"
+				} else if (n.DataAtom == atom.Td || n.DataAtom == atom.Th) && !strings.HasSuffix(text, " ") {
+					text += " "
+				}
+
+				ignores = ignores[:len(ignores)-1]
+				inlines = inlines[:len(inlines)-1]
+			}()
+
+		case html.TextNode:
+			if slices.Contains(ignores, true) {
+				return true
+			}
+			// Collapse all kinds of weird whitespace-like characters into a space, except for newline and ignoring carriage return.
+			var s string
+			for _, c := range n.Data {
+				if c == '\r' {
+					continue
+				} else if c == '\t' {
+					s += " "
+				} else {
+					s += string(c)
+				}
+			}
+			s = regexpSpace.ReplaceAllString(s, " ")
+			s = regexpNewline.ReplaceAllString(s, "\n")
+			s = regexpZeroWidth.ReplaceAllString(s, "")
+
+			inline := len(inlines) > 0 && inlines[len(inlines)-1]
+			ts := strings.TrimSpace(s)
+			if !inline && ts == "" {
+				break
+			}
+			if ts != "" || !strings.HasSuffix(s, " ") && !strings.HasSuffix(s, "\n") {
+				if quoteLevel > 0 {
+					q := strings.Repeat("> ", quoteLevel)
+					var sb strings.Builder
+					for line := range strings.Lines(s) {
+						sb.WriteString(q)
+						sb.WriteString(line)
+					}
+					s = sb.String()
+				}
+				text += s
+			}
+			// We need to generate at most 256 characters of preview. The text we're gathering
+			// will be cleaned up, with quoting removed, so we'll end up with less. Hopefully,
+			// 4k bytes is enough to read.
+			if len(text) >= 4*1024 {
+				return false
+			}
+		}
+		// Ignored: DocumentNode, CommentNode, DoctypeNode, RawNode
+
+		for cn := range n.ChildNodes() {
+			if !walk(cn) {
+				break
+			}
+		}
+
+		return true
+	}
+
+	node, err := html.Parse(r)
+	if err != nil {
+		return "", fmt.Errorf("parsing html: %v", err)
+	}
+
+	// Build text.
+	walk(node)
+
+	text = strings.TrimSpace(text)
+	text = regexpSpace.ReplaceAllString(text, " ")
+	return text, err
+}
--- a/message/preview_test.go
+++ b/message/preview_test.go
@ -0,0 +1,159 @@
+package message
+
+import (
+	"bytes"
+	"fmt"
+	"io"
+	"log/slog"
+	"mime/multipart"
+	"net/textproto"
+	"strings"
+	"testing"
+
+	"github.com/mjl-/mox/mlog"
+)
+
+func TestPreviewText(t *testing.T) {
+	check := func(body, expLine string) {
+		t.Helper()
+
+		line, err := previewText(strings.NewReader(body))
+		tcompare(t, err, nil)
+		if line != expLine {
+			t.Fatalf("got %q, expected %q, for body %q", line, expLine, body)
+		}
+	}
+
+	check("", "")
+	check("single line", "single line\n")
+	check("single line\n", "single line\n")
+	check("> quoted\n", "[...]\n")
+	check("> quoted\nresponse\n", "[...]\nresponse\n")
+	check("> quoted\n[...]\nresponse after author snip\n", "[...]\nresponse after author snip\n")
+	check("[...]\nresponse after author snip\n", "[...]\nresponse after author snip\n")
+	check("[…]\nresponse after author snip\n", "[…]\nresponse after author snip\n")
+	check(">> quoted0\n> quoted1\n>quoted2\n[...]\nresponse after author snip\n", "[...]\nresponse after author snip\n")
+	check(">quoted\n\n>quoted\ncoalesce line-separated quotes\n", "[...]\ncoalesce line-separated quotes\n")
+	check("On <date> <user> wrote:\n> hi\nresponse", "[...]\nresponse\n")
+	check("On <longdate>\n<user> wrote:\n> hi\nresponse", "[...]\nresponse\n")
+	check("> quote\nresponse\n--\nsignature\n", "[...]\nresponse\n")
+	check("> quote\nline1\nline2\nline3\n", "[...]\nline1\nline2\nline3\n")
+}
+
+func tcompose(t *testing.T, typeContents ...string) *bytes.Reader {
+	var b bytes.Buffer
+
+	xc := NewComposer(&b, 100*1024, true)
+	xc.Header("MIME-Version", "1.0")
+
+	var cur, alt *multipart.Writer
+
+	xcreateMultipart := func(subtype string) *multipart.Writer {
+		mp := multipart.NewWriter(xc)
+		if cur == nil {
+			xc.Header("Content-Type", fmt.Sprintf(`multipart/%s; boundary="%s"`, subtype, mp.Boundary()))
+			xc.Line()
+		} else {
+			_, err := cur.CreatePart(textproto.MIMEHeader{"Content-Type": []string{fmt.Sprintf(`multipart/%s; boundary="%s"`, subtype, mp.Boundary())}})
+			tcheck(t, err, "adding multipart")
+		}
+		cur = mp
+		return mp
+	}
+	xcreatePart := func(header textproto.MIMEHeader) io.Writer {
+		if cur == nil {
+			for k, vl := range header {
+				for _, v := range vl {
+					xc.Header(k, v)
+				}
+			}
+			xc.Line()
+			return xc
+		}
+		p, err := cur.CreatePart(header)
+		tcheck(t, err, "adding part")
+		return p
+	}
+
+	if len(typeContents)/2 > 1 {
+		alt = xcreateMultipart("alternative")
+	}
+	for i := 0; i < len(typeContents); i += 2 {
+		body, ct, cte := xc.TextPart(typeContents[i], typeContents[i+1])
+		tp := xcreatePart(textproto.MIMEHeader{"Content-Type": []string{ct}, "Content-Transfer-Encoding": []string{cte}})
+		_, err := tp.Write([]byte(body))
+		tcheck(t, err, "write part")
+	}
+	if alt != nil {
+		err := alt.Close()
+		tcheck(t, err, "close multipart")
+	}
+	xc.Flush()
+
+	buf := b.Bytes()
+	return bytes.NewReader(buf)
+}
+
+func TestPreviewHTML(t *testing.T) {
+	check := func(r *bytes.Reader, exp string) {
+		t.Helper()
+
+		p, err := Parse(slog.Default(), false, r)
+		tcheck(t, err, "parse")
+		err = p.Walk(slog.Default(), nil)
+		tcheck(t, err, "walk")
+		log := mlog.New("message", nil)
+		s, err := p.Preview(log)
+		tcheck(t, err, "preview")
+		tcompare(t, s, exp)
+	}
+
+	// We use the first part for the preview.
+	m := tcompose(t, "plain", "the text", "html", "<html><body>the html</body></html>")
+	check(m, "the text\n")
+
+	// HTML before text.
+	m = tcompose(t, "html", "<body>the html</body>", "plain", "the text")
+	check(m, "the html\n")
+
+	// Only text.
+	m = tcompose(t, "plain", "the text")
+	check(m, "the text\n")
+
+	// Only html.
+	m = tcompose(t, "html", "<body>the html</body>")
+	check(m, "the html\n")
+
+	// No preview
+	m = tcompose(t, "other", "other text")
+	check(m, "")
+
+	// HTML with quoted text.
+	m = tcompose(t, "html", "<html><div>On ... someone wrote:</div><blockquote>something worth replying</blockquote><div>agreed</div></body>")
+	check(m, "[...]\nagreed\n")
+
+	// HTML with ignored elements, inline elements and tables.
+	const moreHTML = `<!doctype html>
+<html>
+	<head>
+		<title>title</title>
+		<style>head style</style>
+		<script>head script</script>
+	</head>
+<body>
+<script>body script</script>
+<style>body style</style>
+<div>line1</div>
+<div>line2</div>
+<div><a href="about:blank">link1   </a> text <span>word</span><span>word2</span>.</div>
+<table><tr><td>col1</td><th>col2</th></tr><tr><td>row2</td></tr></table>
+</body></html>
+`
+	m = tcompose(t, "html", moreHTML)
+	check(m, `line1
+line2
+link1 text wordword2.
+col1 col2
+row2
+`)
+}