imapserver: implement PREVIEW extension (RFC 8970), and store previews in message database

We were already generating previews of plain text parts for the webmail
interface, but we didn't store them, so were generating the previews each time
messages were listed.

Now we store previews in the database for faster handling. And we also generate
previews for html parts if needed. We use the first part that has textual
content.

For IMAP, the previews can be requested by an IMAP client. When we get the
"LAZY" variant, which doesn't require us to generate a preview, we generate it
anyway, because it should be fast enough. So don't make clients first ask for
"PREVIEW (LAZY)" and then again a request for "PREVIEW".

We now also generate a preview when a message is added to the account. Except
for imports. It would slow us down, the previews aren't urgent, and they will
be generated on-demand at first-request.
This commit is contained in:
Mechiel Lukkien
2025-03-28 16:57:44 +01:00
parent 8b418a9ca2
commit aa631c604c
23 changed files with 735 additions and 187 deletions

350
message/preview.go Normal file
View File

@ -0,0 +1,350 @@
package message
import (
"bufio"
"fmt"
"io"
"regexp"
"slices"
"strings"
"golang.org/x/net/html"
"golang.org/x/net/html/atom"
"github.com/mjl-/mox/mlog"
"github.com/mjl-/mox/moxio"
)
// Preview returns a message preview, based on the first text/plain or text/html
// part of the message that has textual content. Preview returns at most 256
// characters (possibly more bytes). Callers may want to truncate and trim trailing
// whitespace before using the preview.
//
// Preview logs at debug level for invalid messages. An error is only returned for
// serious errors, like i/o errors.
func (p Part) Preview(log mlog.Log) (string, error) {
// ../rfc/8970:190
// Don't use if Content-Disposition attachment.
disp, _, err := p.DispositionFilename()
if err != nil {
log.Debugx("parsing disposition/filename", err)
} else if strings.EqualFold(disp, "attachment") {
return "", nil
}
mt := p.MediaType + "/" + p.MediaSubType
switch mt {
case "TEXT/PLAIN", "/":
r := &moxio.LimitReader{R: p.ReaderUTF8OrBinary(), Limit: 100 * 1024}
s, err := previewText(r)
if err != nil {
return "", fmt.Errorf("making preview from text part: %v", err)
}
return s, nil
case "TEXT/HTML":
r := &moxio.LimitReader{R: p.ReaderUTF8OrBinary(), Limit: 1024 * 1024}
// First turn the HTML into text.
s, err := previewHTML(r)
if err != nil {
log.Debugx("parsing html part for preview (ignored)", err)
return "", nil
}
// Turn text body into a preview text.
s, err = previewText(strings.NewReader(s))
if err != nil {
return "", fmt.Errorf("making preview from text from html: %v", err)
}
return s, nil
case "MULTIPART/ENCRYPTED":
return "", nil
}
for i, sp := range p.Parts {
if mt == "MULTIPART/SIGNED" && i >= 1 {
break
}
s, err := sp.Preview(log)
if err != nil || s != "" {
return s, err
}
}
return "", nil
}
// previewText returns a line the client can display next to the subject line
// in a mailbox. It will replace quoted text, and any prefixing "On ... wrote:"
// line with "[...]" so only new and useful information will be displayed.
// Trailing signatures are not included.
func previewText(r io.Reader) (string, error) {
// We look quite a bit of lines ahead for trailing signatures with trailing empty lines.
var lines []string
scanner := bufio.NewScanner(r)
ensureLines := func() {
for len(lines) < 10 && scanner.Scan() {
lines = append(lines, strings.TrimSpace(scanner.Text()))
}
}
ensureLines()
isSnipped := func(s string) bool {
return s == "[...]" || s == "[…]" || s == "..."
}
nextLineQuoted := func(i int) bool {
if i+1 < len(lines) && lines[i+1] == "" {
i++
}
return i+1 < len(lines) && (strings.HasPrefix(lines[i+1], ">") || isSnipped(lines[i+1]))
}
// Remainder is signature if we see a line with only and minimum 2 dashes, and
// there are no more empty lines, and there aren't more than 5 lines left.
isSignature := func() bool {
if len(lines) == 0 || !strings.HasPrefix(lines[0], "--") || strings.Trim(strings.TrimSpace(lines[0]), "-") != "" {
return false
}
l := lines[1:]
for len(l) > 0 && l[len(l)-1] == "" {
l = l[:len(l)-1]
}
if len(l) >= 5 {
return false
}
return !slices.Contains(l, "")
}
result := ""
resultSnipped := func() bool {
return strings.HasSuffix(result, "[...]\n") || strings.HasSuffix(result, "[…]")
}
// Quick check for initial wrapped "On ... wrote:" line.
if len(lines) > 3 && strings.HasPrefix(lines[0], "On ") && !strings.HasSuffix(lines[0], "wrote:") && strings.HasSuffix(lines[1], ":") && nextLineQuoted(1) {
result = "[...]\n"
lines = lines[3:]
ensureLines()
}
for ; len(lines) > 0 && !isSignature(); ensureLines() {
line := lines[0]
if strings.HasPrefix(line, ">") {
if !resultSnipped() {
result += "[...]\n"
}
lines = lines[1:]
continue
}
if line == "" {
lines = lines[1:]
continue
}
// Check for a "On <date>, <person> wrote:", we require digits before a quoted
// line, with an optional empty line in between. If we don't have any text yet, we
// don't require the digits.
if strings.HasSuffix(line, ":") && (strings.ContainsAny(line, "0123456789") || result == "") && nextLineQuoted(0) {
if !resultSnipped() {
result += "[...]\n"
}
lines = lines[1:]
continue
}
// Skip possibly duplicate snipping by author.
if !isSnipped(line) || !resultSnipped() {
result += line + "\n"
}
lines = lines[1:]
if len(result) > 250 {
break
}
}
// Limit number of characters (not bytes). ../rfc/8970:200
// To 256 characters. ../rfc/8970:211
var o, n int
for o = range result {
n++
if n > 256 {
result = result[:o]
break
}
}
return result, scanner.Err()
}
// Any text inside these html elements (recursively) is ignored.
var ignoreAtoms = atomMap(
atom.Dialog,
atom.Head,
atom.Map,
atom.Math,
atom.Script,
atom.Style,
atom.Svg,
atom.Template,
)
// Inline elements don't force newlines at beginning & end of text in this element.
// https://developer.mozilla.org/en-US/docs/Web/HTML/Element#inline_text_semantics
var inlineAtoms = atomMap(
atom.A,
atom.Abbr,
atom.B,
atom.Bdi,
atom.Bdo,
atom.Cite,
atom.Code,
atom.Data,
atom.Dfn,
atom.Em,
atom.I,
atom.Kbd,
atom.Mark,
atom.Q,
atom.Rp,
atom.Rt,
atom.Ruby,
atom.S,
atom.Samp,
atom.Small,
atom.Span,
atom.Strong,
atom.Sub,
atom.Sup,
atom.Time,
atom.U,
atom.Var,
atom.Wbr,
atom.Del,
atom.Ins,
// We treat these specially, inserting a space after them instead of a newline.
atom.Td,
atom.Th,
)
func atomMap(l ...atom.Atom) map[atom.Atom]bool {
m := map[atom.Atom]bool{}
for _, a := range l {
m[a] = true
}
return m
}
var regexpSpace = regexp.MustCompile(`[ \t]+`) // Replaced with single space.
var regexpNewline = regexp.MustCompile(`\n\n\n+`) // Replaced with single newline.
var regexpZeroWidth = regexp.MustCompile("[\u00a0\u200b\u200c\u200d][\u00a0\u200b\u200c\u200d]+") // Removed, combinations don't make sense, generated.
func previewHTML(r io.Reader) (string, error) {
// Stack/state, based on elements.
var ignores []bool
var inlines []bool
var text string // Collecting text.
var err error // Set when walking DOM.
var quoteLevel int
// We'll walk the DOM nodes, keeping track of whether we are ignoring text, and
// whether we are in an inline or block element, and building up the text. We stop
// when we have enough data, returning false in that case.
var walk func(n *html.Node) bool
walk = func(n *html.Node) bool {
switch n.Type {
case html.ErrorNode:
err = fmt.Errorf("unexpected error node")
return false
case html.ElementNode:
ignores = append(ignores, ignoreAtoms[n.DataAtom])
inline := inlineAtoms[n.DataAtom]
inlines = append(inlines, inline)
if n.DataAtom == atom.Blockquote {
quoteLevel++
}
defer func() {
if n.DataAtom == atom.Blockquote {
quoteLevel--
}
if !inline && !strings.HasSuffix(text, "\n\n") {
text += "\n"
} else if (n.DataAtom == atom.Td || n.DataAtom == atom.Th) && !strings.HasSuffix(text, " ") {
text += " "
}
ignores = ignores[:len(ignores)-1]
inlines = inlines[:len(inlines)-1]
}()
case html.TextNode:
if slices.Contains(ignores, true) {
return true
}
// Collapse all kinds of weird whitespace-like characters into a space, except for newline and ignoring carriage return.
var s string
for _, c := range n.Data {
if c == '\r' {
continue
} else if c == '\t' {
s += " "
} else {
s += string(c)
}
}
s = regexpSpace.ReplaceAllString(s, " ")
s = regexpNewline.ReplaceAllString(s, "\n")
s = regexpZeroWidth.ReplaceAllString(s, "")
inline := len(inlines) > 0 && inlines[len(inlines)-1]
ts := strings.TrimSpace(s)
if !inline && ts == "" {
break
}
if ts != "" || !strings.HasSuffix(s, " ") && !strings.HasSuffix(s, "\n") {
if quoteLevel > 0 {
q := strings.Repeat("> ", quoteLevel)
var sb strings.Builder
for line := range strings.Lines(s) {
sb.WriteString(q)
sb.WriteString(line)
}
s = sb.String()
}
text += s
}
// We need to generate at most 256 characters of preview. The text we're gathering
// will be cleaned up, with quoting removed, so we'll end up with less. Hopefully,
// 4k bytes is enough to read.
if len(text) >= 4*1024 {
return false
}
}
// Ignored: DocumentNode, CommentNode, DoctypeNode, RawNode
for cn := range n.ChildNodes() {
if !walk(cn) {
break
}
}
return true
}
node, err := html.Parse(r)
if err != nil {
return "", fmt.Errorf("parsing html: %v", err)
}
// Build text.
walk(node)
text = strings.TrimSpace(text)
text = regexpSpace.ReplaceAllString(text, " ")
return text, err
}

159
message/preview_test.go Normal file
View File

@ -0,0 +1,159 @@
package message
import (
"bytes"
"fmt"
"io"
"log/slog"
"mime/multipart"
"net/textproto"
"strings"
"testing"
"github.com/mjl-/mox/mlog"
)
func TestPreviewText(t *testing.T) {
check := func(body, expLine string) {
t.Helper()
line, err := previewText(strings.NewReader(body))
tcompare(t, err, nil)
if line != expLine {
t.Fatalf("got %q, expected %q, for body %q", line, expLine, body)
}
}
check("", "")
check("single line", "single line\n")
check("single line\n", "single line\n")
check("> quoted\n", "[...]\n")
check("> quoted\nresponse\n", "[...]\nresponse\n")
check("> quoted\n[...]\nresponse after author snip\n", "[...]\nresponse after author snip\n")
check("[...]\nresponse after author snip\n", "[...]\nresponse after author snip\n")
check("[…]\nresponse after author snip\n", "[…]\nresponse after author snip\n")
check(">> quoted0\n> quoted1\n>quoted2\n[...]\nresponse after author snip\n", "[...]\nresponse after author snip\n")
check(">quoted\n\n>quoted\ncoalesce line-separated quotes\n", "[...]\ncoalesce line-separated quotes\n")
check("On <date> <user> wrote:\n> hi\nresponse", "[...]\nresponse\n")
check("On <longdate>\n<user> wrote:\n> hi\nresponse", "[...]\nresponse\n")
check("> quote\nresponse\n--\nsignature\n", "[...]\nresponse\n")
check("> quote\nline1\nline2\nline3\n", "[...]\nline1\nline2\nline3\n")
}
func tcompose(t *testing.T, typeContents ...string) *bytes.Reader {
var b bytes.Buffer
xc := NewComposer(&b, 100*1024, true)
xc.Header("MIME-Version", "1.0")
var cur, alt *multipart.Writer
xcreateMultipart := func(subtype string) *multipart.Writer {
mp := multipart.NewWriter(xc)
if cur == nil {
xc.Header("Content-Type", fmt.Sprintf(`multipart/%s; boundary="%s"`, subtype, mp.Boundary()))
xc.Line()
} else {
_, err := cur.CreatePart(textproto.MIMEHeader{"Content-Type": []string{fmt.Sprintf(`multipart/%s; boundary="%s"`, subtype, mp.Boundary())}})
tcheck(t, err, "adding multipart")
}
cur = mp
return mp
}
xcreatePart := func(header textproto.MIMEHeader) io.Writer {
if cur == nil {
for k, vl := range header {
for _, v := range vl {
xc.Header(k, v)
}
}
xc.Line()
return xc
}
p, err := cur.CreatePart(header)
tcheck(t, err, "adding part")
return p
}
if len(typeContents)/2 > 1 {
alt = xcreateMultipart("alternative")
}
for i := 0; i < len(typeContents); i += 2 {
body, ct, cte := xc.TextPart(typeContents[i], typeContents[i+1])
tp := xcreatePart(textproto.MIMEHeader{"Content-Type": []string{ct}, "Content-Transfer-Encoding": []string{cte}})
_, err := tp.Write([]byte(body))
tcheck(t, err, "write part")
}
if alt != nil {
err := alt.Close()
tcheck(t, err, "close multipart")
}
xc.Flush()
buf := b.Bytes()
return bytes.NewReader(buf)
}
func TestPreviewHTML(t *testing.T) {
check := func(r *bytes.Reader, exp string) {
t.Helper()
p, err := Parse(slog.Default(), false, r)
tcheck(t, err, "parse")
err = p.Walk(slog.Default(), nil)
tcheck(t, err, "walk")
log := mlog.New("message", nil)
s, err := p.Preview(log)
tcheck(t, err, "preview")
tcompare(t, s, exp)
}
// We use the first part for the preview.
m := tcompose(t, "plain", "the text", "html", "<html><body>the html</body></html>")
check(m, "the text\n")
// HTML before text.
m = tcompose(t, "html", "<body>the html</body>", "plain", "the text")
check(m, "the html\n")
// Only text.
m = tcompose(t, "plain", "the text")
check(m, "the text\n")
// Only html.
m = tcompose(t, "html", "<body>the html</body>")
check(m, "the html\n")
// No preview
m = tcompose(t, "other", "other text")
check(m, "")
// HTML with quoted text.
m = tcompose(t, "html", "<html><div>On ... someone wrote:</div><blockquote>something worth replying</blockquote><div>agreed</div></body>")
check(m, "[...]\nagreed\n")
// HTML with ignored elements, inline elements and tables.
const moreHTML = `<!doctype html>
<html>
<head>
<title>title</title>
<style>head style</style>
<script>head script</script>
</head>
<body>
<script>body script</script>
<style>body style</style>
<div>line1</div>
<div>line2</div>
<div><a href="about:blank">link1 </a> text <span>word</span><span>word2</span>.</div>
<table><tr><td>col1</td><th>col2</th></tr><tr><td>row2</td></tr></table>
</body></html>
`
m = tcompose(t, "html", moreHTML)
check(m, `line1
line2
link1 text wordword2.
col1 col2
row2
`)
}