mox/message/preview.go
Mechiel Lukkien aa631c604c
imapserver: implement PREVIEW extension (RFC 8970), and store previews in message database
We were already generating previews of plain text parts for the webmail
interface, but we didn't store them, so were generating the previews each time
messages were listed.

Now we store previews in the database for faster handling. And we also generate
previews for html parts if needed. We use the first part that has textual
content.

For IMAP, the previews can be requested by an IMAP client. When we get the
"LAZY" variant, which doesn't require us to generate a preview, we generate it
anyway, because it should be fast enough. So don't make clients first ask for
"PREVIEW (LAZY)" and then again a request for "PREVIEW".

We now also generate a preview when a message is added to the account. Except
for imports. It would slow us down, the previews aren't urgent, and they will
be generated on-demand at first-request.
2025-03-28 17:10:17 +01:00

351 lines
8.7 KiB
Go

package message
import (
"bufio"
"fmt"
"io"
"regexp"
"slices"
"strings"
"golang.org/x/net/html"
"golang.org/x/net/html/atom"
"github.com/mjl-/mox/mlog"
"github.com/mjl-/mox/moxio"
)
// Preview returns a message preview, based on the first text/plain or text/html
// part of the message that has textual content. Preview returns at most 256
// characters (possibly more bytes). Callers may want to truncate and trim trailing
// whitespace before using the preview.
//
// Preview logs at debug level for invalid messages. An error is only returned for
// serious errors, like i/o errors.
func (p Part) Preview(log mlog.Log) (string, error) {
// ../rfc/8970:190
// Don't use if Content-Disposition attachment.
disp, _, err := p.DispositionFilename()
if err != nil {
log.Debugx("parsing disposition/filename", err)
} else if strings.EqualFold(disp, "attachment") {
return "", nil
}
mt := p.MediaType + "/" + p.MediaSubType
switch mt {
case "TEXT/PLAIN", "/":
r := &moxio.LimitReader{R: p.ReaderUTF8OrBinary(), Limit: 100 * 1024}
s, err := previewText(r)
if err != nil {
return "", fmt.Errorf("making preview from text part: %v", err)
}
return s, nil
case "TEXT/HTML":
r := &moxio.LimitReader{R: p.ReaderUTF8OrBinary(), Limit: 1024 * 1024}
// First turn the HTML into text.
s, err := previewHTML(r)
if err != nil {
log.Debugx("parsing html part for preview (ignored)", err)
return "", nil
}
// Turn text body into a preview text.
s, err = previewText(strings.NewReader(s))
if err != nil {
return "", fmt.Errorf("making preview from text from html: %v", err)
}
return s, nil
case "MULTIPART/ENCRYPTED":
return "", nil
}
for i, sp := range p.Parts {
if mt == "MULTIPART/SIGNED" && i >= 1 {
break
}
s, err := sp.Preview(log)
if err != nil || s != "" {
return s, err
}
}
return "", nil
}
// previewText returns a line the client can display next to the subject line
// in a mailbox. It will replace quoted text, and any prefixing "On ... wrote:"
// line with "[...]" so only new and useful information will be displayed.
// Trailing signatures are not included.
func previewText(r io.Reader) (string, error) {
// We look quite a bit of lines ahead for trailing signatures with trailing empty lines.
var lines []string
scanner := bufio.NewScanner(r)
ensureLines := func() {
for len(lines) < 10 && scanner.Scan() {
lines = append(lines, strings.TrimSpace(scanner.Text()))
}
}
ensureLines()
isSnipped := func(s string) bool {
return s == "[...]" || s == "[…]" || s == "..."
}
nextLineQuoted := func(i int) bool {
if i+1 < len(lines) && lines[i+1] == "" {
i++
}
return i+1 < len(lines) && (strings.HasPrefix(lines[i+1], ">") || isSnipped(lines[i+1]))
}
// Remainder is signature if we see a line with only and minimum 2 dashes, and
// there are no more empty lines, and there aren't more than 5 lines left.
isSignature := func() bool {
if len(lines) == 0 || !strings.HasPrefix(lines[0], "--") || strings.Trim(strings.TrimSpace(lines[0]), "-") != "" {
return false
}
l := lines[1:]
for len(l) > 0 && l[len(l)-1] == "" {
l = l[:len(l)-1]
}
if len(l) >= 5 {
return false
}
return !slices.Contains(l, "")
}
result := ""
resultSnipped := func() bool {
return strings.HasSuffix(result, "[...]\n") || strings.HasSuffix(result, "[…]")
}
// Quick check for initial wrapped "On ... wrote:" line.
if len(lines) > 3 && strings.HasPrefix(lines[0], "On ") && !strings.HasSuffix(lines[0], "wrote:") && strings.HasSuffix(lines[1], ":") && nextLineQuoted(1) {
result = "[...]\n"
lines = lines[3:]
ensureLines()
}
for ; len(lines) > 0 && !isSignature(); ensureLines() {
line := lines[0]
if strings.HasPrefix(line, ">") {
if !resultSnipped() {
result += "[...]\n"
}
lines = lines[1:]
continue
}
if line == "" {
lines = lines[1:]
continue
}
// Check for a "On <date>, <person> wrote:", we require digits before a quoted
// line, with an optional empty line in between. If we don't have any text yet, we
// don't require the digits.
if strings.HasSuffix(line, ":") && (strings.ContainsAny(line, "0123456789") || result == "") && nextLineQuoted(0) {
if !resultSnipped() {
result += "[...]\n"
}
lines = lines[1:]
continue
}
// Skip possibly duplicate snipping by author.
if !isSnipped(line) || !resultSnipped() {
result += line + "\n"
}
lines = lines[1:]
if len(result) > 250 {
break
}
}
// Limit number of characters (not bytes). ../rfc/8970:200
// To 256 characters. ../rfc/8970:211
var o, n int
for o = range result {
n++
if n > 256 {
result = result[:o]
break
}
}
return result, scanner.Err()
}
// Any text inside these html elements (recursively) is ignored.
var ignoreAtoms = atomMap(
atom.Dialog,
atom.Head,
atom.Map,
atom.Math,
atom.Script,
atom.Style,
atom.Svg,
atom.Template,
)
// Inline elements don't force newlines at beginning & end of text in this element.
// https://developer.mozilla.org/en-US/docs/Web/HTML/Element#inline_text_semantics
var inlineAtoms = atomMap(
atom.A,
atom.Abbr,
atom.B,
atom.Bdi,
atom.Bdo,
atom.Cite,
atom.Code,
atom.Data,
atom.Dfn,
atom.Em,
atom.I,
atom.Kbd,
atom.Mark,
atom.Q,
atom.Rp,
atom.Rt,
atom.Ruby,
atom.S,
atom.Samp,
atom.Small,
atom.Span,
atom.Strong,
atom.Sub,
atom.Sup,
atom.Time,
atom.U,
atom.Var,
atom.Wbr,
atom.Del,
atom.Ins,
// We treat these specially, inserting a space after them instead of a newline.
atom.Td,
atom.Th,
)
func atomMap(l ...atom.Atom) map[atom.Atom]bool {
m := map[atom.Atom]bool{}
for _, a := range l {
m[a] = true
}
return m
}
var regexpSpace = regexp.MustCompile(`[ \t]+`) // Replaced with single space.
var regexpNewline = regexp.MustCompile(`\n\n\n+`) // Replaced with single newline.
var regexpZeroWidth = regexp.MustCompile("[\u00a0\u200b\u200c\u200d][\u00a0\u200b\u200c\u200d]+") // Removed, combinations don't make sense, generated.
func previewHTML(r io.Reader) (string, error) {
// Stack/state, based on elements.
var ignores []bool
var inlines []bool
var text string // Collecting text.
var err error // Set when walking DOM.
var quoteLevel int
// We'll walk the DOM nodes, keeping track of whether we are ignoring text, and
// whether we are in an inline or block element, and building up the text. We stop
// when we have enough data, returning false in that case.
var walk func(n *html.Node) bool
walk = func(n *html.Node) bool {
switch n.Type {
case html.ErrorNode:
err = fmt.Errorf("unexpected error node")
return false
case html.ElementNode:
ignores = append(ignores, ignoreAtoms[n.DataAtom])
inline := inlineAtoms[n.DataAtom]
inlines = append(inlines, inline)
if n.DataAtom == atom.Blockquote {
quoteLevel++
}
defer func() {
if n.DataAtom == atom.Blockquote {
quoteLevel--
}
if !inline && !strings.HasSuffix(text, "\n\n") {
text += "\n"
} else if (n.DataAtom == atom.Td || n.DataAtom == atom.Th) && !strings.HasSuffix(text, " ") {
text += " "
}
ignores = ignores[:len(ignores)-1]
inlines = inlines[:len(inlines)-1]
}()
case html.TextNode:
if slices.Contains(ignores, true) {
return true
}
// Collapse all kinds of weird whitespace-like characters into a space, except for newline and ignoring carriage return.
var s string
for _, c := range n.Data {
if c == '\r' {
continue
} else if c == '\t' {
s += " "
} else {
s += string(c)
}
}
s = regexpSpace.ReplaceAllString(s, " ")
s = regexpNewline.ReplaceAllString(s, "\n")
s = regexpZeroWidth.ReplaceAllString(s, "")
inline := len(inlines) > 0 && inlines[len(inlines)-1]
ts := strings.TrimSpace(s)
if !inline && ts == "" {
break
}
if ts != "" || !strings.HasSuffix(s, " ") && !strings.HasSuffix(s, "\n") {
if quoteLevel > 0 {
q := strings.Repeat("> ", quoteLevel)
var sb strings.Builder
for line := range strings.Lines(s) {
sb.WriteString(q)
sb.WriteString(line)
}
s = sb.String()
}
text += s
}
// We need to generate at most 256 characters of preview. The text we're gathering
// will be cleaned up, with quoting removed, so we'll end up with less. Hopefully,
// 4k bytes is enough to read.
if len(text) >= 4*1024 {
return false
}
}
// Ignored: DocumentNode, CommentNode, DoctypeNode, RawNode
for cn := range n.ChildNodes() {
if !walk(cn) {
break
}
}
return true
}
node, err := html.Parse(r)
if err != nil {
return "", fmt.Errorf("parsing html: %v", err)
}
// Build text.
walk(node)
text = strings.TrimSpace(text)
text = regexpSpace.ReplaceAllString(text, " ")
return text, err
}