mirror of
https://github.com/mjl-/mox.git
synced 2025-06-28 01:48:15 +03:00

We were already generating previews of plain text parts for the webmail interface, but we didn't store them, so were generating the previews each time messages were listed. Now we store previews in the database for faster handling. And we also generate previews for html parts if needed. We use the first part that has textual content. For IMAP, the previews can be requested by an IMAP client. When we get the "LAZY" variant, which doesn't require us to generate a preview, we generate it anyway, because it should be fast enough. So don't make clients first ask for "PREVIEW (LAZY)" and then again a request for "PREVIEW". We now also generate a preview when a message is added to the account. Except for imports. It would slow us down, the previews aren't urgent, and they will be generated on-demand at first-request.
351 lines
8.7 KiB
Go
351 lines
8.7 KiB
Go
package message
|
|
|
|
import (
|
|
"bufio"
|
|
"fmt"
|
|
"io"
|
|
"regexp"
|
|
"slices"
|
|
"strings"
|
|
|
|
"golang.org/x/net/html"
|
|
"golang.org/x/net/html/atom"
|
|
|
|
"github.com/mjl-/mox/mlog"
|
|
"github.com/mjl-/mox/moxio"
|
|
)
|
|
|
|
// Preview returns a message preview, based on the first text/plain or text/html
|
|
// part of the message that has textual content. Preview returns at most 256
|
|
// characters (possibly more bytes). Callers may want to truncate and trim trailing
|
|
// whitespace before using the preview.
|
|
//
|
|
// Preview logs at debug level for invalid messages. An error is only returned for
|
|
// serious errors, like i/o errors.
|
|
func (p Part) Preview(log mlog.Log) (string, error) {
|
|
// ../rfc/8970:190
|
|
|
|
// Don't use if Content-Disposition attachment.
|
|
disp, _, err := p.DispositionFilename()
|
|
if err != nil {
|
|
log.Debugx("parsing disposition/filename", err)
|
|
} else if strings.EqualFold(disp, "attachment") {
|
|
return "", nil
|
|
}
|
|
|
|
mt := p.MediaType + "/" + p.MediaSubType
|
|
switch mt {
|
|
case "TEXT/PLAIN", "/":
|
|
r := &moxio.LimitReader{R: p.ReaderUTF8OrBinary(), Limit: 100 * 1024}
|
|
s, err := previewText(r)
|
|
if err != nil {
|
|
return "", fmt.Errorf("making preview from text part: %v", err)
|
|
}
|
|
return s, nil
|
|
|
|
case "TEXT/HTML":
|
|
r := &moxio.LimitReader{R: p.ReaderUTF8OrBinary(), Limit: 1024 * 1024}
|
|
|
|
// First turn the HTML into text.
|
|
s, err := previewHTML(r)
|
|
if err != nil {
|
|
log.Debugx("parsing html part for preview (ignored)", err)
|
|
return "", nil
|
|
}
|
|
|
|
// Turn text body into a preview text.
|
|
s, err = previewText(strings.NewReader(s))
|
|
if err != nil {
|
|
return "", fmt.Errorf("making preview from text from html: %v", err)
|
|
}
|
|
return s, nil
|
|
|
|
case "MULTIPART/ENCRYPTED":
|
|
return "", nil
|
|
}
|
|
|
|
for i, sp := range p.Parts {
|
|
if mt == "MULTIPART/SIGNED" && i >= 1 {
|
|
break
|
|
}
|
|
s, err := sp.Preview(log)
|
|
if err != nil || s != "" {
|
|
return s, err
|
|
}
|
|
}
|
|
return "", nil
|
|
}
|
|
|
|
// previewText returns a line the client can display next to the subject line
|
|
// in a mailbox. It will replace quoted text, and any prefixing "On ... wrote:"
|
|
// line with "[...]" so only new and useful information will be displayed.
|
|
// Trailing signatures are not included.
|
|
func previewText(r io.Reader) (string, error) {
|
|
// We look quite a bit of lines ahead for trailing signatures with trailing empty lines.
|
|
var lines []string
|
|
scanner := bufio.NewScanner(r)
|
|
ensureLines := func() {
|
|
for len(lines) < 10 && scanner.Scan() {
|
|
lines = append(lines, strings.TrimSpace(scanner.Text()))
|
|
}
|
|
}
|
|
ensureLines()
|
|
|
|
isSnipped := func(s string) bool {
|
|
return s == "[...]" || s == "[…]" || s == "..."
|
|
}
|
|
|
|
nextLineQuoted := func(i int) bool {
|
|
if i+1 < len(lines) && lines[i+1] == "" {
|
|
i++
|
|
}
|
|
return i+1 < len(lines) && (strings.HasPrefix(lines[i+1], ">") || isSnipped(lines[i+1]))
|
|
}
|
|
|
|
// Remainder is signature if we see a line with only and minimum 2 dashes, and
|
|
// there are no more empty lines, and there aren't more than 5 lines left.
|
|
isSignature := func() bool {
|
|
if len(lines) == 0 || !strings.HasPrefix(lines[0], "--") || strings.Trim(strings.TrimSpace(lines[0]), "-") != "" {
|
|
return false
|
|
}
|
|
l := lines[1:]
|
|
for len(l) > 0 && l[len(l)-1] == "" {
|
|
l = l[:len(l)-1]
|
|
}
|
|
if len(l) >= 5 {
|
|
return false
|
|
}
|
|
return !slices.Contains(l, "")
|
|
}
|
|
|
|
result := ""
|
|
|
|
resultSnipped := func() bool {
|
|
return strings.HasSuffix(result, "[...]\n") || strings.HasSuffix(result, "[…]")
|
|
}
|
|
|
|
// Quick check for initial wrapped "On ... wrote:" line.
|
|
if len(lines) > 3 && strings.HasPrefix(lines[0], "On ") && !strings.HasSuffix(lines[0], "wrote:") && strings.HasSuffix(lines[1], ":") && nextLineQuoted(1) {
|
|
result = "[...]\n"
|
|
lines = lines[3:]
|
|
ensureLines()
|
|
}
|
|
|
|
for ; len(lines) > 0 && !isSignature(); ensureLines() {
|
|
line := lines[0]
|
|
if strings.HasPrefix(line, ">") {
|
|
if !resultSnipped() {
|
|
result += "[...]\n"
|
|
}
|
|
lines = lines[1:]
|
|
continue
|
|
}
|
|
if line == "" {
|
|
lines = lines[1:]
|
|
continue
|
|
}
|
|
// Check for a "On <date>, <person> wrote:", we require digits before a quoted
|
|
// line, with an optional empty line in between. If we don't have any text yet, we
|
|
// don't require the digits.
|
|
if strings.HasSuffix(line, ":") && (strings.ContainsAny(line, "0123456789") || result == "") && nextLineQuoted(0) {
|
|
if !resultSnipped() {
|
|
result += "[...]\n"
|
|
}
|
|
lines = lines[1:]
|
|
continue
|
|
}
|
|
// Skip possibly duplicate snipping by author.
|
|
if !isSnipped(line) || !resultSnipped() {
|
|
result += line + "\n"
|
|
}
|
|
lines = lines[1:]
|
|
if len(result) > 250 {
|
|
break
|
|
}
|
|
}
|
|
|
|
// Limit number of characters (not bytes). ../rfc/8970:200
|
|
// To 256 characters. ../rfc/8970:211
|
|
var o, n int
|
|
for o = range result {
|
|
n++
|
|
if n > 256 {
|
|
result = result[:o]
|
|
break
|
|
}
|
|
}
|
|
|
|
return result, scanner.Err()
|
|
}
|
|
|
|
// Any text inside these html elements (recursively) is ignored.
|
|
var ignoreAtoms = atomMap(
|
|
atom.Dialog,
|
|
atom.Head,
|
|
atom.Map,
|
|
atom.Math,
|
|
atom.Script,
|
|
atom.Style,
|
|
atom.Svg,
|
|
atom.Template,
|
|
)
|
|
|
|
// Inline elements don't force newlines at beginning & end of text in this element.
|
|
// https://developer.mozilla.org/en-US/docs/Web/HTML/Element#inline_text_semantics
|
|
var inlineAtoms = atomMap(
|
|
atom.A,
|
|
atom.Abbr,
|
|
atom.B,
|
|
atom.Bdi,
|
|
atom.Bdo,
|
|
atom.Cite,
|
|
atom.Code,
|
|
atom.Data,
|
|
atom.Dfn,
|
|
atom.Em,
|
|
atom.I,
|
|
atom.Kbd,
|
|
atom.Mark,
|
|
atom.Q,
|
|
atom.Rp,
|
|
atom.Rt,
|
|
atom.Ruby,
|
|
atom.S,
|
|
atom.Samp,
|
|
atom.Small,
|
|
atom.Span,
|
|
atom.Strong,
|
|
atom.Sub,
|
|
atom.Sup,
|
|
atom.Time,
|
|
atom.U,
|
|
atom.Var,
|
|
atom.Wbr,
|
|
|
|
atom.Del,
|
|
atom.Ins,
|
|
|
|
// We treat these specially, inserting a space after them instead of a newline.
|
|
atom.Td,
|
|
atom.Th,
|
|
)
|
|
|
|
func atomMap(l ...atom.Atom) map[atom.Atom]bool {
|
|
m := map[atom.Atom]bool{}
|
|
for _, a := range l {
|
|
m[a] = true
|
|
}
|
|
return m
|
|
}
|
|
|
|
var regexpSpace = regexp.MustCompile(`[ \t]+`) // Replaced with single space.
|
|
var regexpNewline = regexp.MustCompile(`\n\n\n+`) // Replaced with single newline.
|
|
var regexpZeroWidth = regexp.MustCompile("[\u00a0\u200b\u200c\u200d][\u00a0\u200b\u200c\u200d]+") // Removed, combinations don't make sense, generated.
|
|
|
|
func previewHTML(r io.Reader) (string, error) {
|
|
// Stack/state, based on elements.
|
|
var ignores []bool
|
|
var inlines []bool
|
|
|
|
var text string // Collecting text.
|
|
var err error // Set when walking DOM.
|
|
var quoteLevel int
|
|
|
|
// We'll walk the DOM nodes, keeping track of whether we are ignoring text, and
|
|
// whether we are in an inline or block element, and building up the text. We stop
|
|
// when we have enough data, returning false in that case.
|
|
var walk func(n *html.Node) bool
|
|
walk = func(n *html.Node) bool {
|
|
switch n.Type {
|
|
case html.ErrorNode:
|
|
err = fmt.Errorf("unexpected error node")
|
|
return false
|
|
|
|
case html.ElementNode:
|
|
ignores = append(ignores, ignoreAtoms[n.DataAtom])
|
|
inline := inlineAtoms[n.DataAtom]
|
|
inlines = append(inlines, inline)
|
|
if n.DataAtom == atom.Blockquote {
|
|
quoteLevel++
|
|
}
|
|
defer func() {
|
|
if n.DataAtom == atom.Blockquote {
|
|
quoteLevel--
|
|
}
|
|
if !inline && !strings.HasSuffix(text, "\n\n") {
|
|
text += "\n"
|
|
} else if (n.DataAtom == atom.Td || n.DataAtom == atom.Th) && !strings.HasSuffix(text, " ") {
|
|
text += " "
|
|
}
|
|
|
|
ignores = ignores[:len(ignores)-1]
|
|
inlines = inlines[:len(inlines)-1]
|
|
}()
|
|
|
|
case html.TextNode:
|
|
if slices.Contains(ignores, true) {
|
|
return true
|
|
}
|
|
// Collapse all kinds of weird whitespace-like characters into a space, except for newline and ignoring carriage return.
|
|
var s string
|
|
for _, c := range n.Data {
|
|
if c == '\r' {
|
|
continue
|
|
} else if c == '\t' {
|
|
s += " "
|
|
} else {
|
|
s += string(c)
|
|
}
|
|
}
|
|
s = regexpSpace.ReplaceAllString(s, " ")
|
|
s = regexpNewline.ReplaceAllString(s, "\n")
|
|
s = regexpZeroWidth.ReplaceAllString(s, "")
|
|
|
|
inline := len(inlines) > 0 && inlines[len(inlines)-1]
|
|
ts := strings.TrimSpace(s)
|
|
if !inline && ts == "" {
|
|
break
|
|
}
|
|
if ts != "" || !strings.HasSuffix(s, " ") && !strings.HasSuffix(s, "\n") {
|
|
if quoteLevel > 0 {
|
|
q := strings.Repeat("> ", quoteLevel)
|
|
var sb strings.Builder
|
|
for line := range strings.Lines(s) {
|
|
sb.WriteString(q)
|
|
sb.WriteString(line)
|
|
}
|
|
s = sb.String()
|
|
}
|
|
text += s
|
|
}
|
|
// We need to generate at most 256 characters of preview. The text we're gathering
|
|
// will be cleaned up, with quoting removed, so we'll end up with less. Hopefully,
|
|
// 4k bytes is enough to read.
|
|
if len(text) >= 4*1024 {
|
|
return false
|
|
}
|
|
}
|
|
// Ignored: DocumentNode, CommentNode, DoctypeNode, RawNode
|
|
|
|
for cn := range n.ChildNodes() {
|
|
if !walk(cn) {
|
|
break
|
|
}
|
|
}
|
|
|
|
return true
|
|
}
|
|
|
|
node, err := html.Parse(r)
|
|
if err != nil {
|
|
return "", fmt.Errorf("parsing html: %v", err)
|
|
}
|
|
|
|
// Build text.
|
|
walk(node)
|
|
|
|
text = strings.TrimSpace(text)
|
|
text = regexpSpace.ReplaceAllString(text, " ")
|
|
return text, err
|
|
}
|