mox/message/preview.go

358 lines
8.8 KiB
Go

package message
import (
"bufio"
"fmt"
"io"
"regexp"
"slices"
"strings"
"golang.org/x/net/html"
"golang.org/x/net/html/atom"
"github.com/mjl-/mox/mlog"
"github.com/mjl-/mox/moxio"
)
// Preview returns a message preview, based on the first text/plain or text/html
// part of the message that has textual content. Preview returns at most 256
// characters (possibly more bytes). Callers may want to truncate and trim trailing
// whitespace before using the preview.
//
// Preview logs at debug level for invalid messages. An error is only returned for
// serious errors, like i/o errors.
func (p Part) Preview(log mlog.Log) (string, error) {
// ../rfc/8970:190
// Don't use if Content-Disposition attachment.
disp, _, err := p.DispositionFilename()
if err != nil {
log.Debugx("parsing disposition/filename", err)
} else if strings.EqualFold(disp, "attachment") {
return "", nil
}
mt := p.MediaType + "/" + p.MediaSubType
switch mt {
case "TEXT/PLAIN", "/":
r := &moxio.LimitReader{R: p.ReaderUTF8OrBinary(), Limit: 100 * 1024}
s, err := previewText(r)
if err != nil {
return "", fmt.Errorf("making preview from text part: %v", err)
}
return s, nil
case "TEXT/HTML":
r := &moxio.LimitReader{R: p.ReaderUTF8OrBinary(), Limit: 1024 * 1024}
// First turn the HTML into text.
s, err := previewHTML(r)
if err != nil {
log.Debugx("parsing html part for preview (ignored)", err)
return "", nil
}
// Turn text body into a preview text.
s, err = previewText(strings.NewReader(s))
if err != nil {
return "", fmt.Errorf("making preview from text from html: %v", err)
}
return s, nil
case "MULTIPART/ENCRYPTED":
return "", nil
}
for i, sp := range p.Parts {
if mt == "MULTIPART/SIGNED" && i >= 1 {
break
}
s, err := sp.Preview(log)
if err != nil || s != "" {
return s, err
}
}
return "", nil
}
// previewText returns a line the client can display next to the subject line
// in a mailbox. It will replace quoted text, and any prefixing "On ... wrote:"
// line with "[...]" so only new and useful information will be displayed.
// Trailing signatures are not included.
func previewText(r io.Reader) (string, error) {
// We look quite a bit of lines ahead for trailing signatures with trailing empty lines.
var lines []string
scanner := bufio.NewScanner(r)
ensureLines := func() {
for len(lines) < 10 && scanner.Scan() {
lines = append(lines, strings.TrimSpace(scanner.Text()))
}
}
ensureLines()
isSnipped := func(s string) bool {
return s == "[...]" || s == "[…]" || s == "..."
}
nextLineQuoted := func(i int) bool {
if i+1 < len(lines) && lines[i+1] == "" {
i++
}
return i+1 < len(lines) && (strings.HasPrefix(lines[i+1], ">") || isSnipped(lines[i+1]))
}
// Remainder is signature if we see a line with only and minimum 2 dashes, and
// there are no more empty lines, and there aren't more than 5 lines left.
isSignature := func() bool {
if len(lines) == 0 || !strings.HasPrefix(lines[0], "--") || strings.Trim(strings.TrimSpace(lines[0]), "-") != "" {
return false
}
l := lines[1:]
for len(l) > 0 && l[len(l)-1] == "" {
l = l[:len(l)-1]
}
if len(l) >= 5 {
return false
}
return !slices.Contains(l, "")
}
result := ""
resultSnipped := func() bool {
return strings.HasSuffix(result, "[...]\n") || strings.HasSuffix(result, "[…]")
}
// Quick check for initial wrapped "On ... wrote:" line.
if len(lines) > 3 && strings.HasPrefix(lines[0], "On ") && !strings.HasSuffix(lines[0], "wrote:") && strings.HasSuffix(lines[1], ":") && nextLineQuoted(1) {
result = "[...]\n"
lines = lines[3:]
ensureLines()
}
for ; len(lines) > 0 && !isSignature(); ensureLines() {
line := lines[0]
if strings.HasPrefix(line, ">") {
if !resultSnipped() {
result += "[...]\n"
}
lines = lines[1:]
continue
}
if line == "" {
lines = lines[1:]
continue
}
// Check for a "On <date>, <person> wrote:", we require digits before a quoted
// line, with an optional empty line in between. If we don't have any text yet, we
// don't require the digits.
if strings.HasSuffix(line, ":") && (strings.ContainsAny(line, "0123456789") || result == "") && nextLineQuoted(0) {
if !resultSnipped() {
result += "[...]\n"
}
lines = lines[1:]
continue
}
// Skip possibly duplicate snipping by author.
if !isSnipped(line) || !resultSnipped() {
result += line + "\n"
}
lines = lines[1:]
if len(result) > 250 {
break
}
}
// Limit number of characters (not bytes). ../rfc/8970:200
// To 256 characters. ../rfc/8970:211
var o, n int
for o = range result {
n++
if n > 256 {
result = result[:o]
break
}
}
return result, scanner.Err()
}
// Any text inside these html elements (recursively) is ignored.
var ignoreAtoms = atomMap(
atom.Dialog,
atom.Head,
atom.Map,
atom.Math,
atom.Script,
atom.Style,
atom.Svg,
atom.Template,
)
// Inline elements don't force newlines at beginning & end of text in this element.
// https://developer.mozilla.org/en-US/docs/Web/HTML/Element#inline_text_semantics
var inlineAtoms = atomMap(
atom.A,
atom.Abbr,
atom.B,
atom.Bdi,
atom.Bdo,
atom.Cite,
atom.Code,
atom.Data,
atom.Dfn,
atom.Em,
atom.I,
atom.Kbd,
atom.Mark,
atom.Q,
atom.Rp,
atom.Rt,
atom.Ruby,
atom.S,
atom.Samp,
atom.Small,
atom.Span,
atom.Strong,
atom.Sub,
atom.Sup,
atom.Time,
atom.U,
atom.Var,
atom.Wbr,
atom.Del,
atom.Ins,
// We treat these specially, inserting a space after them instead of a newline.
atom.Td,
atom.Th,
)
func atomMap(l ...atom.Atom) map[atom.Atom]bool {
m := map[atom.Atom]bool{}
for _, a := range l {
m[a] = true
}
return m
}
var regexpSpace = regexp.MustCompile(`[ \t]+`) // Replaced with single space.
var regexpNewline = regexp.MustCompile(`\n\n\n+`) // Replaced with single newline.
var regexpZeroWidth = regexp.MustCompile("[\u00a0\u200b\u200c\u200d][\u00a0\u200b\u200c\u200d]+") // Removed, combinations don't make sense, generated.
func previewHTML(r io.Reader) (string, error) {
// Stack/state, based on elements.
var ignores []bool
var inlines []bool
var text string // Collecting text.
var err error // Set when walking DOM.
var quoteLevel int
// We'll walk the DOM nodes, keeping track of whether we are ignoring text, and
// whether we are in an inline or block element, and building up the text. We stop
// when we have enough data, returning false in that case.
var walk func(n *html.Node) bool
walk = func(n *html.Node) bool {
switch n.Type {
case html.ErrorNode:
err = fmt.Errorf("unexpected error node")
return false
case html.ElementNode:
ignores = append(ignores, ignoreAtoms[n.DataAtom])
inline := inlineAtoms[n.DataAtom]
inlines = append(inlines, inline)
if n.DataAtom == atom.Blockquote {
quoteLevel++
}
defer func() {
if n.DataAtom == atom.Blockquote {
quoteLevel--
}
if !inline && !strings.HasSuffix(text, "\n\n") {
text += "\n"
} else if (n.DataAtom == atom.Td || n.DataAtom == atom.Th) && !strings.HasSuffix(text, " ") {
text += " "
}
ignores = ignores[:len(ignores)-1]
inlines = inlines[:len(inlines)-1]
}()
case html.TextNode:
if slices.Contains(ignores, true) {
return true
}
// Collapse all kinds of weird whitespace-like characters into a space, except for newline and ignoring carriage return.
var s string
for _, c := range n.Data {
if c == '\r' {
continue
} else if c == '\t' {
s += " "
} else {
s += string(c)
}
}
s = regexpSpace.ReplaceAllString(s, " ")
s = regexpNewline.ReplaceAllString(s, "\n")
s = regexpZeroWidth.ReplaceAllString(s, "")
inline := len(inlines) > 0 && inlines[len(inlines)-1]
ts := strings.TrimSpace(s)
if !inline && ts == "" {
break
}
if ts != "" || !strings.HasSuffix(s, " ") && !strings.HasSuffix(s, "\n") {
if quoteLevel > 0 {
q := strings.Repeat("> ", quoteLevel)
var sb strings.Builder
for s != "" {
o := strings.IndexByte(s, '\n')
if o < 0 {
o = len(s)
} else {
o++
}
sb.WriteString(q)
sb.WriteString(s[:o])
s = s[o:]
}
s = sb.String()
}
text += s
}
// We need to generate at most 256 characters of preview. The text we're gathering
// will be cleaned up, with quoting removed, so we'll end up with less. Hopefully,
// 4k bytes is enough to read.
if len(text) >= 4*1024 {
return false
}
}
// Ignored: DocumentNode, CommentNode, DoctypeNode, RawNode
for cn := range n.ChildNodes() {
if !walk(cn) {
break
}
}
return true
}
node, err := html.Parse(r)
if err != nil {
return "", fmt.Errorf("parsing html: %v", err)
}
// Build text.
walk(node)
text = strings.TrimSpace(text)
text = regexpSpace.ReplaceAllString(text, " ")
return text, err
}