package message import ( "bufio" "fmt" "io" "regexp" "slices" "strings" "golang.org/x/net/html" "golang.org/x/net/html/atom" "github.com/mjl-/mox/mlog" "github.com/mjl-/mox/moxio" ) // Preview returns a message preview, based on the first text/plain or text/html // part of the message that has textual content. Preview returns at most 256 // characters (possibly more bytes). Callers may want to truncate and trim trailing // whitespace before using the preview. // // Preview logs at debug level for invalid messages. An error is only returned for // serious errors, like i/o errors. func (p Part) Preview(log mlog.Log) (string, error) { // ../rfc/8970:190 // Don't use if Content-Disposition attachment. disp, _, err := p.DispositionFilename() if err != nil { log.Debugx("parsing disposition/filename", err) } else if strings.EqualFold(disp, "attachment") { return "", nil } mt := p.MediaType + "/" + p.MediaSubType switch mt { case "TEXT/PLAIN", "/": r := &moxio.LimitReader{R: p.ReaderUTF8OrBinary(), Limit: 100 * 1024} s, err := previewText(r) if err != nil { return "", fmt.Errorf("making preview from text part: %v", err) } return s, nil case "TEXT/HTML": r := &moxio.LimitReader{R: p.ReaderUTF8OrBinary(), Limit: 1024 * 1024} // First turn the HTML into text. s, err := previewHTML(r) if err != nil { log.Debugx("parsing html part for preview (ignored)", err) return "", nil } // Turn text body into a preview text. s, err = previewText(strings.NewReader(s)) if err != nil { return "", fmt.Errorf("making preview from text from html: %v", err) } return s, nil case "MULTIPART/ENCRYPTED": return "", nil } for i, sp := range p.Parts { if mt == "MULTIPART/SIGNED" && i >= 1 { break } s, err := sp.Preview(log) if err != nil || s != "" { return s, err } } return "", nil } // previewText returns a line the client can display next to the subject line // in a mailbox. It will replace quoted text, and any prefixing "On ... wrote:" // line with "[...]" so only new and useful information will be displayed. // Trailing signatures are not included. func previewText(r io.Reader) (string, error) { // We look quite a bit of lines ahead for trailing signatures with trailing empty lines. var lines []string scanner := bufio.NewScanner(r) ensureLines := func() { for len(lines) < 10 && scanner.Scan() { lines = append(lines, strings.TrimSpace(scanner.Text())) } } ensureLines() isSnipped := func(s string) bool { return s == "[...]" || s == "[…]" || s == "..." } nextLineQuoted := func(i int) bool { if i+1 < len(lines) && lines[i+1] == "" { i++ } return i+1 < len(lines) && (strings.HasPrefix(lines[i+1], ">") || isSnipped(lines[i+1])) } // Remainder is signature if we see a line with only and minimum 2 dashes, and // there are no more empty lines, and there aren't more than 5 lines left. isSignature := func() bool { if len(lines) == 0 || !strings.HasPrefix(lines[0], "--") || strings.Trim(strings.TrimSpace(lines[0]), "-") != "" { return false } l := lines[1:] for len(l) > 0 && l[len(l)-1] == "" { l = l[:len(l)-1] } if len(l) >= 5 { return false } return !slices.Contains(l, "") } result := "" resultSnipped := func() bool { return strings.HasSuffix(result, "[...]\n") || strings.HasSuffix(result, "[…]") } // Quick check for initial wrapped "On ... wrote:" line. if len(lines) > 3 && strings.HasPrefix(lines[0], "On ") && !strings.HasSuffix(lines[0], "wrote:") && strings.HasSuffix(lines[1], ":") && nextLineQuoted(1) { result = "[...]\n" lines = lines[3:] ensureLines() } for ; len(lines) > 0 && !isSignature(); ensureLines() { line := lines[0] if strings.HasPrefix(line, ">") { if !resultSnipped() { result += "[...]\n" } lines = lines[1:] continue } if line == "" { lines = lines[1:] continue } // Check for a "On , wrote:", we require digits before a quoted // line, with an optional empty line in between. If we don't have any text yet, we // don't require the digits. if strings.HasSuffix(line, ":") && (strings.ContainsAny(line, "0123456789") || result == "") && nextLineQuoted(0) { if !resultSnipped() { result += "[...]\n" } lines = lines[1:] continue } // Skip possibly duplicate snipping by author. if !isSnipped(line) || !resultSnipped() { result += line + "\n" } lines = lines[1:] if len(result) > 250 { break } } // Limit number of characters (not bytes). ../rfc/8970:200 // To 256 characters. ../rfc/8970:211 var o, n int for o = range result { n++ if n > 256 { result = result[:o] break } } return result, scanner.Err() } // Any text inside these html elements (recursively) is ignored. var ignoreAtoms = atomMap( atom.Dialog, atom.Head, atom.Map, atom.Math, atom.Script, atom.Style, atom.Svg, atom.Template, ) // Inline elements don't force newlines at beginning & end of text in this element. // https://developer.mozilla.org/en-US/docs/Web/HTML/Element#inline_text_semantics var inlineAtoms = atomMap( atom.A, atom.Abbr, atom.B, atom.Bdi, atom.Bdo, atom.Cite, atom.Code, atom.Data, atom.Dfn, atom.Em, atom.I, atom.Kbd, atom.Mark, atom.Q, atom.Rp, atom.Rt, atom.Ruby, atom.S, atom.Samp, atom.Small, atom.Span, atom.Strong, atom.Sub, atom.Sup, atom.Time, atom.U, atom.Var, atom.Wbr, atom.Del, atom.Ins, // We treat these specially, inserting a space after them instead of a newline. atom.Td, atom.Th, ) func atomMap(l ...atom.Atom) map[atom.Atom]bool { m := map[atom.Atom]bool{} for _, a := range l { m[a] = true } return m } var regexpSpace = regexp.MustCompile(`[ \t]+`) // Replaced with single space. var regexpNewline = regexp.MustCompile(`\n\n\n+`) // Replaced with single newline. var regexpZeroWidth = regexp.MustCompile("[\u00a0\u200b\u200c\u200d][\u00a0\u200b\u200c\u200d]+") // Removed, combinations don't make sense, generated. func previewHTML(r io.Reader) (string, error) { // Stack/state, based on elements. var ignores []bool var inlines []bool var text string // Collecting text. var err error // Set when walking DOM. var quoteLevel int // We'll walk the DOM nodes, keeping track of whether we are ignoring text, and // whether we are in an inline or block element, and building up the text. We stop // when we have enough data, returning false in that case. var walk func(n *html.Node) bool walk = func(n *html.Node) bool { switch n.Type { case html.ErrorNode: err = fmt.Errorf("unexpected error node") return false case html.ElementNode: ignores = append(ignores, ignoreAtoms[n.DataAtom]) inline := inlineAtoms[n.DataAtom] inlines = append(inlines, inline) if n.DataAtom == atom.Blockquote { quoteLevel++ } defer func() { if n.DataAtom == atom.Blockquote { quoteLevel-- } if !inline && !strings.HasSuffix(text, "\n\n") { text += "\n" } else if (n.DataAtom == atom.Td || n.DataAtom == atom.Th) && !strings.HasSuffix(text, " ") { text += " " } ignores = ignores[:len(ignores)-1] inlines = inlines[:len(inlines)-1] }() case html.TextNode: if slices.Contains(ignores, true) { return true } // Collapse all kinds of weird whitespace-like characters into a space, except for newline and ignoring carriage return. var s string for _, c := range n.Data { if c == '\r' { continue } else if c == '\t' { s += " " } else { s += string(c) } } s = regexpSpace.ReplaceAllString(s, " ") s = regexpNewline.ReplaceAllString(s, "\n") s = regexpZeroWidth.ReplaceAllString(s, "") inline := len(inlines) > 0 && inlines[len(inlines)-1] ts := strings.TrimSpace(s) if !inline && ts == "" { break } if ts != "" || !strings.HasSuffix(s, " ") && !strings.HasSuffix(s, "\n") { if quoteLevel > 0 { q := strings.Repeat("> ", quoteLevel) var sb strings.Builder for s != "" { o := strings.IndexByte(s, '\n') if o < 0 { o = len(s) } else { o++ } sb.WriteString(q) sb.WriteString(s[:o]) s = s[o:] } s = sb.String() } text += s } // We need to generate at most 256 characters of preview. The text we're gathering // will be cleaned up, with quoting removed, so we'll end up with less. Hopefully, // 4k bytes is enough to read. if len(text) >= 4*1024 { return false } } // Ignored: DocumentNode, CommentNode, DoctypeNode, RawNode for cn := range n.ChildNodes() { if !walk(cn) { break } } return true } node, err := html.Parse(r) if err != nil { return "", fmt.Errorf("parsing html: %v", err) } // Build text. walk(node) text = strings.TrimSpace(text) text = regexpSpace.ReplaceAllString(text, " ") return text, err }