mirror of
https://github.com/mjl-/mox.git
synced 2025-07-12 17:44:35 +03:00
improve training of junk filter
before, we used heuristics to decide when to train/untrain a message as junk or nonjunk: the message had to be seen, be in certain mailboxes. then if a message was marked as junk, it was junk. and otherwise it was nonjunk. this wasn't good enough: you may want to keep some messages around as neither junk or nonjunk. and that wasn't possible. ideally, we would just look at the imap $Junk and $NotJunk flags. the problem is that mail clients don't set these flags, or don't make it easy. thunderbird can set the flags based on its own bayesian filter. it has a shortcut for marking Junk and moving it to the junk folder (good), but the counterpart of notjunk only marks a message as notjunk without showing in the UI that it was marked as notjunk. there is also no "move and mark as notjunk" mechanism. e.g. "archive" does not mark a message as notjunk. ios mail and mutt don't appear to have any way to see or change the $Junk and $NotJunk flags. what email clients do have is the ability to move messages to other mailboxes/folders. so mox now has a mechanism that allows you to configure mailboxes that automatically set $Junk or $NotJunk (or clear both) when a message is moved/copied/delivered to that folder. e.g. a mailbox called junk or spam or rejects marks its messags as junk. inbox, postmaster, dmarc, tlsrpt, neutral* mark their messages as neither junk or notjunk. other folders mark their messages as notjunk. e.g. list/*, archive. this functionality is optional, but enabled with the quickstart and for new accounts. also, mox now keeps track of the previous training of a message and will only untrain/train if needed. before, there probably have been duplicate or missing (un)trainings. this also includes a new subcommand "retrain" to recreate the junkfilter for an account. you should run it after updating to this version. and you should probably also modify your account config to include the AutomaticJunkFlags.
This commit is contained in:
@ -276,8 +276,9 @@ type Message struct {
|
||||
|
||||
MessageHash []byte // Hash of message. For rejects delivery, so optional like MessageID.
|
||||
Flags
|
||||
Size int64
|
||||
MsgPrefix []byte // Typically holds received headers and/or header separator.
|
||||
Size int64
|
||||
TrainedJunk *bool // If nil, no training done yet. Otherwise, true is trained as junk, false trained as nonjunk.
|
||||
MsgPrefix []byte // Typically holds received headers and/or header separator.
|
||||
|
||||
// ParsedBuf message structure. Currently saved as JSON of message.Part because bstore
|
||||
// cannot yet store recursive types. Created when first needed, and saved in the
|
||||
@ -299,6 +300,48 @@ func (m Message) LoadPart(r io.ReaderAt) (message.Part, error) {
|
||||
return p, nil
|
||||
}
|
||||
|
||||
// NeedsTraining returns whether message needs a training update, based on
|
||||
// TrainedJunk (current training status) and new Junk/Notjunk flags.
|
||||
func (m Message) NeedsTraining() bool {
|
||||
untrain := m.TrainedJunk != nil
|
||||
untrainJunk := untrain && *m.TrainedJunk
|
||||
train := m.Junk || m.Notjunk && !(m.Junk && m.Notjunk)
|
||||
trainJunk := m.Junk
|
||||
return untrain != train || untrain && train && untrainJunk != trainJunk
|
||||
}
|
||||
|
||||
// JunkFlagsForMailbox sets Junk and Notjunk flags based on mailbox name if configured. Often
|
||||
// used when delivering/moving/copying messages to a mailbox. Mail clients are not
|
||||
// very helpful with setting junk/notjunk flags. But clients can move/copy messages
|
||||
// to other mailboxes. So we set flags when clients move a message.
|
||||
func (m *Message) JunkFlagsForMailbox(mailbox string, conf config.Account) {
|
||||
if !conf.AutomaticJunkFlags.Enabled {
|
||||
return
|
||||
}
|
||||
|
||||
lmailbox := strings.ToLower(mailbox)
|
||||
|
||||
if conf.JunkMailbox != nil && conf.JunkMailbox.MatchString(lmailbox) {
|
||||
m.Junk = true
|
||||
m.Notjunk = false
|
||||
} else if conf.NeutralMailbox != nil && conf.NeutralMailbox.MatchString(lmailbox) {
|
||||
m.Junk = false
|
||||
m.Notjunk = false
|
||||
} else if conf.NotJunkMailbox != nil && conf.NotJunkMailbox.MatchString(lmailbox) {
|
||||
m.Junk = false
|
||||
m.Notjunk = true
|
||||
} else if conf.JunkMailbox == nil && conf.NeutralMailbox != nil && conf.NotJunkMailbox != nil {
|
||||
m.Junk = true
|
||||
m.Notjunk = false
|
||||
} else if conf.JunkMailbox != nil && conf.NeutralMailbox == nil && conf.NotJunkMailbox != nil {
|
||||
m.Junk = false
|
||||
m.Notjunk = false
|
||||
} else if conf.JunkMailbox != nil && conf.NeutralMailbox != nil && conf.NotJunkMailbox == nil {
|
||||
m.Junk = false
|
||||
m.Notjunk = true
|
||||
}
|
||||
}
|
||||
|
||||
// Recipient represents the recipient of a message. It is tracked to allow
|
||||
// first-time incoming replies from users this account has sent messages to. On
|
||||
// IMAP append to Sent, the message is parsed and recipients are inserted as
|
||||
@ -528,14 +571,10 @@ func (a *Account) WithRLock(fn func()) {
|
||||
// If sync is true, the message file and its directory are synced. Should be true
|
||||
// for regular mail delivery, but can be false when importing many messages.
|
||||
//
|
||||
// if train is true, the junkfilter (if configured) is trained with the message.
|
||||
// Should be used for regular mail delivery, but can be false when importing many
|
||||
// messages.
|
||||
//
|
||||
// Must be called with account rlock or wlock.
|
||||
//
|
||||
// Caller must broadcast new message.
|
||||
func (a *Account) DeliverX(log *mlog.Log, tx *bstore.Tx, m *Message, msgFile *os.File, consumeFile, isSent, sync, train bool) {
|
||||
func (a *Account) DeliverX(log *mlog.Log, tx *bstore.Tx, m *Message, msgFile *os.File, consumeFile, isSent, sync bool) {
|
||||
mb := Mailbox{ID: m.MailboxID}
|
||||
err := tx.Get(&mb)
|
||||
xcheckf(err, "get mailbox")
|
||||
@ -544,6 +583,9 @@ func (a *Account) DeliverX(log *mlog.Log, tx *bstore.Tx, m *Message, msgFile *os
|
||||
err = tx.Update(&mb)
|
||||
xcheckf(err, "updating mailbox nextuid")
|
||||
|
||||
conf, _ := a.Conf()
|
||||
m.JunkFlagsForMailbox(mb.Name, conf)
|
||||
|
||||
var part *message.Part
|
||||
if m.ParsedBuf == nil {
|
||||
mr := FileMsgReader(m.MsgPrefix, msgFile) // We don't close, it would close the msgFile.
|
||||
@ -629,13 +671,10 @@ func (a *Account) DeliverX(log *mlog.Log, tx *bstore.Tx, m *Message, msgFile *os
|
||||
xcheckf(err, "sync directory")
|
||||
}
|
||||
|
||||
if train {
|
||||
conf, _ := a.Conf()
|
||||
if mb.Name != conf.RejectsMailbox {
|
||||
err := a.Train(log, []Message{*m})
|
||||
xcheckf(err, "train junkfilter with new message")
|
||||
}
|
||||
}
|
||||
l := []Message{*m}
|
||||
err = a.RetrainMessages(log, tx, l, false)
|
||||
xcheckf(err, "training junkfilter")
|
||||
*m = l[0]
|
||||
}
|
||||
|
||||
// write contents of r to new file dst, for delivering a message.
|
||||
@ -969,7 +1008,7 @@ func (a *Account) DeliverMailbox(log *mlog.Log, mailbox string, m *Message, msgF
|
||||
m.MailboxOrigID = mb.ID
|
||||
changes = append(changes, chl...)
|
||||
|
||||
a.DeliverX(log, tx, m, msgFile, consumeFile, mb.Sent, true, true)
|
||||
a.DeliverX(log, tx, m, msgFile, consumeFile, mb.Sent, true)
|
||||
return nil
|
||||
})
|
||||
// todo: if rename succeeded but transaction failed, we should remove the file.
|
||||
@ -988,7 +1027,7 @@ func (a *Account) DeliverMailbox(log *mlog.Log, mailbox string, m *Message, msgF
|
||||
//
|
||||
// Caller most hold account wlock.
|
||||
// Changes are broadcasted.
|
||||
func (a *Account) TidyRejectsMailbox(rejectsMailbox string) (hasSpace bool, rerr error) {
|
||||
func (a *Account) TidyRejectsMailbox(log *mlog.Log, rejectsMailbox string) (hasSpace bool, rerr error) {
|
||||
var changes []Change
|
||||
|
||||
err := extransact(a.DB, true, func(tx *bstore.Tx) error {
|
||||
@ -1007,7 +1046,7 @@ func (a *Account) TidyRejectsMailbox(rejectsMailbox string) (hasSpace bool, rerr
|
||||
remove, err := qdel.List()
|
||||
xcheckf(err, "listing old messages")
|
||||
|
||||
changes = a.xremoveMessages(tx, mb, remove)
|
||||
changes = a.xremoveMessages(log, tx, mb, remove)
|
||||
|
||||
// We allow up to n messages.
|
||||
qcount := bstore.QueryTx[Message](tx)
|
||||
@ -1027,7 +1066,7 @@ func (a *Account) TidyRejectsMailbox(rejectsMailbox string) (hasSpace bool, rerr
|
||||
return hasSpace, err
|
||||
}
|
||||
|
||||
func (a *Account) xremoveMessages(tx *bstore.Tx, mb *Mailbox, l []Message) []Change {
|
||||
func (a *Account) xremoveMessages(log *mlog.Log, tx *bstore.Tx, mb *Mailbox, l []Message) []Change {
|
||||
if len(l) == 0 {
|
||||
return nil
|
||||
}
|
||||
@ -1048,8 +1087,18 @@ func (a *Account) xremoveMessages(tx *bstore.Tx, mb *Mailbox, l []Message) []Cha
|
||||
// Actually remove the messages.
|
||||
qdm := bstore.QueryTx[Message](tx)
|
||||
qdm.FilterIDs(ids)
|
||||
var deleted []Message
|
||||
qdm.Gather(&deleted)
|
||||
_, err = qdm.Delete()
|
||||
xcheckf(err, "deleting from message recipient")
|
||||
xcheckf(err, "deleting from messages")
|
||||
|
||||
// Mark as neutral and train so junk filter gets untrained with these (junk) messages.
|
||||
for i := range deleted {
|
||||
deleted[i].Junk = false
|
||||
deleted[i].Notjunk = false
|
||||
}
|
||||
err = a.RetrainMessages(log, tx, deleted, true)
|
||||
xcheckf(err, "training deleted messages")
|
||||
|
||||
changes := make([]Change, len(l))
|
||||
for i, m := range l {
|
||||
@ -1077,7 +1126,7 @@ func (a *Account) RejectsRemove(log *mlog.Log, rejectsMailbox, messageID string)
|
||||
remove, err := q.List()
|
||||
xcheckf(err, "listing messages to remove")
|
||||
|
||||
changes = a.xremoveMessages(tx, mb, remove)
|
||||
changes = a.xremoveMessages(log, tx, mb, remove)
|
||||
|
||||
return err
|
||||
})
|
||||
|
@ -72,13 +72,13 @@ func TestMailbox(t *testing.T) {
|
||||
tcheck(t, err, "sent mailbox")
|
||||
msent.MailboxID = mbsent.ID
|
||||
msent.MailboxOrigID = mbsent.ID
|
||||
acc.DeliverX(xlog, tx, &msent, msgFile, false, true, true, true)
|
||||
acc.DeliverX(xlog, tx, &msent, msgFile, false, true, true)
|
||||
|
||||
err = tx.Insert(&mbrejects)
|
||||
tcheck(t, err, "insert rejects mailbox")
|
||||
mreject.MailboxID = mbrejects.ID
|
||||
mreject.MailboxOrigID = mbrejects.ID
|
||||
acc.DeliverX(xlog, tx, &mreject, msgFile, false, false, true, true)
|
||||
acc.DeliverX(xlog, tx, &mreject, msgFile, false, false, true)
|
||||
|
||||
return nil
|
||||
})
|
||||
@ -86,25 +86,34 @@ func TestMailbox(t *testing.T) {
|
||||
|
||||
err = acc.Deliver(xlog, conf.Destinations["mjl"], &mconsumed, msgFile, true)
|
||||
tcheck(t, err, "deliver with consume")
|
||||
|
||||
err = acc.DB.Write(func(tx *bstore.Tx) error {
|
||||
m.Junk = true
|
||||
l := []Message{m}
|
||||
err = acc.RetrainMessages(log, tx, l, false)
|
||||
tcheck(t, err, "train as junk")
|
||||
m = l[0]
|
||||
return nil
|
||||
})
|
||||
tcheck(t, err, "train messages")
|
||||
})
|
||||
|
||||
m.Junk = true
|
||||
err = acc.Train(log, []Message{m})
|
||||
tcheck(t, err, "train as junk")
|
||||
|
||||
flags := m.Flags
|
||||
|
||||
m.Seen = true
|
||||
m.Junk = false
|
||||
m.Notjunk = true
|
||||
jf, _, err := acc.OpenJunkFilter(log)
|
||||
tcheck(t, err, "open junk filter")
|
||||
err = acc.Retrain(log, jf, flags, m)
|
||||
tcheck(t, err, "retrain as non-junk")
|
||||
err = acc.DB.Write(func(tx *bstore.Tx) error {
|
||||
return acc.RetrainMessage(log, tx, jf, &m, false)
|
||||
})
|
||||
tcheck(t, err, "retraining as non-junk")
|
||||
err = jf.Close()
|
||||
tcheck(t, err, "close junk filter")
|
||||
|
||||
err = acc.Untrain(log, []Message{m})
|
||||
tcheck(t, err, "untrain non-junk")
|
||||
m.Notjunk = false
|
||||
err = acc.DB.Write(func(tx *bstore.Tx) error {
|
||||
return acc.RetrainMessages(log, tx, []Message{m}, false)
|
||||
})
|
||||
tcheck(t, err, "untraining non-junk")
|
||||
|
||||
err = acc.SetPassword("testtest")
|
||||
tcheck(t, err, "set password")
|
||||
@ -171,7 +180,7 @@ func TestMailbox(t *testing.T) {
|
||||
tcheck(t, err, "write tx")
|
||||
|
||||
// todo: check that messages are removed and changes sent.
|
||||
hasSpace, err := acc.TidyRejectsMailbox("Rejects")
|
||||
hasSpace, err := acc.TidyRejectsMailbox(log, "Rejects")
|
||||
tcheck(t, err, "tidy rejects mailbox")
|
||||
if !hasSpace {
|
||||
t.Fatalf("no space for more rejects")
|
||||
|
@ -5,6 +5,8 @@ import (
|
||||
"os"
|
||||
"path/filepath"
|
||||
|
||||
"github.com/mjl-/bstore"
|
||||
|
||||
"github.com/mjl-/mox/config"
|
||||
"github.com/mjl-/mox/junk"
|
||||
"github.com/mjl-/mox/mlog"
|
||||
@ -40,28 +42,20 @@ func (a *Account) OpenJunkFilter(log *mlog.Log) (*junk.Filter, *config.JunkFilte
|
||||
return f, jf, err
|
||||
}
|
||||
|
||||
// Train new messages, if relevant given their flags.
|
||||
func (a *Account) Train(log *mlog.Log, msgs []Message) error {
|
||||
return a.xtrain(log, msgs, false, true)
|
||||
}
|
||||
|
||||
// Untrain removed messages, if relevant given their flags.
|
||||
func (a *Account) Untrain(log *mlog.Log, msgs []Message) error {
|
||||
return a.xtrain(log, msgs, true, false)
|
||||
}
|
||||
|
||||
// train or untrain messages, if relevant given their flags.
|
||||
func (a *Account) xtrain(log *mlog.Log, msgs []Message, untrain, train bool) (rerr error) {
|
||||
// RetrainMessages (un)trains messages, if relevant given their flags. Updates
|
||||
// m.TrainedJunk after retraining.
|
||||
func (a *Account) RetrainMessages(log *mlog.Log, tx *bstore.Tx, msgs []Message, absentOK bool) (rerr error) {
|
||||
if len(msgs) == 0 {
|
||||
return nil
|
||||
}
|
||||
|
||||
var jf *junk.Filter
|
||||
|
||||
for _, m := range msgs {
|
||||
if !m.Seen && !m.Junk {
|
||||
for i := range msgs {
|
||||
if !msgs[i].NeedsTraining() {
|
||||
continue
|
||||
}
|
||||
|
||||
// Lazy open the junk filter.
|
||||
if jf == nil {
|
||||
var err error
|
||||
@ -79,33 +73,28 @@ func (a *Account) xtrain(log *mlog.Log, msgs []Message, untrain, train bool) (re
|
||||
}
|
||||
}()
|
||||
}
|
||||
ham := !m.Junk
|
||||
err := xtrainMessage(log, a, jf, m, untrain, ham, train, ham)
|
||||
if err != nil {
|
||||
if err := a.RetrainMessage(log, tx, jf, &msgs[i], absentOK); err != nil {
|
||||
return err
|
||||
}
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
// Retrain message, if relevant given old flags and the new flags in m.
|
||||
func (a *Account) Retrain(log *mlog.Log, jf *junk.Filter, old Flags, m Message) error {
|
||||
untrain := old.Seen || old.Junk
|
||||
train := m.Seen || m.Junk
|
||||
untrainHam := !old.Junk
|
||||
trainHam := !m.Junk
|
||||
// RetrainMessage untrains and/or trains a message, if relevant given m.TrainedJunk
|
||||
// and m.Junk/m.Notjunk. Updates m.TrainedJunk after retraining.
|
||||
func (a *Account) RetrainMessage(log *mlog.Log, tx *bstore.Tx, jf *junk.Filter, m *Message, absentOK bool) error {
|
||||
untrain := m.TrainedJunk != nil
|
||||
untrainJunk := untrain && *m.TrainedJunk
|
||||
train := m.Junk || m.Notjunk && !(m.Junk && m.Notjunk)
|
||||
trainJunk := m.Junk
|
||||
|
||||
if !untrain && !train || (untrain && train && trainHam == untrainHam) {
|
||||
if !untrain && !train || (untrain && train && untrainJunk == trainJunk) {
|
||||
return nil
|
||||
}
|
||||
|
||||
return xtrainMessage(log, a, jf, m, untrain, untrainHam, train, trainHam)
|
||||
}
|
||||
log.Info("updating junk filter", mlog.Field("untrain", untrain), mlog.Field("untrainJunk", untrainJunk), mlog.Field("train", train), mlog.Field("trainJunk", trainJunk))
|
||||
|
||||
func xtrainMessage(log *mlog.Log, a *Account, jf *junk.Filter, m Message, untrain, untrainHam, train, trainHam bool) error {
|
||||
log.Info("updating junk filter", mlog.Field("untrain", untrain), mlog.Field("untrainHam", untrainHam), mlog.Field("train", train), mlog.Field("trainHam", trainHam))
|
||||
|
||||
mr := a.MessageReader(m)
|
||||
mr := a.MessageReader(*m)
|
||||
defer mr.Close()
|
||||
|
||||
p, err := m.LoadPart(mr)
|
||||
@ -121,16 +110,46 @@ func xtrainMessage(log *mlog.Log, a *Account, jf *junk.Filter, m Message, untrai
|
||||
}
|
||||
|
||||
if untrain {
|
||||
err := jf.Untrain(untrainHam, words)
|
||||
err := jf.Untrain(!untrainJunk, words)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
m.TrainedJunk = nil
|
||||
}
|
||||
if train {
|
||||
err := jf.Train(trainHam, words)
|
||||
err := jf.Train(!trainJunk, words)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
m.TrainedJunk = &trainJunk
|
||||
}
|
||||
if err := tx.Update(m); err != nil && (!absentOK || err != bstore.ErrAbsent) {
|
||||
return err
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
// TrainMessage trains the junk filter based on the current m.Junk/m.Notjunk flags,
|
||||
// disregarding m.TrainedJunk and not updating that field.
|
||||
func (a *Account) TrainMessage(log *mlog.Log, jf *junk.Filter, m Message) (bool, error) {
|
||||
if !m.Junk && !m.Notjunk || (m.Junk && m.Notjunk) {
|
||||
return false, nil
|
||||
}
|
||||
|
||||
mr := a.MessageReader(m)
|
||||
defer mr.Close()
|
||||
|
||||
p, err := m.LoadPart(mr)
|
||||
if err != nil {
|
||||
log.Errorx("loading part for message", err)
|
||||
return false, nil
|
||||
}
|
||||
|
||||
words, err := jf.ParseMessage(p)
|
||||
if err != nil {
|
||||
log.Errorx("parsing message for updating junk filter", err, mlog.Field("parse", ""))
|
||||
return false, nil
|
||||
}
|
||||
|
||||
return true, jf.Train(m.Notjunk, words)
|
||||
}
|
||||
|
Reference in New Issue
Block a user