mox!

2025-07-12 17:04:39 +03:00 · 2023-01-30 14:27:06 +01:00
commit cb229cb6cf
1256 changed files with 491723 additions and 0 deletions
--- a/junk.go
+++ b/junk.go
@ -0,0 +1,440 @@
+package main
+
+/*
+note: these testdata paths are not in the repo, you should gather some of your
+own ham/spam emails.
+
+./mox junk train testdata/train/ham testdata/train/spam
+./mox junk train -sent-dir testdata/sent testdata/train/ham testdata/train/spam
+./mox junk check 'testdata/check/ham/mail1'
+./mox junk test testdata/check/ham testdata/check/spam
+./mox junk analyze testdata/train/ham testdata/train/spam
+./mox junk analyze -top-words 10 -train-ratio 0.5 -spam-threshold 0.85 -max-power 0.01 -sent-dir testdata/sent testdata/train/ham testdata/train/spam
+./mox junk play -top-words 10 -train-ratio 0.5 -spam-threshold 0.85 -max-power 0.01 -sent-dir testdata/sent testdata/train/ham testdata/train/spam
+*/
+
+import (
+	"flag"
+	"fmt"
+	"log"
+	mathrand "math/rand"
+	"os"
+	"runtime"
+	"runtime/pprof"
+	"sort"
+	"time"
+
+	"github.com/mjl-/mox/junk"
+	"github.com/mjl-/mox/message"
+	"github.com/mjl-/mox/mlog"
+	"github.com/mjl-/mox/mox-"
+)
+
+type junkArgs struct {
+	params                        junk.Params
+	cpuprofile, memprofile        string
+	spamThreshold                 float64
+	trainRatio                    float64
+	seed                          bool
+	sentDir                       string
+	databasePath, bloomfilterPath string
+	debug                         bool
+}
+
+func (a junkArgs) Memprofile() {
+	if a.memprofile == "" {
+		return
+	}
+
+	f, err := os.Create(a.memprofile)
+	xcheckf(err, "creating memory profile")
+	defer f.Close()
+	runtime.GC() // get up-to-date statistics
+	err = pprof.WriteHeapProfile(f)
+	xcheckf(err, "writing memory profile")
+}
+
+func (a junkArgs) Profile() func() {
+	if a.cpuprofile == "" {
+		return func() {
+			a.Memprofile()
+		}
+	}
+
+	f, err := os.Create(a.cpuprofile)
+	xcheckf(err, "creating CPU profile")
+	err = pprof.StartCPUProfile(f)
+	xcheckf(err, "start CPU profile")
+	return func() {
+		pprof.StopCPUProfile()
+		f.Close()
+		a.Memprofile()
+	}
+}
+
+func (a junkArgs) SetLogLevel() {
+	mox.Conf.Log[""] = mlog.LevelInfo
+	if a.debug {
+		mox.Conf.Log[""] = mlog.LevelDebug
+	}
+	mlog.SetConfig(mox.Conf.Log)
+}
+
+func junkFlags(fs *flag.FlagSet) (a junkArgs) {
+	fs.BoolVar(&a.params.Onegrams, "one-grams", false, "use 1-grams, i.e. single words, for scoring")
+	fs.BoolVar(&a.params.Twograms, "two-grams", true, "use 2-grams, i.e. word pairs, for scoring")
+	fs.BoolVar(&a.params.Threegrams, "three-grams", false, "use 3-grams, i.e. word triplets, for scoring")
+	fs.Float64Var(&a.params.MaxPower, "max-power", 0.05, "maximum word power, e.g. min 0.05/max 0.95")
+	fs.Float64Var(&a.params.IgnoreWords, "ignore-words", 0.1, "ignore words with ham/spaminess within this distance from 0.5")
+	fs.IntVar(&a.params.TopWords, "top-words", 10, "number of top spam and number of top ham words from email to use")
+	fs.IntVar(&a.params.RareWords, "rare-words", 1, "words are rare if encountered this number during training, and skipped for scoring")
+	fs.BoolVar(&a.debug, "debug", false, "print debug logging when calculating spam probability")
+
+	fs.Float64Var(&a.spamThreshold, "spam-threshold", 0.95, "probability where message is seen as spam")
+	fs.Float64Var(&a.trainRatio, "train-ratio", 0.5, "part of data to use for training versus analyzing (for analyze only)")
+	fs.StringVar(&a.sentDir, "sent-dir", "", "directory with sent mails, for training")
+	fs.BoolVar(&a.seed, "seed", false, "seed prng before analysis")
+	fs.StringVar(&a.databasePath, "dbpath", "filter.db", "database file for ham/spam words")
+	fs.StringVar(&a.bloomfilterPath, "bloompath", "filter.bloom", "bloom filter for ignoring unique strings")
+
+	fs.StringVar(&a.cpuprofile, "cpuprof", "", "store cpu profile to file")
+	fs.StringVar(&a.memprofile, "memprof", "", "store mem profile to file")
+	return
+}
+
+func listDir(dir string) (l []string) {
+	files, err := os.ReadDir(dir)
+	xcheckf(err, "listing directory %q", dir)
+	for _, f := range files {
+		l = append(l, f.Name())
+	}
+	return l
+}
+
+func must(f *junk.Filter, err error) *junk.Filter {
+	xcheckf(err, "filter")
+	return f
+}
+
+func cmdJunkTrain(c *cmd) {
+	c.unlisted = true
+	c.params = "hamdir spamdir"
+	c.help = "Train a junk filter with messages from hamdir and spamdir."
+	a := junkFlags(c.flag)
+	args := c.Parse()
+	if len(args) != 2 {
+		c.Usage()
+	}
+	defer a.Profile()()
+	a.SetLogLevel()
+
+	f := must(junk.NewFilter(mlog.New("junktrain"), a.params, a.databasePath, a.bloomfilterPath))
+	defer f.Close()
+
+	hamFiles := listDir(args[0])
+	spamFiles := listDir(args[1])
+	var sentFiles []string
+	if a.sentDir != "" {
+		sentFiles = listDir(a.sentDir)
+	}
+
+	err := f.TrainDirs(args[0], a.sentDir, args[1], hamFiles, sentFiles, spamFiles)
+	xcheckf(err, "train")
+}
+
+func cmdJunkCheck(c *cmd) {
+	c.unlisted = true
+	c.params = "mailfile"
+	c.help = "Check an email message against a junk filter, printing the probability of spam on a scale from 0 to 1."
+	a := junkFlags(c.flag)
+	args := c.Parse()
+	if len(args) != 1 {
+		c.Usage()
+	}
+	defer a.Profile()()
+	a.SetLogLevel()
+
+	f := must(junk.OpenFilter(mlog.New("junkcheck"), a.params, a.databasePath, a.bloomfilterPath, false))
+	defer f.Close()
+
+	prob, _, _, _, err := f.ClassifyMessagePath(args[0])
+	xcheckf(err, "testing mail")
+
+	fmt.Printf("%.6f\n", prob)
+}
+
+func cmdJunkTest(c *cmd) {
+	c.unlisted = true
+	c.params = "hamdir spamdir"
+	c.help = "Check a directory with hams and one with spams against the junk filter, and report the success ratio."
+	a := junkFlags(c.flag)
+	args := c.Parse()
+	if len(args) != 2 {
+		c.Usage()
+	}
+	defer a.Profile()()
+	a.SetLogLevel()
+
+	f := must(junk.OpenFilter(mlog.New("junktest"), a.params, a.databasePath, a.bloomfilterPath, false))
+	defer f.Close()
+
+	testDir := func(dir string, ham bool) (int, int) {
+		ok, bad := 0, 0
+		files, err := os.ReadDir(dir)
+		xcheckf(err, "readdir %q", dir)
+		for _, fi := range files {
+			path := dir + "/" + fi.Name()
+			prob, _, _, _, err := f.ClassifyMessagePath(path)
+			if err != nil {
+				log.Printf("classify message %q: %s", path, err)
+				continue
+			}
+			if ham && prob < a.spamThreshold || !ham && prob > a.spamThreshold {
+				ok++
+			} else {
+				bad++
+			}
+			if ham && prob > a.spamThreshold {
+				fmt.Printf("ham %q: %.4f\n", path, prob)
+			}
+			if !ham && prob < a.spamThreshold {
+				fmt.Printf("spam %q: %.4f\n", path, prob)
+			}
+		}
+		return ok, bad
+	}
+
+	nhamok, nhambad := testDir(args[0], true)
+	nspamok, nspambad := testDir(args[1], false)
+	fmt.Printf("total ham, ok %d, bad %d\n", nhamok, nhambad)
+	fmt.Printf("total spam, ok %d, bad %d\n", nspamok, nspambad)
+	fmt.Printf("specifity (true negatives, hams identified): %.6f\n", float64(nhamok)/(float64(nhamok+nhambad)))
+	fmt.Printf("sensitivity (true positives, spams identified): %.6f\n", float64(nspamok)/(float64(nspamok+nspambad)))
+	fmt.Printf("accuracy: %.6f\n", float64(nhamok+nspamok)/float64(nhamok+nhambad+nspamok+nspambad))
+}
+
+func cmdJunkAnalyze(c *cmd) {
+	c.unlisted = true
+	c.params = "hamdir spamdir"
+	c.help = `Analyze a directory with ham messages and one with spam messages.
+
+A part of the messages is used for training, and remaining for testing. The
+messages are shuffled, with optional random seed.`
+	a := junkFlags(c.flag)
+	args := c.Parse()
+	if len(args) != 2 {
+		c.Usage()
+	}
+	defer a.Profile()()
+	a.SetLogLevel()
+
+	f := must(junk.NewFilter(mlog.New("junkanalyze"), a.params, a.databasePath, a.bloomfilterPath))
+	defer f.Close()
+
+	hamDir := args[0]
+	spamDir := args[1]
+	hamFiles := listDir(hamDir)
+	spamFiles := listDir(spamDir)
+
+	var rand *mathrand.Rand
+	if a.seed {
+		rand = mathrand.New(mathrand.NewSource(time.Now().UnixMilli()))
+	} else {
+		rand = mathrand.New(mathrand.NewSource(0))
+	}
+
+	shuffle := func(l []string) {
+		count := len(l)
+		for i := range l {
+			n := rand.Intn(count)
+			l[i], l[n] = l[n], l[i]
+		}
+	}
+
+	shuffle(hamFiles)
+	shuffle(spamFiles)
+
+	ntrainham := int(a.trainRatio * float64(len(hamFiles)))
+	ntrainspam := int(a.trainRatio * float64(len(spamFiles)))
+
+	trainHam := hamFiles[:ntrainham]
+	trainSpam := spamFiles[:ntrainspam]
+	testHam := hamFiles[ntrainham:]
+	testSpam := spamFiles[ntrainspam:]
+
+	var trainSent []string
+	if a.sentDir != "" {
+		trainSent = listDir(a.sentDir)
+	}
+
+	err := f.TrainDirs(hamDir, a.sentDir, spamDir, trainHam, trainSent, trainSpam)
+	xcheckf(err, "train")
+
+	testDir := func(dir string, files []string, ham bool) (ok, bad, malformed int) {
+		for _, name := range files {
+			path := dir + "/" + name
+			prob, _, _, _, err := f.ClassifyMessagePath(path)
+			if err != nil {
+				// log.Infof("%s: %s", path, err)
+				malformed++
+				continue
+			}
+			if ham && prob < a.spamThreshold || !ham && prob > a.spamThreshold {
+				ok++
+			} else {
+				bad++
+			}
+			if ham && prob > a.spamThreshold {
+				fmt.Printf("ham %q: %.4f\n", path, prob)
+			}
+			if !ham && prob < a.spamThreshold {
+				fmt.Printf("spam %q: %.4f\n", path, prob)
+			}
+		}
+		return
+	}
+
+	nhamok, nhambad, nmalformedham := testDir(args[0], testHam, true)
+	nspamok, nspambad, nmalformedspam := testDir(args[1], testSpam, false)
+	fmt.Printf("training done, nham %d, nsent %d, nspam %d\n", ntrainham, len(trainSent), ntrainspam)
+	fmt.Printf("total ham, ok %d, bad %d, malformed %d\n", nhamok, nhambad, nmalformedham)
+	fmt.Printf("total spam, ok %d, bad %d, malformed %d\n", nspamok, nspambad, nmalformedspam)
+	fmt.Printf("specifity (true negatives, hams identified): %.6f\n", float64(nhamok)/(float64(nhamok+nhambad)))
+	fmt.Printf("sensitivity (true positives, spams identified): %.6f\n", float64(nspamok)/(float64(nspamok+nspambad)))
+	fmt.Printf("accuracy: %.6f\n", float64(nhamok+nspamok)/float64(nhamok+nhambad+nspamok+nspambad))
+}
+
+func cmdJunkPlay(c *cmd) {
+	c.unlisted = true
+	c.params = "hamdir spamdir"
+	c.help = "Play messages from ham and spam directory according to their time of arrival and report on junk filter performance."
+	a := junkFlags(c.flag)
+	args := c.Parse()
+	if len(args) != 2 {
+		c.Usage()
+	}
+	defer a.Profile()()
+	a.SetLogLevel()
+
+	f := must(junk.NewFilter(mlog.New("junkplay"), a.params, a.databasePath, a.bloomfilterPath))
+	defer f.Close()
+
+	// We'll go through all emails to find their dates.
+	type msg struct {
+		dir, filename string
+		ham, sent     bool
+		t             time.Time
+	}
+	var msgs []msg
+
+	var nbad, nnodate, nham, nspam, nsent int
+
+	scanDir := func(dir string, ham, sent bool) {
+		for _, name := range listDir(dir) {
+			path := dir + "/" + name
+			mf, err := os.Open(path)
+			xcheckf(err, "open %q", path)
+			fi, err := mf.Stat()
+			xcheckf(err, "stat %q", path)
+			p, err := message.EnsurePart(mf, fi.Size())
+			if err != nil {
+				nbad++
+				mf.Close()
+				continue
+			}
+			if p.Envelope.Date.IsZero() {
+				nnodate++
+				mf.Close()
+				continue
+			}
+			mf.Close()
+			msgs = append(msgs, msg{dir, name, ham, sent, p.Envelope.Date})
+			if sent {
+				nsent++
+			} else if ham {
+				nham++
+			} else {
+				nspam++
+			}
+		}
+	}
+
+	hamDir := args[0]
+	spamDir := args[1]
+	scanDir(hamDir, true, false)
+	scanDir(spamDir, false, false)
+	if a.sentDir != "" {
+		scanDir(a.sentDir, true, true)
+	}
+
+	// Sort the messages, earliest first.
+	sort.Slice(msgs, func(i, j int) bool {
+		return msgs[i].t.Before(msgs[j].t)
+	})
+
+	// Play all messages as if they are coming in. We predict their spaminess, check if
+	// we are right. And we train the system with the result.
+	var nhamok, nhambad, nspamok, nspambad int
+
+	play := func(msg msg) {
+		var words map[string]struct{}
+		path := msg.dir + "/" + msg.filename
+		if !msg.sent {
+			var prob float64
+			var err error
+			prob, words, _, _, err = f.ClassifyMessagePath(path)
+			if err != nil {
+				nbad++
+				return
+			}
+			if msg.ham {
+				if prob < a.spamThreshold {
+					nhamok++
+				} else {
+					nhambad++
+				}
+			} else {
+				if prob > a.spamThreshold {
+					nspamok++
+				} else {
+					nspambad++
+				}
+			}
+		} else {
+			mf, err := os.Open(path)
+			xcheckf(err, "open %q", path)
+			defer mf.Close()
+			fi, err := mf.Stat()
+			xcheckf(err, "stat %q", path)
+			p, err := message.EnsurePart(mf, fi.Size())
+			if err != nil {
+				log.Printf("bad sent message %q: %s", path, err)
+				return
+			}
+
+			words, err = f.ParseMessage(p)
+			if err != nil {
+				log.Printf("bad sent message %q: %s", path, err)
+				return
+			}
+		}
+
+		if err := f.Train(msg.ham, words); err != nil {
+			log.Printf("train: %s", err)
+		}
+	}
+
+	for _, m := range msgs {
+		play(m)
+	}
+
+	err := f.Save()
+	xcheckf(err, "saving filter")
+
+	fmt.Printf("completed, nham %d, nsent %d, nspam %d, nbad %d, nwithoutdate %d\n", nham, nsent, nspam, nbad, nnodate)
+	fmt.Printf("total ham, ok %d, bad %d\n", nhamok, nhambad)
+	fmt.Printf("total spam, ok %d, bad %d\n", nspamok, nspambad)
+	fmt.Printf("specifity (true negatives, hams identified): %.6f\n", float64(nhamok)/(float64(nhamok+nhambad)))
+	fmt.Printf("sensitivity (true positives, spams identified): %.6f\n", float64(nspamok)/(float64(nspamok+nspambad)))
+	fmt.Printf("accuracy: %.6f\n", float64(nhamok+nspamok)/float64(nhamok+nhambad+nspamok+nspambad))
+}