From 75a1691c1ba6aeccec73302cca36a50f786afa2e Mon Sep 17 00:00:00 2001 From: M66B Date: Mon, 28 Dec 2020 12:03:34 +0100 Subject: [PATCH] Text classification experiment --- ATTRIBUTION.md | 1 + app/build.gradle | 4 +++ app/src/main/assets/ATTRIBUTION.md | 1 + .../java/eu/faircode/email/WorkerFts.java | 32 ++++++++++++++++++- 4 files changed, 37 insertions(+), 1 deletion(-) diff --git a/ATTRIBUTION.md b/ATTRIBUTION.md index f60a3a7c0f..4d23e236c5 100644 --- a/ATTRIBUTION.md +++ b/ATTRIBUTION.md @@ -29,3 +29,4 @@ FairEmail uses: * [Over-Scroll Support For Android's RecyclerView, ListView, GridView, ScrollView ...](https://github.com/EverythingMe/overscroll-decor). Copyright (c) 2015, DoAT Media Ltd. [BSD-2-Clause License](https://github.com/EverythingMe/overscroll-decor/blob/master/LICENSE) * [Compact Encoding Detection](https://github.com/google/compact_enc_det). Copyright 2016 Google Inc. [Apache License 2.0](https://github.com/google/compact_enc_det/blob/master/LICENSE). * [POI-HMEF](https://poi.apache.org/components/hmef/index.html). Copyright © 2001-2020 The Apache Software Foundation. [Apache Software License v2](https://poi.apache.org/devel/guidelines.html#The+Licensing). +* [Java Naive Bayes Classifier](https://github.com/ptnplanet/Java-Naive-Bayes-Classifier). Copyright (c) 2012-2017 Philipp Nolte. [MIT License](https://github.com/ptnplanet/Java-Naive-Bayes-Classifier#the-mit-license-mit). diff --git a/app/build.gradle b/app/build.gradle index 1b6fa02d9d..c9251f33fd 100644 --- a/app/build.gradle +++ b/app/build.gradle @@ -278,6 +278,7 @@ dependencies { def appauth_version = "0.7.1" def jcharset_version = "2.1" def apache_poi = "3.17" + def bayes_version = "1.0.7" // https://developer.android.com/jetpack/androidx/releases/ @@ -442,4 +443,7 @@ dependencies { // https://poi.apache.org/components/hmef/index.html // https://mvnrepository.com/artifact/org.apache.poi/poi-scratchpad implementation "org.apache.poi:poi-scratchpad:$apache_poi" + + // https://github.com/ptnplanet/Java-Naive-Bayes-Classifier + implementation "com.github.ptnplanet:Java-Naive-Bayes-Classifier:$bayes_version" } diff --git a/app/src/main/assets/ATTRIBUTION.md b/app/src/main/assets/ATTRIBUTION.md index f60a3a7c0f..4d23e236c5 100644 --- a/app/src/main/assets/ATTRIBUTION.md +++ b/app/src/main/assets/ATTRIBUTION.md @@ -29,3 +29,4 @@ FairEmail uses: * [Over-Scroll Support For Android's RecyclerView, ListView, GridView, ScrollView ...](https://github.com/EverythingMe/overscroll-decor). Copyright (c) 2015, DoAT Media Ltd. [BSD-2-Clause License](https://github.com/EverythingMe/overscroll-decor/blob/master/LICENSE) * [Compact Encoding Detection](https://github.com/google/compact_enc_det). Copyright 2016 Google Inc. [Apache License 2.0](https://github.com/google/compact_enc_det/blob/master/LICENSE). * [POI-HMEF](https://poi.apache.org/components/hmef/index.html). Copyright © 2001-2020 The Apache Software Foundation. [Apache Software License v2](https://poi.apache.org/devel/guidelines.html#The+Licensing). +* [Java Naive Bayes Classifier](https://github.com/ptnplanet/Java-Naive-Bayes-Classifier). Copyright (c) 2012-2017 Philipp Nolte. [MIT License](https://github.com/ptnplanet/Java-Naive-Bayes-Classifier#the-mit-license-mit). diff --git a/app/src/main/java/eu/faircode/email/WorkerFts.java b/app/src/main/java/eu/faircode/email/WorkerFts.java index e4f8081490..d09efdfc14 100644 --- a/app/src/main/java/eu/faircode/email/WorkerFts.java +++ b/app/src/main/java/eu/faircode/email/WorkerFts.java @@ -22,6 +22,7 @@ package eu.faircode.email; import android.content.Context; import android.content.SharedPreferences; import android.database.Cursor; +import android.text.TextUtils; import androidx.annotation.NonNull; import androidx.preference.PreferenceManager; @@ -34,17 +35,22 @@ import androidx.work.WorkerParameters; import java.io.File; import java.io.FileNotFoundException; import java.util.ArrayList; +import java.util.Collection; import java.util.List; import java.util.concurrent.TimeUnit; +import de.daslaboratorium.machinelearning.classifier.Classification; +import de.daslaboratorium.machinelearning.classifier.bayes.BayesClassifier; import io.requery.android.database.sqlite.SQLiteDatabase; import static android.os.Process.THREAD_PRIORITY_BACKGROUND; public class WorkerFts extends Worker { - private static final int INDEX_DELAY = 30; // seconds + private static final int INDEX_DELAY = BuildConfig.DEBUG ? 3 : 30; // seconds private static final int INDEX_BATCH_SIZE = 100; + private static BayesClassifier classifier = new BayesClassifier<>(); + public WorkerFts(@NonNull Context context, @NonNull WorkerParameters workerParams) { super(context, workerParams); Log.i("Instance " + getName()); @@ -78,6 +84,30 @@ public class WorkerFts extends Worker { File file = message.getFile(getApplicationContext()); String text = HtmlHelper.getFullText(file); + if (BuildConfig.DEBUG) { + EntityFolder folder = db.folder().getFolder(message.folder); + if (folder != null) { + // \\P{L}+ + List features = new ArrayList<>(); + for (String word : text.trim().toLowerCase().split("\\W+")) { + if (word.matches(".*\\d.*")) + continue; + if (word.endsWith(".")) + word = word.substring(0, word.length() - 1); + features.add(word); + } + + Collection> classifications = classifier.classifyDetailed(features); + for (Classification classification : classifications) + Log.i("MMM folder=" + folder.name + + " classified=" + classification.getCategory() + + " probability=" + classification.getProbability() + + " features=" + TextUtils.join(", ", features.subList(0, Math.min(features.size(), 20)))); + + classifier.learn(EntityFolder.JUNK.equals(folder.type) ? "spam" : "ham", features); + } + } + try { sdb.beginTransaction(); FtsDbHelper.insert(sdb, message, text);