mirror of https://github.com/M66B/FairEmail.git
Added experimental message classifier
This commit is contained in:
parent
14ce0a02aa
commit
b063fb6503
|
@ -0,0 +1,214 @@
|
|||
package eu.faircode.email;
|
||||
|
||||
/*
|
||||
This file is part of FairEmail.
|
||||
|
||||
FairEmail is free software: you can redistribute it and/or modify
|
||||
it under the terms of the GNU General Public License as published by
|
||||
the Free Software Foundation, either version 3 of the License, or
|
||||
(at your option) any later version.
|
||||
|
||||
FairEmail is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
GNU General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU General Public License
|
||||
along with FairEmail. If not, see <http://www.gnu.org/licenses/>.
|
||||
|
||||
Copyright 2018-2021 by Marcel Bokhorst (M66B)
|
||||
*/
|
||||
|
||||
import android.content.Context;
|
||||
import android.text.TextUtils;
|
||||
|
||||
import org.jetbrains.annotations.NotNull;
|
||||
|
||||
import java.io.File;
|
||||
import java.io.IOException;
|
||||
import java.text.BreakIterator;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Collections;
|
||||
import java.util.Comparator;
|
||||
import java.util.HashMap;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
|
||||
public class MessageClassifier {
|
||||
private static Map<String, Integer> classMessages = new HashMap<>();
|
||||
private static Map<String, Map<String, Integer>> wordClassFrequency = new HashMap<>();
|
||||
|
||||
private static final double COMMON_WORD_FACTOR = 0.75;
|
||||
private static final double CHANCE_THRESHOLD = 2.0;
|
||||
|
||||
static String classify(EntityMessage message, boolean added, Context context) {
|
||||
DB db = DB.getInstance(context);
|
||||
|
||||
if (!message.content)
|
||||
throw new IllegalArgumentException("Message without content");
|
||||
|
||||
EntityFolder folder = db.folder().getFolder(message.folder);
|
||||
if (folder == null)
|
||||
return null;
|
||||
|
||||
EntityAccount account = db.account().getAccount(folder.account);
|
||||
if (account == null)
|
||||
return null;
|
||||
|
||||
if (!EntityFolder.INBOX.equals(folder.type) &&
|
||||
!EntityFolder.JUNK.equals(folder.type) &&
|
||||
!EntityFolder.USER.equals(folder.type) &&
|
||||
!(EntityFolder.ARCHIVE.equals(folder.type) && !account.isGmail()))
|
||||
return null;
|
||||
|
||||
File file = message.getFile(context);
|
||||
String text;
|
||||
try {
|
||||
text = HtmlHelper.getFullText(file);
|
||||
} catch (IOException ex) {
|
||||
Log.w(ex);
|
||||
text = null;
|
||||
}
|
||||
|
||||
if (TextUtils.isEmpty(text))
|
||||
return null;
|
||||
|
||||
String classified = classify(folder.name, text, added);
|
||||
|
||||
Integer m = classMessages.get(folder.name);
|
||||
if (added) {
|
||||
m = (m == null ? 1 : m + 1);
|
||||
classMessages.put(folder.name, m);
|
||||
} else {
|
||||
if (m != null)
|
||||
classMessages.put(folder.name, m - 1);
|
||||
}
|
||||
|
||||
return classified;
|
||||
}
|
||||
|
||||
static String classify(String classify, String text, boolean added) {
|
||||
int maxFrequency = 0;
|
||||
int maxMatchedWords = 0;
|
||||
List<String> words = new ArrayList<>();
|
||||
Map<String, Stat> classStats = new HashMap<>();
|
||||
|
||||
BreakIterator boundary = BreakIterator.getWordInstance(); // TODO ICU
|
||||
boundary.setText(text);
|
||||
int start = boundary.first();
|
||||
for (int end = boundary.next(); end != BreakIterator.DONE; end = boundary.next()) {
|
||||
String word = text.substring(start, end).toLowerCase();
|
||||
if (word.length() > 1 &&
|
||||
!words.contains(word) &&
|
||||
!word.matches(".*\\d.*")) {
|
||||
words.add(word);
|
||||
|
||||
Map<String, Integer> classFrequency = wordClassFrequency.get(word);
|
||||
if (!added) {
|
||||
Integer c = (classFrequency == null ? null : classFrequency.get(classify));
|
||||
if (c != null)
|
||||
classFrequency.put(classify, c - 1);
|
||||
continue;
|
||||
}
|
||||
|
||||
if (classFrequency == null) {
|
||||
classFrequency = new HashMap<>();
|
||||
wordClassFrequency.put(word, classFrequency);
|
||||
}
|
||||
|
||||
// Filter classes of common occurring words
|
||||
List<String> applyClasses = new ArrayList<>(classFrequency.keySet());
|
||||
for (String class1 : classFrequency.keySet())
|
||||
for (String class2 : classFrequency.keySet())
|
||||
if (!class1.equals(class2)) {
|
||||
double percentage1 = (double) classFrequency.get(class1) / classMessages.get(class1);
|
||||
double percentage2 = (double) classFrequency.get(class2) / classMessages.get(class2);
|
||||
double factor = percentage1 / percentage2;
|
||||
if (factor > 1)
|
||||
factor = 1 / factor;
|
||||
if (factor > COMMON_WORD_FACTOR) {
|
||||
Log.i("Classifier skip class=" + class1 + " word=" + word);
|
||||
applyClasses.remove(class1);
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
for (String clazz : applyClasses) {
|
||||
int frequency = classFrequency.get(clazz);
|
||||
if (frequency > maxFrequency)
|
||||
maxFrequency = frequency;
|
||||
|
||||
Stat stat = classStats.get(clazz);
|
||||
if (stat == null) {
|
||||
stat = new Stat();
|
||||
classStats.put(clazz, stat);
|
||||
}
|
||||
|
||||
stat.matchedWords++;
|
||||
stat.totalFrequency += frequency;
|
||||
|
||||
if (stat.matchedWords > maxMatchedWords)
|
||||
maxMatchedWords = stat.matchedWords;
|
||||
}
|
||||
|
||||
Integer c = classFrequency.get(classify);
|
||||
c = (c == null ? 1 : c + 1);
|
||||
classFrequency.put(classify, c);
|
||||
}
|
||||
start = end;
|
||||
}
|
||||
|
||||
if (!added)
|
||||
return null;
|
||||
|
||||
List<Chance> chances = new ArrayList<>();
|
||||
for (String clazz : classStats.keySet()) {
|
||||
Stat stat = classStats.get(clazz);
|
||||
double chance = ((double) stat.totalFrequency / maxFrequency / maxMatchedWords);
|
||||
Chance c = new Chance(clazz, chance);
|
||||
Log.i("Classifier " + c +
|
||||
" frequency=" + stat.totalFrequency + "/" + maxFrequency +
|
||||
" matched=" + stat.matchedWords + "/" + maxMatchedWords);
|
||||
chances.add(c);
|
||||
}
|
||||
|
||||
if (chances.size() <= 1)
|
||||
return null;
|
||||
|
||||
Collections.sort(chances, new Comparator<Chance>() {
|
||||
@Override
|
||||
public int compare(Chance c1, Chance c2) {
|
||||
return -c1.chance.compareTo(c2.chance);
|
||||
}
|
||||
});
|
||||
|
||||
String classification = null;
|
||||
if (chances.get(0).chance / chances.get(1).chance >= CHANCE_THRESHOLD)
|
||||
classification = chances.get(0).clazz;
|
||||
|
||||
Log.i("Classifier classify=" + classify + " classified=" + classification);
|
||||
|
||||
return classification;
|
||||
}
|
||||
|
||||
private static class Stat {
|
||||
int matchedWords = 0;
|
||||
int totalFrequency = 0;
|
||||
}
|
||||
|
||||
private static class Chance {
|
||||
String clazz;
|
||||
Double chance;
|
||||
|
||||
Chance(String clazz, Double chance) {
|
||||
this.clazz = clazz;
|
||||
this.chance = chance;
|
||||
}
|
||||
|
||||
@NotNull
|
||||
@Override
|
||||
public String toString() {
|
||||
return clazz + "=" + chance;
|
||||
}
|
||||
}
|
||||
}
|
|
@ -42,7 +42,7 @@ import io.requery.android.database.sqlite.SQLiteDatabase;
|
|||
import static android.os.Process.THREAD_PRIORITY_BACKGROUND;
|
||||
|
||||
public class WorkerFts extends Worker {
|
||||
private static final int INDEX_DELAY = 30; // seconds
|
||||
private static final int INDEX_DELAY = BuildConfig.DEBUG ? 3 : 30; // seconds
|
||||
private static final int INDEX_BATCH_SIZE = 100;
|
||||
|
||||
public WorkerFts(@NonNull Context context, @NonNull WorkerParameters workerParams) {
|
||||
|
@ -78,6 +78,9 @@ public class WorkerFts extends Worker {
|
|||
continue;
|
||||
}
|
||||
|
||||
if (BuildConfig.DEBUG)
|
||||
MessageClassifier.classify(message, true, context);
|
||||
|
||||
File file = message.getFile(context);
|
||||
String text = HtmlHelper.getFullText(file);
|
||||
if (TextUtils.isEmpty(text)) {
|
||||
|
|
Loading…
Reference in New Issue