FairEmail/app/src/main/java/eu/faircode/email/MessageClassifier.java

333 lines
12 KiB
Java
Raw Normal View History

2021-01-02 13:33:53 +00:00
package eu.faircode.email;
/*
This file is part of FairEmail.
FairEmail is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
FairEmail is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with FairEmail. If not, see <http://www.gnu.org/licenses/>.
Copyright 2018-2021 by Marcel Bokhorst (M66B)
*/
import android.content.Context;
2021-01-02 14:57:24 +00:00
import android.content.SharedPreferences;
2021-01-02 13:33:53 +00:00
import android.text.TextUtils;
2021-01-02 14:57:24 +00:00
import androidx.preference.PreferenceManager;
2021-01-02 13:33:53 +00:00
import org.jetbrains.annotations.NotNull;
2021-01-02 19:42:35 +00:00
import org.json.JSONArray;
import org.json.JSONException;
import org.json.JSONObject;
2021-01-02 13:33:53 +00:00
import java.io.File;
import java.io.IOException;
import java.text.BreakIterator;
import java.util.ArrayList;
import java.util.Collections;
import java.util.Comparator;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
public class MessageClassifier {
2021-01-02 19:42:35 +00:00
private static boolean loaded = false;
2021-01-02 20:39:09 +00:00
private static Map<Long, Map<String, Integer>> classMessages = new HashMap<>();
private static Map<Long, Map<String, Map<String, Integer>>> wordClassFrequency = new HashMap<>();
2021-01-02 13:33:53 +00:00
private static final double COMMON_WORD_FACTOR = 0.75;
private static final double CHANCE_THRESHOLD = 2.0;
2021-01-02 21:38:37 +00:00
static void classify(EntityMessage message, boolean added, Context context) {
2021-01-02 19:42:35 +00:00
if (!isEnabled(context))
2021-01-02 21:38:37 +00:00
return;
2021-01-02 13:33:53 +00:00
2021-01-02 19:42:35 +00:00
try {
load(context);
} catch (Throwable ex) {
Log.e(ex);
}
2021-01-02 14:57:24 +00:00
DB db = DB.getInstance(context);
2021-01-02 13:33:53 +00:00
EntityFolder folder = db.folder().getFolder(message.folder);
if (folder == null)
2021-01-02 21:38:37 +00:00
return;
2021-01-02 13:33:53 +00:00
EntityAccount account = db.account().getAccount(folder.account);
if (account == null)
2021-01-02 21:38:37 +00:00
return;
2021-01-02 13:33:53 +00:00
if (!EntityFolder.INBOX.equals(folder.type) &&
!EntityFolder.JUNK.equals(folder.type) &&
!EntityFolder.USER.equals(folder.type) &&
!(EntityFolder.ARCHIVE.equals(folder.type) && !account.isGmail()))
2021-01-02 21:38:37 +00:00
return;
2021-01-02 13:33:53 +00:00
File file = message.getFile(context);
2021-01-02 19:50:43 +00:00
if (!file.exists())
2021-01-02 21:38:37 +00:00
return;
2021-01-02 19:50:43 +00:00
2021-01-02 13:33:53 +00:00
String text;
try {
text = HtmlHelper.getFullText(file);
} catch (IOException ex) {
Log.w(ex);
text = null;
}
if (TextUtils.isEmpty(text))
2021-01-02 21:38:37 +00:00
return;
2021-01-02 13:33:53 +00:00
2021-01-02 20:39:09 +00:00
if (!classMessages.containsKey(account.id))
classMessages.put(account.id, new HashMap<>());
if (!wordClassFrequency.containsKey(account.id))
wordClassFrequency.put(account.id, new HashMap<>());
2021-01-02 13:33:53 +00:00
2021-01-02 20:39:09 +00:00
String classified = classify(account.id, folder.name, text, added);
Integer m = classMessages.get(account.id).get(folder.name);
2021-01-02 13:33:53 +00:00
if (added) {
m = (m == null ? 1 : m + 1);
2021-01-02 20:39:09 +00:00
classMessages.get(account.id).put(folder.name, m);
2021-01-02 13:33:53 +00:00
} else {
2021-01-02 14:57:24 +00:00
if (m != null && m > 0)
2021-01-02 20:39:09 +00:00
classMessages.get(account.id).put(folder.name, m - 1);
2021-01-02 13:33:53 +00:00
}
2021-01-02 20:39:09 +00:00
Log.i("Classifier classify=" + folder.name + " messages=" + classMessages.get(account.id).get(folder.name));
2021-01-02 13:33:53 +00:00
2021-01-02 21:38:37 +00:00
if (classified != null) {
EntityFolder f = db.folder().getFolderByName(account.id, classified);
if (f != null && f.auto_classify && !f.id.equals(folder.id))
EntityOperation.queue(context, message, EntityOperation.MOVE, f.id);
}
2021-01-02 13:33:53 +00:00
}
2021-01-02 20:39:09 +00:00
private static String classify(long account, String classify, String text, boolean added) {
2021-01-02 13:33:53 +00:00
int maxFrequency = 0;
int maxMatchedWords = 0;
List<String> words = new ArrayList<>();
Map<String, Stat> classStats = new HashMap<>();
BreakIterator boundary = BreakIterator.getWordInstance(); // TODO ICU
boundary.setText(text);
int start = boundary.first();
for (int end = boundary.next(); end != BreakIterator.DONE; end = boundary.next()) {
String word = text.substring(start, end).toLowerCase();
if (word.length() > 1 &&
!words.contains(word) &&
!word.matches(".*\\d.*")) {
words.add(word);
2021-01-02 20:39:09 +00:00
Map<String, Integer> classFrequency = wordClassFrequency.get(account).get(word);
2021-01-02 17:18:27 +00:00
if (added) {
if (classFrequency == null) {
classFrequency = new HashMap<>();
2021-01-02 20:39:09 +00:00
wordClassFrequency.get(account).put(word, classFrequency);
2021-01-02 17:18:27 +00:00
}
2021-01-02 13:33:53 +00:00
2021-01-02 17:18:27 +00:00
// Filter classes of common occurring words
List<String> applyClasses = new ArrayList<>(classFrequency.keySet());
for (String class1 : classFrequency.keySet())
for (String class2 : classFrequency.keySet())
if (!class1.equals(class2)) {
2021-01-02 20:39:09 +00:00
double percentage1 = (double) classFrequency.get(class1) / classMessages.get(account).get(class1);
double percentage2 = (double) classFrequency.get(class2) / classMessages.get(account).get(class2);
2021-01-02 17:18:27 +00:00
double factor = percentage1 / percentage2;
if (factor > 1)
factor = 1 / factor;
if (factor > COMMON_WORD_FACTOR) {
Log.i("Classifier skip class=" + class1 + " word=" + word);
applyClasses.remove(class1);
break;
}
2021-01-02 13:33:53 +00:00
}
2021-01-02 17:18:27 +00:00
for (String clazz : applyClasses) {
int frequency = classFrequency.get(clazz);
if (frequency > maxFrequency)
maxFrequency = frequency;
Stat stat = classStats.get(clazz);
if (stat == null) {
stat = new Stat();
classStats.put(clazz, stat);
2021-01-02 13:33:53 +00:00
}
2021-01-02 17:18:27 +00:00
stat.matchedWords++;
stat.totalFrequency += frequency;
2021-01-02 13:33:53 +00:00
2021-01-02 17:18:27 +00:00
if (stat.matchedWords > maxMatchedWords)
maxMatchedWords = stat.matchedWords;
2021-01-02 13:33:53 +00:00
}
2021-01-02 17:18:27 +00:00
Integer c = classFrequency.get(classify);
c = (c == null ? 1 : c + 1);
classFrequency.put(classify, c);
} else {
Integer c = (classFrequency == null ? null : classFrequency.get(classify));
if (c != null)
if (c > 0)
classFrequency.put(classify, c - 1);
else
classFrequency.remove(classify);
2021-01-02 13:33:53 +00:00
}
}
start = end;
}
if (!added)
return null;
List<Chance> chances = new ArrayList<>();
for (String clazz : classStats.keySet()) {
Stat stat = classStats.get(clazz);
double chance = ((double) stat.totalFrequency / maxFrequency / maxMatchedWords);
Chance c = new Chance(clazz, chance);
Log.i("Classifier " + c +
" frequency=" + stat.totalFrequency + "/" + maxFrequency +
" matched=" + stat.matchedWords + "/" + maxMatchedWords);
chances.add(c);
}
if (chances.size() <= 1)
return null;
Collections.sort(chances, new Comparator<Chance>() {
@Override
public int compare(Chance c1, Chance c2) {
return -c1.chance.compareTo(c2.chance);
}
});
String classification = null;
if (chances.get(0).chance / chances.get(1).chance >= CHANCE_THRESHOLD)
classification = chances.get(0).clazz;
Log.i("Classifier classify=" + classify + " classified=" + classification);
return classification;
}
2021-01-02 19:42:35 +00:00
static synchronized void save(Context context) throws JSONException, IOException {
if (!isEnabled(context))
return;
JSONArray jmessages = new JSONArray();
2021-01-02 20:39:09 +00:00
for (Long account : classMessages.keySet())
for (String clazz : classMessages.get(account).keySet()) {
JSONObject jmessage = new JSONObject();
jmessage.put("account", account);
jmessage.put("class", clazz);
jmessage.put("count", classMessages.get(account).get(clazz));
jmessages.put(jmessage);
}
2021-01-02 19:42:35 +00:00
JSONArray jwords = new JSONArray();
2021-01-02 20:39:09 +00:00
for (Long account : classMessages.keySet())
for (String word : wordClassFrequency.get(account).keySet())
for (String clazz : wordClassFrequency.get(account).get(word).keySet()) {
JSONObject jword = new JSONObject();
jword.put("account", account);
jword.put("word", word);
jword.put("class", clazz);
jword.put("frequency", wordClassFrequency.get(account).get(word).get(clazz));
jwords.put(jword);
}
2021-01-02 19:42:35 +00:00
JSONObject jroot = new JSONObject();
jroot.put("messages", jmessages);
jroot.put("words", jwords);
File file = getFile(context);
Helper.writeText(file, jroot.toString(2));
2021-01-02 20:39:09 +00:00
Log.i("Classifier saved");
2021-01-02 19:42:35 +00:00
}
private static synchronized void load(Context context) throws IOException, JSONException {
if (loaded)
return;
if (!isEnabled(context))
return;
classMessages.clear();
wordClassFrequency.clear();
File file = getFile(context);
if (file.exists()) {
String json = Helper.readText(file);
JSONObject jroot = new JSONObject(json);
JSONArray jmessages = jroot.getJSONArray("messages");
for (int m = 0; m < jmessages.length(); m++) {
JSONObject jmessage = (JSONObject) jmessages.get(m);
2021-01-02 20:39:09 +00:00
long account = jmessage.getLong("account");
if (!classMessages.containsKey(account))
classMessages.put(account, new HashMap<>());
classMessages.get(account).put(jmessage.getString("class"), jmessage.getInt("count"));
2021-01-02 19:42:35 +00:00
}
JSONArray jwords = jroot.getJSONArray("words");
for (int w = 0; w < jwords.length(); w++) {
JSONObject jword = (JSONObject) jwords.get(w);
2021-01-02 20:39:09 +00:00
long account = jword.getLong("account");
if (!wordClassFrequency.containsKey("account"))
wordClassFrequency.put(account, new HashMap<>());
2021-01-02 19:42:35 +00:00
String word = jword.getString("word");
2021-01-02 20:39:09 +00:00
Map<String, Integer> classFrequency = wordClassFrequency.get(account).get(word);
2021-01-02 19:42:35 +00:00
if (classFrequency == null) {
classFrequency = new HashMap<>();
2021-01-02 20:39:09 +00:00
wordClassFrequency.get(account).put(word, classFrequency);
2021-01-02 19:42:35 +00:00
}
classFrequency.put(jword.getString("class"), jword.getInt("frequency"));
}
}
loaded = true;
2021-01-02 20:39:09 +00:00
Log.i("Classifier loaded");
2021-01-02 19:42:35 +00:00
}
2021-01-02 21:38:37 +00:00
static boolean isEnabled(Context context) {
2021-01-02 19:42:35 +00:00
SharedPreferences prefs = PreferenceManager.getDefaultSharedPreferences(context);
return prefs.getBoolean("classify", BuildConfig.DEBUG);
}
private static File getFile(Context context) {
return new File(context.getFilesDir(), "classifier.json");
}
2021-01-02 13:33:53 +00:00
private static class Stat {
int matchedWords = 0;
int totalFrequency = 0;
}
private static class Chance {
String clazz;
Double chance;
Chance(String clazz, Double chance) {
this.clazz = clazz;
this.chance = chance;
}
@NotNull
@Override
public String toString() {
return clazz + "=" + chance;
}
}
}