2021-01-02 13:33:53 +00:00
|
|
|
package eu.faircode.email;
|
|
|
|
|
|
|
|
/*
|
|
|
|
This file is part of FairEmail.
|
|
|
|
|
|
|
|
FairEmail is free software: you can redistribute it and/or modify
|
|
|
|
it under the terms of the GNU General Public License as published by
|
|
|
|
the Free Software Foundation, either version 3 of the License, or
|
|
|
|
(at your option) any later version.
|
|
|
|
|
|
|
|
FairEmail is distributed in the hope that it will be useful,
|
|
|
|
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
|
|
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
|
|
GNU General Public License for more details.
|
|
|
|
|
|
|
|
You should have received a copy of the GNU General Public License
|
|
|
|
along with FairEmail. If not, see <http://www.gnu.org/licenses/>.
|
|
|
|
|
2022-01-01 08:46:36 +00:00
|
|
|
Copyright 2018-2022 by Marcel Bokhorst (M66B)
|
2021-01-02 13:33:53 +00:00
|
|
|
*/
|
|
|
|
|
|
|
|
import android.content.Context;
|
2021-01-02 14:57:24 +00:00
|
|
|
import android.content.SharedPreferences;
|
2021-01-06 20:43:26 +00:00
|
|
|
import android.os.Build;
|
2021-01-04 14:28:45 +00:00
|
|
|
import android.text.TextUtils;
|
2021-02-10 10:59:52 +00:00
|
|
|
import android.util.JsonReader;
|
|
|
|
import android.util.JsonWriter;
|
2021-05-08 09:23:22 +00:00
|
|
|
import android.util.MalformedJsonException;
|
2021-01-02 13:33:53 +00:00
|
|
|
|
2021-01-10 13:31:29 +00:00
|
|
|
import androidx.annotation.NonNull;
|
2021-01-02 14:57:24 +00:00
|
|
|
import androidx.preference.PreferenceManager;
|
|
|
|
|
2021-02-10 11:46:29 +00:00
|
|
|
import java.io.BufferedReader;
|
|
|
|
import java.io.BufferedWriter;
|
2021-01-02 13:33:53 +00:00
|
|
|
import java.io.File;
|
2021-02-10 10:59:52 +00:00
|
|
|
import java.io.FileReader;
|
|
|
|
import java.io.FileWriter;
|
2021-01-02 13:33:53 +00:00
|
|
|
import java.io.IOException;
|
|
|
|
import java.util.ArrayList;
|
2021-01-04 14:28:45 +00:00
|
|
|
import java.util.Arrays;
|
2021-01-02 13:33:53 +00:00
|
|
|
import java.util.Collections;
|
|
|
|
import java.util.Comparator;
|
2021-01-03 07:41:29 +00:00
|
|
|
import java.util.Date;
|
2021-01-02 13:33:53 +00:00
|
|
|
import java.util.HashMap;
|
|
|
|
import java.util.List;
|
|
|
|
import java.util.Map;
|
|
|
|
|
2021-01-04 14:28:45 +00:00
|
|
|
import javax.mail.Address;
|
|
|
|
import javax.mail.internet.InternetAddress;
|
|
|
|
|
2021-01-02 13:33:53 +00:00
|
|
|
public class MessageClassifier {
|
2021-01-02 19:42:35 +00:00
|
|
|
private static boolean loaded = false;
|
2021-01-02 21:52:41 +00:00
|
|
|
private static boolean dirty = false;
|
2021-01-10 16:02:27 +00:00
|
|
|
private static final Map<Long, List<String>> accountMsgIds = new HashMap<>();
|
2021-01-07 18:20:17 +00:00
|
|
|
private static final Map<Long, Map<String, Integer>> classMessages = new HashMap<>();
|
2021-01-06 20:35:11 +00:00
|
|
|
private static final Map<Long, Map<String, Map<String, Frequency>>> wordClassFrequency = new HashMap<>();
|
2021-01-02 13:33:53 +00:00
|
|
|
|
2021-01-11 13:32:16 +00:00
|
|
|
private static final int MAX_WORDS = 1000;
|
|
|
|
|
2021-06-26 14:14:23 +00:00
|
|
|
static synchronized void classify(EntityMessage message, EntityFolder folder, EntityFolder target, Context context) {
|
2021-01-02 19:42:35 +00:00
|
|
|
try {
|
2021-01-03 11:23:51 +00:00
|
|
|
if (!isEnabled(context))
|
|
|
|
return;
|
2021-01-03 07:27:27 +00:00
|
|
|
|
2021-01-13 09:22:52 +00:00
|
|
|
if (!folder.auto_classify_source)
|
2021-01-03 11:23:51 +00:00
|
|
|
return;
|
2021-01-02 19:50:43 +00:00
|
|
|
|
2021-02-01 14:59:08 +00:00
|
|
|
if (target != null && !target.auto_classify_source)
|
|
|
|
return;
|
|
|
|
|
2021-01-06 07:31:34 +00:00
|
|
|
long start = new Date().getTime();
|
|
|
|
|
2021-01-05 18:51:22 +00:00
|
|
|
// Build text to classify
|
2021-01-10 13:31:29 +00:00
|
|
|
List<String> texts = getTexts(message, context);
|
|
|
|
if (texts.size() == 0)
|
2021-01-03 11:23:51 +00:00
|
|
|
return;
|
2021-01-02 13:33:53 +00:00
|
|
|
|
2021-01-05 18:51:22 +00:00
|
|
|
// Load data if needed
|
2021-01-03 11:23:51 +00:00
|
|
|
load(context);
|
2021-01-02 21:52:41 +00:00
|
|
|
|
2021-01-10 16:02:27 +00:00
|
|
|
// Initialize account if needed
|
|
|
|
if (!accountMsgIds.containsKey(folder.account))
|
|
|
|
accountMsgIds.put(folder.account, new ArrayList<>());
|
|
|
|
if (!classMessages.containsKey(folder.account))
|
|
|
|
classMessages.put(folder.account, new HashMap<>());
|
|
|
|
if (!wordClassFrequency.containsKey(folder.account))
|
|
|
|
wordClassFrequency.put(folder.account, new HashMap<>());
|
|
|
|
|
2021-01-10 13:31:29 +00:00
|
|
|
// Classify texts
|
2021-08-16 11:07:41 +00:00
|
|
|
String classified = classify(message, folder.name, texts, target == null, context);
|
2021-01-03 11:23:51 +00:00
|
|
|
|
2021-01-06 07:31:34 +00:00
|
|
|
long elapsed = new Date().getTime() - start;
|
2021-08-16 11:07:41 +00:00
|
|
|
EntityLog.log(context, EntityLog.Type.Classification, message,
|
2021-08-16 08:15:26 +00:00
|
|
|
"Classifier" +
|
|
|
|
" folder=" + folder.name +
|
|
|
|
" message=" + message.id +
|
|
|
|
"@" + new Date(message.received) +
|
|
|
|
":" + message.subject +
|
|
|
|
" class=" + classified +
|
|
|
|
" re=" + message.auto_classified +
|
|
|
|
" elapsed=" + elapsed);
|
2021-01-03 11:23:51 +00:00
|
|
|
|
2021-01-10 13:31:29 +00:00
|
|
|
// Auto classify message
|
2021-01-05 18:51:22 +00:00
|
|
|
if (classified != null &&
|
|
|
|
!classified.equals(folder.name) &&
|
2021-01-10 16:02:27 +00:00
|
|
|
!TextUtils.isEmpty(message.msgid) &&
|
2021-07-29 06:55:34 +00:00
|
|
|
!message.hasKeyword(MessageHelper.FLAG_CLASSIFIED) &&
|
2021-07-29 07:11:51 +00:00
|
|
|
!message.hasKeyword(MessageHelper.FLAG_FILTERED) &&
|
2021-01-10 16:02:27 +00:00
|
|
|
!accountMsgIds.get(folder.account).contains(message.msgid) &&
|
2021-01-05 18:51:22 +00:00
|
|
|
!EntityFolder.JUNK.equals(folder.type)) {
|
2021-01-10 17:43:23 +00:00
|
|
|
boolean pro = ActivityBilling.isPro(context);
|
|
|
|
|
2021-01-05 18:51:22 +00:00
|
|
|
DB db = DB.getInstance(context);
|
2021-01-04 11:21:51 +00:00
|
|
|
try {
|
|
|
|
db.beginTransaction();
|
|
|
|
|
2021-01-05 18:51:22 +00:00
|
|
|
EntityFolder dest = db.folder().getFolderByName(folder.account, classified);
|
2021-01-13 09:22:52 +00:00
|
|
|
if (dest != null && dest.auto_classify_target &&
|
2021-01-10 17:43:23 +00:00
|
|
|
(pro || EntityFolder.JUNK.equals(dest.type))) {
|
2021-07-29 06:55:34 +00:00
|
|
|
EntityOperation.queue(context, message, EntityOperation.KEYWORD, MessageHelper.FLAG_CLASSIFIED, true);
|
2021-01-05 18:51:22 +00:00
|
|
|
EntityOperation.queue(context, message, EntityOperation.MOVE, dest.id, false, true);
|
2021-01-04 08:27:27 +00:00
|
|
|
message.ui_hide = true;
|
|
|
|
}
|
2021-01-04 11:21:51 +00:00
|
|
|
|
|
|
|
db.setTransactionSuccessful();
|
|
|
|
} finally {
|
|
|
|
db.endTransaction();
|
|
|
|
}
|
2021-01-10 17:47:29 +00:00
|
|
|
|
|
|
|
if (message.ui_hide)
|
|
|
|
accountMsgIds.get(folder.account).add(message.msgid);
|
2021-01-05 18:51:22 +00:00
|
|
|
}
|
2021-01-10 16:02:27 +00:00
|
|
|
|
|
|
|
dirty = true;
|
2021-01-03 11:23:51 +00:00
|
|
|
} catch (Throwable ex) {
|
|
|
|
Log.e(ex);
|
2021-01-02 21:38:37 +00:00
|
|
|
}
|
2021-01-02 13:33:53 +00:00
|
|
|
}
|
|
|
|
|
2021-01-10 13:31:29 +00:00
|
|
|
@NonNull
|
|
|
|
private static List<String> getTexts(@NonNull EntityMessage message, @NonNull Context context) throws IOException {
|
|
|
|
List<String> texts = new ArrayList<>();
|
|
|
|
|
|
|
|
File file = message.getFile(context);
|
|
|
|
if (!file.exists())
|
|
|
|
return texts;
|
|
|
|
|
|
|
|
List<Address> addresses = new ArrayList<>();
|
|
|
|
if (message.from != null)
|
|
|
|
addresses.addAll(Arrays.asList(message.from));
|
|
|
|
if (message.to != null)
|
|
|
|
addresses.addAll(Arrays.asList(message.to));
|
|
|
|
if (message.cc != null)
|
|
|
|
addresses.addAll(Arrays.asList(message.cc));
|
|
|
|
if (message.bcc != null)
|
|
|
|
addresses.addAll(Arrays.asList(message.bcc));
|
|
|
|
if (message.reply != null)
|
|
|
|
addresses.addAll(Arrays.asList(message.reply));
|
2022-01-30 17:06:49 +00:00
|
|
|
if (message.return_path != null)
|
|
|
|
addresses.addAll(Arrays.asList(message.return_path));
|
2021-01-10 13:31:29 +00:00
|
|
|
|
|
|
|
for (Address address : addresses) {
|
|
|
|
String email = ((InternetAddress) address).getAddress();
|
|
|
|
String name = ((InternetAddress) address).getPersonal();
|
|
|
|
if (!TextUtils.isEmpty(email))
|
|
|
|
texts.add(email);
|
|
|
|
if (!TextUtils.isEmpty(name))
|
|
|
|
texts.add(name);
|
2021-01-07 09:58:56 +00:00
|
|
|
}
|
|
|
|
|
2021-01-10 13:31:29 +00:00
|
|
|
if (message.subject != null)
|
|
|
|
texts.add(message.subject);
|
|
|
|
|
|
|
|
String text = HtmlHelper.getFullText(file);
|
|
|
|
texts.add(text);
|
|
|
|
|
|
|
|
return texts;
|
|
|
|
}
|
|
|
|
|
2021-08-16 11:07:41 +00:00
|
|
|
private static String classify(EntityMessage message, @NonNull String currentClass, @NonNull List<String> texts, boolean added, @NonNull Context context) {
|
2021-01-06 20:35:11 +00:00
|
|
|
State state = new State();
|
2021-01-07 14:20:39 +00:00
|
|
|
|
2021-08-12 17:06:15 +00:00
|
|
|
// Check classes
|
2021-09-16 18:52:30 +00:00
|
|
|
DB db = DB.getInstance(context);
|
2021-08-16 11:07:41 +00:00
|
|
|
for (String clazz : new ArrayList<>(classMessages.get(message.account).keySet())) {
|
|
|
|
EntityFolder folder = db.folder().getFolderByName(message.account, clazz);
|
2021-08-12 17:06:15 +00:00
|
|
|
if (folder == null) {
|
2021-08-16 11:07:41 +00:00
|
|
|
EntityLog.log(context, EntityLog.Type.Classification, message,
|
2021-09-16 18:52:30 +00:00
|
|
|
"Classifier deleting folder class=" + message.account + ":" + clazz);
|
|
|
|
classMessages.get(message.account).remove(clazz);
|
|
|
|
for (String word : wordClassFrequency.get(message.account).keySet())
|
|
|
|
wordClassFrequency.get(message.account).get(word).remove(clazz);
|
2021-08-12 17:06:15 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2021-01-10 13:31:29 +00:00
|
|
|
Log.i("Classifier texts=" + texts.size());
|
|
|
|
for (String text : texts) {
|
|
|
|
// First word
|
2021-08-16 11:07:41 +00:00
|
|
|
processWord(message.account, added, null, state);
|
2021-01-10 13:31:29 +00:00
|
|
|
|
|
|
|
// Process words
|
|
|
|
if (Build.VERSION.SDK_INT < Build.VERSION_CODES.N) {
|
|
|
|
java.text.BreakIterator boundary = java.text.BreakIterator.getWordInstance();
|
|
|
|
boundary.setText(text);
|
|
|
|
int start = boundary.first();
|
|
|
|
for (int end = boundary.next(); end != java.text.BreakIterator.DONE; end = boundary.next()) {
|
|
|
|
String word = text.substring(start, end);
|
2021-08-16 11:07:41 +00:00
|
|
|
processWord(message.account, added, word, state);
|
2021-01-11 13:32:16 +00:00
|
|
|
if (state.words.size() >= MAX_WORDS)
|
|
|
|
break;
|
2021-01-10 13:31:29 +00:00
|
|
|
start = end;
|
|
|
|
}
|
|
|
|
} else {
|
|
|
|
// The ICU break iterator works better for Chinese texts
|
|
|
|
android.icu.text.BreakIterator boundary = android.icu.text.BreakIterator.getWordInstance();
|
|
|
|
boundary.setText(text);
|
|
|
|
int start = boundary.first();
|
|
|
|
for (int end = boundary.next(); end != android.icu.text.BreakIterator.DONE; end = boundary.next()) {
|
|
|
|
String word = text.substring(start, end);
|
2021-08-16 11:07:41 +00:00
|
|
|
processWord(message.account, added, word, state);
|
2021-01-11 13:32:16 +00:00
|
|
|
if (state.words.size() >= MAX_WORDS)
|
|
|
|
break;
|
2021-01-10 13:31:29 +00:00
|
|
|
start = end;
|
|
|
|
}
|
2021-01-02 13:33:53 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2021-01-10 13:31:29 +00:00
|
|
|
// final word
|
2021-08-16 11:07:41 +00:00
|
|
|
processWord(message.account, added, null, state);
|
2021-01-10 13:31:29 +00:00
|
|
|
|
|
|
|
int maxMessages = 0;
|
2021-08-16 11:07:41 +00:00
|
|
|
for (String clazz : classMessages.get(message.account).keySet()) {
|
|
|
|
int count = classMessages.get(message.account).get(clazz);
|
2021-01-10 13:31:29 +00:00
|
|
|
if (count > maxMessages)
|
|
|
|
maxMessages = count;
|
|
|
|
}
|
|
|
|
|
2021-08-16 11:07:41 +00:00
|
|
|
updateFrequencies(message.account, currentClass, added, state);
|
2021-01-10 14:26:51 +00:00
|
|
|
|
2021-01-07 09:06:33 +00:00
|
|
|
if (maxMessages == 0) {
|
2021-08-16 11:07:41 +00:00
|
|
|
Log.i("Classifier no messages account=" + message.account);
|
2021-01-07 14:20:39 +00:00
|
|
|
return null;
|
2021-01-07 09:06:33 +00:00
|
|
|
}
|
|
|
|
|
2021-01-10 14:26:51 +00:00
|
|
|
if (!added)
|
|
|
|
return null;
|
|
|
|
|
2021-01-07 14:20:39 +00:00
|
|
|
// Calculate chance per class
|
2021-01-10 13:31:29 +00:00
|
|
|
int words = state.words.size() - texts.size() - 1;
|
2021-01-07 09:06:33 +00:00
|
|
|
List<Chance> chances = new ArrayList<>();
|
|
|
|
for (String clazz : state.classStats.keySet()) {
|
2021-01-06 20:35:11 +00:00
|
|
|
Stat stat = state.classStats.get(clazz);
|
2021-01-07 09:58:56 +00:00
|
|
|
|
2021-01-08 17:30:00 +00:00
|
|
|
double chance = stat.totalFrequency / maxMessages / words;
|
2021-01-02 13:33:53 +00:00
|
|
|
Chance c = new Chance(clazz, chance);
|
2021-01-07 08:35:58 +00:00
|
|
|
chances.add(c);
|
2021-08-16 11:07:41 +00:00
|
|
|
EntityLog.log(context, EntityLog.Type.Classification, message,
|
2021-08-16 08:15:26 +00:00
|
|
|
"Classifier " + c +
|
|
|
|
" frequency=" + (Math.round(stat.totalFrequency * 100.0) / 100.0) + "/" + maxMessages + " msgs" +
|
|
|
|
" matched=" + stat.matchedWords + "/" + words + " words" +
|
|
|
|
" text=" + TextUtils.join(", ", stat.words));
|
2021-01-02 13:33:53 +00:00
|
|
|
}
|
|
|
|
|
2021-01-04 14:28:45 +00:00
|
|
|
if (BuildConfig.DEBUG)
|
2021-01-10 13:31:29 +00:00
|
|
|
Log.i("Classifier words=" + state.words.size() + " " + TextUtils.join(", ", state.words));
|
2021-01-04 14:28:45 +00:00
|
|
|
|
2021-01-05 11:55:12 +00:00
|
|
|
if (chances.size() <= 1)
|
2021-01-02 13:33:53 +00:00
|
|
|
return null;
|
|
|
|
|
2021-01-07 14:20:39 +00:00
|
|
|
// Sort classes by chance
|
2021-01-02 13:33:53 +00:00
|
|
|
Collections.sort(chances, new Comparator<Chance>() {
|
|
|
|
@Override
|
|
|
|
public int compare(Chance c1, Chance c2) {
|
|
|
|
return -c1.chance.compareTo(c2.chance);
|
|
|
|
}
|
|
|
|
});
|
|
|
|
|
2021-01-08 13:21:51 +00:00
|
|
|
SharedPreferences prefs = PreferenceManager.getDefaultSharedPreferences(context);
|
2021-01-14 07:06:09 +00:00
|
|
|
double class_min_chance = prefs.getInt("class_min_probability", 15) / 100.0;
|
2021-01-08 13:21:51 +00:00
|
|
|
double class_min_difference = prefs.getInt("class_min_difference", 50) / 100.0;
|
|
|
|
|
2021-01-07 14:20:39 +00:00
|
|
|
// Select best class
|
2021-01-02 13:33:53 +00:00
|
|
|
String classification = null;
|
2021-01-08 13:21:51 +00:00
|
|
|
double c0 = chances.get(0).chance;
|
|
|
|
double c1 = chances.get(1).chance;
|
2021-01-08 13:32:14 +00:00
|
|
|
double threshold = c0 * (1.0 - class_min_difference);
|
|
|
|
if (c0 > class_min_chance && c1 < threshold)
|
2021-01-02 13:33:53 +00:00
|
|
|
classification = chances.get(0).clazz;
|
|
|
|
|
2021-01-08 13:21:51 +00:00
|
|
|
Log.i("Classifier current=" + currentClass +
|
2021-01-08 13:32:14 +00:00
|
|
|
" c0=" + Math.round(c0 * 100 * 100) / 100.0 + ">" + Math.round(class_min_chance * 100) + "%" +
|
|
|
|
" c1=" + Math.round(c1 * 100 * 100) / 100.0 + "<" + Math.round(threshold * 100 * 100) / 100.0 + "%" +
|
|
|
|
" (" + Math.round(class_min_difference * 100) + "%)" +
|
2021-01-08 13:21:51 +00:00
|
|
|
" classified=" + classification);
|
2021-01-02 13:33:53 +00:00
|
|
|
|
|
|
|
return classification;
|
|
|
|
}
|
|
|
|
|
2021-01-10 13:31:29 +00:00
|
|
|
private static void processWord(long account, boolean added, String word, State state) {
|
2021-01-07 13:17:38 +00:00
|
|
|
if (word != null) {
|
|
|
|
word = word.trim().toLowerCase();
|
2021-01-10 13:31:29 +00:00
|
|
|
if (word.length() < 2 || word.matches(".*\\d.*"))
|
2021-01-07 13:17:38 +00:00
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
2021-01-10 13:31:29 +00:00
|
|
|
if (word != null ||
|
|
|
|
state.words.size() == 0 ||
|
|
|
|
state.words.get(state.words.size() - 1) != null)
|
|
|
|
state.words.add(word);
|
|
|
|
|
|
|
|
if (!added)
|
|
|
|
return;
|
2021-01-07 13:17:38 +00:00
|
|
|
|
2021-01-06 20:35:11 +00:00
|
|
|
if (state.words.size() < 3)
|
|
|
|
return;
|
|
|
|
|
|
|
|
String before = state.words.get(state.words.size() - 3);
|
|
|
|
String current = state.words.get(state.words.size() - 2);
|
|
|
|
String after = state.words.get(state.words.size() - 1);
|
|
|
|
|
2021-01-10 13:31:29 +00:00
|
|
|
if (current == null)
|
|
|
|
return;
|
|
|
|
|
2021-01-06 20:35:11 +00:00
|
|
|
Map<String, Frequency> classFrequency = wordClassFrequency.get(account).get(current);
|
2021-01-10 13:31:29 +00:00
|
|
|
if (classFrequency == null)
|
|
|
|
return;
|
|
|
|
|
|
|
|
for (String clazz : classFrequency.keySet()) {
|
|
|
|
Frequency frequency = classFrequency.get(clazz);
|
|
|
|
if (frequency.count <= 0)
|
|
|
|
continue;
|
|
|
|
|
|
|
|
Stat stat = state.classStats.get(clazz);
|
|
|
|
if (stat == null) {
|
|
|
|
stat = new Stat();
|
|
|
|
state.classStats.put(clazz, stat);
|
2021-01-06 20:35:11 +00:00
|
|
|
}
|
|
|
|
|
2021-01-10 13:31:29 +00:00
|
|
|
int c = (frequency.count - frequency.duplicates);
|
|
|
|
Integer b = (before == null ? null : frequency.before.get(before));
|
|
|
|
Integer a = (after == null ? null : frequency.after.get(after));
|
|
|
|
double f = (c +
|
|
|
|
(b == null ? 2 * c : 2.0 * b / frequency.count * c) +
|
|
|
|
(a == null ? 2 * c : 2.0 * a / frequency.count * c)) / 5.0;
|
|
|
|
//Log.i("Classifier " +
|
|
|
|
// before + "/" + b + "/" + frequency.before.get(before) + " " +
|
|
|
|
// after + "/" + a + "/" + frequency.after.get(after) + " " +
|
|
|
|
// current + "/" + c + "=" + frequency.count + "-" + frequency.duplicates +
|
|
|
|
// " f=" + f);
|
|
|
|
|
|
|
|
stat.totalFrequency += f;
|
|
|
|
stat.matchedWords++;
|
|
|
|
|
2021-01-10 18:24:33 +00:00
|
|
|
if (BuildConfig.DEBUG && false)
|
2021-01-10 13:31:29 +00:00
|
|
|
stat.words.add(current + "=" + f);
|
|
|
|
}
|
|
|
|
}
|
2021-01-06 20:35:11 +00:00
|
|
|
|
2021-01-10 14:26:51 +00:00
|
|
|
private static void updateFrequencies(long account, @NonNull String currentClass, boolean added, @NonNull State state) {
|
|
|
|
Integer m = classMessages.get(account).get(currentClass);
|
|
|
|
m = (m == null ? 0 : m) + (added ? 1 : -1);
|
|
|
|
if (m <= 0)
|
|
|
|
classMessages.get(account).remove(currentClass);
|
|
|
|
else
|
|
|
|
classMessages.get(account).put(currentClass, m);
|
|
|
|
Log.i("Classifier " + currentClass + "=" + m + " msgs");
|
|
|
|
|
2021-01-10 13:31:29 +00:00
|
|
|
for (int i = 1; i < state.words.size() - 1; i++) {
|
|
|
|
String before = state.words.get(i - 1);
|
|
|
|
String current = state.words.get(i);
|
|
|
|
String after = state.words.get(i + 1);
|
2021-01-06 20:35:11 +00:00
|
|
|
|
2021-01-10 13:31:29 +00:00
|
|
|
if (current == null)
|
|
|
|
continue;
|
2021-01-06 20:35:11 +00:00
|
|
|
|
2021-01-10 13:31:29 +00:00
|
|
|
Map<String, Frequency> classFrequency = wordClassFrequency.get(account).get(current);
|
|
|
|
if (added) {
|
|
|
|
if (classFrequency == null) {
|
|
|
|
classFrequency = new HashMap<>();
|
|
|
|
wordClassFrequency.get(account).put(current, classFrequency);
|
2021-01-07 08:18:29 +00:00
|
|
|
}
|
2021-01-10 14:26:51 +00:00
|
|
|
Frequency c = classFrequency.get(currentClass);
|
2021-01-10 13:31:29 +00:00
|
|
|
if (c == null) {
|
|
|
|
c = new Frequency();
|
2021-01-10 14:26:51 +00:00
|
|
|
classFrequency.put(currentClass, c);
|
2021-01-10 13:31:29 +00:00
|
|
|
}
|
|
|
|
c.add(before, after, 1, state.words.indexOf(current) < i);
|
|
|
|
} else {
|
2021-01-10 14:26:51 +00:00
|
|
|
Frequency c = (classFrequency == null ? null : classFrequency.get(currentClass));
|
2021-01-10 13:31:29 +00:00
|
|
|
if (c != null)
|
|
|
|
c.add(before, after, -1, state.words.indexOf(current) < i);
|
2021-01-06 20:35:11 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2021-02-10 10:59:52 +00:00
|
|
|
static synchronized void save(@NonNull Context context) throws IOException {
|
2021-01-02 21:52:41 +00:00
|
|
|
if (!dirty)
|
2021-01-02 21:39:30 +00:00
|
|
|
return;
|
2021-01-02 19:42:35 +00:00
|
|
|
|
2021-02-10 10:59:52 +00:00
|
|
|
long start = new Date().getTime();
|
|
|
|
|
2021-01-02 19:42:35 +00:00
|
|
|
File file = getFile(context);
|
2021-05-08 09:23:22 +00:00
|
|
|
if (file.exists())
|
|
|
|
try {
|
|
|
|
File backup = getBackupFile(context);
|
|
|
|
Log.i("Classifier backup " + backup);
|
|
|
|
backup.delete();
|
|
|
|
file.renameTo(backup);
|
|
|
|
} catch (Throwable ex) {
|
|
|
|
Log.w(ex);
|
|
|
|
}
|
|
|
|
|
|
|
|
Log.i("Classifier save " + file);
|
2021-02-10 11:46:29 +00:00
|
|
|
try (JsonWriter writer = new JsonWriter(new BufferedWriter(new FileWriter(file)))) {
|
2021-02-10 10:59:52 +00:00
|
|
|
writer.beginObject();
|
|
|
|
|
|
|
|
writer.name("version").value(2);
|
|
|
|
|
|
|
|
writer.name("messages");
|
|
|
|
writer.beginArray();
|
|
|
|
for (Long account : classMessages.keySet())
|
|
|
|
for (String clazz : classMessages.get(account).keySet()) {
|
|
|
|
writer.beginObject();
|
|
|
|
writer.name("account").value(account);
|
|
|
|
writer.name("class").value(clazz);
|
|
|
|
writer.name("count").value(classMessages.get(account).get(clazz));
|
|
|
|
writer.endObject();
|
|
|
|
}
|
|
|
|
writer.endArray();
|
|
|
|
|
|
|
|
writer.name("words");
|
|
|
|
writer.beginArray();
|
|
|
|
for (Long account : wordClassFrequency.keySet())
|
|
|
|
for (String word : wordClassFrequency.get(account).keySet()) {
|
|
|
|
Map<String, Frequency> classFrequency = wordClassFrequency.get(account).get(word);
|
|
|
|
for (String clazz : classFrequency.keySet()) {
|
|
|
|
Frequency f = classFrequency.get(clazz);
|
|
|
|
writer.beginObject();
|
|
|
|
|
|
|
|
writer.name("account").value(account);
|
|
|
|
writer.name("word").value(word);
|
|
|
|
writer.name("class").value(clazz);
|
|
|
|
writer.name("count").value(f.count);
|
|
|
|
writer.name("dup").value(f.duplicates);
|
|
|
|
|
|
|
|
writer.name("before");
|
|
|
|
writer.beginObject();
|
|
|
|
for (String key : f.before.keySet())
|
|
|
|
writer.name(key).value(f.before.get(key));
|
|
|
|
writer.endObject();
|
|
|
|
|
|
|
|
writer.name("after");
|
|
|
|
writer.beginObject();
|
|
|
|
for (String key : f.after.keySet())
|
|
|
|
writer.name(key).value(f.after.get(key));
|
|
|
|
writer.endObject();
|
|
|
|
|
|
|
|
writer.endObject();
|
|
|
|
}
|
|
|
|
}
|
|
|
|
writer.endArray();
|
|
|
|
|
|
|
|
writer.name("classified");
|
|
|
|
writer.beginArray();
|
|
|
|
for (Long account : accountMsgIds.keySet()) {
|
|
|
|
writer.beginObject();
|
|
|
|
writer.name("account").value(account);
|
|
|
|
writer.name("messages");
|
|
|
|
writer.beginArray();
|
|
|
|
for (String msgid : accountMsgIds.get(account))
|
|
|
|
writer.value(msgid);
|
|
|
|
writer.endArray();
|
|
|
|
writer.endObject();
|
|
|
|
}
|
|
|
|
writer.endArray();
|
|
|
|
|
|
|
|
writer.endObject();
|
|
|
|
}
|
2021-01-02 19:42:35 +00:00
|
|
|
|
2021-01-03 09:06:08 +00:00
|
|
|
dirty = false;
|
2021-02-10 10:59:52 +00:00
|
|
|
|
|
|
|
long elapsed = new Date().getTime() - start;
|
|
|
|
Log.i("Classifier data saved elapsed=" + elapsed);
|
2021-01-02 19:42:35 +00:00
|
|
|
}
|
|
|
|
|
2021-02-10 10:59:52 +00:00
|
|
|
private static synchronized void load(@NonNull Context context) {
|
2021-01-05 12:42:43 +00:00
|
|
|
if (loaded || dirty)
|
2021-01-02 19:42:35 +00:00
|
|
|
return;
|
|
|
|
|
2021-02-10 10:59:52 +00:00
|
|
|
clear(context);
|
2021-05-08 09:23:22 +00:00
|
|
|
File file = getFile(context);
|
|
|
|
try {
|
|
|
|
_load(file);
|
|
|
|
} catch (MalformedJsonException ex) {
|
|
|
|
Log.w(ex);
|
|
|
|
clear(context);
|
|
|
|
File backup = getBackupFile(context);
|
|
|
|
if (backup.exists())
|
|
|
|
try {
|
|
|
|
_load(backup);
|
|
|
|
} catch (Throwable ex1) {
|
|
|
|
Log.e(ex1);
|
|
|
|
backup.delete();
|
|
|
|
clear(context);
|
|
|
|
}
|
|
|
|
} catch (Throwable ex) {
|
|
|
|
Log.e(ex);
|
|
|
|
file.delete();
|
|
|
|
clear(context);
|
|
|
|
}
|
|
|
|
}
|
2021-01-02 19:42:35 +00:00
|
|
|
|
2021-05-08 09:23:22 +00:00
|
|
|
private static synchronized void _load(File file) throws IOException {
|
|
|
|
Log.i("Classifier read " + file);
|
2021-02-10 10:59:52 +00:00
|
|
|
long start = new Date().getTime();
|
|
|
|
if (file.exists())
|
2021-02-10 11:46:29 +00:00
|
|
|
try (JsonReader reader = new JsonReader(new BufferedReader(new FileReader(file)))) {
|
2021-02-10 10:59:52 +00:00
|
|
|
reader.beginObject();
|
|
|
|
while (reader.hasNext())
|
|
|
|
switch (reader.nextName()) {
|
|
|
|
case "version":
|
|
|
|
reader.nextInt();
|
|
|
|
break;
|
|
|
|
|
|
|
|
case "messages":
|
|
|
|
reader.beginArray();
|
|
|
|
while (reader.hasNext()) {
|
|
|
|
Long account = null;
|
|
|
|
String clazz = null;
|
|
|
|
Integer count = null;
|
|
|
|
|
|
|
|
reader.beginObject();
|
|
|
|
while (reader.hasNext())
|
|
|
|
switch (reader.nextName()) {
|
|
|
|
case "account":
|
|
|
|
account = reader.nextLong();
|
|
|
|
break;
|
|
|
|
case "class":
|
|
|
|
clazz = reader.nextString();
|
|
|
|
break;
|
|
|
|
case "count":
|
|
|
|
count = reader.nextInt();
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
reader.endObject();
|
|
|
|
|
|
|
|
if (account == null || clazz == null || count == null)
|
|
|
|
continue;
|
|
|
|
|
|
|
|
if (!classMessages.containsKey(account))
|
|
|
|
classMessages.put(account, new HashMap<>());
|
|
|
|
classMessages.get(account).put(clazz, count);
|
|
|
|
}
|
|
|
|
reader.endArray();
|
|
|
|
break;
|
|
|
|
|
|
|
|
case "words":
|
|
|
|
reader.beginArray();
|
|
|
|
while (reader.hasNext()) {
|
|
|
|
Long account = null;
|
|
|
|
String word = null;
|
|
|
|
String clazz = null;
|
|
|
|
Frequency f = new Frequency();
|
|
|
|
|
|
|
|
reader.beginObject();
|
|
|
|
while (reader.hasNext())
|
|
|
|
switch (reader.nextName()) {
|
|
|
|
case "account":
|
|
|
|
account = reader.nextLong();
|
|
|
|
break;
|
|
|
|
case "word":
|
|
|
|
word = reader.nextString();
|
|
|
|
break;
|
|
|
|
case "class":
|
|
|
|
clazz = reader.nextString();
|
|
|
|
break;
|
|
|
|
case "count":
|
|
|
|
f.count = reader.nextInt();
|
|
|
|
break;
|
|
|
|
case "dup":
|
|
|
|
f.duplicates = reader.nextInt();
|
|
|
|
break;
|
|
|
|
case "before":
|
|
|
|
reader.beginObject();
|
|
|
|
while (reader.hasNext())
|
|
|
|
f.before.put(reader.nextName(), reader.nextInt());
|
|
|
|
reader.endObject();
|
|
|
|
break;
|
|
|
|
case "after":
|
|
|
|
reader.beginObject();
|
|
|
|
while (reader.hasNext())
|
|
|
|
f.after.put(reader.nextName(), reader.nextInt());
|
|
|
|
reader.endObject();
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
reader.endObject();
|
|
|
|
|
|
|
|
if (account == null || word == null || clazz == null)
|
|
|
|
continue;
|
|
|
|
|
|
|
|
if (!wordClassFrequency.containsKey(account))
|
|
|
|
wordClassFrequency.put(account, new HashMap<>());
|
|
|
|
|
|
|
|
Map<String, Frequency> classFrequency = wordClassFrequency.get(account).get(word);
|
|
|
|
if (classFrequency == null) {
|
|
|
|
classFrequency = new HashMap<>();
|
|
|
|
wordClassFrequency.get(account).put(word, classFrequency);
|
|
|
|
}
|
|
|
|
|
|
|
|
classFrequency.put(clazz, f);
|
|
|
|
}
|
|
|
|
reader.endArray();
|
|
|
|
break;
|
|
|
|
|
|
|
|
case "classified":
|
|
|
|
reader.beginArray();
|
|
|
|
while (reader.hasNext()) {
|
|
|
|
Long account = null;
|
|
|
|
List<String> msgids = new ArrayList<>();
|
|
|
|
|
|
|
|
reader.beginObject();
|
|
|
|
while (reader.hasNext())
|
|
|
|
switch (reader.nextName()) {
|
|
|
|
case "account":
|
|
|
|
account = reader.nextLong();
|
|
|
|
break;
|
|
|
|
case "messages":
|
|
|
|
reader.beginArray();
|
|
|
|
while (reader.hasNext())
|
|
|
|
msgids.add(reader.nextString());
|
|
|
|
reader.endArray();
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
reader.endObject();
|
|
|
|
|
|
|
|
if (account == null)
|
|
|
|
continue;
|
|
|
|
|
|
|
|
accountMsgIds.put(account, msgids);
|
|
|
|
}
|
|
|
|
reader.endArray();
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
reader.endObject();
|
2021-01-31 19:20:07 +00:00
|
|
|
}
|
2021-01-02 19:42:35 +00:00
|
|
|
|
|
|
|
loaded = true;
|
2021-02-10 10:59:52 +00:00
|
|
|
dirty = false;
|
|
|
|
|
|
|
|
long elapsed = new Date().getTime() - start;
|
|
|
|
Log.i("Classifier data loaded elapsed=" + elapsed);
|
2021-01-02 19:42:35 +00:00
|
|
|
}
|
|
|
|
|
2021-01-10 16:32:14 +00:00
|
|
|
static synchronized void cleanup(@NonNull Context context) {
|
|
|
|
try {
|
|
|
|
load(context);
|
|
|
|
|
|
|
|
DB db = DB.getInstance(context);
|
|
|
|
for (Long account : accountMsgIds.keySet()) {
|
|
|
|
List<String> msgids = accountMsgIds.get(account);
|
|
|
|
Log.i("Classifier cleanup account=" + account + " count=" + msgids.size());
|
|
|
|
for (String msgid : new ArrayList<>(msgids)) {
|
|
|
|
List<EntityMessage> messages = db.message().getMessagesByMsgId(account, msgid);
|
|
|
|
if (messages != null && messages.size() == 0) {
|
|
|
|
Log.i("Classifier removing msgid=" + msgid);
|
|
|
|
msgids.remove(msgid);
|
|
|
|
dirty = true;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
if (dirty)
|
|
|
|
save(context);
|
|
|
|
} catch (Throwable ex) {
|
|
|
|
Log.e(ex);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2021-01-10 13:31:29 +00:00
|
|
|
static synchronized void clear(@NonNull Context context) {
|
2021-01-10 17:18:14 +00:00
|
|
|
accountMsgIds.clear();
|
2021-02-10 10:59:52 +00:00
|
|
|
classMessages.clear();
|
2021-01-03 19:21:35 +00:00
|
|
|
wordClassFrequency.clear();
|
|
|
|
dirty = true;
|
2021-01-05 12:42:43 +00:00
|
|
|
Log.i("Classifier data cleared");
|
2021-01-03 19:21:35 +00:00
|
|
|
}
|
|
|
|
|
2021-01-10 13:31:29 +00:00
|
|
|
static boolean isEnabled(@NonNull Context context) {
|
2021-01-02 19:42:35 +00:00
|
|
|
SharedPreferences prefs = PreferenceManager.getDefaultSharedPreferences(context);
|
2021-01-03 19:21:35 +00:00
|
|
|
return prefs.getBoolean("classification", false);
|
2021-01-02 19:42:35 +00:00
|
|
|
}
|
|
|
|
|
2021-01-10 13:31:29 +00:00
|
|
|
static File getFile(@NonNull Context context) {
|
2021-01-02 19:42:35 +00:00
|
|
|
return new File(context.getFilesDir(), "classifier.json");
|
|
|
|
}
|
|
|
|
|
2021-05-08 09:23:22 +00:00
|
|
|
static File getBackupFile(@NonNull Context context) {
|
|
|
|
return new File(context.getFilesDir(), "classifier.backup");
|
|
|
|
}
|
|
|
|
|
2021-01-06 20:35:11 +00:00
|
|
|
private static class State {
|
2021-01-10 13:31:29 +00:00
|
|
|
private final List<String> words = new ArrayList<>();
|
|
|
|
private final Map<String, Stat> classStats = new HashMap<>();
|
2021-01-06 20:35:11 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
private static class Frequency {
|
2021-01-07 08:18:29 +00:00
|
|
|
private int count = 0;
|
2021-01-10 13:31:29 +00:00
|
|
|
private int duplicates = 0;
|
2021-01-07 08:18:29 +00:00
|
|
|
private Map<String, Integer> before = new HashMap<>();
|
|
|
|
private Map<String, Integer> after = new HashMap<>();
|
|
|
|
|
2021-01-10 13:31:29 +00:00
|
|
|
private void add(String b, String a, int c, boolean duplicate) {
|
2021-01-07 08:18:29 +00:00
|
|
|
if (count + c < 0)
|
|
|
|
return;
|
|
|
|
|
|
|
|
count += c;
|
|
|
|
|
2021-01-10 13:31:29 +00:00
|
|
|
if (duplicate)
|
|
|
|
duplicates += c;
|
|
|
|
|
2021-01-07 08:18:29 +00:00
|
|
|
if (b != null) {
|
|
|
|
Integer x = before.get(b);
|
|
|
|
before.put(b, (x == null ? 0 : x) + c);
|
|
|
|
}
|
|
|
|
|
|
|
|
if (a != null) {
|
|
|
|
Integer x = after.get(a);
|
|
|
|
after.put(a, (x == null ? 0 : x) + c);
|
|
|
|
}
|
|
|
|
}
|
2021-01-06 20:35:11 +00:00
|
|
|
}
|
|
|
|
|
2021-01-02 13:33:53 +00:00
|
|
|
private static class Stat {
|
2021-01-07 19:11:05 +00:00
|
|
|
private int matchedWords = 0;
|
|
|
|
private double totalFrequency = 0;
|
2021-01-10 13:31:29 +00:00
|
|
|
private final List<String> words = new ArrayList<>();
|
2021-01-02 13:33:53 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
private static class Chance {
|
2021-01-07 08:18:29 +00:00
|
|
|
private String clazz;
|
|
|
|
private Double chance;
|
2021-01-02 13:33:53 +00:00
|
|
|
|
2021-01-07 08:18:29 +00:00
|
|
|
private Chance(String clazz, Double chance) {
|
2021-01-02 13:33:53 +00:00
|
|
|
this.clazz = clazz;
|
|
|
|
this.chance = chance;
|
|
|
|
}
|
|
|
|
|
2021-05-15 19:24:10 +00:00
|
|
|
@NonNull
|
2021-01-02 13:33:53 +00:00
|
|
|
@Override
|
|
|
|
public String toString() {
|
2021-01-07 13:55:14 +00:00
|
|
|
return clazz + "=" + Math.round(chance * 100.0 * 100.0) / 100.0 + "%";
|
2021-01-02 13:33:53 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|