From b3dac145db2de8d13e5966f44df1edfeaeae3956 Mon Sep 17 00:00:00 2001 From: M66B Date: Wed, 5 Jan 2022 11:57:11 +0100 Subject: [PATCH] Correct UTF-8 encoded Windows-1252 text --- .../java/eu/faircode/email/ActivityEML.java | 3 + .../java/eu/faircode/email/CharsetHelper.java | 63 +++++++++++++++++++ .../java/eu/faircode/email/MessageHelper.java | 7 ++- 3 files changed, 72 insertions(+), 1 deletion(-) diff --git a/app/src/main/java/eu/faircode/email/ActivityEML.java b/app/src/main/java/eu/faircode/email/ActivityEML.java index ce6b2cba65..9623737345 100644 --- a/app/src/main/java/eu/faircode/email/ActivityEML.java +++ b/app/src/main/java/eu/faircode/email/ActivityEML.java @@ -67,6 +67,7 @@ import java.text.DateFormat; import java.util.ArrayList; import java.util.Enumeration; import java.util.List; +import java.util.Objects; import java.util.Properties; import javax.mail.Flags; @@ -438,6 +439,7 @@ public class ActivityEML extends ActivityBase { Charset cs = Charset.forName(charset); boolean isUtf8 = CharsetHelper.isUTF8(text.getBytes(cs)); + boolean isW1252 = !Objects.equals(text, CharsetHelper.utf8toW1252(text)); for (int i = 0; i < level; i++) ssb.append(" "); @@ -445,6 +447,7 @@ public class ActivityEML extends ActivityBase { ssb.append("Detected: ") .append(detected == null ? "?" : detected.toString()) .append(" isUTF8=").append(Boolean.toString(isUtf8)) + .append(" isW1252=").append(Boolean.toString(isW1252)) .append('\n'); } } diff --git a/app/src/main/java/eu/faircode/email/CharsetHelper.java b/app/src/main/java/eu/faircode/email/CharsetHelper.java index af3caee8d6..153dc3f25f 100644 --- a/app/src/main/java/eu/faircode/email/CharsetHelper.java +++ b/app/src/main/java/eu/faircode/email/CharsetHelper.java @@ -20,8 +20,10 @@ package eu.faircode.email; */ import android.text.TextUtils; +import android.util.Pair; import java.nio.ByteBuffer; +import java.nio.CharBuffer; import java.nio.charset.CharacterCodingException; import java.nio.charset.Charset; import java.nio.charset.CharsetDecoder; @@ -38,9 +40,19 @@ public class CharsetHelper { private static final List COMMON = Collections.unmodifiableList(Arrays.asList( "US-ASCII", "ISO-8859-1", "ISO-8859-2", "windows-1250", "windows-1252", "windows-1257", "UTF-8" )); + private static final int MIN_W1252 = 10; + private static final Pair[] sUtf8W1252 = new Pair[128]; static { System.loadLibrary("fairemail"); + + // https://www.i18nqa.com/debug/utf8-debug.html + Charset w1252 = Charset.forName("windows-1252"); + for (int c = 128; c < 256; c++) { + String y = new String(new byte[]{(byte) c}, w1252); + String x = new String(y.getBytes(), w1252); + sUtf8W1252[c - 128] = new Pair<>(x.getBytes(), y.getBytes()); + } } private static native DetectResult jni_detect_charset(byte[] octets); @@ -64,6 +76,57 @@ public class CharsetHelper { } } + static String utf8toW1252(String text) { + try { + Charset w1252 = Charset.forName("windows-1252"); + + //String result = new String(text.getBytes(StandardCharsets.ISO_8859_1), w1252); + //for (int c = 0; c < 128; c++) { + // String y = new String(sUtf8W1252[c].second); + // String x = new String(sUtf8W1252[c].first); + // result = result.replace(x, y); + //} + //return result; + + byte[] t = new String(text.getBytes(StandardCharsets.ISO_8859_1), w1252).getBytes(); + byte[] result = new byte[t.length]; + + int i = 0; + int len = 0; + int count = 0; + while (i < t.length && (i < MAX_SAMPLE_SIZE || count >= MIN_W1252)) { + boolean found = false; + for (int c = 0; c < 128; c++) { + int sl = sUtf8W1252[c].first.length; + if (i + sl < t.length) { + found = true; + for (int a = 0; a < sl; a++) + if (t[i + a] != sUtf8W1252[c].first[a]) { + found = false; + break; + } + if (found) { + count++; + int tl = sUtf8W1252[c].second.length; + System.arraycopy(sUtf8W1252[c].second, 0, result, len, tl); + len += tl; + i += sl; + break; + } + } + if (found) + break; + } + if (!found) + result[len++] = t[i++]; + } + return (count < MIN_W1252 ? text : new String(result, 0, len)); + } catch (Throwable ex) { + Log.w(ex); + return text; + } + } + public static Charset detect(String text) { try { byte[] octets = text.getBytes(StandardCharsets.ISO_8859_1); diff --git a/app/src/main/java/eu/faircode/email/MessageHelper.java b/app/src/main/java/eu/faircode/email/MessageHelper.java index cdb30afb8b..7e6e65db0e 100644 --- a/app/src/main/java/eu/faircode/email/MessageHelper.java +++ b/app/src/main/java/eu/faircode/email/MessageHelper.java @@ -2461,7 +2461,7 @@ public class MessageHelper { } if (h.isPlainText()) { - if (charset == null || StandardCharsets.ISO_8859_1.equals(cs)) + if (charset == null || StandardCharsets.ISO_8859_1.equals(cs)) { if (StandardCharsets.ISO_8859_1.equals(cs) && CharsetHelper.isUTF8(result)) { Log.i("Charset upgrade=UTF8"); result = new String(result.getBytes(StandardCharsets.ISO_8859_1), StandardCharsets.UTF_8); @@ -2477,6 +2477,8 @@ public class MessageHelper { result = new String(result.getBytes(StandardCharsets.ISO_8859_1), detected); } } + } else if (StandardCharsets.UTF_8.equals(cs)) + result = CharsetHelper.utf8toW1252(result); if ("flowed".equalsIgnoreCase(h.contentType.getParameter("format"))) result = HtmlHelper.flow(result); @@ -2512,6 +2514,9 @@ public class MessageHelper { CharsetHelper.isUTF8(result)) result = new String(result.getBytes(StandardCharsets.ISO_8859_1), StandardCharsets.UTF_8); + //if (StandardCharsets.UTF_8.equals(cs)) + // result = CharsetHelper.utf8w1252(result); + // Fix incorrect UTF16 try { if (CHARSET16.contains(cs)) {