From 14edad1e2eaf0225e7968a7900da89d9be2959fd Mon Sep 17 00:00:00 2001 From: M66B Date: Sat, 11 Jun 2022 07:53:14 +0200 Subject: [PATCH] Charset: refactoring --- .../java/eu/faircode/email/MessageHelper.java | 71 +++++++++---------- 1 file changed, 32 insertions(+), 39 deletions(-) diff --git a/app/src/main/java/eu/faircode/email/MessageHelper.java b/app/src/main/java/eu/faircode/email/MessageHelper.java index 2ae57ffee6..fba43d424b 100644 --- a/app/src/main/java/eu/faircode/email/MessageHelper.java +++ b/app/src/main/java/eu/faircode/email/MessageHelper.java @@ -3109,21 +3109,35 @@ public class MessageHelper { return null; } - String result; + // Check character set + String charset = h.contentType.getParameter("charset"); + if (UnknownCharsetProvider.charsetForMime(charset) == null) + warnings.add(context.getString(R.string.title_no_charset, charset)); + if (TextUtils.isEmpty(charset) || + charset.equalsIgnoreCase(StandardCharsets.US_ASCII.name())) + charset = null; + + Charset cs = null; + if (charset != null) + try { + cs = Charset.forName(charset); + } catch (UnsupportedCharsetException ignored) { + cs = null; + } + + String result; try { Object content; // Check for UTF-16 LE without BOM - String pcharset = h.contentType.getParameter("charset"); - if ("utf-16".equalsIgnoreCase(pcharset) && override == null) { - String charset = pcharset; + if (StandardCharsets.UTF_16.equals(cs) && override == null) { BufferedInputStream bis = new BufferedInputStream(h.part.getDataHandler().getInputStream()); if (Boolean.TRUE.equals(CharsetHelper.isUTF16LE(bis))) { - charset = StandardCharsets.UTF_16LE.name(); - Log.e("Charset " + pcharset + " -> " + charset); + Log.e("Charset " + cs + " -> UTF16LE"); + cs = StandardCharsets.UTF_16LE; } - content = Helper.readStream(bis, Charset.forName(charset)); + content = Helper.readStream(bis, cs); } else content = h.part.getContent(); @@ -3141,15 +3155,8 @@ public class MessageHelper { // Typically com.sun.mail.util.QPDecoderStream if (BuildConfig.DEBUG && false) warnings.add(content.getClass().getName()); - Charset charset; - try { - String cs = h.contentType.getParameter("charset"); - charset = (cs == null ? StandardCharsets.ISO_8859_1 : Charset.forName(cs)); - } catch (Throwable ex) { - Log.w(ex); - charset = StandardCharsets.ISO_8859_1; - } - result = Helper.readStream((InputStream) content, charset); + result = Helper.readStream((InputStream) content, + cs == null ? StandardCharsets.ISO_8859_1 : cs); } else { Log.e(content.getClass().getName()); result = content.toString(); @@ -3169,24 +3176,9 @@ public class MessageHelper { return null; } - // Check character set - String charset = h.contentType.getParameter("charset"); - if (UnknownCharsetProvider.charsetForMime(charset) == null) - warnings.add(context.getString(R.string.title_no_charset, charset)); - - if ((TextUtils.isEmpty(charset) || charset.equalsIgnoreCase(StandardCharsets.US_ASCII.name()))) - charset = null; - - Charset cs = null; - try { - if (charset != null) - cs = Charset.forName(charset); - } catch (UnsupportedCharsetException ignored) { - } - if (h.isPlainText()) { if (override == null) { - if (charset == null || StandardCharsets.ISO_8859_1.equals(cs)) { + if (cs == null || StandardCharsets.ISO_8859_1.equals(cs)) { if (StandardCharsets.ISO_8859_1.equals(cs) && CharsetHelper.isUTF8(result)) { Log.i("Charset upgrade=UTF8"); result = new String(result.getBytes(StandardCharsets.ISO_8859_1), StandardCharsets.UTF_8); @@ -3254,11 +3246,11 @@ public class MessageHelper { try { if (CHARSET16.contains(cs)) { Charset detected = CharsetHelper.detect(result, cs); - // UTF-16 can be detected as US-ASCII if (!CHARSET16.contains(detected)) Log.w(new Throwable("Charset=" + cs + " detected=" + detected)); + // UTF-16 can be detected as US-ASCII if (StandardCharsets.UTF_8.equals(detected)) { - charset = null; + cs = null; result = new String(result.getBytes(StandardCharsets.ISO_8859_1), detected); } } @@ -3266,26 +3258,27 @@ public class MessageHelper { Log.w(ex); } - if (charset == null) { + if (cs == null) { // // String excerpt = result.substring(0, Math.min(MAX_META_EXCERPT, result.length())); Document d = JsoupEx.parse(excerpt); for (Element meta : d.select("meta")) { + String mcharset = null; if ("Content-Type".equalsIgnoreCase(meta.attr("http-equiv"))) { try { ContentType ct = new ContentType(meta.attr("content")); - charset = ct.getParameter("charset"); + mcharset = ct.getParameter("charset"); } catch (ParseException ex) { Log.w(ex); } } else - charset = meta.attr("charset"); + mcharset = meta.attr("charset"); - if (!TextUtils.isEmpty(charset)) + if (!TextUtils.isEmpty(mcharset)) try { Log.i("Charset meta=" + meta); - Charset c = Charset.forName(charset); + Charset c = Charset.forName(mcharset); // US-ASCII is a subset of ISO8859-1 if (StandardCharsets.US_ASCII.equals(c))