Charset improvements

This commit is contained in:
M66B 2020-10-27 18:01:39 +01:00
parent 962e637aee
commit 4faf5ad9bc
2 changed files with 25 additions and 10 deletions

View File

@ -80,14 +80,14 @@ class CharsetHelper {
if (TextUtils.isEmpty(detected.charset)) { if (TextUtils.isEmpty(detected.charset)) {
Log.e("compact_enc_det result=" + detected); Log.e("compact_enc_det result=" + detected);
return null; return null;
} else if (!BuildConfig.PLAY_STORE_RELEASE && } else if (COMMON.contains(detected.charset))
COMMON.contains(detected.charset))
Log.w("compact_enc_det result=" + detected); Log.w("compact_enc_det result=" + detected);
else if ("GB18030".equals(detected.charset) && else if ("GB18030".equals(detected.charset)) {
!Locale.getDefault().getLanguage().equals(CHINESE)) { boolean chinese = Locale.getDefault().getLanguage().equals(CHINESE);
// https://github.com/google/compact_enc_det/issues/8 // https://github.com/google/compact_enc_det/issues/8
Log.e("compact_enc_det result=" + detected); Log.e("compact_enc_det result=" + detected + " chinese=" + chinese);
return null; if (!chinese)
return null;
} else } else
Log.e("compact_enc_det result=" + detected); Log.e("compact_enc_det result=" + detected);

View File

@ -1801,18 +1801,33 @@ public class MessageHelper {
try { try {
Log.i("Charset meta=" + meta); Log.i("Charset meta=" + meta);
Charset c = Charset.forName(charset); Charset c = Charset.forName(charset);
if (c.equals(StandardCharsets.UTF_8) && !CharsetHelper.isUTF8(result))
// US-ASCII is a subset of ISO8859-1
if (StandardCharsets.US_ASCII.equals(c))
break; break;
if (CHARSET16.contains(c))
break; // Can't convert 16 bits charset to 8 bits // Check if really UTF-8
if (StandardCharsets.UTF_8.equals(c) && !CharsetHelper.isUTF8(result)) {
Log.e("Charset meta=" + meta + " !isUTF8");
break;
}
// 16 bits charsets cannot be converted to 8 bits
if (CHARSET16.contains(c)) {
Log.e("Charset meta=" + meta);
break;
}
Charset detected = CharsetHelper.detect(result); Charset detected = CharsetHelper.detect(result);
if (!(StandardCharsets.US_ASCII.equals(detected) && if (!(StandardCharsets.US_ASCII.equals(detected) &&
StandardCharsets.UTF_8.equals(c))) StandardCharsets.UTF_8.equals(c)))
Log.e("Converting detected=" + detected + " meta=" + c); Log.e("Converting detected=" + detected + " meta=" + c);
// Convert
result = new String(result.getBytes(StandardCharsets.ISO_8859_1), c); result = new String(result.getBytes(StandardCharsets.ISO_8859_1), c);
break; break;
} catch (Throwable ex) { } catch (Throwable ex) {
Log.w(ex); Log.e(ex);
} }
} }
} }