Charset improvements

This commit is contained in:
M66B 2020-10-27 18:01:39 +01:00
parent 962e637aee
commit 4faf5ad9bc
2 changed files with 25 additions and 10 deletions

View File

@ -80,13 +80,13 @@ class CharsetHelper {
if (TextUtils.isEmpty(detected.charset)) {
Log.e("compact_enc_det result=" + detected);
return null;
} else if (!BuildConfig.PLAY_STORE_RELEASE &&
COMMON.contains(detected.charset))
} else if (COMMON.contains(detected.charset))
Log.w("compact_enc_det result=" + detected);
else if ("GB18030".equals(detected.charset) &&
!Locale.getDefault().getLanguage().equals(CHINESE)) {
else if ("GB18030".equals(detected.charset)) {
boolean chinese = Locale.getDefault().getLanguage().equals(CHINESE);
// https://github.com/google/compact_enc_det/issues/8
Log.e("compact_enc_det result=" + detected);
Log.e("compact_enc_det result=" + detected + " chinese=" + chinese);
if (!chinese)
return null;
} else
Log.e("compact_enc_det result=" + detected);

View File

@ -1801,18 +1801,33 @@ public class MessageHelper {
try {
Log.i("Charset meta=" + meta);
Charset c = Charset.forName(charset);
if (c.equals(StandardCharsets.UTF_8) && !CharsetHelper.isUTF8(result))
// US-ASCII is a subset of ISO8859-1
if (StandardCharsets.US_ASCII.equals(c))
break;
if (CHARSET16.contains(c))
break; // Can't convert 16 bits charset to 8 bits
// Check if really UTF-8
if (StandardCharsets.UTF_8.equals(c) && !CharsetHelper.isUTF8(result)) {
Log.e("Charset meta=" + meta + " !isUTF8");
break;
}
// 16 bits charsets cannot be converted to 8 bits
if (CHARSET16.contains(c)) {
Log.e("Charset meta=" + meta);
break;
}
Charset detected = CharsetHelper.detect(result);
if (!(StandardCharsets.US_ASCII.equals(detected) &&
StandardCharsets.UTF_8.equals(c)))
Log.e("Converting detected=" + detected + " meta=" + c);
// Convert
result = new String(result.getBytes(StandardCharsets.ISO_8859_1), c);
break;
} catch (Throwable ex) {
Log.w(ex);
Log.e(ex);
}
}
}