FairEmail/app/src/main/java/eu/faircode/email/CharsetHelper.java

129 lines
4.2 KiB
Java
Raw Normal View History

2020-10-09 15:43:02 +00:00
package eu.faircode.email;
/*
This file is part of FairEmail.
FairEmail is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
FairEmail is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with FairEmail. If not, see <http://www.gnu.org/licenses/>.
Copyright 2018-2020 by Marcel Bokhorst (M66B)
*/
2020-10-22 06:39:33 +00:00
import android.text.TextUtils;
2020-10-09 15:43:02 +00:00
import java.nio.charset.Charset;
import java.nio.charset.StandardCharsets;
2020-10-25 09:42:58 +00:00
import java.util.Arrays;
import java.util.Collections;
import java.util.List;
2020-10-25 08:33:02 +00:00
import java.util.Locale;
2020-10-09 15:43:02 +00:00
class CharsetHelper {
2020-10-15 06:40:20 +00:00
private static final int MAX_SAMPLE_SIZE = 8192;
2020-10-25 09:42:58 +00:00
private static String CHINESE = new Locale("zh").getLanguage();
private static final List<String> COMMON = Collections.unmodifiableList(Arrays.asList(
"US-ASCII", "ISO-8859-1", "ISO-8859-2", "windows-1250", "windows-1252", "windows-1257", "UTF-8"
));
2020-10-14 20:36:27 +00:00
2020-10-14 18:54:28 +00:00
static {
System.loadLibrary("compact_enc_det");
}
2020-10-25 09:42:58 +00:00
private static native DetectResult jni_detect(byte[] octets);
2020-10-14 18:54:28 +00:00
2020-10-09 15:43:02 +00:00
static boolean isUTF8(String text) {
// Get extended ASCII characters
byte[] octets = text.getBytes(StandardCharsets.ISO_8859_1);
int bytes;
for (int i = 0; i < octets.length; i++) {
if ((octets[i] & 0b10000000) == 0b00000000)
bytes = 1;
else if ((octets[i] & 0b11100000) == 0b11000000)
bytes = 2;
else if ((octets[i] & 0b11110000) == 0b11100000)
bytes = 3;
else if ((octets[i] & 0b11111000) == 0b11110000)
bytes = 4;
else if ((octets[i] & 0b11111100) == 0b11111000)
bytes = 5;
else if ((octets[i] & 0b11111110) == 0b11111100)
bytes = 6;
else
return false;
if (i + bytes > octets.length)
return false;
while (--bytes > 0)
if ((octets[++i] & 0b11000000) != 0b10000000)
return false;
}
return true;
}
2020-10-10 06:57:03 +00:00
static Charset detect(String text) {
try {
2020-10-10 11:20:24 +00:00
byte[] octets = text.getBytes(StandardCharsets.ISO_8859_1);
2020-10-14 20:36:27 +00:00
byte[] sample;
2020-10-15 06:40:20 +00:00
if (octets.length < MAX_SAMPLE_SIZE)
2020-10-14 20:36:27 +00:00
sample = octets;
else {
2020-10-15 06:40:20 +00:00
sample = new byte[MAX_SAMPLE_SIZE];
System.arraycopy(octets, 0, sample, 0, MAX_SAMPLE_SIZE);
2020-10-14 20:36:27 +00:00
}
Log.i("compact_enc_det sample=" + sample.length);
2020-10-25 09:42:58 +00:00
DetectResult detected = jni_detect(sample);
if (TextUtils.isEmpty(detected.charset)) {
Log.e("compact_enc_det result=" + detected);
return null;
} else if (!BuildConfig.PLAY_STORE_RELEASE &&
COMMON.contains(detected.charset))
2020-10-15 14:34:16 +00:00
Log.w("compact_enc_det result=" + detected);
2020-10-25 09:42:58 +00:00
else if ("GB18030".equals(detected.charset) &&
2020-10-25 08:33:02 +00:00
!Locale.getDefault().getLanguage().equals(CHINESE)) {
// https://github.com/google/compact_enc_det/issues/8
2020-10-15 14:34:16 +00:00
Log.e("compact_enc_det result=" + detected);
2020-10-22 06:39:33 +00:00
return null;
2020-10-25 09:42:58 +00:00
} else
Log.e("compact_enc_det result=" + detected);
2020-10-22 06:39:33 +00:00
2020-10-25 09:42:58 +00:00
return Charset.forName(detected.charset);
2020-10-10 06:57:03 +00:00
} catch (Throwable ex) {
Log.w(ex);
return null;
}
}
2020-10-25 09:42:58 +00:00
private static class DetectResult {
String charset;
int bytes_consumed;
boolean is_reliable;
DetectResult(String charset, int bytes_consumed, boolean is_reliable) {
this.charset = charset;
this.bytes_consumed = bytes_consumed;
this.is_reliable = is_reliable;
}
@Override
public String toString() {
return charset + " c=" + bytes_consumed + " r=" + is_reliable;
}
}
2020-10-09 15:43:02 +00:00
}