2020-10-09 15:43:02 +00:00
|
|
|
package eu.faircode.email;
|
|
|
|
|
|
|
|
/*
|
|
|
|
This file is part of FairEmail.
|
|
|
|
|
|
|
|
FairEmail is free software: you can redistribute it and/or modify
|
|
|
|
it under the terms of the GNU General Public License as published by
|
|
|
|
the Free Software Foundation, either version 3 of the License, or
|
|
|
|
(at your option) any later version.
|
|
|
|
|
|
|
|
FairEmail is distributed in the hope that it will be useful,
|
|
|
|
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
|
|
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
|
|
GNU General Public License for more details.
|
|
|
|
|
|
|
|
You should have received a copy of the GNU General Public License
|
|
|
|
along with FairEmail. If not, see <http://www.gnu.org/licenses/>.
|
|
|
|
|
|
|
|
Copyright 2018-2020 by Marcel Bokhorst (M66B)
|
|
|
|
*/
|
|
|
|
|
2020-10-22 06:39:33 +00:00
|
|
|
import android.text.TextUtils;
|
|
|
|
|
2020-10-25 11:38:41 +00:00
|
|
|
import java.nio.ByteBuffer;
|
|
|
|
import java.nio.charset.CharacterCodingException;
|
2020-10-09 15:43:02 +00:00
|
|
|
import java.nio.charset.Charset;
|
2020-10-25 11:38:41 +00:00
|
|
|
import java.nio.charset.CharsetDecoder;
|
|
|
|
import java.nio.charset.CodingErrorAction;
|
2020-10-09 15:43:02 +00:00
|
|
|
import java.nio.charset.StandardCharsets;
|
2020-10-25 09:42:58 +00:00
|
|
|
import java.util.Arrays;
|
|
|
|
import java.util.Collections;
|
|
|
|
import java.util.List;
|
2020-10-25 08:33:02 +00:00
|
|
|
import java.util.Locale;
|
2020-10-09 15:43:02 +00:00
|
|
|
|
|
|
|
class CharsetHelper {
|
2020-10-15 06:40:20 +00:00
|
|
|
private static final int MAX_SAMPLE_SIZE = 8192;
|
2020-10-25 09:42:58 +00:00
|
|
|
private static String CHINESE = new Locale("zh").getLanguage();
|
|
|
|
private static final List<String> COMMON = Collections.unmodifiableList(Arrays.asList(
|
|
|
|
"US-ASCII", "ISO-8859-1", "ISO-8859-2", "windows-1250", "windows-1252", "windows-1257", "UTF-8"
|
|
|
|
));
|
2020-10-14 20:36:27 +00:00
|
|
|
|
2020-10-14 18:54:28 +00:00
|
|
|
static {
|
2020-11-08 14:26:18 +00:00
|
|
|
System.loadLibrary("fairemail");
|
2020-10-14 18:54:28 +00:00
|
|
|
}
|
|
|
|
|
2020-10-25 09:42:58 +00:00
|
|
|
private static native DetectResult jni_detect(byte[] octets);
|
2020-10-14 18:54:28 +00:00
|
|
|
|
2020-10-09 15:43:02 +00:00
|
|
|
static boolean isUTF8(String text) {
|
|
|
|
// Get extended ASCII characters
|
|
|
|
byte[] octets = text.getBytes(StandardCharsets.ISO_8859_1);
|
|
|
|
|
2020-10-25 11:38:41 +00:00
|
|
|
CharsetDecoder utf8Decoder = StandardCharsets.UTF_8.newDecoder()
|
|
|
|
.onMalformedInput(CodingErrorAction.REPORT)
|
|
|
|
.onUnmappableCharacter(CodingErrorAction.REPORT);
|
2020-10-09 15:43:02 +00:00
|
|
|
|
2020-10-25 11:38:41 +00:00
|
|
|
try {
|
|
|
|
utf8Decoder.decode(ByteBuffer.wrap(octets));
|
|
|
|
return true;
|
|
|
|
} catch (CharacterCodingException ex) {
|
|
|
|
Log.w(ex);
|
|
|
|
return false;
|
|
|
|
}
|
2020-10-09 15:43:02 +00:00
|
|
|
}
|
|
|
|
|
2020-10-10 06:57:03 +00:00
|
|
|
static Charset detect(String text) {
|
|
|
|
try {
|
2020-10-10 11:20:24 +00:00
|
|
|
byte[] octets = text.getBytes(StandardCharsets.ISO_8859_1);
|
2020-10-14 20:36:27 +00:00
|
|
|
|
|
|
|
byte[] sample;
|
2020-10-15 06:40:20 +00:00
|
|
|
if (octets.length < MAX_SAMPLE_SIZE)
|
2020-10-14 20:36:27 +00:00
|
|
|
sample = octets;
|
|
|
|
else {
|
2020-10-15 06:40:20 +00:00
|
|
|
sample = new byte[MAX_SAMPLE_SIZE];
|
|
|
|
System.arraycopy(octets, 0, sample, 0, MAX_SAMPLE_SIZE);
|
2020-10-14 20:36:27 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
Log.i("compact_enc_det sample=" + sample.length);
|
2020-10-25 09:42:58 +00:00
|
|
|
DetectResult detected = jni_detect(sample);
|
|
|
|
|
|
|
|
if (TextUtils.isEmpty(detected.charset)) {
|
|
|
|
Log.e("compact_enc_det result=" + detected);
|
|
|
|
return null;
|
2020-10-27 17:01:39 +00:00
|
|
|
} else if (COMMON.contains(detected.charset))
|
2020-10-15 14:34:16 +00:00
|
|
|
Log.w("compact_enc_det result=" + detected);
|
2020-10-27 17:01:39 +00:00
|
|
|
else if ("GB18030".equals(detected.charset)) {
|
|
|
|
boolean chinese = Locale.getDefault().getLanguage().equals(CHINESE);
|
2020-10-24 19:23:03 +00:00
|
|
|
// https://github.com/google/compact_enc_det/issues/8
|
2020-10-27 17:01:39 +00:00
|
|
|
Log.e("compact_enc_det result=" + detected + " chinese=" + chinese);
|
|
|
|
if (!chinese)
|
|
|
|
return null;
|
2020-11-15 15:39:11 +00:00
|
|
|
} else // GBK, Big5, ISO-2022-JP
|
2020-10-25 09:42:58 +00:00
|
|
|
Log.e("compact_enc_det result=" + detected);
|
2020-10-22 06:39:33 +00:00
|
|
|
|
2020-10-25 09:42:58 +00:00
|
|
|
return Charset.forName(detected.charset);
|
2020-10-10 06:57:03 +00:00
|
|
|
} catch (Throwable ex) {
|
|
|
|
Log.w(ex);
|
|
|
|
return null;
|
|
|
|
}
|
|
|
|
}
|
2020-10-25 09:42:58 +00:00
|
|
|
|
|
|
|
private static class DetectResult {
|
|
|
|
String charset;
|
2020-10-25 10:27:35 +00:00
|
|
|
int sample_size;
|
2020-10-25 09:42:58 +00:00
|
|
|
int bytes_consumed;
|
|
|
|
boolean is_reliable;
|
|
|
|
|
2020-10-25 10:27:35 +00:00
|
|
|
DetectResult(String charset, int sample_size, int bytes_consumed, boolean is_reliable) {
|
2020-10-25 09:42:58 +00:00
|
|
|
this.charset = charset;
|
2020-10-25 10:27:35 +00:00
|
|
|
this.sample_size = sample_size;
|
2020-10-25 09:42:58 +00:00
|
|
|
this.bytes_consumed = bytes_consumed;
|
|
|
|
this.is_reliable = is_reliable;
|
|
|
|
}
|
|
|
|
|
|
|
|
@Override
|
|
|
|
public String toString() {
|
2020-10-25 10:27:35 +00:00
|
|
|
return charset + " s=" + bytes_consumed + "/" + sample_size + " r=" + is_reliable;
|
2020-10-25 09:42:58 +00:00
|
|
|
}
|
|
|
|
}
|
2020-10-09 15:43:02 +00:00
|
|
|
}
|