Improved charset detection

This commit is contained in:
M66B 2020-10-10 13:20:24 +02:00
parent 29460107da
commit 7c30c8e7d8
1 changed files with 8 additions and 7 deletions

View File

@ -26,9 +26,7 @@ import java.nio.charset.StandardCharsets;
import java.nio.charset.UnsupportedCharsetException;
class CharsetHelper {
private static UniversalDetector detector = new UniversalDetector();
private static final int SAMPLE_SIZE = 2 * 1024;
private static final int SAMPLE_SIZE = 1024;
static boolean isUTF8(String text) {
// Get extended ASCII characters
@ -124,9 +122,14 @@ class CharsetHelper {
static Charset detect(String text) {
try {
byte[] sample = text.getBytes(StandardCharsets.ISO_8859_1);
byte[] octets = text.getBytes(StandardCharsets.ISO_8859_1);
detector.handleData(sample, 0, Math.min(SAMPLE_SIZE, sample.length));
int offset = 0;
UniversalDetector detector = new UniversalDetector();
while (offset < octets.length && !detector.isDone()) {
detector.handleData(octets, offset, Math.min(SAMPLE_SIZE, octets.length - offset));
offset += SAMPLE_SIZE;
}
detector.dataEnd();
String detected = detector.getDetectedCharset();
@ -137,8 +140,6 @@ class CharsetHelper {
} catch (Throwable ex) {
Log.w(ex);
return null;
} finally {
detector.reset();
}
}
}