Add support for guessing UTF-16 encoding based on BOM

This commit is contained in:
Sean Owen 2021-02-20 07:29:32 -06:00
parent ef498941bf
commit 547e58a286
2 changed files with 31 additions and 2 deletions

View file

@ -68,13 +68,22 @@ public final class StringUtils {
* @param hints decode hints if applicable
* @return Charset of guessed encoding; at the moment will only guess one of:
* {@link #SHIFT_JIS_CHARSET}, {@link StandardCharsets#UTF_8},
* {@link StandardCharsets#ISO_8859_1}, or the platform default encoding if
* {@link StandardCharsets#ISO_8859_1}, {@link StandardCharsets#UTF_16},
* or the platform default encoding if
* none of these can possibly be correct
*/
public static Charset guessCharset(byte[] bytes, Map<DecodeHintType,?> hints) {
if (hints != null && hints.containsKey(DecodeHintType.CHARACTER_SET)) {
return Charset.forName(hints.get(DecodeHintType.CHARACTER_SET).toString());
}
// First try UTF-16, assuming anything with its BOM is UTF-16
if (bytes.length > 2 &&
((bytes[0] == (byte) 0xFE && bytes[1] == (byte) 0xFF) ||
(bytes[0] == (byte) 0xFF && bytes[1] == (byte) 0xFE))) {
return StandardCharsets.UTF_16;
}
// For now, merely tries to distinguish ISO-8859-1, UTF-8 and Shift_JIS,
// which should be by far the most common encodings.
int length = bytes.length;

View file

@ -64,6 +64,22 @@ public final class StringUtilsTestCase extends Assert {
StringUtils.SHIFT_JIS_CHARSET, "SJIS");
}
@Test
public void testUTF16BE() {
// 调压柜
doTest(new byte[] { (byte) 0xFE, (byte) 0xFF, (byte) 0x8c, (byte) 0x03, (byte) 0x53, (byte) 0x8b,
(byte) 0x67, (byte) 0xdc, },
StandardCharsets.UTF_16, StandardCharsets.UTF_16.name());
}
@Test
public void testUTF16LE() {
// 调压柜
doTest(new byte[] { (byte) 0xFF, (byte) 0xFE, (byte) 0x03, (byte) 0x8c, (byte) 0x8b, (byte) 0x53,
(byte) 0xdc, (byte) 0x67, },
StandardCharsets.UTF_16, StandardCharsets.UTF_16.name());
}
private static void doTest(byte[] bytes, Charset charset, String encoding) {
Charset guessedCharset = StringUtils.guessCharset(bytes, null);
String guessedEncoding = StringUtils.guessEncoding(bytes, null);
@ -85,7 +101,11 @@ public final class StringUtilsTestCase extends Assert {
declaration.append("new byte[] { ");
for (byte b : text.getBytes(charset)) {
declaration.append("(byte) 0x");
declaration.append(Integer.toHexString(b & 0xFF));
int value = b & 0xFF;
if (value < 0x10) {
declaration.append('0');
}
declaration.append(Integer.toHexString(value));
declaration.append(", ");
}
declaration.append('}');