mirror of
https://github.com/zxing/zxing.git
synced 2025-02-02 05:41:08 -08:00
Add support for guessing UTF-16 encoding based on BOM
This commit is contained in:
parent
ef498941bf
commit
547e58a286
|
@ -68,13 +68,22 @@ public final class StringUtils {
|
|||
* @param hints decode hints if applicable
|
||||
* @return Charset of guessed encoding; at the moment will only guess one of:
|
||||
* {@link #SHIFT_JIS_CHARSET}, {@link StandardCharsets#UTF_8},
|
||||
* {@link StandardCharsets#ISO_8859_1}, or the platform default encoding if
|
||||
* {@link StandardCharsets#ISO_8859_1}, {@link StandardCharsets#UTF_16},
|
||||
* or the platform default encoding if
|
||||
* none of these can possibly be correct
|
||||
*/
|
||||
public static Charset guessCharset(byte[] bytes, Map<DecodeHintType,?> hints) {
|
||||
if (hints != null && hints.containsKey(DecodeHintType.CHARACTER_SET)) {
|
||||
return Charset.forName(hints.get(DecodeHintType.CHARACTER_SET).toString());
|
||||
}
|
||||
|
||||
// First try UTF-16, assuming anything with its BOM is UTF-16
|
||||
if (bytes.length > 2 &&
|
||||
((bytes[0] == (byte) 0xFE && bytes[1] == (byte) 0xFF) ||
|
||||
(bytes[0] == (byte) 0xFF && bytes[1] == (byte) 0xFE))) {
|
||||
return StandardCharsets.UTF_16;
|
||||
}
|
||||
|
||||
// For now, merely tries to distinguish ISO-8859-1, UTF-8 and Shift_JIS,
|
||||
// which should be by far the most common encodings.
|
||||
int length = bytes.length;
|
||||
|
|
|
@ -64,6 +64,22 @@ public final class StringUtilsTestCase extends Assert {
|
|||
StringUtils.SHIFT_JIS_CHARSET, "SJIS");
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testUTF16BE() {
|
||||
// 调压柜
|
||||
doTest(new byte[] { (byte) 0xFE, (byte) 0xFF, (byte) 0x8c, (byte) 0x03, (byte) 0x53, (byte) 0x8b,
|
||||
(byte) 0x67, (byte) 0xdc, },
|
||||
StandardCharsets.UTF_16, StandardCharsets.UTF_16.name());
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testUTF16LE() {
|
||||
// 调压柜
|
||||
doTest(new byte[] { (byte) 0xFF, (byte) 0xFE, (byte) 0x03, (byte) 0x8c, (byte) 0x8b, (byte) 0x53,
|
||||
(byte) 0xdc, (byte) 0x67, },
|
||||
StandardCharsets.UTF_16, StandardCharsets.UTF_16.name());
|
||||
}
|
||||
|
||||
private static void doTest(byte[] bytes, Charset charset, String encoding) {
|
||||
Charset guessedCharset = StringUtils.guessCharset(bytes, null);
|
||||
String guessedEncoding = StringUtils.guessEncoding(bytes, null);
|
||||
|
@ -85,7 +101,11 @@ public final class StringUtilsTestCase extends Assert {
|
|||
declaration.append("new byte[] { ");
|
||||
for (byte b : text.getBytes(charset)) {
|
||||
declaration.append("(byte) 0x");
|
||||
declaration.append(Integer.toHexString(b & 0xFF));
|
||||
int value = b & 0xFF;
|
||||
if (value < 0x10) {
|
||||
declaration.append('0');
|
||||
}
|
||||
declaration.append(Integer.toHexString(value));
|
||||
declaration.append(", ");
|
||||
}
|
||||
declaration.append('}');
|
||||
|
|
Loading…
Reference in a new issue