Better encoding guessing from Alex

git-svn-id: https://zxing.googlecode.com/svn/trunk@2157 59b500cc-1b3d-0410-9834-0bbf25fbcc57
This commit is contained in:
srowen 2012-02-04 12:24:26 +00:00
parent d265ccff56
commit 5d89cc30fc
3 changed files with 189 additions and 94 deletions

View file

@ -4,6 +4,7 @@ in alphabetical order.
Agustín Delgado (Servinform S.A.) Agustín Delgado (Servinform S.A.)
Aitor Almeida (University of Deusto) Aitor Almeida (University of Deusto)
Alasdair Mackintosh (Google) Alasdair Mackintosh (Google)
Alex Dupre
Alexander Martin (Haase & Martin GmbH) Alexander Martin (Haase & Martin GmbH)
Andreas Pillath Andreas Pillath
Andrew Walbran (Google) Andrew Walbran (Google)
@ -20,6 +21,7 @@ Dave MacLachlan (Google)
David Phillip Oster (Google) David Phillip Oster (Google)
David Albert (Bug Labs) David Albert (Bug Labs)
David Olivier David Olivier
dawalker (Google)
Diego Pierotto Diego Pierotto
drejc83 drejc83
Eduardo Castillejo (University of Deusto) Eduardo Castillejo (University of Deusto)

View file

@ -24,6 +24,7 @@ import com.google.zxing.DecodeHintType;
* Common string-related functions. * Common string-related functions.
* *
* @author Sean Owen * @author Sean Owen
* @author Alex Dupre
*/ */
public final class StringUtils { public final class StringUtils {
@ -54,30 +55,33 @@ public final class StringUtils {
return characterSet; return characterSet;
} }
} }
// Does it start with the UTF-8 byte order mark? then guess it's UTF-8
if (bytes.length > 3 &&
bytes[0] == (byte) 0xEF &&
bytes[1] == (byte) 0xBB &&
bytes[2] == (byte) 0xBF) {
return UTF8;
}
// For now, merely tries to distinguish ISO-8859-1, UTF-8 and Shift_JIS, // For now, merely tries to distinguish ISO-8859-1, UTF-8 and Shift_JIS,
// which should be by far the most common encodings. ISO-8859-1 // which should be by far the most common encodings.
// should not have bytes in the 0x80 - 0x9F range, while Shift_JIS
// uses this as a first byte of a two-byte character. If we see this
// followed by a valid second byte in Shift_JIS, assume it is Shift_JIS.
// If we see something else in that second byte, we'll make the risky guess
// that it's UTF-8.
int length = bytes.length; int length = bytes.length;
boolean canBeISO88591 = true; boolean canBeISO88591 = true;
boolean canBeShiftJIS = true; boolean canBeShiftJIS = true;
boolean canBeUTF8 = true; boolean canBeUTF8 = true;
int utf8BytesLeft = 0; int utf8BytesLeft = 0;
int maybeDoubleByteCount = 0; //int utf8LowChars = 0;
int maybeSingleByteKatakanaCount = 0; int utf2BytesChars = 0;
boolean sawLatin1Supplement = false; int utf3BytesChars = 0;
boolean sawUTF8Start = false; int utf4BytesChars = 0;
boolean lastWasPossibleDoubleByteStart = false; int sjisBytesLeft = 0;
//int sjisLowChars = 0;
int sjisKatakanaChars = 0;
//int sjisDoubleBytesChars = 0;
int sjisCurKatakanaWordLength = 0;
int sjisCurDoubleBytesWordLength = 0;
int sjisMaxKatakanaWordLength = 0;
int sjisMaxDoubleBytesWordLength = 0;
//int isoLowChars = 0;
//int isoHighChars = 0;
int isoHighOther = 0;
boolean utf8bom = bytes.length > 3 &&
bytes[0] == (byte) 0xEF &&
bytes[1] == (byte) 0xBB &&
bytes[2] == (byte) 0xBF;
for (int i = 0; for (int i = 0;
i < length && (canBeISO88591 || canBeShiftJIS || canBeUTF8); i < length && (canBeISO88591 || canBeShiftJIS || canBeUTF8);
@ -86,105 +90,122 @@ public final class StringUtils {
int value = bytes[i] & 0xFF; int value = bytes[i] & 0xFF;
// UTF-8 stuff // UTF-8 stuff
if (value >= 0x80 && value <= 0xBF) { if (canBeUTF8) {
if (utf8BytesLeft > 0) { if (utf8BytesLeft > 0) {
if ((value & 0x80) == 0) {
canBeUTF8 = false;
} else {
utf8BytesLeft--; utf8BytesLeft--;
} }
} else if ((value & 0x80) != 0) {
if ((value & 0x40) == 0) {
canBeUTF8 = false;
} else {
utf8BytesLeft++;
if ((value & 0x20) == 0) {
utf2BytesChars++;
} else {
utf8BytesLeft++;
if ((value & 0x10) == 0) {
utf3BytesChars++;
} else {
utf8BytesLeft++;
if ((value & 0x08) == 0) {
utf4BytesChars++;
} else { } else {
if (utf8BytesLeft > 0) {
canBeUTF8 = false; canBeUTF8 = false;
} }
if (value >= 0xC0 && value <= 0xFD) {
sawUTF8Start = true;
int valueCopy = value;
while ((valueCopy & 0x40) != 0) {
utf8BytesLeft++;
valueCopy <<= 1;
} }
} }
} }
} //else {
//utf8LowChars++;
//}
}
// ISO-8859-1 stuff // ISO-8859-1 stuff
if (canBeISO88591) {
if ((value == 0xC2 || value == 0xC3) && i < length - 1) { if (value > 0x7F && value < 0xA0) {
// This is really a poor hack. The slightly more exotic characters people might want to put in
// a QR Code, by which I mean the Latin-1 supplement characters (e.g. u-umlaut) have encodings
// that start with 0xC2 followed by [0xA0,0xBF], or start with 0xC3 followed by [0x80,0xBF].
int nextValue = bytes[i + 1] & 0xFF;
if (nextValue <= 0xBF &&
((value == 0xC2 && nextValue >= 0xA0) || (value == 0xC3 && nextValue >= 0x80))) {
sawLatin1Supplement = true;
}
}
if (value >= 0x7F && value <= 0x9F) {
canBeISO88591 = false; canBeISO88591 = false;
} else if (value > 0x9F) {
if (value < 0xC0 || value == 0xD7 || value == 0xF7) {
isoHighOther++;
} //else {
//isoHighChars++;
//}
} //else {
//isoLowChars++;
//}
} }
// Shift_JIS stuff // Shift_JIS stuff
if (canBeShiftJIS) {
if (sjisBytesLeft > 0) {
if (value < 0x40 || value == 0x7F || value > 0xFC) {
canBeShiftJIS = false;
} else {
sjisBytesLeft--;
}
} else if (value == 0x80 || value == 0xA0 || value > 0xEF) {
canBeShiftJIS = false;
} else if (value > 0xA0 && value < 0xE0) {
sjisKatakanaChars++;
sjisCurDoubleBytesWordLength = 0;
sjisCurKatakanaWordLength++;
if (sjisCurKatakanaWordLength > sjisMaxKatakanaWordLength) {
sjisMaxKatakanaWordLength = sjisCurKatakanaWordLength;
}
} else if (value > 0x7F) {
sjisBytesLeft++;
//sjisDoubleBytesChars++;
sjisCurKatakanaWordLength = 0;
sjisCurDoubleBytesWordLength++;
if (sjisCurDoubleBytesWordLength > sjisMaxDoubleBytesWordLength) {
sjisMaxDoubleBytesWordLength = sjisCurDoubleBytesWordLength;
}
} else {
//sjisLowChars++;
sjisCurKatakanaWordLength = 0;
sjisCurDoubleBytesWordLength = 0;
}
}
}
if (value >= 0xA1 && value <= 0xDF) { if (canBeUTF8 && utf8BytesLeft > 0) {
// count the number of characters that might be a Shift_JIS single-byte Katakana character
if (!lastWasPossibleDoubleByteStart) {
maybeSingleByteKatakanaCount++;
}
}
if (!lastWasPossibleDoubleByteStart &&
((value >= 0xF0 && value <= 0xFF) || value == 0x80 || value == 0xA0)) {
canBeShiftJIS = false;
}
if ((value >= 0x81 && value <= 0x9F) || (value >= 0xE0 && value <= 0xEF)) {
// These start double-byte characters in Shift_JIS. Let's see if it's followed by a valid
// second byte.
if (lastWasPossibleDoubleByteStart) {
// If we just checked this and the last byte for being a valid double-byte
// char, don't check starting on this byte. If this and the last byte
// formed a valid pair, then this shouldn't be checked to see if it starts
// a double byte pair of course.
lastWasPossibleDoubleByteStart = false;
} else {
// ... otherwise do check to see if this plus the next byte form a valid
// double byte pair encoding a character.
lastWasPossibleDoubleByteStart = true;
if (i >= bytes.length - 1) {
canBeShiftJIS = false;
} else {
int nextValue = bytes[i + 1] & 0xFF;
if (nextValue < 0x40 || nextValue > 0xFC) {
canBeShiftJIS = false;
} else {
maybeDoubleByteCount++;
}
// There is some conflicting information out there about which bytes can follow which in
// double-byte Shift_JIS characters. The rule above seems to be the one that matches practice.
}
}
} else {
lastWasPossibleDoubleByteStart = false;
}
}
if (utf8BytesLeft > 0) {
canBeUTF8 = false; canBeUTF8 = false;
} }
if (canBeShiftJIS && sjisBytesLeft > 0) {
// Easy -- if assuming Shift_JIS and no evidence it can't be, done canBeShiftJIS = false;
if (canBeShiftJIS && ASSUME_SHIFT_JIS) {
return SHIFT_JIS;
} }
if (canBeUTF8 && sawUTF8Start) {
// Easy -- if there is BOM or at least 1 valid not-single byte character (and no evidence it can't be UTF-8), done
if (canBeUTF8 && (utf8bom || utf2BytesChars + utf3BytesChars + utf4BytesChars > 0)) {
return UTF8; return UTF8;
} }
// Distinguishing Shift_JIS and ISO-8859-1 can be a little tough. The crude heuristic is: // Easy -- if assuming Shift_JIS or at least 3 valid consecutive not-ascii characters (and no evidence it can't be), done
// - If we saw if (canBeShiftJIS && (ASSUME_SHIFT_JIS || sjisMaxKatakanaWordLength >= 3 || sjisMaxDoubleBytesWordLength >= 3)) {
// - at least 3 bytes that starts a double-byte value (bytes that are rare in ISO-8859-1), or
// - over 5% of bytes could be single-byte Katakana (also rare in ISO-8859-1),
// - and, saw no sequences that are invalid in Shift_JIS, then we conclude Shift_JIS
if (canBeShiftJIS && (maybeDoubleByteCount >= 3 || 20 * maybeSingleByteKatakanaCount > length)) {
return SHIFT_JIS; return SHIFT_JIS;
} }
// Otherwise, we default to ISO-8859-1 unless we know it can't be // Distinguishing Shift_JIS and ISO-8859-1 can be a little tough for short words. The crude heuristic is:
if (!sawLatin1Supplement && canBeISO88591) { // - If we saw
// - only two consecutive katakana chars in the whole text, or
// - at least 10% of bytes that could be "upper" not-alphanumeric Latin1,
// - then we conclude Shift_JIS, else ISO-8859-1
if (canBeISO88591 && canBeShiftJIS) {
return (sjisMaxKatakanaWordLength == 2 && sjisKatakanaChars == 2) || isoHighOther * 10 >= length
? SHIFT_JIS : ISO88591;
}
// Otherwise, try in order ISO-8859-1, Shift JIS, UTF-8 and fall back to default platform encoding
if (canBeISO88591) {
return ISO88591; return ISO88591;
} }
if (canBeShiftJIS) {
return SHIFT_JIS;
}
if (canBeUTF8) {
return UTF8;
}
// Otherwise, we take a wild guess with platform encoding // Otherwise, we take a wild guess with platform encoding
return PLATFORM_DEFAULT_ENCODING; return PLATFORM_DEFAULT_ENCODING;
} }

View file

@ -0,0 +1,72 @@
/*
* Copyright 2012 ZXing authors
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.google.zxing.common;
import org.junit.Assert;
import org.junit.Test;
import java.nio.charset.Charset;
public final class StringUtilsTestCase extends Assert {
@Test
public void testShortShiftJIS_1() {
// ÈáëÈö
doTest(new byte[] { (byte) 0x8b, (byte) 0xe0, (byte) 0x8b, (byte) 0x9b, }, "SJIS");
}
@Test
public void testShortISO88591_1() {
// bd
doTest(new byte[] { (byte) 0x62, (byte) 0xe5, (byte) 0x64, }, "ISO-8859-1");
}
@Test
public void testMixedShiftJIS_1() {
// Hello Èáë!
doTest(new byte[] { (byte) 0x48, (byte) 0x65, (byte) 0x6c, (byte) 0x6c, (byte) 0x6f,
(byte) 0x20, (byte) 0x8b, (byte) 0xe0, (byte) 0x21, },
"SJIS");
}
private static void doTest(byte[] bytes, String charsetName) {
Charset charset = Charset.forName(charsetName);
String guessedName = StringUtils.guessEncoding(bytes, null);
Charset guessedEncoding = Charset.forName(guessedName);
assertEquals(charset, guessedEncoding);
}
/**
* Utility for printing out a string in given encoding as a Java statement, since it's better
* to write that into the Java source file rather than risk character encoding issues in the
* source file itself
*/
public static void main(String[] args) {
String text = args[0];
Charset charset = Charset.forName(args[1]);
StringBuilder declaration = new StringBuilder();
declaration.append("new byte[] { ");
for (byte b : text.getBytes(charset)) {
declaration.append("(byte) 0x");
declaration.append(Integer.toHexString(b & 0xFF));
declaration.append(", ");
}
declaration.append('}');
System.out.println(declaration);
}
}