From 4f4aea47ca56a738c03e0e29b13c3df2d098e5a6 Mon Sep 17 00:00:00 2001 From: srowen Date: Mon, 2 Aug 2010 04:52:06 +0000 Subject: [PATCH] Issue 489 update the port git-svn-id: https://zxing.googlecode.com/svn/trunk@1501 59b500cc-1b3d-0410-9834-0bbf25fbcc57 --- .../qrcode/decoder/DecodedBitStreamParser.cpp | 104 ++++++++++++++---- 1 file changed, 85 insertions(+), 19 deletions(-) diff --git a/cpp/core/src/zxing/qrcode/decoder/DecodedBitStreamParser.cpp b/cpp/core/src/zxing/qrcode/decoder/DecodedBitStreamParser.cpp index 65390abd1..8c0fdd932 100644 --- a/cpp/core/src/zxing/qrcode/decoder/DecodedBitStreamParser.cpp +++ b/cpp/core/src/zxing/qrcode/decoder/DecodedBitStreamParser.cpp @@ -201,6 +201,9 @@ void DecodedBitStreamParser::decodeAlphanumericSegment(Ref bits, std: const char * DecodedBitStreamParser::guessEncoding(unsigned char *bytes, int length) { + const bool ASSUME_SHIFT_JIS = false; + char const* const PLATFORM_DEFAULT_ENCODING="UTF-8"; + // Does it start with the UTF-8 byte order mark? then guess it's UTF-8 if (length > 3 && bytes[0] == (unsigned char)0xEF && bytes[1] == (unsigned char)0xBB && bytes[2] == (unsigned char)0xBF) { @@ -214,17 +217,56 @@ DecodedBitStreamParser::guessEncoding(unsigned char *bytes, int length) { // If we see something else in that second byte, we'll make the risky guess // that it's UTF-8. bool canBeISO88591 = true; + bool canBeShiftJIS = true; + bool canBeUTF8 = true; + int utf8BytesLeft = 0; + int maybeDoubleByteCount = 0; + int maybeSingleByteKatakanaCount = 0; + bool sawLatin1Supplement = false; + bool sawUTF8Start = false; bool lastWasPossibleDoubleByteStart = false; - for (int i = 0; i < length; i++) { + for (int i = 0; + i < length && (canBeISO88591 || canBeShiftJIS || canBeUTF8); + i++) { int value = bytes[i] & 0xFF; - if (value >= 0x80 && value <= 0x9F && i < length - 1) { - canBeISO88591 = false; - // ISO-8859-1 shouldn't use this, but before we decide it is Shift_JIS, - // just double check that it is followed by a byte that's valid in - // the Shift_JIS encoding + + // UTF-8 stuff + if (value >= 0x80 && value <= 0xBF) { + if (utf8BytesLeft > 0) { + utf8BytesLeft--; + } + } else { + if (utf8BytesLeft > 0) { + canBeUTF8 = false; + } + if (value >= 0xC0 && value <= 0xFD) { + sawUTF8Start = true; + int valueCopy = value; + while ((valueCopy & 0x40) != 0) { + utf8BytesLeft++; + valueCopy <<= 1; + } + } + } + + // Shift_JIS stuff + + if (value >= 0xA1 && value <= 0xDF) { + // count the number of characters that might be a Shift_JIS single-byte Katakana character + if (!lastWasPossibleDoubleByteStart) { + maybeSingleByteKatakanaCount++; + } + } + if (!lastWasPossibleDoubleByteStart && + ((value >= 0xF0 && value <= 0xFF) || value == 0x80 || value == 0xA0)) { + canBeShiftJIS = false; + } + if (((value >= 0x81 && value <= 0x9F) || (value >= 0xE0 && value <= 0xEF))) { + // These start double-byte characters in Shift_JIS. Let's see if it's followed by a valid + // second byte. if (lastWasPossibleDoubleByteStart) { // If we just checked this and the last byte for being a valid double-byte - // char, don't check starting on this byte. If the this and the last byte + // char, don't check starting on this byte. If this and the last byte // formed a valid pair, then this shouldn't be checked to see if it starts // a double byte pair of course. lastWasPossibleDoubleByteStart = false; @@ -232,24 +274,48 @@ DecodedBitStreamParser::guessEncoding(unsigned char *bytes, int length) { // ... otherwise do check to see if this plus the next byte form a valid // double byte pair encoding a character. lastWasPossibleDoubleByteStart = true; - int nextValue = bytes[i + 1] & 0xFF; - if ((value & 0x1) == 0) { - // if even, next value should be in [0x9F,0xFC] - // if not, we'll guess UTF-8 - if (nextValue < 0x9F || nextValue > 0xFC) { - return UTF8; - } + if (i >= length - 1) { + canBeShiftJIS = false; } else { - // if odd, next value should be in [0x40,0x9E] - // if not, we'll guess UTF-8 - if (nextValue < 0x40 || nextValue > 0x9E) { - return UTF8; + int nextValue = bytes[i + 1] & 0xFF; + if (nextValue < 0x40 || nextValue > 0xFC) { + canBeShiftJIS = false; + } else { + maybeDoubleByteCount++; } + // There is some conflicting information out there about which bytes can follow which in + // double-byte Shift_JIS characters. The rule above seems to be the one that matches practice. } } + } else { + lastWasPossibleDoubleByteStart = false; } } - return canBeISO88591 ? ISO88591 : SHIFT_JIS; + if (utf8BytesLeft > 0) { + canBeUTF8 = false; + } + + // Easy -- if assuming Shift_JIS and no evidence it can't be, done + if (canBeShiftJIS && ASSUME_SHIFT_JIS) { + return SHIFT_JIS; + } + if (canBeUTF8 && sawUTF8Start) { + return UTF8; + } + // Distinguishing Shift_JIS and ISO-8859-1 can be a little tough. The crude heuristic is: + // - If we saw + // - at least 3 bytes that starts a double-byte value (bytes that are rare in ISO-8859-1), or + // - over 5% of bytes could be single-byte Katakana (also rare in ISO-8859-1), + // - and, saw no sequences that are invalid in Shift_JIS, then we conclude Shift_JIS + if (canBeShiftJIS && (maybeDoubleByteCount >= 3 || 20 * maybeSingleByteKatakanaCount > length)) { + return SHIFT_JIS; + } + // Otherwise, we default to ISO-8859-1 unless we know it can't be + if (!sawLatin1Supplement && canBeISO88591) { + return ISO88591; + } + // Otherwise, we take a wild guess with platform encoding + return PLATFORM_DEFAULT_ENCODING; } string DecodedBitStreamParser::decode(ArrayRef bytes, Version *version) {