formatting cleanup before trying to make the last failing C++ qr blackbox test pass

git-svn-id: https://zxing.googlecode.com/svn/trunk@1964 59b500cc-1b3d-0410-9834-0bbf25fbcc57
This commit is contained in:
smparkes@smparkes.net 2011-10-13 15:20:25 +00:00
parent 9558d83d71
commit beeef242b2
5 changed files with 329 additions and 321 deletions

View file

@ -1,3 +1,4 @@
// -*- mode:c++; tab-width:2; indent-tabs-mode:nil; c-basic-offset:2 -*-
/* /*
* BitMatrix.cpp * BitMatrix.cpp
* zxing * zxing
@ -24,34 +25,41 @@
#include <sstream> #include <sstream>
#include <string> #include <string>
namespace zxing { using std::numeric_limits;
using namespace std; using std::ostream;
using std::ostringstream;
unsigned int logDigits(unsigned digits) { using zxing::BitMatrix;
unsigned log = 0; using zxing::BitArray;
unsigned val = 1; using zxing::Ref;
while (val < digits) {
log++; namespace {
val <<= 1; unsigned int logDigits(unsigned digits) {
unsigned log = 0;
unsigned val = 1;
while (val < digits) {
log++;
val <<= 1;
}
return log;
} }
return log;
}
const unsigned int bitsPerWord = numeric_limits<unsigned int>::digits; const unsigned int bitsPerWord = numeric_limits<unsigned int>::digits;
const unsigned int logBits = logDigits(bitsPerWord); const unsigned int logBits = logDigits(bitsPerWord);
const unsigned int bitsMask = (1 << logBits) - 1; const unsigned int bitsMask = (1 << logBits) - 1;
static size_t wordsForSize(size_t width, size_t height) { size_t wordsForSize(size_t width, size_t height) {
size_t bits = width * height; size_t bits = width * height;
int arraySize = bits >> logBits; int arraySize = bits >> logBits;
if (bits - (arraySize << logBits) != 0) { if (bits - (arraySize << logBits) != 0) {
arraySize++; arraySize++;
}
return arraySize;
} }
return arraySize;
} }
BitMatrix::BitMatrix(size_t dimension) : BitMatrix::BitMatrix(size_t dimension) :
width_(dimension), height_(dimension), words_(0), bits_(NULL) { width_(dimension), height_(dimension), words_(0), bits_(NULL) {
words_ = wordsForSize(width_, height_); words_ = wordsForSize(width_, height_);
bits_ = new unsigned int[words_]; bits_ = new unsigned int[words_];
@ -59,7 +67,7 @@ BitMatrix::BitMatrix(size_t dimension) :
} }
BitMatrix::BitMatrix(size_t width, size_t height) : BitMatrix::BitMatrix(size_t width, size_t height) :
width_(width), height_(height), words_(0), bits_(NULL) { width_(width), height_(height), words_(0), bits_(NULL) {
words_ = wordsForSize(width_, height_); words_ = wordsForSize(width_, height_);
bits_ = new unsigned int[words_]; bits_ = new unsigned int[words_];
@ -160,19 +168,20 @@ unsigned int* BitMatrix::getBits() const {
return bits_; return bits_;
} }
ostream& operator<<(ostream &out, const BitMatrix &bm) { namespace zxing {
for (size_t y = 0; y < bm.height_; y++) { ostream& operator<<(ostream &out, const BitMatrix &bm) {
for (size_t x = 0; x < bm.width_; x++) { for (size_t y = 0; y < bm.height_; y++) {
out << (bm.get(x, y) ? "X " : " "); for (size_t x = 0; x < bm.width_; x++) {
out << (bm.get(x, y) ? "X " : " ");
}
out << "\n";
} }
out << "\n"; return out;
} }
return out;
} }
const char *BitMatrix::description() {
const char* BitMatrix::description() {
ostringstream out; ostringstream out;
out << *this; out << *this;
return out.str().c_str(); return out.str().c_str();
} }
}

View file

@ -1,3 +1,4 @@
// -*- mode:c++; tab-width:2; indent-tabs-mode:nil; c-basic-offset:2 -*-
#ifndef __BIT_MATRIX_H__ #ifndef __BIT_MATRIX_H__
#define __BIT_MATRIX_H__ #define __BIT_MATRIX_H__

View file

@ -1,3 +1,4 @@
// -*- mode:c++; tab-width:2; indent-tabs-mode:nil; c-basic-offset:2 -*-
/* /*
* QRCodeReader.cpp * QRCodeReader.cpp
* zxing * zxing

View file

@ -1,3 +1,4 @@
// -*- mode:c++; tab-width:2; indent-tabs-mode:nil; c-basic-offset:2 -*-
#ifndef __QR_CODE_READER_H__ #ifndef __QR_CODE_READER_H__
#define __QR_CODE_READER_H__ #define __QR_CODE_READER_H__

View file

@ -1,5 +1,5 @@
/* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- // -*- mode:c++; tab-width:2; indent-tabs-mode:nil; c-basic-offset:2 -*-
* /*
* DecodedBitStreamParser.cpp * DecodedBitStreamParser.cpp
* zxing * zxing
* *
@ -35,329 +35,325 @@
#define ICONV_CONST /**/ #define ICONV_CONST /**/
#endif #endif
using namespace std;
using namespace zxing; using namespace zxing;
using namespace zxing::qrcode;
namespace zxing { const char DecodedBitStreamParser::ALPHANUMERIC_CHARS[] =
namespace qrcode { { '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', 'A', 'B',
'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N',
'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X',
'Y', 'Z', ' ', '$', '%', '*', '+', '-', '.', '/', ':'
};
using namespace std; const char *DecodedBitStreamParser::ASCII = "ASCII";
const char *DecodedBitStreamParser::ISO88591 = "ISO-8859-1";
const char *DecodedBitStreamParser::UTF8 = "UTF-8";
const char *DecodedBitStreamParser::SHIFT_JIS = "SHIFT_JIS";
const char *DecodedBitStreamParser::EUC_JP = "EUC-JP";
const char DecodedBitStreamParser::ALPHANUMERIC_CHARS[] = { '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', 'A', 'B', void DecodedBitStreamParser::append(std::string &result, const unsigned char *bufIn, size_t nIn, const char *src) {
'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X',
'Y', 'Z', ' ', '$', '%', '*', '+', '-', '.', '/', ':'
};
const char *DecodedBitStreamParser::ASCII = "ASCII";
const char *DecodedBitStreamParser::ISO88591 = "ISO-8859-1";
const char *DecodedBitStreamParser::UTF8 = "UTF-8";
const char *DecodedBitStreamParser::SHIFT_JIS = "SHIFT_JIS";
const char *DecodedBitStreamParser::EUC_JP = "EUC-JP";
void DecodedBitStreamParser::append(std::string &result, const unsigned char *bufIn, size_t nIn, const char *src) {
#ifndef NO_ICONV #ifndef NO_ICONV
if (nIn == 0) { if (nIn == 0) {
return; return;
} }
iconv_t cd = iconv_open(UTF8, src); iconv_t cd = iconv_open(UTF8, src);
const int maxOut = 4 * nIn + 1; const int maxOut = 4 * nIn + 1;
unsigned char* bufOut = new unsigned char[maxOut]; unsigned char* bufOut = new unsigned char[maxOut];
ICONV_CONST char *fromPtr = (ICONV_CONST char *)bufIn; ICONV_CONST char *fromPtr = (ICONV_CONST char *)bufIn;
size_t nFrom = nIn; size_t nFrom = nIn;
char *toPtr = (char *)bufOut; char *toPtr = (char *)bufOut;
size_t nTo = maxOut; size_t nTo = maxOut;
while (nFrom > 0) { while (nFrom > 0) {
size_t oneway = iconv(cd, &fromPtr, &nFrom, &toPtr, &nTo); size_t oneway = iconv(cd, &fromPtr, &nFrom, &toPtr, &nTo);
if (oneway == (size_t)(-1)) { if (oneway == (size_t)(-1)) {
iconv_close(cd);
delete[] bufOut;
throw ReaderException("error converting characters");
}
}
iconv_close(cd); iconv_close(cd);
int nResult = maxOut - nTo;
bufOut[nResult] = '\0';
result.append((const char *)bufOut);
delete[] bufOut; delete[] bufOut;
throw ReaderException("error converting characters");
}
}
iconv_close(cd);
int nResult = maxOut - nTo;
bufOut[nResult] = '\0';
result.append((const char *)bufOut);
delete[] bufOut;
#else #else
result.append((const char *)bufIn, nIn); result.append((const char *)bufIn, nIn);
#endif #endif
}
void DecodedBitStreamParser::decodeKanjiSegment(Ref<BitSource> bits, std::string &result, int count) {
// Each character will require 2 bytes. Read the characters as 2-byte pairs
// and decode as Shift_JIS afterwards
size_t nBytes = 2 * count;
unsigned char* buffer = new unsigned char[nBytes];
int offset = 0;
while (count > 0) {
// Each 13 bits encodes a 2-byte character
int twoBytes = bits->readBits(13);
int assembledTwoBytes = ((twoBytes / 0x0C0) << 8) | (twoBytes % 0x0C0);
if (assembledTwoBytes < 0x01F00) {
// In the 0x8140 to 0x9FFC range
assembledTwoBytes += 0x08140;
} else {
// In the 0xE040 to 0xEBBF range
assembledTwoBytes += 0x0C140;
} }
buffer[offset] = (unsigned char)(assembledTwoBytes >> 8);
buffer[offset + 1] = (unsigned char)assembledTwoBytes;
offset += 2;
count--;
}
void DecodedBitStreamParser::decodeKanjiSegment(Ref<BitSource> bits, std::string &result, int count) { append(result, buffer, nBytes, SHIFT_JIS);
// Each character will require 2 bytes. Read the characters as 2-byte pairs delete[] buffer;
// and decode as Shift_JIS afterwards }
size_t nBytes = 2 * count;
unsigned char* buffer = new unsigned char[nBytes];
int offset = 0;
while (count > 0) {
// Each 13 bits encodes a 2-byte character
int twoBytes = bits->readBits(13); void DecodedBitStreamParser::decodeByteSegment(Ref<BitSource> bits, std::string &result, int count) {
int assembledTwoBytes = ((twoBytes / 0x0C0) << 8) | (twoBytes % 0x0C0); int nBytes = count;
if (assembledTwoBytes < 0x01F00) { unsigned char* readBytes = new unsigned char[nBytes];
// In the 0x8140 to 0x9FFC range if (count << 3 > bits->available()) {
assembledTwoBytes += 0x08140; ostringstream s;
} else { s << "Count too large: " << count;
// In the 0xE040 to 0xEBBF range delete[] readBytes;
assembledTwoBytes += 0x0C140; throw ReaderException(s.str().c_str());
} }
buffer[offset] = (unsigned char)(assembledTwoBytes >> 8); for (int i = 0; i < count; i++) {
buffer[offset + 1] = (unsigned char)assembledTwoBytes; readBytes[i] = (unsigned char)bits->readBits(8);
offset += 2; }
count--; // The spec isn't clear on this mode; see
} // section 6.4.5: t does not say which encoding to assuming
// upon decoding. I have seen ISO-8859-1 used as well as
// Shift_JIS -- without anything like an ECI designator to
// give a hint.
const char *encoding = guessEncoding(readBytes, nBytes);
append(result, readBytes, nBytes, encoding);
delete[] readBytes;
}
append(result, buffer, nBytes, SHIFT_JIS); void DecodedBitStreamParser::decodeNumericSegment(Ref<BitSource> bits, std::string &result, int count) {
delete[] buffer; int nBytes = count;
unsigned char* bytes = new unsigned char[nBytes];
int i = 0;
// Read three digits at a time
while (count >= 3) {
// Each 10 bits encodes three digits
if (bits->available() < 10) {
throw ReaderException("format exception");
} }
int threeDigitsBits = bits->readBits(10);
void DecodedBitStreamParser::decodeByteSegment(Ref<BitSource> bits, std::string &result, int count) { if (threeDigitsBits >= 1000) {
int nBytes = count; ostringstream s;
unsigned char* readBytes = new unsigned char[nBytes]; s << "Illegal value for 3-digit unit: " << threeDigitsBits;
if (count << 3 > bits->available()) {
ostringstream s;
s << "Count too large: " << count;
delete[] readBytes;
throw ReaderException(s.str().c_str());
}
for (int i = 0; i < count; i++) {
readBytes[i] = (unsigned char)bits->readBits(8);
}
// The spec isn't clear on this mode; see
// section 6.4.5: t does not say which encoding to assuming
// upon decoding. I have seen ISO-8859-1 used as well as
// Shift_JIS -- without anything like an ECI designator to
// give a hint.
const char *encoding = guessEncoding(readBytes, nBytes);
append(result, readBytes, nBytes, encoding);
delete[] readBytes;
}
void DecodedBitStreamParser::decodeNumericSegment(Ref<BitSource> bits, std::string &result, int count) {
int nBytes = count;
unsigned char* bytes = new unsigned char[nBytes];
int i = 0;
// Read three digits at a time
while (count >= 3) {
// Each 10 bits encodes three digits
if (bits->available() < 10) {
throw ReaderException("format exception");
}
int threeDigitsBits = bits->readBits(10);
if (threeDigitsBits >= 1000) {
ostringstream s;
s << "Illegal value for 3-digit unit: " << threeDigitsBits;
delete[] bytes;
throw ReaderException(s.str().c_str());
}
bytes[i++] = ALPHANUMERIC_CHARS[threeDigitsBits / 100];
bytes[i++] = ALPHANUMERIC_CHARS[(threeDigitsBits / 10) % 10];
bytes[i++] = ALPHANUMERIC_CHARS[threeDigitsBits % 10];
count -= 3;
}
if (count == 2) {
if (bits->available() < 7) {
throw ReaderException("format exception");
}
// Two digits left over to read, encoded in 7 bits
int twoDigitsBits = bits->readBits(7);
if (twoDigitsBits >= 100) {
ostringstream s;
s << "Illegal value for 2-digit unit: " << twoDigitsBits;
delete[] bytes;
throw ReaderException(s.str().c_str());
}
bytes[i++] = ALPHANUMERIC_CHARS[twoDigitsBits / 10];
bytes[i++] = ALPHANUMERIC_CHARS[twoDigitsBits % 10];
} else if (count == 1) {
if (bits->available() < 4) {
throw ReaderException("format exception");
}
// One digit left over to read
int digitBits = bits->readBits(4);
if (digitBits >= 10) {
ostringstream s;
s << "Illegal value for digit unit: " << digitBits;
delete[] bytes;
throw ReaderException(s.str().c_str());
}
bytes[i++] = ALPHANUMERIC_CHARS[digitBits];
}
append(result, bytes, nBytes, ASCII);
delete[] bytes; delete[] bytes;
throw ReaderException(s.str().c_str());
} }
bytes[i++] = ALPHANUMERIC_CHARS[threeDigitsBits / 100];
void DecodedBitStreamParser::decodeAlphanumericSegment(Ref<BitSource> bits, std::string &result, int count) { bytes[i++] = ALPHANUMERIC_CHARS[(threeDigitsBits / 10) % 10];
int nBytes = count; bytes[i++] = ALPHANUMERIC_CHARS[threeDigitsBits % 10];
unsigned char* bytes = new unsigned char[nBytes]; count -= 3;
int i = 0; }
// Read two characters at a time if (count == 2) {
while (count > 1) { if (bits->available() < 7) {
int nextTwoCharsBits = bits->readBits(11); throw ReaderException("format exception");
bytes[i++] = ALPHANUMERIC_CHARS[nextTwoCharsBits / 45]; }
bytes[i++] = ALPHANUMERIC_CHARS[nextTwoCharsBits % 45]; // Two digits left over to read, encoded in 7 bits
count -= 2; int twoDigitsBits = bits->readBits(7);
} if (twoDigitsBits >= 100) {
if (count == 1) { ostringstream s;
bytes[i++] = ALPHANUMERIC_CHARS[bits->readBits(6)]; s << "Illegal value for 2-digit unit: " << twoDigitsBits;
}
append(result, bytes, nBytes, ASCII);
delete[] bytes; delete[] bytes;
throw ReaderException(s.str().c_str());
} }
bytes[i++] = ALPHANUMERIC_CHARS[twoDigitsBits / 10];
bytes[i++] = ALPHANUMERIC_CHARS[twoDigitsBits % 10];
} else if (count == 1) {
if (bits->available() < 4) {
throw ReaderException("format exception");
}
// One digit left over to read
int digitBits = bits->readBits(4);
if (digitBits >= 10) {
ostringstream s;
s << "Illegal value for digit unit: " << digitBits;
delete[] bytes;
throw ReaderException(s.str().c_str());
}
bytes[i++] = ALPHANUMERIC_CHARS[digitBits];
}
append(result, bytes, nBytes, ASCII);
delete[] bytes;
}
const char * void DecodedBitStreamParser::decodeAlphanumericSegment(Ref<BitSource> bits, std::string &result, int count) {
DecodedBitStreamParser::guessEncoding(unsigned char *bytes, int length) { int nBytes = count;
const bool ASSUME_SHIFT_JIS = false; unsigned char* bytes = new unsigned char[nBytes];
char const* const PLATFORM_DEFAULT_ENCODING="UTF-8"; int i = 0;
// Read two characters at a time
while (count > 1) {
int nextTwoCharsBits = bits->readBits(11);
bytes[i++] = ALPHANUMERIC_CHARS[nextTwoCharsBits / 45];
bytes[i++] = ALPHANUMERIC_CHARS[nextTwoCharsBits % 45];
count -= 2;
}
if (count == 1) {
bytes[i++] = ALPHANUMERIC_CHARS[bits->readBits(6)];
}
append(result, bytes, nBytes, ASCII);
delete[] bytes;
}
// Does it start with the UTF-8 byte order mark? then guess it's UTF-8 const char *
if (length > 3 && bytes[0] == (unsigned char)0xEF && bytes[1] == (unsigned char)0xBB && bytes[2] DecodedBitStreamParser::guessEncoding(unsigned char *bytes, int length) {
== (unsigned char)0xBF) { const bool ASSUME_SHIFT_JIS = false;
return UTF8; char const* const PLATFORM_DEFAULT_ENCODING="UTF-8";
}
// For now, merely tries to distinguish ISO-8859-1, UTF-8 and Shift_JIS, // Does it start with the UTF-8 byte order mark? then guess it's UTF-8
// which should be by far the most common encodings. ISO-8859-1 if (length > 3 && bytes[0] == (unsigned char)0xEF && bytes[1] == (unsigned char)0xBB && bytes[2]
// should not have bytes in the 0x80 - 0x9F range, while Shift_JIS == (unsigned char)0xBF) {
// uses this as a first byte of a two-byte character. If we see this return UTF8;
// followed by a valid second byte in Shift_JIS, assume it is Shift_JIS. }
// If we see something else in that second byte, we'll make the risky guess // For now, merely tries to distinguish ISO-8859-1, UTF-8 and Shift_JIS,
// that it's UTF-8. // which should be by far the most common encodings. ISO-8859-1
bool canBeISO88591 = true; // should not have bytes in the 0x80 - 0x9F range, while Shift_JIS
bool canBeShiftJIS = true; // uses this as a first byte of a two-byte character. If we see this
bool canBeUTF8 = true; // followed by a valid second byte in Shift_JIS, assume it is Shift_JIS.
int utf8BytesLeft = 0; // If we see something else in that second byte, we'll make the risky guess
int maybeDoubleByteCount = 0; // that it's UTF-8.
int maybeSingleByteKatakanaCount = 0; bool canBeISO88591 = true;
bool sawLatin1Supplement = false; bool canBeShiftJIS = true;
bool sawUTF8Start = false; bool canBeUTF8 = true;
bool lastWasPossibleDoubleByteStart = false; int utf8BytesLeft = 0;
for (int i = 0; int maybeDoubleByteCount = 0;
i < length && (canBeISO88591 || canBeShiftJIS || canBeUTF8); int maybeSingleByteKatakanaCount = 0;
i++) { bool sawLatin1Supplement = false;
int value = bytes[i] & 0xFF; bool sawUTF8Start = false;
bool lastWasPossibleDoubleByteStart = false;
// UTF-8 stuff for (int i = 0;
if (value >= 0x80 && value <= 0xBF) { i < length && (canBeISO88591 || canBeShiftJIS || canBeUTF8);
if (utf8BytesLeft > 0) { i++) {
utf8BytesLeft--; int value = bytes[i] & 0xFF;
}
} else { // UTF-8 stuff
if (utf8BytesLeft > 0) { if (value >= 0x80 && value <= 0xBF) {
canBeUTF8 = false; if (utf8BytesLeft > 0) {
} utf8BytesLeft--;
if (value >= 0xC0 && value <= 0xFD) {
sawUTF8Start = true;
int valueCopy = value;
while ((valueCopy & 0x40) != 0) {
utf8BytesLeft++;
valueCopy <<= 1;
}
}
}
// Shift_JIS stuff
if (value >= 0xA1 && value <= 0xDF) {
// count the number of characters that might be a Shift_JIS single-byte Katakana character
if (!lastWasPossibleDoubleByteStart) {
maybeSingleByteKatakanaCount++;
}
}
if (!lastWasPossibleDoubleByteStart &&
((value >= 0xF0 && value <= 0xFF) || value == 0x80 || value == 0xA0)) {
canBeShiftJIS = false;
}
if (((value >= 0x81 && value <= 0x9F) || (value >= 0xE0 && value <= 0xEF))) {
// These start double-byte characters in Shift_JIS. Let's see if it's followed by a valid
// second byte.
if (lastWasPossibleDoubleByteStart) {
// If we just checked this and the last byte for being a valid double-byte
// char, don't check starting on this byte. If this and the last byte
// formed a valid pair, then this shouldn't be checked to see if it starts
// a double byte pair of course.
lastWasPossibleDoubleByteStart = false;
} else {
// ... otherwise do check to see if this plus the next byte form a valid
// double byte pair encoding a character.
lastWasPossibleDoubleByteStart = true;
if (i >= length - 1) {
canBeShiftJIS = false;
} else {
int nextValue = bytes[i + 1] & 0xFF;
if (nextValue < 0x40 || nextValue > 0xFC) {
canBeShiftJIS = false;
} else {
maybeDoubleByteCount++;
}
// There is some conflicting information out there about which bytes can follow which in
// double-byte Shift_JIS characters. The rule above seems to be the one that matches practice.
}
}
} else {
lastWasPossibleDoubleByteStart = false;
}
} }
} else {
if (utf8BytesLeft > 0) { if (utf8BytesLeft > 0) {
canBeUTF8 = false; canBeUTF8 = false;
} }
if (value >= 0xC0 && value <= 0xFD) {
// Easy -- if assuming Shift_JIS and no evidence it can't be, done sawUTF8Start = true;
if (canBeShiftJIS && ASSUME_SHIFT_JIS) { int valueCopy = value;
return SHIFT_JIS; while ((valueCopy & 0x40) != 0) {
utf8BytesLeft++;
valueCopy <<= 1;
}
} }
if (canBeUTF8 && sawUTF8Start) {
return UTF8;
}
// Distinguishing Shift_JIS and ISO-8859-1 can be a little tough. The crude heuristic is:
// - If we saw
// - at least 3 bytes that starts a double-byte value (bytes that are rare in ISO-8859-1), or
// - over 5% of bytes could be single-byte Katakana (also rare in ISO-8859-1),
// - and, saw no sequences that are invalid in Shift_JIS, then we conclude Shift_JIS
if (canBeShiftJIS && (maybeDoubleByteCount >= 3 || 20 * maybeSingleByteKatakanaCount > length)) {
return SHIFT_JIS;
}
// Otherwise, we default to ISO-8859-1 unless we know it can't be
if (!sawLatin1Supplement && canBeISO88591) {
return ISO88591;
}
// Otherwise, we take a wild guess with platform encoding
return PLATFORM_DEFAULT_ENCODING;
} }
string DecodedBitStreamParser::decode(ArrayRef<unsigned char> bytes, Version *version) { // Shift_JIS stuff
string result;
Ref<BitSource> bits(new BitSource(bytes)); if (value >= 0xA1 && value <= 0xDF) {
Mode *mode = &Mode::TERMINATOR; // count the number of characters that might be a Shift_JIS single-byte Katakana character
do { if (!lastWasPossibleDoubleByteStart) {
// While still another segment to read... maybeSingleByteKatakanaCount++;
if (bits->available() < 4) { }
// OK, assume we're done. Really, a TERMINATOR mode should have been recorded here }
mode = &Mode::TERMINATOR; if (!lastWasPossibleDoubleByteStart &&
((value >= 0xF0 && value <= 0xFF) || value == 0x80 || value == 0xA0)) {
canBeShiftJIS = false;
}
if (((value >= 0x81 && value <= 0x9F) || (value >= 0xE0 && value <= 0xEF))) {
// These start double-byte characters in Shift_JIS. Let's see if it's followed by a valid
// second byte.
if (lastWasPossibleDoubleByteStart) {
// If we just checked this and the last byte for being a valid double-byte
// char, don't check starting on this byte. If this and the last byte
// formed a valid pair, then this shouldn't be checked to see if it starts
// a double byte pair of course.
lastWasPossibleDoubleByteStart = false;
} else {
// ... otherwise do check to see if this plus the next byte form a valid
// double byte pair encoding a character.
lastWasPossibleDoubleByteStart = true;
if (i >= length - 1) {
canBeShiftJIS = false;
} else { } else {
mode = &Mode::forBits(bits->readBits(4)); // mode is encoded by 4 bits int nextValue = bytes[i + 1] & 0xFF;
} if (nextValue < 0x40 || nextValue > 0xFC) {
if (mode != &Mode::TERMINATOR) { canBeShiftJIS = false;
// How many characters will follow, encoded in this mode?
int count = bits->readBits(mode->getCharacterCountBits(version));
if (mode == &Mode::NUMERIC) {
decodeNumericSegment(bits, result, count);
} else if (mode == &Mode::ALPHANUMERIC) {
decodeAlphanumericSegment(bits, result, count);
} else if (mode == &Mode::BYTE) {
decodeByteSegment(bits, result, count);
} else if (mode == &Mode::KANJI) {
decodeKanjiSegment(bits, result, count);
} else { } else {
throw ReaderException("Unsupported mode indicator"); maybeDoubleByteCount++;
} }
// There is some conflicting information out there about which bytes can follow which in
// double-byte Shift_JIS characters. The rule above seems to be the one that matches practice.
} }
} while (mode != &Mode::TERMINATOR); }
return result; } else {
lastWasPossibleDoubleByteStart = false;
} }
} }
if (utf8BytesLeft > 0) {
canBeUTF8 = false;
}
// Easy -- if assuming Shift_JIS and no evidence it can't be, done
if (canBeShiftJIS && ASSUME_SHIFT_JIS) {
return SHIFT_JIS;
}
if (canBeUTF8 && sawUTF8Start) {
return UTF8;
}
// Distinguishing Shift_JIS and ISO-8859-1 can be a little tough. The crude heuristic is:
// - If we saw
// - at least 3 bytes that starts a double-byte value (bytes that are rare in ISO-8859-1), or
// - over 5% of bytes could be single-byte Katakana (also rare in ISO-8859-1),
// - and, saw no sequences that are invalid in Shift_JIS, then we conclude Shift_JIS
if (canBeShiftJIS && (maybeDoubleByteCount >= 3 || 20 * maybeSingleByteKatakanaCount > length)) {
return SHIFT_JIS;
}
// Otherwise, we default to ISO-8859-1 unless we know it can't be
if (!sawLatin1Supplement && canBeISO88591) {
return ISO88591;
}
// Otherwise, we take a wild guess with platform encoding
return PLATFORM_DEFAULT_ENCODING;
}
string DecodedBitStreamParser::decode(ArrayRef<unsigned char> bytes, Version *version) {
string result;
Ref<BitSource> bits(new BitSource(bytes));
Mode *mode = &Mode::TERMINATOR;
do {
// While still another segment to read...
if (bits->available() < 4) {
// OK, assume we're done. Really, a TERMINATOR mode should have been recorded here
mode = &Mode::TERMINATOR;
} else {
mode = &Mode::forBits(bits->readBits(4)); // mode is encoded by 4 bits
}
if (mode != &Mode::TERMINATOR) {
// How many characters will follow, encoded in this mode?
int count = bits->readBits(mode->getCharacterCountBits(version));
if (mode == &Mode::NUMERIC) {
decodeNumericSegment(bits, result, count);
} else if (mode == &Mode::ALPHANUMERIC) {
decodeAlphanumericSegment(bits, result, count);
} else if (mode == &Mode::BYTE) {
decodeByteSegment(bits, result, count);
} else if (mode == &Mode::KANJI) {
decodeKanjiSegment(bits, result, count);
} else {
throw ReaderException("Unsupported mode indicator");
}
}
} while (mode != &Mode::TERMINATOR);
return result;
} }