mirror of
https://github.com/zxing/zxing.git
synced 2025-03-05 20:48:51 -08:00
formatting cleanup before trying to make the last failing C++ qr blackbox test pass
git-svn-id: https://zxing.googlecode.com/svn/trunk@1964 59b500cc-1b3d-0410-9834-0bbf25fbcc57
This commit is contained in:
parent
9558d83d71
commit
beeef242b2
|
@ -1,3 +1,4 @@
|
||||||
|
// -*- mode:c++; tab-width:2; indent-tabs-mode:nil; c-basic-offset:2 -*-
|
||||||
/*
|
/*
|
||||||
* BitMatrix.cpp
|
* BitMatrix.cpp
|
||||||
* zxing
|
* zxing
|
||||||
|
@ -24,34 +25,41 @@
|
||||||
#include <sstream>
|
#include <sstream>
|
||||||
#include <string>
|
#include <string>
|
||||||
|
|
||||||
namespace zxing {
|
using std::numeric_limits;
|
||||||
using namespace std;
|
using std::ostream;
|
||||||
|
using std::ostringstream;
|
||||||
|
|
||||||
unsigned int logDigits(unsigned digits) {
|
using zxing::BitMatrix;
|
||||||
unsigned log = 0;
|
using zxing::BitArray;
|
||||||
unsigned val = 1;
|
using zxing::Ref;
|
||||||
while (val < digits) {
|
|
||||||
log++;
|
namespace {
|
||||||
val <<= 1;
|
unsigned int logDigits(unsigned digits) {
|
||||||
|
unsigned log = 0;
|
||||||
|
unsigned val = 1;
|
||||||
|
while (val < digits) {
|
||||||
|
log++;
|
||||||
|
val <<= 1;
|
||||||
|
}
|
||||||
|
return log;
|
||||||
}
|
}
|
||||||
return log;
|
|
||||||
}
|
|
||||||
|
|
||||||
const unsigned int bitsPerWord = numeric_limits<unsigned int>::digits;
|
const unsigned int bitsPerWord = numeric_limits<unsigned int>::digits;
|
||||||
const unsigned int logBits = logDigits(bitsPerWord);
|
const unsigned int logBits = logDigits(bitsPerWord);
|
||||||
const unsigned int bitsMask = (1 << logBits) - 1;
|
const unsigned int bitsMask = (1 << logBits) - 1;
|
||||||
|
|
||||||
static size_t wordsForSize(size_t width, size_t height) {
|
size_t wordsForSize(size_t width, size_t height) {
|
||||||
size_t bits = width * height;
|
size_t bits = width * height;
|
||||||
int arraySize = bits >> logBits;
|
int arraySize = bits >> logBits;
|
||||||
if (bits - (arraySize << logBits) != 0) {
|
if (bits - (arraySize << logBits) != 0) {
|
||||||
arraySize++;
|
arraySize++;
|
||||||
|
}
|
||||||
|
return arraySize;
|
||||||
}
|
}
|
||||||
return arraySize;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
BitMatrix::BitMatrix(size_t dimension) :
|
BitMatrix::BitMatrix(size_t dimension) :
|
||||||
width_(dimension), height_(dimension), words_(0), bits_(NULL) {
|
width_(dimension), height_(dimension), words_(0), bits_(NULL) {
|
||||||
|
|
||||||
words_ = wordsForSize(width_, height_);
|
words_ = wordsForSize(width_, height_);
|
||||||
bits_ = new unsigned int[words_];
|
bits_ = new unsigned int[words_];
|
||||||
|
@ -59,7 +67,7 @@ BitMatrix::BitMatrix(size_t dimension) :
|
||||||
}
|
}
|
||||||
|
|
||||||
BitMatrix::BitMatrix(size_t width, size_t height) :
|
BitMatrix::BitMatrix(size_t width, size_t height) :
|
||||||
width_(width), height_(height), words_(0), bits_(NULL) {
|
width_(width), height_(height), words_(0), bits_(NULL) {
|
||||||
|
|
||||||
words_ = wordsForSize(width_, height_);
|
words_ = wordsForSize(width_, height_);
|
||||||
bits_ = new unsigned int[words_];
|
bits_ = new unsigned int[words_];
|
||||||
|
@ -160,19 +168,20 @@ unsigned int* BitMatrix::getBits() const {
|
||||||
return bits_;
|
return bits_;
|
||||||
}
|
}
|
||||||
|
|
||||||
ostream& operator<<(ostream &out, const BitMatrix &bm) {
|
namespace zxing {
|
||||||
for (size_t y = 0; y < bm.height_; y++) {
|
ostream& operator<<(ostream &out, const BitMatrix &bm) {
|
||||||
for (size_t x = 0; x < bm.width_; x++) {
|
for (size_t y = 0; y < bm.height_; y++) {
|
||||||
out << (bm.get(x, y) ? "X " : " ");
|
for (size_t x = 0; x < bm.width_; x++) {
|
||||||
|
out << (bm.get(x, y) ? "X " : " ");
|
||||||
|
}
|
||||||
|
out << "\n";
|
||||||
}
|
}
|
||||||
out << "\n";
|
return out;
|
||||||
}
|
}
|
||||||
return out;
|
|
||||||
}
|
}
|
||||||
const char *BitMatrix::description() {
|
|
||||||
|
const char* BitMatrix::description() {
|
||||||
ostringstream out;
|
ostringstream out;
|
||||||
out << *this;
|
out << *this;
|
||||||
return out.str().c_str();
|
return out.str().c_str();
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
|
||||||
|
|
|
@ -1,3 +1,4 @@
|
||||||
|
// -*- mode:c++; tab-width:2; indent-tabs-mode:nil; c-basic-offset:2 -*-
|
||||||
#ifndef __BIT_MATRIX_H__
|
#ifndef __BIT_MATRIX_H__
|
||||||
#define __BIT_MATRIX_H__
|
#define __BIT_MATRIX_H__
|
||||||
|
|
||||||
|
|
|
@ -1,3 +1,4 @@
|
||||||
|
// -*- mode:c++; tab-width:2; indent-tabs-mode:nil; c-basic-offset:2 -*-
|
||||||
/*
|
/*
|
||||||
* QRCodeReader.cpp
|
* QRCodeReader.cpp
|
||||||
* zxing
|
* zxing
|
||||||
|
|
|
@ -1,3 +1,4 @@
|
||||||
|
// -*- mode:c++; tab-width:2; indent-tabs-mode:nil; c-basic-offset:2 -*-
|
||||||
#ifndef __QR_CODE_READER_H__
|
#ifndef __QR_CODE_READER_H__
|
||||||
#define __QR_CODE_READER_H__
|
#define __QR_CODE_READER_H__
|
||||||
|
|
||||||
|
|
|
@ -1,5 +1,5 @@
|
||||||
/* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*-
|
// -*- mode:c++; tab-width:2; indent-tabs-mode:nil; c-basic-offset:2 -*-
|
||||||
*
|
/*
|
||||||
* DecodedBitStreamParser.cpp
|
* DecodedBitStreamParser.cpp
|
||||||
* zxing
|
* zxing
|
||||||
*
|
*
|
||||||
|
@ -35,329 +35,325 @@
|
||||||
#define ICONV_CONST /**/
|
#define ICONV_CONST /**/
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
using namespace std;
|
||||||
using namespace zxing;
|
using namespace zxing;
|
||||||
|
using namespace zxing::qrcode;
|
||||||
|
|
||||||
namespace zxing {
|
const char DecodedBitStreamParser::ALPHANUMERIC_CHARS[] =
|
||||||
namespace qrcode {
|
{ '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', 'A', 'B',
|
||||||
|
'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N',
|
||||||
|
'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X',
|
||||||
|
'Y', 'Z', ' ', '$', '%', '*', '+', '-', '.', '/', ':'
|
||||||
|
};
|
||||||
|
|
||||||
using namespace std;
|
const char *DecodedBitStreamParser::ASCII = "ASCII";
|
||||||
|
const char *DecodedBitStreamParser::ISO88591 = "ISO-8859-1";
|
||||||
|
const char *DecodedBitStreamParser::UTF8 = "UTF-8";
|
||||||
|
const char *DecodedBitStreamParser::SHIFT_JIS = "SHIFT_JIS";
|
||||||
|
const char *DecodedBitStreamParser::EUC_JP = "EUC-JP";
|
||||||
|
|
||||||
const char DecodedBitStreamParser::ALPHANUMERIC_CHARS[] = { '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', 'A', 'B',
|
void DecodedBitStreamParser::append(std::string &result, const unsigned char *bufIn, size_t nIn, const char *src) {
|
||||||
'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X',
|
|
||||||
'Y', 'Z', ' ', '$', '%', '*', '+', '-', '.', '/', ':'
|
|
||||||
};
|
|
||||||
|
|
||||||
const char *DecodedBitStreamParser::ASCII = "ASCII";
|
|
||||||
const char *DecodedBitStreamParser::ISO88591 = "ISO-8859-1";
|
|
||||||
const char *DecodedBitStreamParser::UTF8 = "UTF-8";
|
|
||||||
const char *DecodedBitStreamParser::SHIFT_JIS = "SHIFT_JIS";
|
|
||||||
const char *DecodedBitStreamParser::EUC_JP = "EUC-JP";
|
|
||||||
|
|
||||||
void DecodedBitStreamParser::append(std::string &result, const unsigned char *bufIn, size_t nIn, const char *src) {
|
|
||||||
#ifndef NO_ICONV
|
#ifndef NO_ICONV
|
||||||
if (nIn == 0) {
|
if (nIn == 0) {
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
iconv_t cd = iconv_open(UTF8, src);
|
iconv_t cd = iconv_open(UTF8, src);
|
||||||
const int maxOut = 4 * nIn + 1;
|
const int maxOut = 4 * nIn + 1;
|
||||||
unsigned char* bufOut = new unsigned char[maxOut];
|
unsigned char* bufOut = new unsigned char[maxOut];
|
||||||
|
|
||||||
ICONV_CONST char *fromPtr = (ICONV_CONST char *)bufIn;
|
ICONV_CONST char *fromPtr = (ICONV_CONST char *)bufIn;
|
||||||
size_t nFrom = nIn;
|
size_t nFrom = nIn;
|
||||||
char *toPtr = (char *)bufOut;
|
char *toPtr = (char *)bufOut;
|
||||||
size_t nTo = maxOut;
|
size_t nTo = maxOut;
|
||||||
|
|
||||||
while (nFrom > 0) {
|
while (nFrom > 0) {
|
||||||
size_t oneway = iconv(cd, &fromPtr, &nFrom, &toPtr, &nTo);
|
size_t oneway = iconv(cd, &fromPtr, &nFrom, &toPtr, &nTo);
|
||||||
if (oneway == (size_t)(-1)) {
|
if (oneway == (size_t)(-1)) {
|
||||||
iconv_close(cd);
|
|
||||||
delete[] bufOut;
|
|
||||||
throw ReaderException("error converting characters");
|
|
||||||
}
|
|
||||||
}
|
|
||||||
iconv_close(cd);
|
iconv_close(cd);
|
||||||
|
|
||||||
int nResult = maxOut - nTo;
|
|
||||||
bufOut[nResult] = '\0';
|
|
||||||
result.append((const char *)bufOut);
|
|
||||||
delete[] bufOut;
|
delete[] bufOut;
|
||||||
|
throw ReaderException("error converting characters");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
iconv_close(cd);
|
||||||
|
|
||||||
|
int nResult = maxOut - nTo;
|
||||||
|
bufOut[nResult] = '\0';
|
||||||
|
result.append((const char *)bufOut);
|
||||||
|
delete[] bufOut;
|
||||||
#else
|
#else
|
||||||
result.append((const char *)bufIn, nIn);
|
result.append((const char *)bufIn, nIn);
|
||||||
#endif
|
#endif
|
||||||
|
}
|
||||||
|
|
||||||
|
void DecodedBitStreamParser::decodeKanjiSegment(Ref<BitSource> bits, std::string &result, int count) {
|
||||||
|
// Each character will require 2 bytes. Read the characters as 2-byte pairs
|
||||||
|
// and decode as Shift_JIS afterwards
|
||||||
|
size_t nBytes = 2 * count;
|
||||||
|
unsigned char* buffer = new unsigned char[nBytes];
|
||||||
|
int offset = 0;
|
||||||
|
while (count > 0) {
|
||||||
|
// Each 13 bits encodes a 2-byte character
|
||||||
|
|
||||||
|
int twoBytes = bits->readBits(13);
|
||||||
|
int assembledTwoBytes = ((twoBytes / 0x0C0) << 8) | (twoBytes % 0x0C0);
|
||||||
|
if (assembledTwoBytes < 0x01F00) {
|
||||||
|
// In the 0x8140 to 0x9FFC range
|
||||||
|
assembledTwoBytes += 0x08140;
|
||||||
|
} else {
|
||||||
|
// In the 0xE040 to 0xEBBF range
|
||||||
|
assembledTwoBytes += 0x0C140;
|
||||||
}
|
}
|
||||||
|
buffer[offset] = (unsigned char)(assembledTwoBytes >> 8);
|
||||||
|
buffer[offset + 1] = (unsigned char)assembledTwoBytes;
|
||||||
|
offset += 2;
|
||||||
|
count--;
|
||||||
|
}
|
||||||
|
|
||||||
void DecodedBitStreamParser::decodeKanjiSegment(Ref<BitSource> bits, std::string &result, int count) {
|
append(result, buffer, nBytes, SHIFT_JIS);
|
||||||
// Each character will require 2 bytes. Read the characters as 2-byte pairs
|
delete[] buffer;
|
||||||
// and decode as Shift_JIS afterwards
|
}
|
||||||
size_t nBytes = 2 * count;
|
|
||||||
unsigned char* buffer = new unsigned char[nBytes];
|
|
||||||
int offset = 0;
|
|
||||||
while (count > 0) {
|
|
||||||
// Each 13 bits encodes a 2-byte character
|
|
||||||
|
|
||||||
int twoBytes = bits->readBits(13);
|
void DecodedBitStreamParser::decodeByteSegment(Ref<BitSource> bits, std::string &result, int count) {
|
||||||
int assembledTwoBytes = ((twoBytes / 0x0C0) << 8) | (twoBytes % 0x0C0);
|
int nBytes = count;
|
||||||
if (assembledTwoBytes < 0x01F00) {
|
unsigned char* readBytes = new unsigned char[nBytes];
|
||||||
// In the 0x8140 to 0x9FFC range
|
if (count << 3 > bits->available()) {
|
||||||
assembledTwoBytes += 0x08140;
|
ostringstream s;
|
||||||
} else {
|
s << "Count too large: " << count;
|
||||||
// In the 0xE040 to 0xEBBF range
|
delete[] readBytes;
|
||||||
assembledTwoBytes += 0x0C140;
|
throw ReaderException(s.str().c_str());
|
||||||
}
|
}
|
||||||
buffer[offset] = (unsigned char)(assembledTwoBytes >> 8);
|
for (int i = 0; i < count; i++) {
|
||||||
buffer[offset + 1] = (unsigned char)assembledTwoBytes;
|
readBytes[i] = (unsigned char)bits->readBits(8);
|
||||||
offset += 2;
|
}
|
||||||
count--;
|
// The spec isn't clear on this mode; see
|
||||||
}
|
// section 6.4.5: t does not say which encoding to assuming
|
||||||
|
// upon decoding. I have seen ISO-8859-1 used as well as
|
||||||
|
// Shift_JIS -- without anything like an ECI designator to
|
||||||
|
// give a hint.
|
||||||
|
const char *encoding = guessEncoding(readBytes, nBytes);
|
||||||
|
append(result, readBytes, nBytes, encoding);
|
||||||
|
delete[] readBytes;
|
||||||
|
}
|
||||||
|
|
||||||
append(result, buffer, nBytes, SHIFT_JIS);
|
void DecodedBitStreamParser::decodeNumericSegment(Ref<BitSource> bits, std::string &result, int count) {
|
||||||
delete[] buffer;
|
int nBytes = count;
|
||||||
|
unsigned char* bytes = new unsigned char[nBytes];
|
||||||
|
int i = 0;
|
||||||
|
// Read three digits at a time
|
||||||
|
while (count >= 3) {
|
||||||
|
// Each 10 bits encodes three digits
|
||||||
|
if (bits->available() < 10) {
|
||||||
|
throw ReaderException("format exception");
|
||||||
}
|
}
|
||||||
|
int threeDigitsBits = bits->readBits(10);
|
||||||
void DecodedBitStreamParser::decodeByteSegment(Ref<BitSource> bits, std::string &result, int count) {
|
if (threeDigitsBits >= 1000) {
|
||||||
int nBytes = count;
|
ostringstream s;
|
||||||
unsigned char* readBytes = new unsigned char[nBytes];
|
s << "Illegal value for 3-digit unit: " << threeDigitsBits;
|
||||||
if (count << 3 > bits->available()) {
|
|
||||||
ostringstream s;
|
|
||||||
s << "Count too large: " << count;
|
|
||||||
delete[] readBytes;
|
|
||||||
throw ReaderException(s.str().c_str());
|
|
||||||
}
|
|
||||||
for (int i = 0; i < count; i++) {
|
|
||||||
readBytes[i] = (unsigned char)bits->readBits(8);
|
|
||||||
}
|
|
||||||
// The spec isn't clear on this mode; see
|
|
||||||
// section 6.4.5: t does not say which encoding to assuming
|
|
||||||
// upon decoding. I have seen ISO-8859-1 used as well as
|
|
||||||
// Shift_JIS -- without anything like an ECI designator to
|
|
||||||
// give a hint.
|
|
||||||
const char *encoding = guessEncoding(readBytes, nBytes);
|
|
||||||
append(result, readBytes, nBytes, encoding);
|
|
||||||
delete[] readBytes;
|
|
||||||
}
|
|
||||||
|
|
||||||
void DecodedBitStreamParser::decodeNumericSegment(Ref<BitSource> bits, std::string &result, int count) {
|
|
||||||
int nBytes = count;
|
|
||||||
unsigned char* bytes = new unsigned char[nBytes];
|
|
||||||
int i = 0;
|
|
||||||
// Read three digits at a time
|
|
||||||
while (count >= 3) {
|
|
||||||
// Each 10 bits encodes three digits
|
|
||||||
if (bits->available() < 10) {
|
|
||||||
throw ReaderException("format exception");
|
|
||||||
}
|
|
||||||
int threeDigitsBits = bits->readBits(10);
|
|
||||||
if (threeDigitsBits >= 1000) {
|
|
||||||
ostringstream s;
|
|
||||||
s << "Illegal value for 3-digit unit: " << threeDigitsBits;
|
|
||||||
delete[] bytes;
|
|
||||||
throw ReaderException(s.str().c_str());
|
|
||||||
}
|
|
||||||
bytes[i++] = ALPHANUMERIC_CHARS[threeDigitsBits / 100];
|
|
||||||
bytes[i++] = ALPHANUMERIC_CHARS[(threeDigitsBits / 10) % 10];
|
|
||||||
bytes[i++] = ALPHANUMERIC_CHARS[threeDigitsBits % 10];
|
|
||||||
count -= 3;
|
|
||||||
}
|
|
||||||
if (count == 2) {
|
|
||||||
if (bits->available() < 7) {
|
|
||||||
throw ReaderException("format exception");
|
|
||||||
}
|
|
||||||
// Two digits left over to read, encoded in 7 bits
|
|
||||||
int twoDigitsBits = bits->readBits(7);
|
|
||||||
if (twoDigitsBits >= 100) {
|
|
||||||
ostringstream s;
|
|
||||||
s << "Illegal value for 2-digit unit: " << twoDigitsBits;
|
|
||||||
delete[] bytes;
|
|
||||||
throw ReaderException(s.str().c_str());
|
|
||||||
}
|
|
||||||
bytes[i++] = ALPHANUMERIC_CHARS[twoDigitsBits / 10];
|
|
||||||
bytes[i++] = ALPHANUMERIC_CHARS[twoDigitsBits % 10];
|
|
||||||
} else if (count == 1) {
|
|
||||||
if (bits->available() < 4) {
|
|
||||||
throw ReaderException("format exception");
|
|
||||||
}
|
|
||||||
// One digit left over to read
|
|
||||||
int digitBits = bits->readBits(4);
|
|
||||||
if (digitBits >= 10) {
|
|
||||||
ostringstream s;
|
|
||||||
s << "Illegal value for digit unit: " << digitBits;
|
|
||||||
delete[] bytes;
|
|
||||||
throw ReaderException(s.str().c_str());
|
|
||||||
}
|
|
||||||
bytes[i++] = ALPHANUMERIC_CHARS[digitBits];
|
|
||||||
}
|
|
||||||
append(result, bytes, nBytes, ASCII);
|
|
||||||
delete[] bytes;
|
delete[] bytes;
|
||||||
|
throw ReaderException(s.str().c_str());
|
||||||
}
|
}
|
||||||
|
bytes[i++] = ALPHANUMERIC_CHARS[threeDigitsBits / 100];
|
||||||
void DecodedBitStreamParser::decodeAlphanumericSegment(Ref<BitSource> bits, std::string &result, int count) {
|
bytes[i++] = ALPHANUMERIC_CHARS[(threeDigitsBits / 10) % 10];
|
||||||
int nBytes = count;
|
bytes[i++] = ALPHANUMERIC_CHARS[threeDigitsBits % 10];
|
||||||
unsigned char* bytes = new unsigned char[nBytes];
|
count -= 3;
|
||||||
int i = 0;
|
}
|
||||||
// Read two characters at a time
|
if (count == 2) {
|
||||||
while (count > 1) {
|
if (bits->available() < 7) {
|
||||||
int nextTwoCharsBits = bits->readBits(11);
|
throw ReaderException("format exception");
|
||||||
bytes[i++] = ALPHANUMERIC_CHARS[nextTwoCharsBits / 45];
|
}
|
||||||
bytes[i++] = ALPHANUMERIC_CHARS[nextTwoCharsBits % 45];
|
// Two digits left over to read, encoded in 7 bits
|
||||||
count -= 2;
|
int twoDigitsBits = bits->readBits(7);
|
||||||
}
|
if (twoDigitsBits >= 100) {
|
||||||
if (count == 1) {
|
ostringstream s;
|
||||||
bytes[i++] = ALPHANUMERIC_CHARS[bits->readBits(6)];
|
s << "Illegal value for 2-digit unit: " << twoDigitsBits;
|
||||||
}
|
|
||||||
append(result, bytes, nBytes, ASCII);
|
|
||||||
delete[] bytes;
|
delete[] bytes;
|
||||||
|
throw ReaderException(s.str().c_str());
|
||||||
}
|
}
|
||||||
|
bytes[i++] = ALPHANUMERIC_CHARS[twoDigitsBits / 10];
|
||||||
|
bytes[i++] = ALPHANUMERIC_CHARS[twoDigitsBits % 10];
|
||||||
|
} else if (count == 1) {
|
||||||
|
if (bits->available() < 4) {
|
||||||
|
throw ReaderException("format exception");
|
||||||
|
}
|
||||||
|
// One digit left over to read
|
||||||
|
int digitBits = bits->readBits(4);
|
||||||
|
if (digitBits >= 10) {
|
||||||
|
ostringstream s;
|
||||||
|
s << "Illegal value for digit unit: " << digitBits;
|
||||||
|
delete[] bytes;
|
||||||
|
throw ReaderException(s.str().c_str());
|
||||||
|
}
|
||||||
|
bytes[i++] = ALPHANUMERIC_CHARS[digitBits];
|
||||||
|
}
|
||||||
|
append(result, bytes, nBytes, ASCII);
|
||||||
|
delete[] bytes;
|
||||||
|
}
|
||||||
|
|
||||||
const char *
|
void DecodedBitStreamParser::decodeAlphanumericSegment(Ref<BitSource> bits, std::string &result, int count) {
|
||||||
DecodedBitStreamParser::guessEncoding(unsigned char *bytes, int length) {
|
int nBytes = count;
|
||||||
const bool ASSUME_SHIFT_JIS = false;
|
unsigned char* bytes = new unsigned char[nBytes];
|
||||||
char const* const PLATFORM_DEFAULT_ENCODING="UTF-8";
|
int i = 0;
|
||||||
|
// Read two characters at a time
|
||||||
|
while (count > 1) {
|
||||||
|
int nextTwoCharsBits = bits->readBits(11);
|
||||||
|
bytes[i++] = ALPHANUMERIC_CHARS[nextTwoCharsBits / 45];
|
||||||
|
bytes[i++] = ALPHANUMERIC_CHARS[nextTwoCharsBits % 45];
|
||||||
|
count -= 2;
|
||||||
|
}
|
||||||
|
if (count == 1) {
|
||||||
|
bytes[i++] = ALPHANUMERIC_CHARS[bits->readBits(6)];
|
||||||
|
}
|
||||||
|
append(result, bytes, nBytes, ASCII);
|
||||||
|
delete[] bytes;
|
||||||
|
}
|
||||||
|
|
||||||
// Does it start with the UTF-8 byte order mark? then guess it's UTF-8
|
const char *
|
||||||
if (length > 3 && bytes[0] == (unsigned char)0xEF && bytes[1] == (unsigned char)0xBB && bytes[2]
|
DecodedBitStreamParser::guessEncoding(unsigned char *bytes, int length) {
|
||||||
== (unsigned char)0xBF) {
|
const bool ASSUME_SHIFT_JIS = false;
|
||||||
return UTF8;
|
char const* const PLATFORM_DEFAULT_ENCODING="UTF-8";
|
||||||
}
|
|
||||||
// For now, merely tries to distinguish ISO-8859-1, UTF-8 and Shift_JIS,
|
// Does it start with the UTF-8 byte order mark? then guess it's UTF-8
|
||||||
// which should be by far the most common encodings. ISO-8859-1
|
if (length > 3 && bytes[0] == (unsigned char)0xEF && bytes[1] == (unsigned char)0xBB && bytes[2]
|
||||||
// should not have bytes in the 0x80 - 0x9F range, while Shift_JIS
|
== (unsigned char)0xBF) {
|
||||||
// uses this as a first byte of a two-byte character. If we see this
|
return UTF8;
|
||||||
// followed by a valid second byte in Shift_JIS, assume it is Shift_JIS.
|
}
|
||||||
// If we see something else in that second byte, we'll make the risky guess
|
// For now, merely tries to distinguish ISO-8859-1, UTF-8 and Shift_JIS,
|
||||||
// that it's UTF-8.
|
// which should be by far the most common encodings. ISO-8859-1
|
||||||
bool canBeISO88591 = true;
|
// should not have bytes in the 0x80 - 0x9F range, while Shift_JIS
|
||||||
bool canBeShiftJIS = true;
|
// uses this as a first byte of a two-byte character. If we see this
|
||||||
bool canBeUTF8 = true;
|
// followed by a valid second byte in Shift_JIS, assume it is Shift_JIS.
|
||||||
int utf8BytesLeft = 0;
|
// If we see something else in that second byte, we'll make the risky guess
|
||||||
int maybeDoubleByteCount = 0;
|
// that it's UTF-8.
|
||||||
int maybeSingleByteKatakanaCount = 0;
|
bool canBeISO88591 = true;
|
||||||
bool sawLatin1Supplement = false;
|
bool canBeShiftJIS = true;
|
||||||
bool sawUTF8Start = false;
|
bool canBeUTF8 = true;
|
||||||
bool lastWasPossibleDoubleByteStart = false;
|
int utf8BytesLeft = 0;
|
||||||
for (int i = 0;
|
int maybeDoubleByteCount = 0;
|
||||||
i < length && (canBeISO88591 || canBeShiftJIS || canBeUTF8);
|
int maybeSingleByteKatakanaCount = 0;
|
||||||
i++) {
|
bool sawLatin1Supplement = false;
|
||||||
int value = bytes[i] & 0xFF;
|
bool sawUTF8Start = false;
|
||||||
|
bool lastWasPossibleDoubleByteStart = false;
|
||||||
// UTF-8 stuff
|
for (int i = 0;
|
||||||
if (value >= 0x80 && value <= 0xBF) {
|
i < length && (canBeISO88591 || canBeShiftJIS || canBeUTF8);
|
||||||
if (utf8BytesLeft > 0) {
|
i++) {
|
||||||
utf8BytesLeft--;
|
int value = bytes[i] & 0xFF;
|
||||||
}
|
|
||||||
} else {
|
// UTF-8 stuff
|
||||||
if (utf8BytesLeft > 0) {
|
if (value >= 0x80 && value <= 0xBF) {
|
||||||
canBeUTF8 = false;
|
if (utf8BytesLeft > 0) {
|
||||||
}
|
utf8BytesLeft--;
|
||||||
if (value >= 0xC0 && value <= 0xFD) {
|
|
||||||
sawUTF8Start = true;
|
|
||||||
int valueCopy = value;
|
|
||||||
while ((valueCopy & 0x40) != 0) {
|
|
||||||
utf8BytesLeft++;
|
|
||||||
valueCopy <<= 1;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// Shift_JIS stuff
|
|
||||||
|
|
||||||
if (value >= 0xA1 && value <= 0xDF) {
|
|
||||||
// count the number of characters that might be a Shift_JIS single-byte Katakana character
|
|
||||||
if (!lastWasPossibleDoubleByteStart) {
|
|
||||||
maybeSingleByteKatakanaCount++;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
if (!lastWasPossibleDoubleByteStart &&
|
|
||||||
((value >= 0xF0 && value <= 0xFF) || value == 0x80 || value == 0xA0)) {
|
|
||||||
canBeShiftJIS = false;
|
|
||||||
}
|
|
||||||
if (((value >= 0x81 && value <= 0x9F) || (value >= 0xE0 && value <= 0xEF))) {
|
|
||||||
// These start double-byte characters in Shift_JIS. Let's see if it's followed by a valid
|
|
||||||
// second byte.
|
|
||||||
if (lastWasPossibleDoubleByteStart) {
|
|
||||||
// If we just checked this and the last byte for being a valid double-byte
|
|
||||||
// char, don't check starting on this byte. If this and the last byte
|
|
||||||
// formed a valid pair, then this shouldn't be checked to see if it starts
|
|
||||||
// a double byte pair of course.
|
|
||||||
lastWasPossibleDoubleByteStart = false;
|
|
||||||
} else {
|
|
||||||
// ... otherwise do check to see if this plus the next byte form a valid
|
|
||||||
// double byte pair encoding a character.
|
|
||||||
lastWasPossibleDoubleByteStart = true;
|
|
||||||
if (i >= length - 1) {
|
|
||||||
canBeShiftJIS = false;
|
|
||||||
} else {
|
|
||||||
int nextValue = bytes[i + 1] & 0xFF;
|
|
||||||
if (nextValue < 0x40 || nextValue > 0xFC) {
|
|
||||||
canBeShiftJIS = false;
|
|
||||||
} else {
|
|
||||||
maybeDoubleByteCount++;
|
|
||||||
}
|
|
||||||
// There is some conflicting information out there about which bytes can follow which in
|
|
||||||
// double-byte Shift_JIS characters. The rule above seems to be the one that matches practice.
|
|
||||||
}
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
lastWasPossibleDoubleByteStart = false;
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
} else {
|
||||||
if (utf8BytesLeft > 0) {
|
if (utf8BytesLeft > 0) {
|
||||||
canBeUTF8 = false;
|
canBeUTF8 = false;
|
||||||
}
|
}
|
||||||
|
if (value >= 0xC0 && value <= 0xFD) {
|
||||||
// Easy -- if assuming Shift_JIS and no evidence it can't be, done
|
sawUTF8Start = true;
|
||||||
if (canBeShiftJIS && ASSUME_SHIFT_JIS) {
|
int valueCopy = value;
|
||||||
return SHIFT_JIS;
|
while ((valueCopy & 0x40) != 0) {
|
||||||
|
utf8BytesLeft++;
|
||||||
|
valueCopy <<= 1;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
if (canBeUTF8 && sawUTF8Start) {
|
|
||||||
return UTF8;
|
|
||||||
}
|
|
||||||
// Distinguishing Shift_JIS and ISO-8859-1 can be a little tough. The crude heuristic is:
|
|
||||||
// - If we saw
|
|
||||||
// - at least 3 bytes that starts a double-byte value (bytes that are rare in ISO-8859-1), or
|
|
||||||
// - over 5% of bytes could be single-byte Katakana (also rare in ISO-8859-1),
|
|
||||||
// - and, saw no sequences that are invalid in Shift_JIS, then we conclude Shift_JIS
|
|
||||||
if (canBeShiftJIS && (maybeDoubleByteCount >= 3 || 20 * maybeSingleByteKatakanaCount > length)) {
|
|
||||||
return SHIFT_JIS;
|
|
||||||
}
|
|
||||||
// Otherwise, we default to ISO-8859-1 unless we know it can't be
|
|
||||||
if (!sawLatin1Supplement && canBeISO88591) {
|
|
||||||
return ISO88591;
|
|
||||||
}
|
|
||||||
// Otherwise, we take a wild guess with platform encoding
|
|
||||||
return PLATFORM_DEFAULT_ENCODING;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
string DecodedBitStreamParser::decode(ArrayRef<unsigned char> bytes, Version *version) {
|
// Shift_JIS stuff
|
||||||
string result;
|
|
||||||
Ref<BitSource> bits(new BitSource(bytes));
|
if (value >= 0xA1 && value <= 0xDF) {
|
||||||
Mode *mode = &Mode::TERMINATOR;
|
// count the number of characters that might be a Shift_JIS single-byte Katakana character
|
||||||
do {
|
if (!lastWasPossibleDoubleByteStart) {
|
||||||
// While still another segment to read...
|
maybeSingleByteKatakanaCount++;
|
||||||
if (bits->available() < 4) {
|
}
|
||||||
// OK, assume we're done. Really, a TERMINATOR mode should have been recorded here
|
}
|
||||||
mode = &Mode::TERMINATOR;
|
if (!lastWasPossibleDoubleByteStart &&
|
||||||
|
((value >= 0xF0 && value <= 0xFF) || value == 0x80 || value == 0xA0)) {
|
||||||
|
canBeShiftJIS = false;
|
||||||
|
}
|
||||||
|
if (((value >= 0x81 && value <= 0x9F) || (value >= 0xE0 && value <= 0xEF))) {
|
||||||
|
// These start double-byte characters in Shift_JIS. Let's see if it's followed by a valid
|
||||||
|
// second byte.
|
||||||
|
if (lastWasPossibleDoubleByteStart) {
|
||||||
|
// If we just checked this and the last byte for being a valid double-byte
|
||||||
|
// char, don't check starting on this byte. If this and the last byte
|
||||||
|
// formed a valid pair, then this shouldn't be checked to see if it starts
|
||||||
|
// a double byte pair of course.
|
||||||
|
lastWasPossibleDoubleByteStart = false;
|
||||||
|
} else {
|
||||||
|
// ... otherwise do check to see if this plus the next byte form a valid
|
||||||
|
// double byte pair encoding a character.
|
||||||
|
lastWasPossibleDoubleByteStart = true;
|
||||||
|
if (i >= length - 1) {
|
||||||
|
canBeShiftJIS = false;
|
||||||
} else {
|
} else {
|
||||||
mode = &Mode::forBits(bits->readBits(4)); // mode is encoded by 4 bits
|
int nextValue = bytes[i + 1] & 0xFF;
|
||||||
}
|
if (nextValue < 0x40 || nextValue > 0xFC) {
|
||||||
if (mode != &Mode::TERMINATOR) {
|
canBeShiftJIS = false;
|
||||||
// How many characters will follow, encoded in this mode?
|
|
||||||
int count = bits->readBits(mode->getCharacterCountBits(version));
|
|
||||||
if (mode == &Mode::NUMERIC) {
|
|
||||||
decodeNumericSegment(bits, result, count);
|
|
||||||
} else if (mode == &Mode::ALPHANUMERIC) {
|
|
||||||
decodeAlphanumericSegment(bits, result, count);
|
|
||||||
} else if (mode == &Mode::BYTE) {
|
|
||||||
decodeByteSegment(bits, result, count);
|
|
||||||
} else if (mode == &Mode::KANJI) {
|
|
||||||
decodeKanjiSegment(bits, result, count);
|
|
||||||
} else {
|
} else {
|
||||||
throw ReaderException("Unsupported mode indicator");
|
maybeDoubleByteCount++;
|
||||||
}
|
}
|
||||||
|
// There is some conflicting information out there about which bytes can follow which in
|
||||||
|
// double-byte Shift_JIS characters. The rule above seems to be the one that matches practice.
|
||||||
}
|
}
|
||||||
} while (mode != &Mode::TERMINATOR);
|
}
|
||||||
return result;
|
} else {
|
||||||
|
lastWasPossibleDoubleByteStart = false;
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
if (utf8BytesLeft > 0) {
|
||||||
|
canBeUTF8 = false;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Easy -- if assuming Shift_JIS and no evidence it can't be, done
|
||||||
|
if (canBeShiftJIS && ASSUME_SHIFT_JIS) {
|
||||||
|
return SHIFT_JIS;
|
||||||
|
}
|
||||||
|
if (canBeUTF8 && sawUTF8Start) {
|
||||||
|
return UTF8;
|
||||||
|
}
|
||||||
|
// Distinguishing Shift_JIS and ISO-8859-1 can be a little tough. The crude heuristic is:
|
||||||
|
// - If we saw
|
||||||
|
// - at least 3 bytes that starts a double-byte value (bytes that are rare in ISO-8859-1), or
|
||||||
|
// - over 5% of bytes could be single-byte Katakana (also rare in ISO-8859-1),
|
||||||
|
// - and, saw no sequences that are invalid in Shift_JIS, then we conclude Shift_JIS
|
||||||
|
if (canBeShiftJIS && (maybeDoubleByteCount >= 3 || 20 * maybeSingleByteKatakanaCount > length)) {
|
||||||
|
return SHIFT_JIS;
|
||||||
|
}
|
||||||
|
// Otherwise, we default to ISO-8859-1 unless we know it can't be
|
||||||
|
if (!sawLatin1Supplement && canBeISO88591) {
|
||||||
|
return ISO88591;
|
||||||
|
}
|
||||||
|
// Otherwise, we take a wild guess with platform encoding
|
||||||
|
return PLATFORM_DEFAULT_ENCODING;
|
||||||
|
}
|
||||||
|
|
||||||
|
string DecodedBitStreamParser::decode(ArrayRef<unsigned char> bytes, Version *version) {
|
||||||
|
string result;
|
||||||
|
Ref<BitSource> bits(new BitSource(bytes));
|
||||||
|
Mode *mode = &Mode::TERMINATOR;
|
||||||
|
do {
|
||||||
|
// While still another segment to read...
|
||||||
|
if (bits->available() < 4) {
|
||||||
|
// OK, assume we're done. Really, a TERMINATOR mode should have been recorded here
|
||||||
|
mode = &Mode::TERMINATOR;
|
||||||
|
} else {
|
||||||
|
mode = &Mode::forBits(bits->readBits(4)); // mode is encoded by 4 bits
|
||||||
|
}
|
||||||
|
if (mode != &Mode::TERMINATOR) {
|
||||||
|
// How many characters will follow, encoded in this mode?
|
||||||
|
int count = bits->readBits(mode->getCharacterCountBits(version));
|
||||||
|
if (mode == &Mode::NUMERIC) {
|
||||||
|
decodeNumericSegment(bits, result, count);
|
||||||
|
} else if (mode == &Mode::ALPHANUMERIC) {
|
||||||
|
decodeAlphanumericSegment(bits, result, count);
|
||||||
|
} else if (mode == &Mode::BYTE) {
|
||||||
|
decodeByteSegment(bits, result, count);
|
||||||
|
} else if (mode == &Mode::KANJI) {
|
||||||
|
decodeKanjiSegment(bits, result, count);
|
||||||
|
} else {
|
||||||
|
throw ReaderException("Unsupported mode indicator");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
} while (mode != &Mode::TERMINATOR);
|
||||||
|
return result;
|
||||||
}
|
}
|
||||||
|
|
Loading…
Reference in a new issue