From beeef242b2b7147299fa9f2b67a4b81af46be979 Mon Sep 17 00:00:00 2001
From: "smparkes@smparkes.net"
 <smparkes@smparkes.net@59b500cc-1b3d-0410-9834-0bbf25fbcc57>
Date: Thu, 13 Oct 2011 15:20:25 +0000
Subject: [PATCH] formatting cleanup before trying to make the last failing C++
 qr blackbox test pass

git-svn-id: https://zxing.googlecode.com/svn/trunk@1964 59b500cc-1b3d-0410-9834-0bbf25fbcc57
---
 cpp/core/src/zxing/common/BitMatrix.cpp       |  69 ++-
 cpp/core/src/zxing/common/BitMatrix.h         |   1 +
 cpp/core/src/zxing/qrcode/QRCodeReader.cpp    |   1 +
 cpp/core/src/zxing/qrcode/QRCodeReader.h      |   1 +
 .../qrcode/decoder/DecodedBitStreamParser.cpp | 578 +++++++++---------
 5 files changed, 329 insertions(+), 321 deletions(-)
diff --git a/cpp/core/src/zxing/common/BitMatrix.cpp b/cpp/core/src/zxing/common/BitMatrix.cpp
index c3e9f1ab9..326c05f67 100644
--- a/cpp/core/src/zxing/common/BitMatrix.cpp
+++ b/cpp/core/src/zxing/common/BitMatrix.cpp
@@ -1,3 +1,4 @@
+// -*- mode:c++; tab-width:2; indent-tabs-mode:nil; c-basic-offset:2 -*-
 /*
  *  BitMatrix.cpp
  *  zxing
@@ -24,34 +25,41 @@
 #include <sstream>
 #include <string>
 
-namespace zxing {
-using namespace std;
+using std::numeric_limits;
+using std::ostream;
+using std::ostringstream;
 
-unsigned int logDigits(unsigned digits) {
-  unsigned log = 0;
-  unsigned val = 1;
-  while (val < digits) {
-    log++;
-    val <<= 1;
+using zxing::BitMatrix;
+using zxing::BitArray;
+using zxing::Ref;
+
+namespace {
+  unsigned int logDigits(unsigned digits) {
+    unsigned log = 0;
+    unsigned val = 1;
+    while (val < digits) {
+      log++;
+      val <<= 1;
+    }
+    return log;
   }
-  return log;
-}
 
-const unsigned int bitsPerWord = numeric_limits<unsigned int>::digits;
-const unsigned int logBits = logDigits(bitsPerWord);
-const unsigned int bitsMask = (1 << logBits) - 1;
+  const unsigned int bitsPerWord = numeric_limits<unsigned int>::digits;
+  const unsigned int logBits = logDigits(bitsPerWord);
+  const unsigned int bitsMask = (1 << logBits) - 1;
 
-static size_t wordsForSize(size_t width, size_t height) {
-  size_t bits = width * height;
-  int arraySize = bits >> logBits;
-  if (bits - (arraySize << logBits) != 0) {
-    arraySize++;
+  size_t wordsForSize(size_t width, size_t height) {
+    size_t bits = width * height;
+    int arraySize = bits >> logBits;
+    if (bits - (arraySize << logBits) != 0) {
+      arraySize++;
+    }
+    return arraySize;
   }
-  return arraySize;
 }
 
 BitMatrix::BitMatrix(size_t dimension) :
-    width_(dimension), height_(dimension), words_(0), bits_(NULL) {
+  width_(dimension), height_(dimension), words_(0), bits_(NULL) {
 
   words_ = wordsForSize(width_, height_);
   bits_ = new unsigned int[words_];
@@ -59,7 +67,7 @@ BitMatrix::BitMatrix(size_t dimension) :
 }
 
 BitMatrix::BitMatrix(size_t width, size_t height) :
-    width_(width), height_(height), words_(0), bits_(NULL) {
+  width_(width), height_(height), words_(0), bits_(NULL) {
 
   words_ = wordsForSize(width_, height_);
   bits_ = new unsigned int[words_];
@@ -160,19 +168,20 @@ unsigned int* BitMatrix::getBits() const {
   return bits_;
 }
 
-ostream& operator<<(ostream &out, const BitMatrix &bm) {
-  for (size_t y = 0; y < bm.height_; y++) {
-    for (size_t x = 0; x < bm.width_; x++) {
-      out << (bm.get(x, y) ? "X " : "  ");
+namespace zxing {
+  ostream& operator<<(ostream &out, const BitMatrix &bm) {
+    for (size_t y = 0; y < bm.height_; y++) {
+      for (size_t x = 0; x < bm.width_; x++) {
+        out << (bm.get(x, y) ? "X " : "  ");
+      }
+      out << "\n";
     }
-    out << "\n";
+    return out;
   }
-  return out;
 }
-const char *BitMatrix::description() {
+
+const char* BitMatrix::description() {
   ostringstream out;
   out << *this;
   return out.str().c_str();
 }
-
-}
diff --git a/cpp/core/src/zxing/common/BitMatrix.h b/cpp/core/src/zxing/common/BitMatrix.h
index 2b1267b76..f73415274 100644
--- a/cpp/core/src/zxing/common/BitMatrix.h
+++ b/cpp/core/src/zxing/common/BitMatrix.h
@@ -1,3 +1,4 @@
+// -*- mode:c++; tab-width:2; indent-tabs-mode:nil; c-basic-offset:2 -*-
 #ifndef __BIT_MATRIX_H__
 #define __BIT_MATRIX_H__
 
diff --git a/cpp/core/src/zxing/qrcode/QRCodeReader.cpp b/cpp/core/src/zxing/qrcode/QRCodeReader.cpp
index cc92e93ec..ef59f1ad8 100644
--- a/cpp/core/src/zxing/qrcode/QRCodeReader.cpp
+++ b/cpp/core/src/zxing/qrcode/QRCodeReader.cpp
@@ -1,3 +1,4 @@
+// -*- mode:c++; tab-width:2; indent-tabs-mode:nil; c-basic-offset:2 -*-
 /*
  *  QRCodeReader.cpp
  *  zxing
diff --git a/cpp/core/src/zxing/qrcode/QRCodeReader.h b/cpp/core/src/zxing/qrcode/QRCodeReader.h
index d146ac529..0e4fa6961 100644
--- a/cpp/core/src/zxing/qrcode/QRCodeReader.h
+++ b/cpp/core/src/zxing/qrcode/QRCodeReader.h
@@ -1,3 +1,4 @@
+// -*- mode:c++; tab-width:2; indent-tabs-mode:nil; c-basic-offset:2 -*-
 #ifndef __QR_CODE_READER_H__
 #define __QR_CODE_READER_H__
 
diff --git a/cpp/core/src/zxing/qrcode/decoder/DecodedBitStreamParser.cpp b/cpp/core/src/zxing/qrcode/decoder/DecodedBitStreamParser.cpp
index 35cad2bfc..a79007cce 100644
--- a/cpp/core/src/zxing/qrcode/decoder/DecodedBitStreamParser.cpp
+++ b/cpp/core/src/zxing/qrcode/decoder/DecodedBitStreamParser.cpp
@@ -1,5 +1,5 @@
-/* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*-
- *
+// -*- mode:c++; tab-width:2; indent-tabs-mode:nil; c-basic-offset:2 -*-
+/*
  *  DecodedBitStreamParser.cpp
  *  zxing
  *
@@ -35,329 +35,325 @@
 #define ICONV_CONST /**/
 #endif
 
+using namespace std;
 using namespace zxing;
+using namespace zxing::qrcode;
 
-namespace zxing {
-  namespace qrcode {
+const char DecodedBitStreamParser::ALPHANUMERIC_CHARS[] =
+{ '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', 'A', 'B',
+  'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N',
+  'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X',
+  'Y', 'Z', ' ', '$', '%', '*', '+', '-', '.', '/', ':'
+};
 
-    using namespace std;
+const char *DecodedBitStreamParser::ASCII = "ASCII";
+const char *DecodedBitStreamParser::ISO88591 = "ISO-8859-1";
+const char *DecodedBitStreamParser::UTF8 = "UTF-8";
+const char *DecodedBitStreamParser::SHIFT_JIS = "SHIFT_JIS";
+const char *DecodedBitStreamParser::EUC_JP = "EUC-JP";
 
-    const char DecodedBitStreamParser::ALPHANUMERIC_CHARS[] = { '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', 'A', 'B',
-                                                                'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X',
-                                                                'Y', 'Z', ' ', '$', '%', '*', '+', '-', '.', '/', ':'
-    };
-
-    const char *DecodedBitStreamParser::ASCII = "ASCII";
-    const char *DecodedBitStreamParser::ISO88591 = "ISO-8859-1";
-    const char *DecodedBitStreamParser::UTF8 = "UTF-8";
-    const char *DecodedBitStreamParser::SHIFT_JIS = "SHIFT_JIS";
-    const char *DecodedBitStreamParser::EUC_JP = "EUC-JP";
-
-    void DecodedBitStreamParser::append(std::string &result, const unsigned char *bufIn, size_t nIn, const char *src) {
+void DecodedBitStreamParser::append(std::string &result, const unsigned char *bufIn, size_t nIn, const char *src) {
 #ifndef NO_ICONV
-      if (nIn == 0) {
-        return;
-      }
+  if (nIn == 0) {
+    return;
+  }
 
-      iconv_t cd = iconv_open(UTF8, src);
-      const int maxOut = 4 * nIn + 1;
-      unsigned char* bufOut = new unsigned char[maxOut];
+  iconv_t cd = iconv_open(UTF8, src);
+  const int maxOut = 4 * nIn + 1;
+  unsigned char* bufOut = new unsigned char[maxOut];
 
-      ICONV_CONST char *fromPtr = (ICONV_CONST char *)bufIn;
-      size_t nFrom = nIn;
-      char *toPtr = (char *)bufOut;
-      size_t nTo = maxOut;
+  ICONV_CONST char *fromPtr = (ICONV_CONST char *)bufIn;
+  size_t nFrom = nIn;
+  char *toPtr = (char *)bufOut;
+  size_t nTo = maxOut;
 
-      while (nFrom > 0) {
-        size_t oneway = iconv(cd, &fromPtr, &nFrom, &toPtr, &nTo);
-        if (oneway == (size_t)(-1)) {
-          iconv_close(cd);
-          delete[] bufOut;
-          throw ReaderException("error converting characters");
-        }
-      }
+  while (nFrom > 0) {
+    size_t oneway = iconv(cd, &fromPtr, &nFrom, &toPtr, &nTo);
+    if (oneway == (size_t)(-1)) {
       iconv_close(cd);
-
-      int nResult = maxOut - nTo;
-      bufOut[nResult] = '\0';
-      result.append((const char *)bufOut);
       delete[] bufOut;
+      throw ReaderException("error converting characters");
+    }
+  }
+  iconv_close(cd);
+
+  int nResult = maxOut - nTo;
+  bufOut[nResult] = '\0';
+  result.append((const char *)bufOut);
+  delete[] bufOut;
 #else
-      result.append((const char *)bufIn, nIn);
+  result.append((const char *)bufIn, nIn);
 #endif
+}
+
+void DecodedBitStreamParser::decodeKanjiSegment(Ref<BitSource> bits, std::string &result, int count) {
+  // Each character will require 2 bytes. Read the characters as 2-byte pairs
+  // and decode as Shift_JIS afterwards
+  size_t nBytes = 2 * count;
+  unsigned char* buffer = new unsigned char[nBytes];
+  int offset = 0;
+  while (count > 0) {
+    // Each 13 bits encodes a 2-byte character
+
+    int twoBytes = bits->readBits(13);
+    int assembledTwoBytes = ((twoBytes / 0x0C0) << 8) | (twoBytes % 0x0C0);
+    if (assembledTwoBytes < 0x01F00) {
+      // In the 0x8140 to 0x9FFC range
+      assembledTwoBytes += 0x08140;
+    } else {
+      // In the 0xE040 to 0xEBBF range
+      assembledTwoBytes += 0x0C140;
     }
+    buffer[offset] = (unsigned char)(assembledTwoBytes >> 8);
+    buffer[offset + 1] = (unsigned char)assembledTwoBytes;
+    offset += 2;
+    count--;
+  }
 
-    void DecodedBitStreamParser::decodeKanjiSegment(Ref<BitSource> bits, std::string &result, int count) {
-      // Each character will require 2 bytes. Read the characters as 2-byte pairs
-      // and decode as Shift_JIS afterwards
-      size_t nBytes = 2 * count;
-      unsigned char* buffer = new unsigned char[nBytes];
-      int offset = 0;
-      while (count > 0) {
-        // Each 13 bits encodes a 2-byte character
+  append(result, buffer, nBytes, SHIFT_JIS);
+  delete[] buffer;
+}
 
-        int twoBytes = bits->readBits(13);
-        int assembledTwoBytes = ((twoBytes / 0x0C0) << 8) | (twoBytes % 0x0C0);
-        if (assembledTwoBytes < 0x01F00) {
-          // In the 0x8140 to 0x9FFC range
-          assembledTwoBytes += 0x08140;
-        } else {
-          // In the 0xE040 to 0xEBBF range
-          assembledTwoBytes += 0x0C140;
-        }
-        buffer[offset] = (unsigned char)(assembledTwoBytes >> 8);
-        buffer[offset + 1] = (unsigned char)assembledTwoBytes;
-        offset += 2;
-        count--;
-      }
+void DecodedBitStreamParser::decodeByteSegment(Ref<BitSource> bits, std::string &result, int count) {
+  int nBytes = count;
+  unsigned char* readBytes = new unsigned char[nBytes];
+  if (count << 3 > bits->available()) {
+    ostringstream s;
+    s << "Count too large: " << count;
+    delete[] readBytes;
+    throw ReaderException(s.str().c_str());
+  }
+  for (int i = 0; i < count; i++) {
+    readBytes[i] = (unsigned char)bits->readBits(8);
+  }
+  // The spec isn't clear on this mode; see
+  // section 6.4.5: t does not say which encoding to assuming
+  // upon decoding. I have seen ISO-8859-1 used as well as
+  // Shift_JIS -- without anything like an ECI designator to
+  // give a hint.
+  const char *encoding = guessEncoding(readBytes, nBytes);
+  append(result, readBytes, nBytes, encoding);
+  delete[] readBytes;
+}
 
-      append(result, buffer, nBytes, SHIFT_JIS);
-      delete[] buffer;
+void DecodedBitStreamParser::decodeNumericSegment(Ref<BitSource> bits, std::string &result, int count) {
+  int nBytes = count;
+  unsigned char* bytes = new unsigned char[nBytes];
+  int i = 0;
+  // Read three digits at a time
+  while (count >= 3) {
+    // Each 10 bits encodes three digits
+    if (bits->available() < 10) {
+      throw ReaderException("format exception");
     }
-
-    void DecodedBitStreamParser::decodeByteSegment(Ref<BitSource> bits, std::string &result, int count) {
-      int nBytes = count;
-      unsigned char* readBytes = new unsigned char[nBytes];
-      if (count << 3 > bits->available()) {
-        ostringstream s;
-        s << "Count too large: " << count;
-        delete[] readBytes;
-        throw ReaderException(s.str().c_str());
-      }
-      for (int i = 0; i < count; i++) {
-        readBytes[i] = (unsigned char)bits->readBits(8);
-      }
-      // The spec isn't clear on this mode; see
-      // section 6.4.5: t does not say which encoding to assuming
-      // upon decoding. I have seen ISO-8859-1 used as well as
-      // Shift_JIS -- without anything like an ECI designator to
-      // give a hint.
-      const char *encoding = guessEncoding(readBytes, nBytes);
-      append(result, readBytes, nBytes, encoding);
-      delete[] readBytes;
-    }
-
-    void DecodedBitStreamParser::decodeNumericSegment(Ref<BitSource> bits, std::string &result, int count) {
-      int nBytes = count;
-      unsigned char* bytes = new unsigned char[nBytes];
-      int i = 0;
-      // Read three digits at a time
-      while (count >= 3) {
-        // Each 10 bits encodes three digits
-        if (bits->available() < 10) {
-          throw ReaderException("format exception");
-        }
-        int threeDigitsBits = bits->readBits(10);
-        if (threeDigitsBits >= 1000) {
-          ostringstream s;
-          s << "Illegal value for 3-digit unit: " << threeDigitsBits;
-          delete[] bytes;
-          throw ReaderException(s.str().c_str());
-        }
-        bytes[i++] = ALPHANUMERIC_CHARS[threeDigitsBits / 100];
-        bytes[i++] = ALPHANUMERIC_CHARS[(threeDigitsBits / 10) % 10];
-        bytes[i++] = ALPHANUMERIC_CHARS[threeDigitsBits % 10];
-        count -= 3;
-      }
-      if (count == 2) {
-        if (bits->available() < 7) {
-          throw ReaderException("format exception");
-        }
-        // Two digits left over to read, encoded in 7 bits
-        int twoDigitsBits = bits->readBits(7);
-        if (twoDigitsBits >= 100) {
-          ostringstream s;
-          s << "Illegal value for 2-digit unit: " << twoDigitsBits;
-          delete[] bytes;
-          throw ReaderException(s.str().c_str());
-        }
-        bytes[i++] = ALPHANUMERIC_CHARS[twoDigitsBits / 10];
-        bytes[i++] = ALPHANUMERIC_CHARS[twoDigitsBits % 10];
-      } else if (count == 1) {
-        if (bits->available() < 4) {
-          throw ReaderException("format exception");
-        }
-        // One digit left over to read
-        int digitBits = bits->readBits(4);
-        if (digitBits >= 10) {
-          ostringstream s;
-          s << "Illegal value for digit unit: " << digitBits;
-          delete[] bytes;
-          throw ReaderException(s.str().c_str());
-        }
-        bytes[i++] = ALPHANUMERIC_CHARS[digitBits];
-      }
-      append(result, bytes, nBytes, ASCII);
+    int threeDigitsBits = bits->readBits(10);
+    if (threeDigitsBits >= 1000) {
+      ostringstream s;
+      s << "Illegal value for 3-digit unit: " << threeDigitsBits;
       delete[] bytes;
+      throw ReaderException(s.str().c_str());
     }
-
-    void DecodedBitStreamParser::decodeAlphanumericSegment(Ref<BitSource> bits, std::string &result, int count) {
-      int nBytes = count;
-      unsigned char* bytes = new unsigned char[nBytes];
-      int i = 0;
-      // Read two characters at a time
-      while (count > 1) {
-        int nextTwoCharsBits = bits->readBits(11);
-        bytes[i++] = ALPHANUMERIC_CHARS[nextTwoCharsBits / 45];
-        bytes[i++] = ALPHANUMERIC_CHARS[nextTwoCharsBits % 45];
-        count -= 2;
-      }
-      if (count == 1) {
-        bytes[i++] = ALPHANUMERIC_CHARS[bits->readBits(6)];
-      }
-      append(result, bytes, nBytes, ASCII);
+    bytes[i++] = ALPHANUMERIC_CHARS[threeDigitsBits / 100];
+    bytes[i++] = ALPHANUMERIC_CHARS[(threeDigitsBits / 10) % 10];
+    bytes[i++] = ALPHANUMERIC_CHARS[threeDigitsBits % 10];
+    count -= 3;
+  }
+  if (count == 2) {
+    if (bits->available() < 7) {
+      throw ReaderException("format exception");
+    }
+    // Two digits left over to read, encoded in 7 bits
+    int twoDigitsBits = bits->readBits(7);
+    if (twoDigitsBits >= 100) {
+      ostringstream s;
+      s << "Illegal value for 2-digit unit: " << twoDigitsBits;
       delete[] bytes;
+      throw ReaderException(s.str().c_str());
     }
+    bytes[i++] = ALPHANUMERIC_CHARS[twoDigitsBits / 10];
+    bytes[i++] = ALPHANUMERIC_CHARS[twoDigitsBits % 10];
+  } else if (count == 1) {
+    if (bits->available() < 4) {
+      throw ReaderException("format exception");
+    }
+    // One digit left over to read
+    int digitBits = bits->readBits(4);
+    if (digitBits >= 10) {
+      ostringstream s;
+      s << "Illegal value for digit unit: " << digitBits;
+      delete[] bytes;
+      throw ReaderException(s.str().c_str());
+    }
+    bytes[i++] = ALPHANUMERIC_CHARS[digitBits];
+  }
+  append(result, bytes, nBytes, ASCII);
+  delete[] bytes;
+}
 
-    const char *
-    DecodedBitStreamParser::guessEncoding(unsigned char *bytes, int length) {
-      const bool ASSUME_SHIFT_JIS = false;
-      char const* const PLATFORM_DEFAULT_ENCODING="UTF-8";
+void DecodedBitStreamParser::decodeAlphanumericSegment(Ref<BitSource> bits, std::string &result, int count) {
+  int nBytes = count;
+  unsigned char* bytes = new unsigned char[nBytes];
+  int i = 0;
+  // Read two characters at a time
+  while (count > 1) {
+    int nextTwoCharsBits = bits->readBits(11);
+    bytes[i++] = ALPHANUMERIC_CHARS[nextTwoCharsBits / 45];
+    bytes[i++] = ALPHANUMERIC_CHARS[nextTwoCharsBits % 45];
+    count -= 2;
+  }
+  if (count == 1) {
+    bytes[i++] = ALPHANUMERIC_CHARS[bits->readBits(6)];
+  }
+  append(result, bytes, nBytes, ASCII);
+  delete[] bytes;
+}
 
-      // Does it start with the UTF-8 byte order mark? then guess it's UTF-8
-      if (length > 3 && bytes[0] == (unsigned char)0xEF && bytes[1] == (unsigned char)0xBB && bytes[2]
-          == (unsigned char)0xBF) {
-        return UTF8;
-      }
-      // For now, merely tries to distinguish ISO-8859-1, UTF-8 and Shift_JIS,
-      // which should be by far the most common encodings. ISO-8859-1
-      // should not have bytes in the 0x80 - 0x9F range, while Shift_JIS
-      // uses this as a first byte of a two-byte character. If we see this
-      // followed by a valid second byte in Shift_JIS, assume it is Shift_JIS.
-      // If we see something else in that second byte, we'll make the risky guess
-      // that it's UTF-8.
-      bool canBeISO88591 = true;
-      bool canBeShiftJIS = true;
-      bool canBeUTF8 = true;
-      int utf8BytesLeft = 0;
-      int maybeDoubleByteCount = 0;
-      int maybeSingleByteKatakanaCount = 0;
-      bool sawLatin1Supplement = false;
-      bool sawUTF8Start = false;
-      bool lastWasPossibleDoubleByteStart = false;
-      for (int i = 0;
-           i < length && (canBeISO88591 || canBeShiftJIS || canBeUTF8);
-           i++) {
-        int value = bytes[i] & 0xFF;
-
-        // UTF-8 stuff
-        if (value >= 0x80 && value <= 0xBF) {
-          if (utf8BytesLeft > 0) {
-            utf8BytesLeft--;
-          }
-        } else {
-          if (utf8BytesLeft > 0) {
-            canBeUTF8 = false;
-          }
-          if (value >= 0xC0 && value <= 0xFD) {
-            sawUTF8Start = true;
-            int valueCopy = value;
-            while ((valueCopy & 0x40) != 0) {
-              utf8BytesLeft++;
-              valueCopy <<= 1;
-            }
-          }
-        }
-
-        // Shift_JIS stuff
-
-        if (value >= 0xA1 && value <= 0xDF) {
-          // count the number of characters that might be a Shift_JIS single-byte Katakana character
-          if (!lastWasPossibleDoubleByteStart) {
-            maybeSingleByteKatakanaCount++;
-          }
-        }
-        if (!lastWasPossibleDoubleByteStart &&
-            ((value >= 0xF0 && value <= 0xFF) || value == 0x80 || value == 0xA0)) {
-          canBeShiftJIS = false;
-        }
-        if (((value >= 0x81 && value <= 0x9F) || (value >= 0xE0 && value <= 0xEF))) {
-          // These start double-byte characters in Shift_JIS. Let's see if it's followed by a valid
-          // second byte.
-          if (lastWasPossibleDoubleByteStart) {
-            // If we just checked this and the last byte for being a valid double-byte
-            // char, don't check starting on this byte. If this and the last byte
-            // formed a valid pair, then this shouldn't be checked to see if it starts
-            // a double byte pair of course.
-            lastWasPossibleDoubleByteStart = false;
-          } else {
-            // ... otherwise do check to see if this plus the next byte form a valid
-            // double byte pair encoding a character.
-            lastWasPossibleDoubleByteStart = true;
-            if (i >= length - 1) {
-              canBeShiftJIS = false;
-            } else {
-              int nextValue = bytes[i + 1] & 0xFF;
-              if (nextValue < 0x40 || nextValue > 0xFC) {
-                canBeShiftJIS = false;
-              } else {
-                maybeDoubleByteCount++;
-              }
-              // There is some conflicting information out there about which bytes can follow which in
-              // double-byte Shift_JIS characters. The rule above seems to be the one that matches practice.
-            }
-          }
-        } else {
-          lastWasPossibleDoubleByteStart = false;
-        }
+const char *
+DecodedBitStreamParser::guessEncoding(unsigned char *bytes, int length) {
+  const bool ASSUME_SHIFT_JIS = false;
+  char const* const PLATFORM_DEFAULT_ENCODING="UTF-8";
+
+  // Does it start with the UTF-8 byte order mark? then guess it's UTF-8
+  if (length > 3 && bytes[0] == (unsigned char)0xEF && bytes[1] == (unsigned char)0xBB && bytes[2]
+      == (unsigned char)0xBF) {
+    return UTF8;
+  }
+  // For now, merely tries to distinguish ISO-8859-1, UTF-8 and Shift_JIS,
+  // which should be by far the most common encodings. ISO-8859-1
+  // should not have bytes in the 0x80 - 0x9F range, while Shift_JIS
+  // uses this as a first byte of a two-byte character. If we see this
+  // followed by a valid second byte in Shift_JIS, assume it is Shift_JIS.
+  // If we see something else in that second byte, we'll make the risky guess
+  // that it's UTF-8.
+  bool canBeISO88591 = true;
+  bool canBeShiftJIS = true;
+  bool canBeUTF8 = true;
+  int utf8BytesLeft = 0;
+  int maybeDoubleByteCount = 0;
+  int maybeSingleByteKatakanaCount = 0;
+  bool sawLatin1Supplement = false;
+  bool sawUTF8Start = false;
+  bool lastWasPossibleDoubleByteStart = false;
+  for (int i = 0;
+       i < length && (canBeISO88591 || canBeShiftJIS || canBeUTF8);
+       i++) {
+    int value = bytes[i] & 0xFF;
+
+    // UTF-8 stuff
+    if (value >= 0x80 && value <= 0xBF) {
+      if (utf8BytesLeft > 0) {
+        utf8BytesLeft--;
       }
+    } else {
       if (utf8BytesLeft > 0) {
         canBeUTF8 = false;
       }
-
-      // Easy -- if assuming Shift_JIS and no evidence it can't be, done
-      if (canBeShiftJIS && ASSUME_SHIFT_JIS) {
-        return SHIFT_JIS;
+      if (value >= 0xC0 && value <= 0xFD) {
+        sawUTF8Start = true;
+        int valueCopy = value;
+        while ((valueCopy & 0x40) != 0) {
+          utf8BytesLeft++;
+          valueCopy <<= 1;
+        }
       }
-      if (canBeUTF8 && sawUTF8Start) {
-        return UTF8;
-      }
-      // Distinguishing Shift_JIS and ISO-8859-1 can be a little tough. The crude heuristic is:
-      // - If we saw
-      //   - at least 3 bytes that starts a double-byte value (bytes that are rare in ISO-8859-1), or
-      //   - over 5% of bytes could be single-byte Katakana (also rare in ISO-8859-1),
-      // - and, saw no sequences that are invalid in Shift_JIS, then we conclude Shift_JIS
-      if (canBeShiftJIS && (maybeDoubleByteCount >= 3 || 20 * maybeSingleByteKatakanaCount > length)) {
-        return SHIFT_JIS;
-      }
-      // Otherwise, we default to ISO-8859-1 unless we know it can't be
-      if (!sawLatin1Supplement && canBeISO88591) {
-        return ISO88591;
-      }
-      // Otherwise, we take a wild guess with platform encoding
-      return PLATFORM_DEFAULT_ENCODING;
     }
 
-    string DecodedBitStreamParser::decode(ArrayRef<unsigned char> bytes, Version *version) {
-      string result;
-      Ref<BitSource> bits(new BitSource(bytes));
-      Mode *mode = &Mode::TERMINATOR;
-      do {
-        // While still another segment to read...
-        if (bits->available() < 4) {
-          // OK, assume we're done. Really, a TERMINATOR mode should have been recorded here
-          mode = &Mode::TERMINATOR;
+    // Shift_JIS stuff
+
+    if (value >= 0xA1 && value <= 0xDF) {
+      // count the number of characters that might be a Shift_JIS single-byte Katakana character
+      if (!lastWasPossibleDoubleByteStart) {
+        maybeSingleByteKatakanaCount++;
+      }
+    }
+    if (!lastWasPossibleDoubleByteStart &&
+        ((value >= 0xF0 && value <= 0xFF) || value == 0x80 || value == 0xA0)) {
+      canBeShiftJIS = false;
+    }
+    if (((value >= 0x81 && value <= 0x9F) || (value >= 0xE0 && value <= 0xEF))) {
+      // These start double-byte characters in Shift_JIS. Let's see if it's followed by a valid
+      // second byte.
+      if (lastWasPossibleDoubleByteStart) {
+        // If we just checked this and the last byte for being a valid double-byte
+        // char, don't check starting on this byte. If this and the last byte
+        // formed a valid pair, then this shouldn't be checked to see if it starts
+        // a double byte pair of course.
+        lastWasPossibleDoubleByteStart = false;
+      } else {
+        // ... otherwise do check to see if this plus the next byte form a valid
+        // double byte pair encoding a character.
+        lastWasPossibleDoubleByteStart = true;
+        if (i >= length - 1) {
+          canBeShiftJIS = false;
         } else {
-          mode = &Mode::forBits(bits->readBits(4)); // mode is encoded by 4 bits
-        }
-        if (mode != &Mode::TERMINATOR) {
-          // How many characters will follow, encoded in this mode?
-          int count = bits->readBits(mode->getCharacterCountBits(version));
-          if (mode == &Mode::NUMERIC) {
-            decodeNumericSegment(bits, result, count);
-          } else if (mode == &Mode::ALPHANUMERIC) {
-            decodeAlphanumericSegment(bits, result, count);
-          } else if (mode == &Mode::BYTE) {
-            decodeByteSegment(bits, result, count);
-          } else if (mode == &Mode::KANJI) {
-            decodeKanjiSegment(bits, result, count);
+          int nextValue = bytes[i + 1] & 0xFF;
+          if (nextValue < 0x40 || nextValue > 0xFC) {
+            canBeShiftJIS = false;
           } else {
-            throw ReaderException("Unsupported mode indicator");
+            maybeDoubleByteCount++;
           }
+          // There is some conflicting information out there about which bytes can follow which in
+          // double-byte Shift_JIS characters. The rule above seems to be the one that matches practice.
         }
-      } while (mode != &Mode::TERMINATOR);
-      return result;
+      }
+    } else {
+      lastWasPossibleDoubleByteStart = false;
     }
-
   }
+  if (utf8BytesLeft > 0) {
+    canBeUTF8 = false;
+  }
+
+  // Easy -- if assuming Shift_JIS and no evidence it can't be, done
+  if (canBeShiftJIS && ASSUME_SHIFT_JIS) {
+    return SHIFT_JIS;
+  }
+  if (canBeUTF8 && sawUTF8Start) {
+    return UTF8;
+  }
+  // Distinguishing Shift_JIS and ISO-8859-1 can be a little tough. The crude heuristic is:
+  // - If we saw
+  //   - at least 3 bytes that starts a double-byte value (bytes that are rare in ISO-8859-1), or
+  //   - over 5% of bytes could be single-byte Katakana (also rare in ISO-8859-1),
+  // - and, saw no sequences that are invalid in Shift_JIS, then we conclude Shift_JIS
+  if (canBeShiftJIS && (maybeDoubleByteCount >= 3 || 20 * maybeSingleByteKatakanaCount > length)) {
+    return SHIFT_JIS;
+  }
+  // Otherwise, we default to ISO-8859-1 unless we know it can't be
+  if (!sawLatin1Supplement && canBeISO88591) {
+    return ISO88591;
+  }
+  // Otherwise, we take a wild guess with platform encoding
+  return PLATFORM_DEFAULT_ENCODING;
+}
+
+string DecodedBitStreamParser::decode(ArrayRef<unsigned char> bytes, Version *version) {
+  string result;
+  Ref<BitSource> bits(new BitSource(bytes));
+  Mode *mode = &Mode::TERMINATOR;
+  do {
+    // While still another segment to read...
+    if (bits->available() < 4) {
+      // OK, assume we're done. Really, a TERMINATOR mode should have been recorded here
+      mode = &Mode::TERMINATOR;
+    } else {
+      mode = &Mode::forBits(bits->readBits(4)); // mode is encoded by 4 bits
+    }
+    if (mode != &Mode::TERMINATOR) {
+      // How many characters will follow, encoded in this mode?
+      int count = bits->readBits(mode->getCharacterCountBits(version));
+      if (mode == &Mode::NUMERIC) {
+        decodeNumericSegment(bits, result, count);
+      } else if (mode == &Mode::ALPHANUMERIC) {
+        decodeAlphanumericSegment(bits, result, count);
+      } else if (mode == &Mode::BYTE) {
+        decodeByteSegment(bits, result, count);
+      } else if (mode == &Mode::KANJI) {
+        decodeKanjiSegment(bits, result, count);
+      } else {
+        throw ReaderException("Unsupported mode indicator");
+      }
+    }
+  } while (mode != &Mode::TERMINATOR);
+  return result;
 }