Compact QR-Code: Support more character sets (in particular cp1256 (Arabic) and Shift-JIS) (#1453)

* - added code to suppress the terminator if the capacity of the version is less than 4 bit - added test case * - Removed code in MinimalEncoder that added Mode.TERMINATOR (is taken care of in Encoder.terminateBits) - Removed the corresponding test case * Updated test cases * Improved documentation * Changed documentation to not use an example with an unsupported character encoding * Improved wording of comment * - Simplified code - Added space after comma in several places * Added support for more character sets supported by CharacterSetECI * Syntactic enhancements * Changed instantiation of generic types to diamond style * Updated documentation of the QR_COMPACT hint to explain the impact of setting the CHARACTER_SET hint . * Changed whitespace * Removed comment * Fixed typos in comments * Added text cases for KANJI and Shift_JS encoding * Improved comments on Japanese language test cases
2025-03-05 20:48:51 -08:00 · 2021-10-21 17:10:28 +02:00 · 2021-10-21 17:10:28 +02:00 · 2e22d09479
parent c729abe393
commit 2e22d09479
4 changed files with 97 additions and 62 deletions
--- a/core/src/main/java/com/google/zxing/EncodeHintType.java
+++ b/core/src/main/java/com/google/zxing/EncodeHintType.java
@ -113,8 +113,10 @@ public enum EncodeHintType {

  /**
   * Specifies whether to use compact mode for QR code (type {@link Boolean}, or "true" or "false"
-   * When compaction is performed the value for {@link #CHARACTER_SET} is ignored.
-   * {@link String} value).
+   * Please note that when compaction is performed, the most compact character encoding is chosen
+   * for characters in the input that are not in the ISO-8859-1 character set. Based on experience,
+   * some scanners do not support encodings like cp-1256 (Arabic). In such cases the encoding can
+   * be forced to UTF-8 by means of the {@link #CHARACTER_SET} encoding hint.
   */
  QR_COMPACT,

--- a/core/src/main/java/com/google/zxing/qrcode/encoder/Encoder.java
+++ b/core/src/main/java/com/google/zxing/qrcode/encoder/Encoder.java
@ -107,7 +107,7 @@ public final class Encoder {
    } else {
    
      // Pick an encoding mode appropriate for the content. Note that this will not attempt to use
-      // multiple modes / segments even if that were more efficient. Twould be nice.
+      // multiple modes / segments even if that were more efficient.
      mode = chooseMode(content, encoding);
  
      // This will store the header information, like mode and
--- a/core/src/main/java/com/google/zxing/qrcode/encoder/MinimalEncoder.java
+++ b/core/src/main/java/com/google/zxing/qrcode/encoder/MinimalEncoder.java
@ -27,6 +27,7 @@ import java.nio.charset.Charset;
 import java.nio.charset.CharsetEncoder;
 import java.nio.charset.StandardCharsets;
 import java.util.ArrayList;
+import java.util.List;

 import java.nio.charset.UnsupportedCharsetException;

@ -52,7 +53,7 @@ import java.nio.charset.UnsupportedCharsetException;
 * In multi language content the algorithm selects the most compact representation using ECI modes.
 * For example the most compact representation of the string "\u0150\u015C" (O-double-acute, S-circumflex) is
 * ECI(UTF-8), BYTE(\u0150\u015C) while prepending one or more times the same leading character as in
- * "\u0150\u0150\u015C", the most compact representation  uses two ECIs so that the string is encoded as
+ * "\u0150\u0150\u015C", the most compact representation uses two ECIs so that the string is encoded as
 * ECI(ISO-8859-2), BYTE(\u0150\u0150), ECI(ISO-8859-3), BYTE(\u015C).
 *
 * @author Alex Geller
@ -75,6 +76,45 @@ final class MinimalEncoder {
    }
  }

+  // List of encoders that potentially encode characters not in ISO-8859-1 in one byte.
+  private static final List<CharsetEncoder> ENCODERS = new ArrayList<>();
+  static {
+    final String[] names = { "ISO-8859-2",
+                             "ISO-8859-3",
+                             "ISO-8859-4",
+                             "ISO-8859-5",
+                             "ISO-8859-6",
+                             "ISO-8859-7",
+                             "ISO-8859-8",
+                             "ISO-8859-9",
+                             "ISO-8859-10",
+                             "ISO-8859-11",
+                             "ISO-8859-13",
+                             "ISO-8859-14",
+                             "ISO-8859-15",
+                             "ISO-8859-16",
+                             "windows-1250",
+                             "windows-1251",
+                             "windows-1252",
+                             "windows-1253",
+                             "windows-1254",
+                             "windows-1255",
+                             "windows-1256",
+                             "windows-1257",
+                             "windows-1258",
+                             "Shift_JIS" };
+    for (String name : names) {
+      if (CharacterSetECI.getCharacterSetECIByName(name) != null) {
+        try {
+          ENCODERS.add(Charset.forName(name).newEncoder());
+        } catch (UnsupportedCharsetException e) {
+          // continue
+        }
+      }
+    }
+  }
+
+
  private final String stringToEncode;
  private final boolean isGS1;
  private final CharsetEncoder[] encoders;
@ -100,72 +140,42 @@ final class MinimalEncoder {
    this.isGS1 = isGS1;
    this.ecLevel = ecLevel;

-    CharsetEncoder[] isoEncoders = new CharsetEncoder[15]; // room for the 15 ISO-8859 charsets 1 through 16.
-    isoEncoders[0] = StandardCharsets.ISO_8859_1.newEncoder();
+    List<CharsetEncoder> neededEncoders = new ArrayList<>();
+    neededEncoders.add(StandardCharsets.ISO_8859_1.newEncoder());
    boolean needUnicodeEncoder = priorityCharset != null && priorityCharset.name().startsWith("UTF");

    for (int i = 0; i < stringToEncode.length(); i++) {
-      int cnt = 0;
-      int j;
-      for (j = 0; j < 15; j++) {
-        if (isoEncoders[j] != null) {
-          cnt++;
-          if (isoEncoders[j].canEncode(stringToEncode.charAt(i))) {
+      boolean canEncode = false;
+      for (CharsetEncoder encoder : neededEncoders) {
+        if (encoder.canEncode(stringToEncode.charAt(i))) {
+          canEncode = true;
+          break;
+        }
+      }
+
+      if (!canEncode) {
+        for (CharsetEncoder encoder : ENCODERS) {
+          if (encoder.canEncode(stringToEncode.charAt(i))) {
+            neededEncoders.add(encoder);
+            canEncode = true;
            break;
          }
        }
      }

-      if (cnt == 14) { // we need all. Can stop looking further.
-        break;
-      }
-
-      if (j >= 15) { // no encoder found
-        for (j = 0; j < 15; j++) {
-          if (j != 11 && isoEncoders[j] == null) { // ISO-8859-12 doesn't exist
-            try {
-              CharsetEncoder ce = Charset.forName("ISO-8859-" + (j + 1)).newEncoder();
-              if (ce.canEncode(stringToEncode.charAt(i))) {
-                isoEncoders[j] = ce;
-                break;
-              }
-            } catch (UnsupportedCharsetException e) {
-              // continue
-            }
-          }
-        }
-        if (j >= 15) {
-          if (!StandardCharsets.UTF_16BE.newEncoder().canEncode(stringToEncode.charAt(i))) {
-            throw new WriterException("Can not encode character \\u" +
-                String.format("%04X", (int) stringToEncode.charAt(i)) + " at position " + i +
-                " in input \"" + stringToEncode + "\"");
-          }
-          needUnicodeEncoder = true;
-        }
+      if (!canEncode) {
+        needUnicodeEncoder = true;
      }
    }

-    int numberOfEncoders = 0;
-    for (int j = 0; j < 15; j++) {
-      if (isoEncoders[j] != null) {
-        if (CharacterSetECI.getCharacterSetECI(isoEncoders[j].charset()) != null) {
-          numberOfEncoders++;
-        } else {
-          needUnicodeEncoder = true;
-        }
-      }
-    }
-
-    if (numberOfEncoders == 1 && !needUnicodeEncoder) {
+    if (neededEncoders.size() == 1 && !needUnicodeEncoder) {
      encoders = new CharsetEncoder[1];
-      encoders[0] = isoEncoders[0];
+      encoders[0] = neededEncoders.get(0);
    } else {
-      encoders = new CharsetEncoder[numberOfEncoders + 2];
+      encoders = new CharsetEncoder[neededEncoders.size() + 2];
      int index = 0;
-      for (int j = 0; j < 15; j++) {
-        if (isoEncoders[j] != null && CharacterSetECI.getCharacterSetECI(isoEncoders[j].charset()) != null) {
-          encoders[index++] = isoEncoders[j];
-        }
+      for (CharsetEncoder encoder : neededEncoders) {
+        encoders[index++] = encoder;
      }

      encoders[index] = StandardCharsets.UTF_8.newEncoder();
@ -304,7 +314,7 @@ final class MinimalEncoder {
    }
  }

-  void addEdge(ArrayList<Edge>[][][] edges, int position, Edge edge) {
+  void addEdge(List<Edge>[][][] edges, int position, Edge edge) {
    int vertexIndex = position + edge.characterLength;
    if (edges[vertexIndex][edge.charsetEncoderIndex][getCompactedOrdinal(edge.mode)] == null) {
      edges[vertexIndex][edge.charsetEncoderIndex][getCompactedOrdinal(edge.mode)] = new ArrayList<>();
@ -312,7 +322,7 @@ final class MinimalEncoder {
    edges[vertexIndex][edge.charsetEncoderIndex][getCompactedOrdinal(edge.mode)].add(edge);
  }

-  void addEdges(Version version, ArrayList<Edge>[][][] edges, int from, Edge previous) {
+  void addEdges(Version version, List<Edge>[][][] edges, int from, Edge previous) {
    int start = 0;
    int end = encoders.length;
    if (priorityEncoderIndex >= 0 && encoders[priorityEncoderIndex].canEncode(stringToEncode.charAt(from))) {
@ -345,7 +355,7 @@ final class MinimalEncoder {
  ResultList encodeSpecificVersion(Version version) throws WriterException {

    @SuppressWarnings("checkstyle:lineLength")
-    /* A vertex represents a tuple of a position in the input, a mode and an a character encoding where position 0
+    /* A vertex represents a tuple of a position in the input, a mode and a character encoding where position 0
     * denotes the position left of the first character, 1 the position left of the second character and so on.
     * Likewise the end vertices are located after the last character at position stringToEncode.length().
     *
@ -463,7 +473,7 @@ final class MinimalEncoder {
    // The last dimension in the array below encodes the 4 modes KANJI, ALPHANUMERIC, NUMERIC and BYTE via the
    // function getCompactedOrdinal(Mode)
    @SuppressWarnings("unchecked")
-    ArrayList<Edge>[][][] edges = new ArrayList[inputLength + 1][encoders.length][4];
+    List<Edge>[][][] edges = new ArrayList[inputLength + 1][encoders.length][4];
    addEdges(version, edges, 0, null);

    for (int i = 1; i <= inputLength; i++) {
@ -471,7 +481,7 @@ final class MinimalEncoder {
        for (int k = 0; k < 4; k++) {
          Edge minimalEdge;
          if (edges[i][j][k] != null) {
-            ArrayList<Edge> localEdges = edges[i][j][k];
+            List<Edge> localEdges = edges[i][j][k];
            int minimalIndex = -1;
            int minimalSize = Integer.MAX_VALUE;
            for (int l = 0; l < localEdges.size(); l++) {
@ -499,7 +509,7 @@ final class MinimalEncoder {
    for (int j = 0; j < encoders.length; j++) {
      for (int k = 0; k < 4; k++) {
        if (edges[inputLength][j][k] != null) {
-          ArrayList<Edge> localEdges = edges[inputLength][j][k];
+          List<Edge> localEdges = edges[inputLength][j][k];
          assert localEdges.size() == 1;
          Edge edge = localEdges.get(0);
          if (edge.cachedTotalSize < minimalSize) {
--- a/core/src/test/java/com/google/zxing/qrcode/encoder/EncoderTestCase.java
+++ b/core/src/test/java/com/google/zxing/qrcode/encoder/EncoderTestCase.java
@ -882,6 +882,29 @@ public final class EncoderTestCase extends Assert {
        true);
  }

+  @Test
+  public void testMinimalEncoder42() throws Exception {
+    // test halfwidth Katakana character (they are single byte encoded in Shift_JIS)
+    verifyMinimalEncoding("Katakana:\uFF66\uFF66\uFF66\uFF66\uFF66\uFF66", "ECI(Shift_JIS),BYTE(Katakana:......)", null
+        , false);
+  }
+
+  @Test
+  public void testMinimalEncoder43() throws Exception {
+    // The character \u30A2 encodes as double byte in Shift_JIS so KANJI is more compact in this case
+    verifyMinimalEncoding("Katakana:\u30A2\u30A2\u30A2\u30A2\u30A2\u30A2", "BYTE(Katakana:),KANJI(......)", null,
+        false);
+  }
+
+  @Test
+  public void testMinimalEncoder44() throws Exception {
+    // The character \u30A2 encodes as double byte in Shift_JIS but KANJI is not more compact in this case because
+    // KANJI is only more compact when it encodes pairs of characters. In the case of mixed text it can however be
+    // that Shift_JIS encoding is more compact as in this example
+    verifyMinimalEncoding("Katakana:\u30A2a\u30A2a\u30A2a\u30A2a\u30A2a\u30A2", "ECI(Shift_JIS),BYTE(Katakana:.a.a.a" +
+        ".a.a.)", null, false);
+  }
+
  static void verifyMinimalEncoding(String input, String expectedResult, Charset priorityCharset, boolean isGS1) 
      throws Exception {
    MinimalEncoder.ResultList result = MinimalEncoder.encode(input, null, priorityCharset, isGS1,