Minimal encoding for Data Matrix with multi ECI and GS1-FNC1 support (2nd) (#1479)

* Added Data Matrix minimal encoding with multi-ECI and GS1-FNC1 support. * Added support for MACRO-5 and MACRO-6 * Improved comment
2025-01-12 11:47:26 -08:00 · 2022-01-12 18:58:24 +01:00 · 2022-01-12 18:58:24 +01:00 · ab23778d4b
parent 491006ac3b
commit ab23778d4b
7 changed files with 1683 additions and 121 deletions
--- a/core/src/main/java/com/google/zxing/EncodeHintType.java
+++ b/core/src/main/java/com/google/zxing/EncodeHintType.java
@ -44,6 +44,20 @@ public enum EncodeHintType {
   */
  DATA_MATRIX_SHAPE,

+  /**
+   * Specifies whether to use compact mode for Data Matrix (type {@link Boolean}, or "true" or "false"
+   * The compact encoding mode also supports the encoding of characters that are not in the ISO-8859-1
+   * character set via ECIs.
+   * Please note that in that case, the most compact character encoding is chosen for characters in
+   * the input that are not in the ISO-8859-1 character set. Based on experience, some scanners do not
+   * support encodings like cp-1256 (Arabic). In such cases the encoding can be forced to UTF-8 by
+   * means of the {@link #CHARACTER_SET} encoding hint.
+   * Compact encoding also provides GS1-FNC1 support when {@link #GS1_FORMAT} is selected. In this case
+   * group-separator character (ASCII 29 decimal) can be used to encode the positions of FNC1 codewords
+   * for the purpose of delimiting AIs.
+   */
+  DATA_MATRIX_COMPACT,
+
  /**
   * Specifies a minimum barcode size (type {@link Dimension}). Only applicable to Data Matrix now.
   *
--- a/core/src/main/java/com/google/zxing/common/ECIEncoderSet.java
+++ b/core/src/main/java/com/google/zxing/common/ECIEncoderSet.java
@ -0,0 +1,200 @@
+/*
+ * Copyright 2021 ZXing authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package com.google.zxing.common;
+
+import java.nio.charset.Charset;
+import java.nio.charset.CharsetEncoder;
+import java.nio.charset.StandardCharsets;
+import java.nio.charset.UnsupportedCharsetException;
+import java.util.ArrayList;
+import java.util.List;
+
+/**
+ * Set of CharsetEncoders for a given input string
+ *
+ * Invariants:
+ * - The list contains only encoders from CharacterSetECI (list is shorter then the list of encoders available on
+ *   the platform for which ECI values are defined).
+ * - The list contains encoders at least one encoder for every character in the input.
+ * - The first encoder in the list is always the ISO-8859-1 encoder even of no character in the input can be encoded
+ *       by it.
+ * - If the input contains a character that is not in ISO-8859-1 then the last two entries in the list will be the
+ *   UTF-8 encoder and the UTF-16BE encoder.
+ *
+ * @author Alex Geller
+ */
+public final class ECIEncoderSet {
+
+  // List of encoders that potentially encode characters not in ISO-8859-1 in one byte.
+  private static final List<CharsetEncoder> ENCODERS = new ArrayList<>();
+  static {
+    final String[] names = { "IBM437",
+                             "ISO-8859-2",
+                             "ISO-8859-3",
+                             "ISO-8859-4",
+                             "ISO-8859-5",
+                             "ISO-8859-6",
+                             "ISO-8859-7",
+                             "ISO-8859-8",
+                             "ISO-8859-9",
+                             "ISO-8859-10",
+                             "ISO-8859-11",
+                             "ISO-8859-13",
+                             "ISO-8859-14",
+                             "ISO-8859-15",
+                             "ISO-8859-16",
+                             "windows-1250",
+                             "windows-1251",
+                             "windows-1252",
+                             "windows-1256",
+                             "Shift_JIS" };
+    for (String name : names) {
+      if (CharacterSetECI.getCharacterSetECIByName(name) != null) {
+        try {
+          ENCODERS.add(Charset.forName(name).newEncoder());
+        } catch (UnsupportedCharsetException e) {
+          // continue
+        }
+      }
+    }
+  }
+
+  private final CharsetEncoder[] encoders;
+  private final int priorityEncoderIndex;
+
+  /**
+   * Constructs an encoder set
+   *
+   * @param stringToEncode the string that needs to be encoded
+   * @param priorityCharset The preferred {@link Charset} or null.
+   * @param fnc1 fnc1 denotes the character in the input that represents the FNC1 character or -1 for a non-GS1 bar
+   * code. When specified, it is considered an error to pass it as argument to the methods canEncode() or encode().
+   */
+  public ECIEncoderSet(String stringToEncode, Charset priorityCharset, int fnc1) {
+    List<CharsetEncoder> neededEncoders = new ArrayList<>();
+
+    //we always need the ISO-8859-1 encoder. It is the default encoding
+    neededEncoders.add(StandardCharsets.ISO_8859_1.newEncoder());
+    boolean needUnicodeEncoder = priorityCharset != null && priorityCharset.name().startsWith("UTF");
+
+    //Walk over the input string and see if all characters can be encoded with the list of encoders 
+    for (int i = 0; i < stringToEncode.length(); i++) {
+      boolean canEncode = false;
+      for (CharsetEncoder encoder : neededEncoders) {
+        char c = stringToEncode.charAt(i);
+        if (c == fnc1 || encoder.canEncode(c)) {
+          canEncode = true;
+          break;
+        }
+      }
+
+      if (!canEncode) {
+        //for the character at position i we don't yet have an encoder in the list
+        for (CharsetEncoder encoder : ENCODERS) {
+          if (encoder.canEncode(stringToEncode.charAt(i))) {
+            //Good, we found an encoder that can encode the character. We add him to the list and continue scanning
+            //the input
+            neededEncoders.add(encoder);
+            canEncode = true;
+            break;
+          }
+        }
+      }
+
+      if (!canEncode) {
+        //The character is not encodeable by any of the single byte encoders so we remember that we will need a
+        //Unicode encoder.
+        needUnicodeEncoder = true;
+      }
+    }
+  
+    if (neededEncoders.size() == 1 && !needUnicodeEncoder) {
+      //the entire input can be encoded by the ISO-8859-1 encoder
+      encoders = new CharsetEncoder[] { neededEncoders.get(0) };
+    } else {
+      // we need more than one single byte encoder or we need a Unicode encoder.
+      // In this case we append a UTF-8 and UTF-16 encoder to the list
+      encoders = new CharsetEncoder[neededEncoders.size() + 2];
+      int index = 0;
+      for (CharsetEncoder encoder : neededEncoders) {
+        encoders[index++] = encoder;
+      }
+
+      encoders[index] = StandardCharsets.UTF_8.newEncoder();
+      encoders[index + 1] = StandardCharsets.UTF_16BE.newEncoder();
+    }
+  
+    //Compute priorityEncoderIndex by looking up priorityCharset in encoders
+    int priorityEncoderIndexValue = -1;
+    if (priorityCharset != null) {
+      for (int i = 0; i < encoders.length; i++) {
+        if (encoders[i] != null && priorityCharset.name().equals(encoders[i].charset().name())) {
+          priorityEncoderIndexValue = i;
+          break;
+        }
+      }
+    }
+    priorityEncoderIndex = priorityEncoderIndexValue;
+    //invariants
+    assert encoders.length > 0;
+    assert encoders[0].charset().equals(StandardCharsets.ISO_8859_1);
+  }
+
+  public int length() {
+    return encoders.length;
+  }
+
+  public String getCharsetName(int index) {
+    assert index < length();
+    return encoders[index].charset().name();
+  }
+
+  public Charset getCharset(int index) {
+    assert index < length();
+    return encoders[index].charset();
+  }
+
+  public int getECIValue(int encoderIndex) {
+    return CharacterSetECI.getCharacterSetECI(encoders[encoderIndex].charset()).getValue();
+  }
+
+  /*
+   *  returns -1 if no priority charset was defined
+   */
+  public int getPriorityEncoderIndex() {
+    return priorityEncoderIndex;
+  }
+
+  public boolean canEncode(char c, int encoderIndex) {
+    assert encoderIndex < length();
+    CharsetEncoder encoder = encoders[encoderIndex];
+    return encoder.canEncode("" + c);
+  }
+
+  public byte[] encode(char c, int encoderIndex) {
+    assert encoderIndex < length();
+    CharsetEncoder encoder = encoders[encoderIndex];
+    assert encoder.canEncode("" + c);
+    return ("" + c).getBytes(encoder.charset());
+  }
+
+  public byte[] encode(String s, int encoderIndex) {
+    assert encoderIndex < length();
+    CharsetEncoder encoder = encoders[encoderIndex];
+    return s.getBytes(encoder.charset());
+  }
+}
--- a/core/src/main/java/com/google/zxing/datamatrix/DataMatrixWriter.java
+++ b/core/src/main/java/com/google/zxing/datamatrix/DataMatrixWriter.java
@ -24,11 +24,13 @@ import com.google.zxing.datamatrix.encoder.DefaultPlacement;
 import com.google.zxing.Dimension;
 import com.google.zxing.datamatrix.encoder.ErrorCorrection;
 import com.google.zxing.datamatrix.encoder.HighLevelEncoder;
+import com.google.zxing.datamatrix.encoder.MinimalEncoder;
 import com.google.zxing.datamatrix.encoder.SymbolInfo;
 import com.google.zxing.datamatrix.encoder.SymbolShapeHint;
 import com.google.zxing.qrcode.encoder.ByteMatrix;

 import java.util.Map;
+import java.nio.charset.Charset;

 /**
 * This object renders a Data Matrix code as a BitMatrix 2D array of greyscale values.
@ -81,7 +83,24 @@ public final class DataMatrixWriter implements Writer {


    //1. step: Data encodation
-    String encoded = HighLevelEncoder.encodeHighLevel(contents, shape, minSize, maxSize);
+    String encoded;
+
+    boolean hasCompactionHint = hints != null && hints.containsKey(EncodeHintType.DATA_MATRIX_COMPACT) &&
+        Boolean.parseBoolean(hints.get(EncodeHintType.DATA_MATRIX_COMPACT).toString());
+    if (hasCompactionHint) {
+
+      boolean hasGS1FormatHint = hints.containsKey(EncodeHintType.GS1_FORMAT) &&
+          Boolean.parseBoolean(hints.get(EncodeHintType.GS1_FORMAT).toString());
+
+      Charset charset = null;
+      boolean hasEncodingHint = hints.containsKey(EncodeHintType.CHARACTER_SET);
+      if (hasEncodingHint) {
+        charset = Charset.forName(hints.get(EncodeHintType.CHARACTER_SET).toString());
+      }
+      encoded = MinimalEncoder.encodeHighLevel(contents, charset, hasGS1FormatHint ? 0x1D : -1, shape);
+    } else {
+      encoded = HighLevelEncoder.encodeHighLevel(contents, shape, minSize, maxSize);
+    }

    SymbolInfo symbolInfo = SymbolInfo.lookup(encoded.length(), shape, minSize, maxSize, true);

--- a/core/src/main/java/com/google/zxing/datamatrix/encoder/HighLevelEncoder.java
+++ b/core/src/main/java/com/google/zxing/datamatrix/encoder/HighLevelEncoder.java
@ -91,15 +91,15 @@ public final class HighLevelEncoder {
  /**
   * 05 Macro header
   */
-  private static final String MACRO_05_HEADER = "[)>\u001E05\u001D";
+  static final String MACRO_05_HEADER = "[)>\u001E05\u001D";
  /**
   * 06 Macro header
   */
-  private static final String MACRO_06_HEADER = "[)>\u001E06\u001D";
+  static final String MACRO_06_HEADER = "[)>\u001E06\u001D";
  /**
   * Macro trailer
   */
-  private static final String MACRO_TRAILER = "\u001E\u0004";
+  static final String MACRO_TRAILER = "\u001E\u0004";

  static final int ASCII_ENCODATION = 0;
  static final int C40_ENCODATION = 1;
@ -406,15 +406,15 @@ public final class HighLevelEncoder {
    return ch >= 128 && ch <= 255;
  }

-  private static boolean isNativeC40(char ch) {
+  static boolean isNativeC40(char ch) {
    return (ch == ' ') || (ch >= '0' && ch <= '9') || (ch >= 'A' && ch <= 'Z');
  }

-  private static boolean isNativeText(char ch) {
+  static boolean isNativeText(char ch) {
    return (ch == ' ') || (ch >= '0' && ch <= '9') || (ch >= 'a' && ch <= 'z');
  }

-  private static boolean isNativeX12(char ch) {
+  static boolean isNativeX12(char ch) {
    return isX12TermSep(ch) || (ch == ' ') || (ch >= '0' && ch <= '9') || (ch >= 'A' && ch <= 'Z');
  }

@ -424,7 +424,7 @@ public final class HighLevelEncoder {
        || (ch == '>');
  }

-  private static boolean isNativeEDIFACT(char ch) {
+  static boolean isNativeEDIFACT(char ch) {
    return ch >= ' ' && ch <= '^';
  }

--- a/core/src/main/java/com/google/zxing/datamatrix/encoder/MinimalEncoder.java
+++ b/core/src/main/java/com/google/zxing/datamatrix/encoder/MinimalEncoder.java
--- a/core/src/main/java/com/google/zxing/qrcode/encoder/MinimalEncoder.java
+++ b/core/src/main/java/com/google/zxing/qrcode/encoder/MinimalEncoder.java
@ -19,17 +19,14 @@ package com.google.zxing.qrcode.encoder;
 import com.google.zxing.qrcode.decoder.Mode;
 import com.google.zxing.qrcode.decoder.Version;
 import com.google.zxing.common.BitArray;
-import com.google.zxing.common.CharacterSetECI;
+import com.google.zxing.common.ECIEncoderSet;
 import com.google.zxing.WriterException;
 import com.google.zxing.qrcode.decoder.ErrorCorrectionLevel;

 import java.nio.charset.Charset;
-import java.nio.charset.CharsetEncoder;
-import java.nio.charset.StandardCharsets;
 import java.util.ArrayList;
 import java.util.List;

-import java.nio.charset.UnsupportedCharsetException;

 /**
 * Encoder that encodes minimally
@ -76,49 +73,9 @@ final class MinimalEncoder {
    }
  }

-  // List of encoders that potentially encode characters not in ISO-8859-1 in one byte.
-  private static final List<CharsetEncoder> ENCODERS = new ArrayList<>();
-  static {
-    final String[] names = { "ISO-8859-2",
-                             "ISO-8859-3",
-                             "ISO-8859-4",
-                             "ISO-8859-5",
-                             "ISO-8859-6",
-                             "ISO-8859-7",
-                             "ISO-8859-8",
-                             "ISO-8859-9",
-                             "ISO-8859-10",
-                             "ISO-8859-11",
-                             "ISO-8859-13",
-                             "ISO-8859-14",
-                             "ISO-8859-15",
-                             "ISO-8859-16",
-                             "windows-1250",
-                             "windows-1251",
-                             "windows-1252",
-                             "windows-1253",
-                             "windows-1254",
-                             "windows-1255",
-                             "windows-1256",
-                             "windows-1257",
-                             "windows-1258",
-                             "Shift_JIS" };
-    for (String name : names) {
-      if (CharacterSetECI.getCharacterSetECIByName(name) != null) {
-        try {
-          ENCODERS.add(Charset.forName(name).newEncoder());
-        } catch (UnsupportedCharsetException e) {
-          // continue
-        }
-      }
-    }
-  }
-
-
  private final String stringToEncode;
  private final boolean isGS1;
-  private final CharsetEncoder[] encoders;
-  private final int priorityEncoderIndex;
+  private final ECIEncoderSet encoders;
  private final ErrorCorrectionLevel ecLevel;

  /**
@ -138,59 +95,8 @@ final class MinimalEncoder {

    this.stringToEncode = stringToEncode;
    this.isGS1 = isGS1;
+    this.encoders = new ECIEncoderSet(stringToEncode, priorityCharset, -1);
    this.ecLevel = ecLevel;
-
-    List<CharsetEncoder> neededEncoders = new ArrayList<>();
-    neededEncoders.add(StandardCharsets.ISO_8859_1.newEncoder());
-    boolean needUnicodeEncoder = priorityCharset != null && priorityCharset.name().startsWith("UTF");
-
-    for (int i = 0; i < stringToEncode.length(); i++) {
-      boolean canEncode = false;
-      for (CharsetEncoder encoder : neededEncoders) {
-        if (encoder.canEncode(stringToEncode.charAt(i))) {
-          canEncode = true;
-          break;
-        }
-      }
-
-      if (!canEncode) {
-        for (CharsetEncoder encoder : ENCODERS) {
-          if (encoder.canEncode(stringToEncode.charAt(i))) {
-            neededEncoders.add(encoder);
-            canEncode = true;
-            break;
-          }
-        }
-      }
-
-      if (!canEncode) {
-        needUnicodeEncoder = true;
-      }
-    }
-
-    if (neededEncoders.size() == 1 && !needUnicodeEncoder) {
-      encoders = new CharsetEncoder[] { neededEncoders.get(0) };
-    } else {
-      encoders = new CharsetEncoder[neededEncoders.size() + 2];
-      int index = 0;
-      for (CharsetEncoder encoder : neededEncoders) {
-        encoders[index++] = encoder;
-      }
-
-      encoders[index] = StandardCharsets.UTF_8.newEncoder();
-      encoders[index + 1] = StandardCharsets.UTF_16BE.newEncoder();
-    }
-
-    int priorityEncoderIndexValue = -1;
-    if (priorityCharset != null) {
-      for (int i = 0; i < encoders.length; i++) {
-        if (encoders[i] != null && priorityCharset.name().equals(encoders[i].charset().name())) {
-          priorityEncoderIndexValue = i;
-          break;
-        }
-      }
-    }
-    priorityEncoderIndex = priorityEncoderIndexValue;
  }

  /**
@ -315,14 +221,15 @@ final class MinimalEncoder {

  void addEdges(Version version, Edge[][][] edges, int from, Edge previous) {
    int start = 0;
-    int end = encoders.length;
-    if (priorityEncoderIndex >= 0 && encoders[priorityEncoderIndex].canEncode(stringToEncode.charAt(from))) {
+    int end = encoders.length();
+    int priorityEncoderIndex = encoders.getPriorityEncoderIndex();
+    if (priorityEncoderIndex >= 0 && encoders.canEncode(stringToEncode.charAt(from),priorityEncoderIndex)) {
      start = priorityEncoderIndex;
      end = priorityEncoderIndex + 1;
    }

    for (int i = start; i < end; i++) {
-      if (encoders[i].canEncode(stringToEncode.charAt(from))) {
+      if (encoders.canEncode(stringToEncode.charAt(from), i)) {
        addEdge(edges, from, new Edge(Mode.BYTE, from, i, 1, previous, version));
      }
    }
@ -464,11 +371,11 @@ final class MinimalEncoder {
    // The last dimension in the array below encodes the 4 modes KANJI, ALPHANUMERIC, NUMERIC and BYTE via the
    // function getCompactedOrdinal(Mode)
    @SuppressWarnings("unchecked")
-    Edge[][][] edges = new Edge[inputLength + 1][encoders.length][4];
+    Edge[][][] edges = new Edge[inputLength + 1][encoders.length()][4];
    addEdges(version, edges, 0, null);

    for (int i = 1; i <= inputLength; i++) {
-      for (int j = 0; j < encoders.length; j++) {
+      for (int j = 0; j < encoders.length(); j++) {
        for (int k = 0; k < 4; k++) {
          if (edges[i][j][k] != null && i < inputLength) {
            addEdges(version, edges, i, edges[i][j][k]);
@ -480,7 +387,7 @@ final class MinimalEncoder {
    int minimalJ = -1;
    int minimalK = -1;
    int minimalSize = Integer.MAX_VALUE;
-    for (int j = 0; j < encoders.length; j++) {
+    for (int j = 0; j < encoders.length(); j++) {
      for (int k = 0; k < 4; k++) {
        if (edges[inputLength][j][k] != null) {
          Edge edge = edges[inputLength][j][k];
@ -535,8 +442,8 @@ final class MinimalEncoder {
          size += characterLength == 1 ? 4 : characterLength == 2 ? 7 : 10;
          break;
        case BYTE:
-          size += 8 * stringToEncode.substring(fromPosition, fromPosition + characterLength).getBytes(
-              encoders[charsetEncoderIndex].charset()).length;
+          size += 8 * encoders.encode(stringToEncode.substring(fromPosition, fromPosition + characterLength),
+              charsetEncoderIndex).length;
          if (needECI) {
            size += 4 + 8; // the ECI assignment numbers for ISO-8859-x, UTF-8 and UTF-16 are all 8 bit long
          }
@ -712,8 +619,9 @@ final class MinimalEncoder {
       * for multi byte encoded characters)
       */
      private int getCharacterCountIndicator() {
-        return mode == Mode.BYTE ? stringToEncode.substring(fromPosition, fromPosition + characterLength).getBytes(
-            encoders[charsetEncoderIndex].charset()).length : characterLength;
+        return mode == Mode.BYTE ?
+            encoders.encode(stringToEncode.substring(fromPosition, fromPosition + characterLength),
+            charsetEncoderIndex).length : characterLength;
      }

      /**
@ -726,11 +634,11 @@ final class MinimalEncoder {
          bits.appendBits(length, mode.getCharacterCountBits(version));
        }
        if (mode == Mode.ECI) {
-          bits.appendBits(CharacterSetECI.getCharacterSetECI(encoders[charsetEncoderIndex].charset()).getValue(), 8);
+          bits.appendBits(encoders.getECIValue(charsetEncoderIndex), 8);
        } else if (characterLength > 0) {
          // append data
          Encoder.appendBytes(stringToEncode.substring(fromPosition, fromPosition + characterLength), mode, bits,
-              encoders[charsetEncoderIndex].charset());
+              encoders.getCharset(charsetEncoderIndex));
        }
      }

@ -738,7 +646,7 @@ final class MinimalEncoder {
        StringBuilder result = new StringBuilder();
        result.append(mode).append('(');
        if (mode == Mode.ECI) {
-          result.append(encoders[charsetEncoderIndex].charset().displayName());
+          result.append(encoders.getCharset(charsetEncoderIndex).displayName());
        } else {
          result.append(makePrintable(stringToEncode.substring(fromPosition, fromPosition + characterLength)));
        }
--- a/core/src/test/java/com/google/zxing/datamatrix/encoder/HighLevelEncodeTestCase.java
+++ b/core/src/test/java/com/google/zxing/datamatrix/encoder/HighLevelEncodeTestCase.java
@ -19,9 +19,10 @@ package com.google.zxing.datamatrix.encoder;
 import junit.framework.ComparisonFailure;
 import org.junit.Assert;
 import org.junit.Test;
+import java.nio.charset.StandardCharsets;

 /**
- * Tests for {@link HighLevelEncoder}.
+ * Tests for {@link HighLevelEncoder} and {@link MinimalEncoder}
 */
 public final class HighLevelEncodeTestCase extends Assert {

@ -111,11 +112,11 @@ public final class HighLevelEncodeTestCase extends Assert {
    //with the 16x48 symbol (47 data codewords)
    useTestSymbols();

-    String visualized = encodeHighLevel("AIMAIMAIMAIMAIMAIM");
+    String visualized = encodeHighLevel("AIMAIMAIMAIMAIMAIM", false);
    assertEquals("230 91 11 91 11 91 11 91 11 91 11 91 11", visualized);
    //case "a": Unlatch is not required

-    visualized = encodeHighLevel("AIMAIMAIMAIMAIMAI");
+    visualized = encodeHighLevel("AIMAIMAIMAIMAIMAI", false);
    assertEquals("230 91 11 91 11 91 11 91 11 91 11 90 241", visualized);
    //case "b": Add trailing shift 0 and Unlatch is not required

@ -379,9 +380,153 @@ public final class HighLevelEncodeTestCase extends Assert {
        "191 89 191 89 191 254 66 66", visualized);

  }
+  @Test
+  public void testSizes() {
+    int[] sizes = new int[2];
+    encodeHighLevel("A", sizes);
+    assertEquals(3, sizes[0]);
+    assertEquals(1, sizes[1]);
+
+    encodeHighLevel("AB", sizes);
+    assertEquals(3, sizes[0]);
+    assertEquals(2, sizes[1]);
+
+    encodeHighLevel("ABC", sizes);
+    assertEquals(3, sizes[0]);
+    assertEquals(3, sizes[1]);
+
+    encodeHighLevel("ABCD", sizes);
+    assertEquals(5, sizes[0]);
+    assertEquals(4, sizes[1]);
+
+    encodeHighLevel("ABCDE", sizes);
+    assertEquals(5, sizes[0]);
+    assertEquals(5, sizes[1]);
+
+    encodeHighLevel("ABCDEF", sizes);
+    assertEquals(5, sizes[0]);
+    assertEquals(5, sizes[1]);
+
+    encodeHighLevel("ABCDEFG", sizes);
+    assertEquals(8, sizes[0]);
+    assertEquals(7, sizes[1]);
+
+    encodeHighLevel("ABCDEFGH", sizes);
+    assertEquals(8, sizes[0]);
+    assertEquals(7, sizes[1]);
+
+    encodeHighLevel("ABCDEFGHI", sizes);
+    assertEquals(8, sizes[0]);
+    assertEquals(8, sizes[1]);
+
+    encodeHighLevel("ABCDEFGHIJ", sizes);
+    assertEquals(8, sizes[0]);
+    assertEquals(8, sizes[1]);
+
+    encodeHighLevel("a", sizes);
+    assertEquals(3, sizes[0]);
+    assertEquals(1, sizes[1]);
+
+    encodeHighLevel("ab", sizes);
+    assertEquals(3, sizes[0]);
+    assertEquals(2, sizes[1]);
+
+    encodeHighLevel("abc", sizes);
+    assertEquals(3, sizes[0]);
+    assertEquals(3, sizes[1]);
+
+    encodeHighLevel("abcd", sizes);
+    assertEquals(5, sizes[0]);
+    assertEquals(4, sizes[1]);
+
+    encodeHighLevel("abcdef", sizes);
+    assertEquals(5, sizes[0]);
+    assertEquals(5, sizes[1]);
+
+    encodeHighLevel("abcdefg", sizes);
+    assertEquals(8, sizes[0]);
+    assertEquals(7, sizes[1]);
+
+    encodeHighLevel("abcdefgh", sizes);
+    assertEquals(8, sizes[0]);
+    assertEquals(8, sizes[1]);
+
+    encodeHighLevel("+", sizes);
+    assertEquals(3, sizes[0]);
+    assertEquals(1, sizes[1]);
+
+    encodeHighLevel("++", sizes);
+    assertEquals(3, sizes[0]);
+    assertEquals(2, sizes[1]);
+
+    encodeHighLevel("+++", sizes);
+    assertEquals(3, sizes[0]);
+    assertEquals(3, sizes[1]);
+
+    encodeHighLevel("++++", sizes);
+    assertEquals(5, sizes[0]);
+    assertEquals(4, sizes[1]);
+
+    encodeHighLevel("+++++", sizes);
+    assertEquals(5, sizes[0]);
+    assertEquals(5, sizes[1]);
+
+    encodeHighLevel("++++++", sizes);
+    assertEquals(8, sizes[0]);
+    assertEquals(6, sizes[1]);
+
+    encodeHighLevel("+++++++", sizes);
+    assertEquals(8, sizes[0]);
+    assertEquals(7, sizes[1]);
+
+    encodeHighLevel("++++++++", sizes);
+    assertEquals(8, sizes[0]);
+    assertEquals(7, sizes[1]);
+
+    encodeHighLevel("+++++++++", sizes);
+    assertEquals(8, sizes[0]);
+    assertEquals(8, sizes[1]);
+
+    encodeHighLevel("\u00F0\u00F0" +
+        "ABCDEFGHIJKLMNOPQRSTUVWXYZABCDEFGHIJKLMNOPQRSTUVWXYZABCDEFGHIJKLMNOPQRSTUVWXYZABCDEF", sizes);
+    assertEquals(114, sizes[0]);
+    assertEquals(62, sizes[1]);
+  }
+
+  @Test
+  public void testECIs() {
+
+    String visualized = visualize(MinimalEncoder.encodeHighLevel("that particularly stands out to me is \u0625\u0650" +
+        "\u062C\u064E\u0651\u0627\u0635 (\u02BE\u0101\u1E63) \"pear\", suggested to have originated from Hebrew " +
+        "\u05D0\u05B7\u05D2\u05B8\u05BC\u05E1 (ag\u00E1s)"));
+    assertEquals("239 209 151 206 214 92 122 140 35 158 144 162 52 205 55 171 137 23 67 206 218 175 147 113 15 254" +
+        " 116 33 241 25 231 186 14 212 64 253 151 252 159 33 41 241 27 231 83 171 53 209 35 25 134 6 42 33 35 239 184" +
+        " 31 193 234 7 252 205 101 127 241 209 34 24 5 22 23 221 148 179 239 128 140 92 187 106 204 198 59 19 25 114" +
+        " 248 118 36 254 231 106 196 19 239 101 27 107 69 189 112 236 156 252 16 174 125 24 10 125 116 42", visualized);
+
+    visualized = visualize(MinimalEncoder.encodeHighLevel("that particularly stands out to me is \u0625\u0650" +
+        "\u062C\u064E\u0651\u0627\u0635 (\u02BE\u0101\u1E63) \"pear\", suggested to have originated from Hebrew " +
+        "\u05D0\u05B7\u05D2\u05B8\u05BC\u05E1 (ag\u00E1s)", StandardCharsets.UTF_8, -1 , SymbolShapeHint.FORCE_NONE));
+    assertEquals("241 27 239 209 151 206 214 92 122 140 35 158 144 162 52 205 55 171 137 23 67 206 218 175 147 113" +
+        " 15 254 116 33 231 202 33 131 77 154 119 225 163 238 206 28 249 93 36 150 151 53 108 246 145 228 217 71" +
+        " 199 42 33 35 239 184 31 193 234 7 252 205 101 127 241 209 34 24 5 22 23 221 148 179 239 128 140 92 187 106" +
+        " 204 198 59 19 25 114 248 118 36 254 231 43 133 212 175 38 220 44 6 125 49 172 93 189 209 111 61 217 203 62" +
+        " 116 42", visualized);
+  }
+
+  private static void encodeHighLevel(String msg, int[] sizes) {
+    sizes[0] = HighLevelEncoder.encodeHighLevel(msg).length();
+    sizes[1] = MinimalEncoder.encodeHighLevel(msg).length();
+  }

  private static String encodeHighLevel(String msg) {
+    return encodeHighLevel(msg, true);
+  }
+
+  private static String encodeHighLevel(String msg, boolean compareSizeToMinimalEncoder) {
    CharSequence encoded = HighLevelEncoder.encodeHighLevel(msg);
+    CharSequence encoded2 = MinimalEncoder.encodeHighLevel(msg);
+    assert !compareSizeToMinimalEncoder || encoded2.length() <= encoded.length();
    //DecodeHighLevel.decode(encoded);
    return visualize(encoded);
  }