Compact QR-Code: Support more character sets (in particular cp1256 (Arabic) and Shift-JIS) (#1453)

* - added code to suppress the terminator if the capacity of the version is less than 4 bit
- added test case

* - Removed code in MinimalEncoder that added Mode.TERMINATOR (is taken care of in Encoder.terminateBits)
- Removed the corresponding test case

* Updated test cases

* Improved documentation

* Changed documentation to not use an example with an unsupported character encoding

* Improved wording of comment

* - Simplified code
- Added space after comma in several places

* Added support for more character sets supported by CharacterSetECI

* Syntactic enhancements

* Changed instantiation of generic types to diamond style

* Updated documentation of the QR_COMPACT hint to explain the impact of setting the CHARACTER_SET hint .

* Changed whitespace

* Removed comment

* Fixed typos in comments

* Added text cases for KANJI and Shift_JS encoding

* Improved comments on Japanese language test cases
This commit is contained in:
AlexGeller1 2021-10-21 17:10:28 +02:00 committed by GitHub
parent c729abe393
commit 2e22d09479
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
4 changed files with 97 additions and 62 deletions

View file

@ -113,8 +113,10 @@ public enum EncodeHintType {
/** /**
* Specifies whether to use compact mode for QR code (type {@link Boolean}, or "true" or "false" * Specifies whether to use compact mode for QR code (type {@link Boolean}, or "true" or "false"
* When compaction is performed the value for {@link #CHARACTER_SET} is ignored. * Please note that when compaction is performed, the most compact character encoding is chosen
* {@link String} value). * for characters in the input that are not in the ISO-8859-1 character set. Based on experience,
* some scanners do not support encodings like cp-1256 (Arabic). In such cases the encoding can
* be forced to UTF-8 by means of the {@link #CHARACTER_SET} encoding hint.
*/ */
QR_COMPACT, QR_COMPACT,

View file

@ -107,7 +107,7 @@ public final class Encoder {
} else { } else {
// Pick an encoding mode appropriate for the content. Note that this will not attempt to use // Pick an encoding mode appropriate for the content. Note that this will not attempt to use
// multiple modes / segments even if that were more efficient. Twould be nice. // multiple modes / segments even if that were more efficient.
mode = chooseMode(content, encoding); mode = chooseMode(content, encoding);
// This will store the header information, like mode and // This will store the header information, like mode and

View file

@ -27,6 +27,7 @@ import java.nio.charset.Charset;
import java.nio.charset.CharsetEncoder; import java.nio.charset.CharsetEncoder;
import java.nio.charset.StandardCharsets; import java.nio.charset.StandardCharsets;
import java.util.ArrayList; import java.util.ArrayList;
import java.util.List;
import java.nio.charset.UnsupportedCharsetException; import java.nio.charset.UnsupportedCharsetException;
@ -52,7 +53,7 @@ import java.nio.charset.UnsupportedCharsetException;
* In multi language content the algorithm selects the most compact representation using ECI modes. * In multi language content the algorithm selects the most compact representation using ECI modes.
* For example the most compact representation of the string "\u0150\u015C" (O-double-acute, S-circumflex) is * For example the most compact representation of the string "\u0150\u015C" (O-double-acute, S-circumflex) is
* ECI(UTF-8), BYTE(\u0150\u015C) while prepending one or more times the same leading character as in * ECI(UTF-8), BYTE(\u0150\u015C) while prepending one or more times the same leading character as in
* "\u0150\u0150\u015C", the most compact representation uses two ECIs so that the string is encoded as * "\u0150\u0150\u015C", the most compact representation uses two ECIs so that the string is encoded as
* ECI(ISO-8859-2), BYTE(\u0150\u0150), ECI(ISO-8859-3), BYTE(\u015C). * ECI(ISO-8859-2), BYTE(\u0150\u0150), ECI(ISO-8859-3), BYTE(\u015C).
* *
* @author Alex Geller * @author Alex Geller
@ -75,6 +76,45 @@ final class MinimalEncoder {
} }
} }
// List of encoders that potentially encode characters not in ISO-8859-1 in one byte.
private static final List<CharsetEncoder> ENCODERS = new ArrayList<>();
static {
final String[] names = { "ISO-8859-2",
"ISO-8859-3",
"ISO-8859-4",
"ISO-8859-5",
"ISO-8859-6",
"ISO-8859-7",
"ISO-8859-8",
"ISO-8859-9",
"ISO-8859-10",
"ISO-8859-11",
"ISO-8859-13",
"ISO-8859-14",
"ISO-8859-15",
"ISO-8859-16",
"windows-1250",
"windows-1251",
"windows-1252",
"windows-1253",
"windows-1254",
"windows-1255",
"windows-1256",
"windows-1257",
"windows-1258",
"Shift_JIS" };
for (String name : names) {
if (CharacterSetECI.getCharacterSetECIByName(name) != null) {
try {
ENCODERS.add(Charset.forName(name).newEncoder());
} catch (UnsupportedCharsetException e) {
// continue
}
}
}
}
private final String stringToEncode; private final String stringToEncode;
private final boolean isGS1; private final boolean isGS1;
private final CharsetEncoder[] encoders; private final CharsetEncoder[] encoders;
@ -100,72 +140,42 @@ final class MinimalEncoder {
this.isGS1 = isGS1; this.isGS1 = isGS1;
this.ecLevel = ecLevel; this.ecLevel = ecLevel;
CharsetEncoder[] isoEncoders = new CharsetEncoder[15]; // room for the 15 ISO-8859 charsets 1 through 16. List<CharsetEncoder> neededEncoders = new ArrayList<>();
isoEncoders[0] = StandardCharsets.ISO_8859_1.newEncoder(); neededEncoders.add(StandardCharsets.ISO_8859_1.newEncoder());
boolean needUnicodeEncoder = priorityCharset != null && priorityCharset.name().startsWith("UTF"); boolean needUnicodeEncoder = priorityCharset != null && priorityCharset.name().startsWith("UTF");
for (int i = 0; i < stringToEncode.length(); i++) { for (int i = 0; i < stringToEncode.length(); i++) {
int cnt = 0; boolean canEncode = false;
int j; for (CharsetEncoder encoder : neededEncoders) {
for (j = 0; j < 15; j++) { if (encoder.canEncode(stringToEncode.charAt(i))) {
if (isoEncoders[j] != null) { canEncode = true;
cnt++; break;
if (isoEncoders[j].canEncode(stringToEncode.charAt(i))) { }
}
if (!canEncode) {
for (CharsetEncoder encoder : ENCODERS) {
if (encoder.canEncode(stringToEncode.charAt(i))) {
neededEncoders.add(encoder);
canEncode = true;
break; break;
} }
} }
} }
if (cnt == 14) { // we need all. Can stop looking further. if (!canEncode) {
break; needUnicodeEncoder = true;
}
if (j >= 15) { // no encoder found
for (j = 0; j < 15; j++) {
if (j != 11 && isoEncoders[j] == null) { // ISO-8859-12 doesn't exist
try {
CharsetEncoder ce = Charset.forName("ISO-8859-" + (j + 1)).newEncoder();
if (ce.canEncode(stringToEncode.charAt(i))) {
isoEncoders[j] = ce;
break;
}
} catch (UnsupportedCharsetException e) {
// continue
}
}
}
if (j >= 15) {
if (!StandardCharsets.UTF_16BE.newEncoder().canEncode(stringToEncode.charAt(i))) {
throw new WriterException("Can not encode character \\u" +
String.format("%04X", (int) stringToEncode.charAt(i)) + " at position " + i +
" in input \"" + stringToEncode + "\"");
}
needUnicodeEncoder = true;
}
} }
} }
int numberOfEncoders = 0; if (neededEncoders.size() == 1 && !needUnicodeEncoder) {
for (int j = 0; j < 15; j++) {
if (isoEncoders[j] != null) {
if (CharacterSetECI.getCharacterSetECI(isoEncoders[j].charset()) != null) {
numberOfEncoders++;
} else {
needUnicodeEncoder = true;
}
}
}
if (numberOfEncoders == 1 && !needUnicodeEncoder) {
encoders = new CharsetEncoder[1]; encoders = new CharsetEncoder[1];
encoders[0] = isoEncoders[0]; encoders[0] = neededEncoders.get(0);
} else { } else {
encoders = new CharsetEncoder[numberOfEncoders + 2]; encoders = new CharsetEncoder[neededEncoders.size() + 2];
int index = 0; int index = 0;
for (int j = 0; j < 15; j++) { for (CharsetEncoder encoder : neededEncoders) {
if (isoEncoders[j] != null && CharacterSetECI.getCharacterSetECI(isoEncoders[j].charset()) != null) { encoders[index++] = encoder;
encoders[index++] = isoEncoders[j];
}
} }
encoders[index] = StandardCharsets.UTF_8.newEncoder(); encoders[index] = StandardCharsets.UTF_8.newEncoder();
@ -304,7 +314,7 @@ final class MinimalEncoder {
} }
} }
void addEdge(ArrayList<Edge>[][][] edges, int position, Edge edge) { void addEdge(List<Edge>[][][] edges, int position, Edge edge) {
int vertexIndex = position + edge.characterLength; int vertexIndex = position + edge.characterLength;
if (edges[vertexIndex][edge.charsetEncoderIndex][getCompactedOrdinal(edge.mode)] == null) { if (edges[vertexIndex][edge.charsetEncoderIndex][getCompactedOrdinal(edge.mode)] == null) {
edges[vertexIndex][edge.charsetEncoderIndex][getCompactedOrdinal(edge.mode)] = new ArrayList<>(); edges[vertexIndex][edge.charsetEncoderIndex][getCompactedOrdinal(edge.mode)] = new ArrayList<>();
@ -312,7 +322,7 @@ final class MinimalEncoder {
edges[vertexIndex][edge.charsetEncoderIndex][getCompactedOrdinal(edge.mode)].add(edge); edges[vertexIndex][edge.charsetEncoderIndex][getCompactedOrdinal(edge.mode)].add(edge);
} }
void addEdges(Version version, ArrayList<Edge>[][][] edges, int from, Edge previous) { void addEdges(Version version, List<Edge>[][][] edges, int from, Edge previous) {
int start = 0; int start = 0;
int end = encoders.length; int end = encoders.length;
if (priorityEncoderIndex >= 0 && encoders[priorityEncoderIndex].canEncode(stringToEncode.charAt(from))) { if (priorityEncoderIndex >= 0 && encoders[priorityEncoderIndex].canEncode(stringToEncode.charAt(from))) {
@ -345,7 +355,7 @@ final class MinimalEncoder {
ResultList encodeSpecificVersion(Version version) throws WriterException { ResultList encodeSpecificVersion(Version version) throws WriterException {
@SuppressWarnings("checkstyle:lineLength") @SuppressWarnings("checkstyle:lineLength")
/* A vertex represents a tuple of a position in the input, a mode and an a character encoding where position 0 /* A vertex represents a tuple of a position in the input, a mode and a character encoding where position 0
* denotes the position left of the first character, 1 the position left of the second character and so on. * denotes the position left of the first character, 1 the position left of the second character and so on.
* Likewise the end vertices are located after the last character at position stringToEncode.length(). * Likewise the end vertices are located after the last character at position stringToEncode.length().
* *
@ -463,7 +473,7 @@ final class MinimalEncoder {
// The last dimension in the array below encodes the 4 modes KANJI, ALPHANUMERIC, NUMERIC and BYTE via the // The last dimension in the array below encodes the 4 modes KANJI, ALPHANUMERIC, NUMERIC and BYTE via the
// function getCompactedOrdinal(Mode) // function getCompactedOrdinal(Mode)
@SuppressWarnings("unchecked") @SuppressWarnings("unchecked")
ArrayList<Edge>[][][] edges = new ArrayList[inputLength + 1][encoders.length][4]; List<Edge>[][][] edges = new ArrayList[inputLength + 1][encoders.length][4];
addEdges(version, edges, 0, null); addEdges(version, edges, 0, null);
for (int i = 1; i <= inputLength; i++) { for (int i = 1; i <= inputLength; i++) {
@ -471,7 +481,7 @@ final class MinimalEncoder {
for (int k = 0; k < 4; k++) { for (int k = 0; k < 4; k++) {
Edge minimalEdge; Edge minimalEdge;
if (edges[i][j][k] != null) { if (edges[i][j][k] != null) {
ArrayList<Edge> localEdges = edges[i][j][k]; List<Edge> localEdges = edges[i][j][k];
int minimalIndex = -1; int minimalIndex = -1;
int minimalSize = Integer.MAX_VALUE; int minimalSize = Integer.MAX_VALUE;
for (int l = 0; l < localEdges.size(); l++) { for (int l = 0; l < localEdges.size(); l++) {
@ -499,7 +509,7 @@ final class MinimalEncoder {
for (int j = 0; j < encoders.length; j++) { for (int j = 0; j < encoders.length; j++) {
for (int k = 0; k < 4; k++) { for (int k = 0; k < 4; k++) {
if (edges[inputLength][j][k] != null) { if (edges[inputLength][j][k] != null) {
ArrayList<Edge> localEdges = edges[inputLength][j][k]; List<Edge> localEdges = edges[inputLength][j][k];
assert localEdges.size() == 1; assert localEdges.size() == 1;
Edge edge = localEdges.get(0); Edge edge = localEdges.get(0);
if (edge.cachedTotalSize < minimalSize) { if (edge.cachedTotalSize < minimalSize) {

View file

@ -882,6 +882,29 @@ public final class EncoderTestCase extends Assert {
true); true);
} }
@Test
public void testMinimalEncoder42() throws Exception {
// test halfwidth Katakana character (they are single byte encoded in Shift_JIS)
verifyMinimalEncoding("Katakana:\uFF66\uFF66\uFF66\uFF66\uFF66\uFF66", "ECI(Shift_JIS),BYTE(Katakana:......)", null
, false);
}
@Test
public void testMinimalEncoder43() throws Exception {
// The character \u30A2 encodes as double byte in Shift_JIS so KANJI is more compact in this case
verifyMinimalEncoding("Katakana:\u30A2\u30A2\u30A2\u30A2\u30A2\u30A2", "BYTE(Katakana:),KANJI(......)", null,
false);
}
@Test
public void testMinimalEncoder44() throws Exception {
// The character \u30A2 encodes as double byte in Shift_JIS but KANJI is not more compact in this case because
// KANJI is only more compact when it encodes pairs of characters. In the case of mixed text it can however be
// that Shift_JIS encoding is more compact as in this example
verifyMinimalEncoding("Katakana:\u30A2a\u30A2a\u30A2a\u30A2a\u30A2a\u30A2", "ECI(Shift_JIS),BYTE(Katakana:.a.a.a" +
".a.a.)", null, false);
}
static void verifyMinimalEncoding(String input, String expectedResult, Charset priorityCharset, boolean isGS1) static void verifyMinimalEncoding(String input, String expectedResult, Charset priorityCharset, boolean isGS1)
throws Exception { throws Exception {
MinimalEncoder.ResultList result = MinimalEncoder.encode(input, null, priorityCharset, isGS1, MinimalEncoder.ResultList result = MinimalEncoder.encode(input, null, priorityCharset, isGS1,