Compact QR-Code: Support more character sets (in particular cp1256 (Arabic) and Shift-JIS) (#1453)

* - added code to suppress the terminator if the capacity of the version is less than 4 bit
- added test case

* - Removed code in MinimalEncoder that added Mode.TERMINATOR (is taken care of in Encoder.terminateBits)
- Removed the corresponding test case

* Updated test cases

* Improved documentation

* Changed documentation to not use an example with an unsupported character encoding

* Improved wording of comment

* - Simplified code
- Added space after comma in several places

* Added support for more character sets supported by CharacterSetECI

* Syntactic enhancements

* Changed instantiation of generic types to diamond style

* Updated documentation of the QR_COMPACT hint to explain the impact of setting the CHARACTER_SET hint .

* Changed whitespace

* Removed comment

* Fixed typos in comments

* Added text cases for KANJI and Shift_JS encoding

* Improved comments on Japanese language test cases
This commit is contained in:
AlexGeller1 2021-10-21 17:10:28 +02:00 committed by GitHub
parent c729abe393
commit 2e22d09479
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
4 changed files with 97 additions and 62 deletions

View file

@ -113,8 +113,10 @@ public enum EncodeHintType {
/**
* Specifies whether to use compact mode for QR code (type {@link Boolean}, or "true" or "false"
* When compaction is performed the value for {@link #CHARACTER_SET} is ignored.
* {@link String} value).
* Please note that when compaction is performed, the most compact character encoding is chosen
* for characters in the input that are not in the ISO-8859-1 character set. Based on experience,
* some scanners do not support encodings like cp-1256 (Arabic). In such cases the encoding can
* be forced to UTF-8 by means of the {@link #CHARACTER_SET} encoding hint.
*/
QR_COMPACT,

View file

@ -107,7 +107,7 @@ public final class Encoder {
} else {
// Pick an encoding mode appropriate for the content. Note that this will not attempt to use
// multiple modes / segments even if that were more efficient. Twould be nice.
// multiple modes / segments even if that were more efficient.
mode = chooseMode(content, encoding);
// This will store the header information, like mode and

View file

@ -27,6 +27,7 @@ import java.nio.charset.Charset;
import java.nio.charset.CharsetEncoder;
import java.nio.charset.StandardCharsets;
import java.util.ArrayList;
import java.util.List;
import java.nio.charset.UnsupportedCharsetException;
@ -52,7 +53,7 @@ import java.nio.charset.UnsupportedCharsetException;
* In multi language content the algorithm selects the most compact representation using ECI modes.
* For example the most compact representation of the string "\u0150\u015C" (O-double-acute, S-circumflex) is
* ECI(UTF-8), BYTE(\u0150\u015C) while prepending one or more times the same leading character as in
* "\u0150\u0150\u015C", the most compact representation uses two ECIs so that the string is encoded as
* "\u0150\u0150\u015C", the most compact representation uses two ECIs so that the string is encoded as
* ECI(ISO-8859-2), BYTE(\u0150\u0150), ECI(ISO-8859-3), BYTE(\u015C).
*
* @author Alex Geller
@ -75,6 +76,45 @@ final class MinimalEncoder {
}
}
// List of encoders that potentially encode characters not in ISO-8859-1 in one byte.
private static final List<CharsetEncoder> ENCODERS = new ArrayList<>();
static {
final String[] names = { "ISO-8859-2",
"ISO-8859-3",
"ISO-8859-4",
"ISO-8859-5",
"ISO-8859-6",
"ISO-8859-7",
"ISO-8859-8",
"ISO-8859-9",
"ISO-8859-10",
"ISO-8859-11",
"ISO-8859-13",
"ISO-8859-14",
"ISO-8859-15",
"ISO-8859-16",
"windows-1250",
"windows-1251",
"windows-1252",
"windows-1253",
"windows-1254",
"windows-1255",
"windows-1256",
"windows-1257",
"windows-1258",
"Shift_JIS" };
for (String name : names) {
if (CharacterSetECI.getCharacterSetECIByName(name) != null) {
try {
ENCODERS.add(Charset.forName(name).newEncoder());
} catch (UnsupportedCharsetException e) {
// continue
}
}
}
}
private final String stringToEncode;
private final boolean isGS1;
private final CharsetEncoder[] encoders;
@ -100,72 +140,42 @@ final class MinimalEncoder {
this.isGS1 = isGS1;
this.ecLevel = ecLevel;
CharsetEncoder[] isoEncoders = new CharsetEncoder[15]; // room for the 15 ISO-8859 charsets 1 through 16.
isoEncoders[0] = StandardCharsets.ISO_8859_1.newEncoder();
List<CharsetEncoder> neededEncoders = new ArrayList<>();
neededEncoders.add(StandardCharsets.ISO_8859_1.newEncoder());
boolean needUnicodeEncoder = priorityCharset != null && priorityCharset.name().startsWith("UTF");
for (int i = 0; i < stringToEncode.length(); i++) {
int cnt = 0;
int j;
for (j = 0; j < 15; j++) {
if (isoEncoders[j] != null) {
cnt++;
if (isoEncoders[j].canEncode(stringToEncode.charAt(i))) {
boolean canEncode = false;
for (CharsetEncoder encoder : neededEncoders) {
if (encoder.canEncode(stringToEncode.charAt(i))) {
canEncode = true;
break;
}
}
if (!canEncode) {
for (CharsetEncoder encoder : ENCODERS) {
if (encoder.canEncode(stringToEncode.charAt(i))) {
neededEncoders.add(encoder);
canEncode = true;
break;
}
}
}
if (cnt == 14) { // we need all. Can stop looking further.
break;
}
if (j >= 15) { // no encoder found
for (j = 0; j < 15; j++) {
if (j != 11 && isoEncoders[j] == null) { // ISO-8859-12 doesn't exist
try {
CharsetEncoder ce = Charset.forName("ISO-8859-" + (j + 1)).newEncoder();
if (ce.canEncode(stringToEncode.charAt(i))) {
isoEncoders[j] = ce;
break;
}
} catch (UnsupportedCharsetException e) {
// continue
}
}
}
if (j >= 15) {
if (!StandardCharsets.UTF_16BE.newEncoder().canEncode(stringToEncode.charAt(i))) {
throw new WriterException("Can not encode character \\u" +
String.format("%04X", (int) stringToEncode.charAt(i)) + " at position " + i +
" in input \"" + stringToEncode + "\"");
}
needUnicodeEncoder = true;
}
if (!canEncode) {
needUnicodeEncoder = true;
}
}
int numberOfEncoders = 0;
for (int j = 0; j < 15; j++) {
if (isoEncoders[j] != null) {
if (CharacterSetECI.getCharacterSetECI(isoEncoders[j].charset()) != null) {
numberOfEncoders++;
} else {
needUnicodeEncoder = true;
}
}
}
if (numberOfEncoders == 1 && !needUnicodeEncoder) {
if (neededEncoders.size() == 1 && !needUnicodeEncoder) {
encoders = new CharsetEncoder[1];
encoders[0] = isoEncoders[0];
encoders[0] = neededEncoders.get(0);
} else {
encoders = new CharsetEncoder[numberOfEncoders + 2];
encoders = new CharsetEncoder[neededEncoders.size() + 2];
int index = 0;
for (int j = 0; j < 15; j++) {
if (isoEncoders[j] != null && CharacterSetECI.getCharacterSetECI(isoEncoders[j].charset()) != null) {
encoders[index++] = isoEncoders[j];
}
for (CharsetEncoder encoder : neededEncoders) {
encoders[index++] = encoder;
}
encoders[index] = StandardCharsets.UTF_8.newEncoder();
@ -304,7 +314,7 @@ final class MinimalEncoder {
}
}
void addEdge(ArrayList<Edge>[][][] edges, int position, Edge edge) {
void addEdge(List<Edge>[][][] edges, int position, Edge edge) {
int vertexIndex = position + edge.characterLength;
if (edges[vertexIndex][edge.charsetEncoderIndex][getCompactedOrdinal(edge.mode)] == null) {
edges[vertexIndex][edge.charsetEncoderIndex][getCompactedOrdinal(edge.mode)] = new ArrayList<>();
@ -312,7 +322,7 @@ final class MinimalEncoder {
edges[vertexIndex][edge.charsetEncoderIndex][getCompactedOrdinal(edge.mode)].add(edge);
}
void addEdges(Version version, ArrayList<Edge>[][][] edges, int from, Edge previous) {
void addEdges(Version version, List<Edge>[][][] edges, int from, Edge previous) {
int start = 0;
int end = encoders.length;
if (priorityEncoderIndex >= 0 && encoders[priorityEncoderIndex].canEncode(stringToEncode.charAt(from))) {
@ -345,7 +355,7 @@ final class MinimalEncoder {
ResultList encodeSpecificVersion(Version version) throws WriterException {
@SuppressWarnings("checkstyle:lineLength")
/* A vertex represents a tuple of a position in the input, a mode and an a character encoding where position 0
/* A vertex represents a tuple of a position in the input, a mode and a character encoding where position 0
* denotes the position left of the first character, 1 the position left of the second character and so on.
* Likewise the end vertices are located after the last character at position stringToEncode.length().
*
@ -463,7 +473,7 @@ final class MinimalEncoder {
// The last dimension in the array below encodes the 4 modes KANJI, ALPHANUMERIC, NUMERIC and BYTE via the
// function getCompactedOrdinal(Mode)
@SuppressWarnings("unchecked")
ArrayList<Edge>[][][] edges = new ArrayList[inputLength + 1][encoders.length][4];
List<Edge>[][][] edges = new ArrayList[inputLength + 1][encoders.length][4];
addEdges(version, edges, 0, null);
for (int i = 1; i <= inputLength; i++) {
@ -471,7 +481,7 @@ final class MinimalEncoder {
for (int k = 0; k < 4; k++) {
Edge minimalEdge;
if (edges[i][j][k] != null) {
ArrayList<Edge> localEdges = edges[i][j][k];
List<Edge> localEdges = edges[i][j][k];
int minimalIndex = -1;
int minimalSize = Integer.MAX_VALUE;
for (int l = 0; l < localEdges.size(); l++) {
@ -499,7 +509,7 @@ final class MinimalEncoder {
for (int j = 0; j < encoders.length; j++) {
for (int k = 0; k < 4; k++) {
if (edges[inputLength][j][k] != null) {
ArrayList<Edge> localEdges = edges[inputLength][j][k];
List<Edge> localEdges = edges[inputLength][j][k];
assert localEdges.size() == 1;
Edge edge = localEdges.get(0);
if (edge.cachedTotalSize < minimalSize) {

View file

@ -882,6 +882,29 @@ public final class EncoderTestCase extends Assert {
true);
}
@Test
public void testMinimalEncoder42() throws Exception {
// test halfwidth Katakana character (they are single byte encoded in Shift_JIS)
verifyMinimalEncoding("Katakana:\uFF66\uFF66\uFF66\uFF66\uFF66\uFF66", "ECI(Shift_JIS),BYTE(Katakana:......)", null
, false);
}
@Test
public void testMinimalEncoder43() throws Exception {
// The character \u30A2 encodes as double byte in Shift_JIS so KANJI is more compact in this case
verifyMinimalEncoding("Katakana:\u30A2\u30A2\u30A2\u30A2\u30A2\u30A2", "BYTE(Katakana:),KANJI(......)", null,
false);
}
@Test
public void testMinimalEncoder44() throws Exception {
// The character \u30A2 encodes as double byte in Shift_JIS but KANJI is not more compact in this case because
// KANJI is only more compact when it encodes pairs of characters. In the case of mixed text it can however be
// that Shift_JIS encoding is more compact as in this example
verifyMinimalEncoding("Katakana:\u30A2a\u30A2a\u30A2a\u30A2a\u30A2a\u30A2", "ECI(Shift_JIS),BYTE(Katakana:.a.a.a" +
".a.a.)", null, false);
}
static void verifyMinimalEncoding(String input, String expectedResult, Charset priorityCharset, boolean isGS1)
throws Exception {
MinimalEncoder.ResultList result = MinimalEncoder.encode(input, null, priorityCharset, isGS1,