mirror of
https://github.com/zxing/zxing.git
synced 2025-03-05 20:48:51 -08:00
Compact QR-Code: Support more character sets (in particular cp1256 (Arabic) and Shift-JIS) (#1453)
* - added code to suppress the terminator if the capacity of the version is less than 4 bit - added test case * - Removed code in MinimalEncoder that added Mode.TERMINATOR (is taken care of in Encoder.terminateBits) - Removed the corresponding test case * Updated test cases * Improved documentation * Changed documentation to not use an example with an unsupported character encoding * Improved wording of comment * - Simplified code - Added space after comma in several places * Added support for more character sets supported by CharacterSetECI * Syntactic enhancements * Changed instantiation of generic types to diamond style * Updated documentation of the QR_COMPACT hint to explain the impact of setting the CHARACTER_SET hint . * Changed whitespace * Removed comment * Fixed typos in comments * Added text cases for KANJI and Shift_JS encoding * Improved comments on Japanese language test cases
This commit is contained in:
parent
c729abe393
commit
2e22d09479
|
@ -113,8 +113,10 @@ public enum EncodeHintType {
|
|||
|
||||
/**
|
||||
* Specifies whether to use compact mode for QR code (type {@link Boolean}, or "true" or "false"
|
||||
* When compaction is performed the value for {@link #CHARACTER_SET} is ignored.
|
||||
* {@link String} value).
|
||||
* Please note that when compaction is performed, the most compact character encoding is chosen
|
||||
* for characters in the input that are not in the ISO-8859-1 character set. Based on experience,
|
||||
* some scanners do not support encodings like cp-1256 (Arabic). In such cases the encoding can
|
||||
* be forced to UTF-8 by means of the {@link #CHARACTER_SET} encoding hint.
|
||||
*/
|
||||
QR_COMPACT,
|
||||
|
||||
|
|
|
@ -107,7 +107,7 @@ public final class Encoder {
|
|||
} else {
|
||||
|
||||
// Pick an encoding mode appropriate for the content. Note that this will not attempt to use
|
||||
// multiple modes / segments even if that were more efficient. Twould be nice.
|
||||
// multiple modes / segments even if that were more efficient.
|
||||
mode = chooseMode(content, encoding);
|
||||
|
||||
// This will store the header information, like mode and
|
||||
|
|
|
@ -27,6 +27,7 @@ import java.nio.charset.Charset;
|
|||
import java.nio.charset.CharsetEncoder;
|
||||
import java.nio.charset.StandardCharsets;
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
|
||||
import java.nio.charset.UnsupportedCharsetException;
|
||||
|
||||
|
@ -52,7 +53,7 @@ import java.nio.charset.UnsupportedCharsetException;
|
|||
* In multi language content the algorithm selects the most compact representation using ECI modes.
|
||||
* For example the most compact representation of the string "\u0150\u015C" (O-double-acute, S-circumflex) is
|
||||
* ECI(UTF-8), BYTE(\u0150\u015C) while prepending one or more times the same leading character as in
|
||||
* "\u0150\u0150\u015C", the most compact representation uses two ECIs so that the string is encoded as
|
||||
* "\u0150\u0150\u015C", the most compact representation uses two ECIs so that the string is encoded as
|
||||
* ECI(ISO-8859-2), BYTE(\u0150\u0150), ECI(ISO-8859-3), BYTE(\u015C).
|
||||
*
|
||||
* @author Alex Geller
|
||||
|
@ -75,6 +76,45 @@ final class MinimalEncoder {
|
|||
}
|
||||
}
|
||||
|
||||
// List of encoders that potentially encode characters not in ISO-8859-1 in one byte.
|
||||
private static final List<CharsetEncoder> ENCODERS = new ArrayList<>();
|
||||
static {
|
||||
final String[] names = { "ISO-8859-2",
|
||||
"ISO-8859-3",
|
||||
"ISO-8859-4",
|
||||
"ISO-8859-5",
|
||||
"ISO-8859-6",
|
||||
"ISO-8859-7",
|
||||
"ISO-8859-8",
|
||||
"ISO-8859-9",
|
||||
"ISO-8859-10",
|
||||
"ISO-8859-11",
|
||||
"ISO-8859-13",
|
||||
"ISO-8859-14",
|
||||
"ISO-8859-15",
|
||||
"ISO-8859-16",
|
||||
"windows-1250",
|
||||
"windows-1251",
|
||||
"windows-1252",
|
||||
"windows-1253",
|
||||
"windows-1254",
|
||||
"windows-1255",
|
||||
"windows-1256",
|
||||
"windows-1257",
|
||||
"windows-1258",
|
||||
"Shift_JIS" };
|
||||
for (String name : names) {
|
||||
if (CharacterSetECI.getCharacterSetECIByName(name) != null) {
|
||||
try {
|
||||
ENCODERS.add(Charset.forName(name).newEncoder());
|
||||
} catch (UnsupportedCharsetException e) {
|
||||
// continue
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
private final String stringToEncode;
|
||||
private final boolean isGS1;
|
||||
private final CharsetEncoder[] encoders;
|
||||
|
@ -100,72 +140,42 @@ final class MinimalEncoder {
|
|||
this.isGS1 = isGS1;
|
||||
this.ecLevel = ecLevel;
|
||||
|
||||
CharsetEncoder[] isoEncoders = new CharsetEncoder[15]; // room for the 15 ISO-8859 charsets 1 through 16.
|
||||
isoEncoders[0] = StandardCharsets.ISO_8859_1.newEncoder();
|
||||
List<CharsetEncoder> neededEncoders = new ArrayList<>();
|
||||
neededEncoders.add(StandardCharsets.ISO_8859_1.newEncoder());
|
||||
boolean needUnicodeEncoder = priorityCharset != null && priorityCharset.name().startsWith("UTF");
|
||||
|
||||
for (int i = 0; i < stringToEncode.length(); i++) {
|
||||
int cnt = 0;
|
||||
int j;
|
||||
for (j = 0; j < 15; j++) {
|
||||
if (isoEncoders[j] != null) {
|
||||
cnt++;
|
||||
if (isoEncoders[j].canEncode(stringToEncode.charAt(i))) {
|
||||
boolean canEncode = false;
|
||||
for (CharsetEncoder encoder : neededEncoders) {
|
||||
if (encoder.canEncode(stringToEncode.charAt(i))) {
|
||||
canEncode = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
if (!canEncode) {
|
||||
for (CharsetEncoder encoder : ENCODERS) {
|
||||
if (encoder.canEncode(stringToEncode.charAt(i))) {
|
||||
neededEncoders.add(encoder);
|
||||
canEncode = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (cnt == 14) { // we need all. Can stop looking further.
|
||||
break;
|
||||
}
|
||||
|
||||
if (j >= 15) { // no encoder found
|
||||
for (j = 0; j < 15; j++) {
|
||||
if (j != 11 && isoEncoders[j] == null) { // ISO-8859-12 doesn't exist
|
||||
try {
|
||||
CharsetEncoder ce = Charset.forName("ISO-8859-" + (j + 1)).newEncoder();
|
||||
if (ce.canEncode(stringToEncode.charAt(i))) {
|
||||
isoEncoders[j] = ce;
|
||||
break;
|
||||
}
|
||||
} catch (UnsupportedCharsetException e) {
|
||||
// continue
|
||||
}
|
||||
}
|
||||
}
|
||||
if (j >= 15) {
|
||||
if (!StandardCharsets.UTF_16BE.newEncoder().canEncode(stringToEncode.charAt(i))) {
|
||||
throw new WriterException("Can not encode character \\u" +
|
||||
String.format("%04X", (int) stringToEncode.charAt(i)) + " at position " + i +
|
||||
" in input \"" + stringToEncode + "\"");
|
||||
}
|
||||
needUnicodeEncoder = true;
|
||||
}
|
||||
if (!canEncode) {
|
||||
needUnicodeEncoder = true;
|
||||
}
|
||||
}
|
||||
|
||||
int numberOfEncoders = 0;
|
||||
for (int j = 0; j < 15; j++) {
|
||||
if (isoEncoders[j] != null) {
|
||||
if (CharacterSetECI.getCharacterSetECI(isoEncoders[j].charset()) != null) {
|
||||
numberOfEncoders++;
|
||||
} else {
|
||||
needUnicodeEncoder = true;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (numberOfEncoders == 1 && !needUnicodeEncoder) {
|
||||
if (neededEncoders.size() == 1 && !needUnicodeEncoder) {
|
||||
encoders = new CharsetEncoder[1];
|
||||
encoders[0] = isoEncoders[0];
|
||||
encoders[0] = neededEncoders.get(0);
|
||||
} else {
|
||||
encoders = new CharsetEncoder[numberOfEncoders + 2];
|
||||
encoders = new CharsetEncoder[neededEncoders.size() + 2];
|
||||
int index = 0;
|
||||
for (int j = 0; j < 15; j++) {
|
||||
if (isoEncoders[j] != null && CharacterSetECI.getCharacterSetECI(isoEncoders[j].charset()) != null) {
|
||||
encoders[index++] = isoEncoders[j];
|
||||
}
|
||||
for (CharsetEncoder encoder : neededEncoders) {
|
||||
encoders[index++] = encoder;
|
||||
}
|
||||
|
||||
encoders[index] = StandardCharsets.UTF_8.newEncoder();
|
||||
|
@ -304,7 +314,7 @@ final class MinimalEncoder {
|
|||
}
|
||||
}
|
||||
|
||||
void addEdge(ArrayList<Edge>[][][] edges, int position, Edge edge) {
|
||||
void addEdge(List<Edge>[][][] edges, int position, Edge edge) {
|
||||
int vertexIndex = position + edge.characterLength;
|
||||
if (edges[vertexIndex][edge.charsetEncoderIndex][getCompactedOrdinal(edge.mode)] == null) {
|
||||
edges[vertexIndex][edge.charsetEncoderIndex][getCompactedOrdinal(edge.mode)] = new ArrayList<>();
|
||||
|
@ -312,7 +322,7 @@ final class MinimalEncoder {
|
|||
edges[vertexIndex][edge.charsetEncoderIndex][getCompactedOrdinal(edge.mode)].add(edge);
|
||||
}
|
||||
|
||||
void addEdges(Version version, ArrayList<Edge>[][][] edges, int from, Edge previous) {
|
||||
void addEdges(Version version, List<Edge>[][][] edges, int from, Edge previous) {
|
||||
int start = 0;
|
||||
int end = encoders.length;
|
||||
if (priorityEncoderIndex >= 0 && encoders[priorityEncoderIndex].canEncode(stringToEncode.charAt(from))) {
|
||||
|
@ -345,7 +355,7 @@ final class MinimalEncoder {
|
|||
ResultList encodeSpecificVersion(Version version) throws WriterException {
|
||||
|
||||
@SuppressWarnings("checkstyle:lineLength")
|
||||
/* A vertex represents a tuple of a position in the input, a mode and an a character encoding where position 0
|
||||
/* A vertex represents a tuple of a position in the input, a mode and a character encoding where position 0
|
||||
* denotes the position left of the first character, 1 the position left of the second character and so on.
|
||||
* Likewise the end vertices are located after the last character at position stringToEncode.length().
|
||||
*
|
||||
|
@ -463,7 +473,7 @@ final class MinimalEncoder {
|
|||
// The last dimension in the array below encodes the 4 modes KANJI, ALPHANUMERIC, NUMERIC and BYTE via the
|
||||
// function getCompactedOrdinal(Mode)
|
||||
@SuppressWarnings("unchecked")
|
||||
ArrayList<Edge>[][][] edges = new ArrayList[inputLength + 1][encoders.length][4];
|
||||
List<Edge>[][][] edges = new ArrayList[inputLength + 1][encoders.length][4];
|
||||
addEdges(version, edges, 0, null);
|
||||
|
||||
for (int i = 1; i <= inputLength; i++) {
|
||||
|
@ -471,7 +481,7 @@ final class MinimalEncoder {
|
|||
for (int k = 0; k < 4; k++) {
|
||||
Edge minimalEdge;
|
||||
if (edges[i][j][k] != null) {
|
||||
ArrayList<Edge> localEdges = edges[i][j][k];
|
||||
List<Edge> localEdges = edges[i][j][k];
|
||||
int minimalIndex = -1;
|
||||
int minimalSize = Integer.MAX_VALUE;
|
||||
for (int l = 0; l < localEdges.size(); l++) {
|
||||
|
@ -499,7 +509,7 @@ final class MinimalEncoder {
|
|||
for (int j = 0; j < encoders.length; j++) {
|
||||
for (int k = 0; k < 4; k++) {
|
||||
if (edges[inputLength][j][k] != null) {
|
||||
ArrayList<Edge> localEdges = edges[inputLength][j][k];
|
||||
List<Edge> localEdges = edges[inputLength][j][k];
|
||||
assert localEdges.size() == 1;
|
||||
Edge edge = localEdges.get(0);
|
||||
if (edge.cachedTotalSize < minimalSize) {
|
||||
|
|
|
@ -882,6 +882,29 @@ public final class EncoderTestCase extends Assert {
|
|||
true);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testMinimalEncoder42() throws Exception {
|
||||
// test halfwidth Katakana character (they are single byte encoded in Shift_JIS)
|
||||
verifyMinimalEncoding("Katakana:\uFF66\uFF66\uFF66\uFF66\uFF66\uFF66", "ECI(Shift_JIS),BYTE(Katakana:......)", null
|
||||
, false);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testMinimalEncoder43() throws Exception {
|
||||
// The character \u30A2 encodes as double byte in Shift_JIS so KANJI is more compact in this case
|
||||
verifyMinimalEncoding("Katakana:\u30A2\u30A2\u30A2\u30A2\u30A2\u30A2", "BYTE(Katakana:),KANJI(......)", null,
|
||||
false);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testMinimalEncoder44() throws Exception {
|
||||
// The character \u30A2 encodes as double byte in Shift_JIS but KANJI is not more compact in this case because
|
||||
// KANJI is only more compact when it encodes pairs of characters. In the case of mixed text it can however be
|
||||
// that Shift_JIS encoding is more compact as in this example
|
||||
verifyMinimalEncoding("Katakana:\u30A2a\u30A2a\u30A2a\u30A2a\u30A2a\u30A2", "ECI(Shift_JIS),BYTE(Katakana:.a.a.a" +
|
||||
".a.a.)", null, false);
|
||||
}
|
||||
|
||||
static void verifyMinimalEncoding(String input, String expectedResult, Charset priorityCharset, boolean isGS1)
|
||||
throws Exception {
|
||||
MinimalEncoder.ResultList result = MinimalEncoder.encode(input, null, priorityCharset, isGS1,
|
||||
|
|
Loading…
Reference in a new issue