Hi Gerd Patch attached that defaults Format6Encoder transliterator to UPPER. mkgmap unconditionally calls to set required mode but test environment doesn't.
Ticker On Fri, 2021-11-26 at 18:40 +0000, Gerd Petermann wrote: > Hi Ticker, > > result looks ok, but unit test CodeFunctionsTest fails. Maybe it was > intended that codepage 0 ignores the --lower-case option? > > Gerd
Index: src/uk/me/parabola/imgfmt/app/labelenc/BaseEncoder.java =================================================================== --- src/uk/me/parabola/imgfmt/app/labelenc/BaseEncoder.java (revision 4818) +++ src/uk/me/parabola/imgfmt/app/labelenc/BaseEncoder.java (working copy) @@ -28,7 +28,7 @@ * @author Steve Ratcliffe */ public class BaseEncoder { - private static final Logger log = Logger.getLogger(BaseEncoder.class); + protected static final Logger log = Logger.getLogger(BaseEncoder.class); public static final EncodedText NO_TEXT = new EncodedText(null, 0, null); Index: src/uk/me/parabola/imgfmt/app/labelenc/Format6Decoder.java =================================================================== --- src/uk/me/parabola/imgfmt/app/labelenc/Format6Decoder.java (revision 4818) +++ src/uk/me/parabola/imgfmt/app/labelenc/Format6Decoder.java (working copy) @@ -85,22 +85,10 @@ if (symbol) { symbol = false; c = Format6Encoder.SYMBOLS.charAt(b); - } - else if(lowerCaseOrSeparator) { + } else if (lowerCaseOrSeparator) { lowerCaseOrSeparator = false; - if(b == 0x2b || b == 0x2c) { - c = (char)(b - 0x10); // "thin" separator - } - else if(Character.isLetter(b)) { - // lower case letter - c = Character.toLowerCase(Format6Encoder.LETTERS.charAt(b)); - } - else { - // not a letter so just use as is (could be a digit) - c = Format6Encoder.LETTERS.charAt(b); - } - } - else { + c = Format6Encoder.LOWERCASE.charAt(b); + } else { switch(b) { case 0x1B: // next char is lower case or a separator @@ -112,13 +100,6 @@ symbol = true; return; - case 0x1D: - case 0x1E: - case 0x1F: - // these are separators - use as is - c = (char)b; - break; - default: c = Format6Encoder.LETTERS.charAt(b); break; Index: src/uk/me/parabola/imgfmt/app/labelenc/Format6Encoder.java =================================================================== --- src/uk/me/parabola/imgfmt/app/labelenc/Format6Encoder.java (revision 4818) +++ src/uk/me/parabola/imgfmt/app/labelenc/Format6Encoder.java (working copy) @@ -20,9 +20,12 @@ import java.util.Locale; /** - * Format according to the '6 bit' .img format. The text is first upper - * cased. Any letter with a diacritic or accent is replaced with its base - * letter. + * Format according to the '6 bit' .img format. + * Any letter with a diacritic or accent is replaced with its base letter. + * Characters from other alphabets are transliterated if resources/chars/ascii/ data exists. + * Unless set/forceUpperCase, the text is upper-cased by the transliterator. + * NB lower-case is supported but each lower-case letter needs 12 bits, so, with typical OSM data, + * almost any other code-page will be more compact * * For example Körnerstraße would become KORNERSTRASSE, * Řípovská would become RIPOVSKA etc. @@ -35,22 +38,33 @@ */ public class Format6Encoder extends BaseEncoder implements CharacterEncoder { - // This is 0x1b is the source document, but the accompanying code uses - // the value 0x1c, which seems to work. - private static final int SYMBOL_SHIFT = 0x1c; + // Following are swapped in the above John Mechalas document, but this is what works: + private static final int LOWERCASE_SHIFT = 0x1b; + private static final int SYMBOL_SHIFT = 0x1c; public static final String LETTERS = " ABCDEFGHIJKLMNO" + // 0x00-0x0F - "PQRSTUVWXYZxx " + // 0x10-0x1F - "0123456789\u0001\u0002\u0003\u0004\u0005\u0006"; // 0x20-0x2F + "PQRSTUVWXYZxx\u001d\u001e\u001f" + // 0x10-0x1F xx are above SHIFTs. prefix/suffix indicators + "0123456789\u0001\u0002\u0003\u0004\u0005\u0006"; // 0x20-0x2F digits + shields public static final String SYMBOLS = "@!\"#$%&'()*+,-./" + // 0x00-0x0F - "xxxxxxxxxx:;<=>?" + // 0x10-0x1F - "xxxxxxxxxxx[\\]^_"; // 0x20-0x2F + " :;<=>?" + // 0x10-0x1F + "° [\\]^_"; // 0x20-0x2F + // ^ looks like degree (\u00b0) on MapSource/eTrex. Won't happen as transliterated to "deg" + // 0123456789abcdef + public static final String LOWERCASE = + "`abcdefghijklmno" + // 0x00-0x0F back-tick + "pqrstuvwxyz{|}~ " + // 0x10-0x1F + " \u001b\u001c "; // 0x20-0x2F more prefix/suffix indicators - private final Transliterator transliterator = new TableTransliterator("ascii"); + private final Transliterator transliterator; + public Format6Encoder(/*String cs, Transliterator transliterator*/) { + this.transliterator = new TableTransliterator("ascii"); + this.transliterator.forceUppercase(true); // for test environment. LBLFile.java will set correctly + } + /** * Encode the text into the 6 bit format. See the class level notes. * @@ -62,7 +76,7 @@ if (text == null || text.isEmpty()) return NO_TEXT; String normalisedText = Normalizer.normalize(text, Normalizer.Form.NFC); - String s = transliterator.transliterate(normalisedText).toUpperCase(Locale.ENGLISH); + String s = transliterator.transliterate(normalisedText); // it does the upper if forceUpper // Allocate more than enough space on average for the label. // if you overdo it then it will waste a lot of space , but @@ -78,8 +92,8 @@ put6(buf, off++, c - 'A' + 1); } else if (c >= '0' && c <= '9') { put6(buf, off++, c - '0' + 0x20); - } else if (c == 0x1b || c == 0x1c) { - put6(buf, off++, 0x1b); + } else if (c == 0x1b || c == 0x1c) { // shiftedLowerCase() does same thing + put6(buf, off++, LOWERCASE_SHIFT); put6(buf, off++, c + 0x10); } else if (c >= 0x1d && c <= 0x1f) { put6(buf, off++, c); @@ -86,8 +100,14 @@ } else if (c >= 1 && c <= 6) { // Highway shields put6(buf, off++, 0x29 + c); + } else if (c >= 'a' && c <= 'z') { + put6(buf, off++, LOWERCASE_SHIFT); + put6(buf, off++, c - 'a' + 1); } else { + int rememberOff = off; off = shiftedSymbol(buf, off, c); + if (off == rememberOff) + off = shiftedLowerCase(buf, off, c); } } @@ -119,6 +139,16 @@ return off; } + private int shiftedLowerCase(byte[] buf, int startOffset, char c) { + int off = startOffset; + int ind = LOWERCASE.indexOf(c); + if (ind >= 0) { + put6(buf, off++, LOWERCASE_SHIFT); + put6(buf, off++, ind); + } + return off; + } + /** * Each character is packed into 6 bits. This keeps track of everything so * that the character can be put into the right place in the byte array. @@ -149,4 +179,9 @@ return buf; } + + public void setUpperCase(boolean upperCase) { + super.setUpperCase(upperCase); + transliterator.forceUppercase(upperCase); + } }
_______________________________________________ mkgmap-dev mailing list mkgmap-dev@lists.mkgmap.org.uk https://www.mkgmap.org.uk/mailman/listinfo/mkgmap-dev