Re: [mkgmap-dev] Format6Encoder/Decoder

Ticker Berkin Fri, 26 Nov 2021 13:19:10 -0800

Hi Gerd

Patch attached that defaults Format6Encoder transliterator to UPPER.
mkgmap unconditionally calls to set required mode but test environment
doesn't.


Ticker

On Fri, 2021-11-26 at 18:40 +0000, Gerd Petermann wrote:
> Hi Ticker,
> 
> result looks ok, but unit test CodeFunctionsTest fails. Maybe it was
> intended that codepage 0 ignores the --lower-case option?
> 
> Gerd

Index: src/uk/me/parabola/imgfmt/app/labelenc/BaseEncoder.java
===================================================================
--- src/uk/me/parabola/imgfmt/app/labelenc/BaseEncoder.java	(revision 4818)
+++ src/uk/me/parabola/imgfmt/app/labelenc/BaseEncoder.java	(working copy)
@@ -28,7 +28,7 @@
  * @author Steve Ratcliffe
  */
 public class BaseEncoder {
-	private static final Logger log = Logger.getLogger(BaseEncoder.class);
+	protected static final Logger log = Logger.getLogger(BaseEncoder.class);
 
 	public static final EncodedText NO_TEXT = new EncodedText(null, 0, null);
 
Index: src/uk/me/parabola/imgfmt/app/labelenc/Format6Decoder.java
===================================================================
--- src/uk/me/parabola/imgfmt/app/labelenc/Format6Decoder.java	(revision 4818)
+++ src/uk/me/parabola/imgfmt/app/labelenc/Format6Decoder.java	(working copy)
@@ -85,22 +85,10 @@
 		if (symbol) {
 			symbol = false;
 			c = Format6Encoder.SYMBOLS.charAt(b);
-		}
-		else if(lowerCaseOrSeparator) {
+		} else if (lowerCaseOrSeparator) {
 			lowerCaseOrSeparator = false;
-			if(b == 0x2b || b == 0x2c) {
-				c = (char)(b - 0x10); // "thin" separator
-			}
-			else if(Character.isLetter(b)) {
-				// lower case letter
-				c = Character.toLowerCase(Format6Encoder.LETTERS.charAt(b));
-			}
-			else {
-				// not a letter so just use as is (could be a digit)
-				c = Format6Encoder.LETTERS.charAt(b);
-			}
-		}
-		else {
+			c = Format6Encoder.LOWERCASE.charAt(b);
+		} else {
 			switch(b) {
 			case 0x1B:
 				// next char is lower case or a separator
@@ -112,13 +100,6 @@
 				symbol = true;
 				return;
 
-			case 0x1D:
-			case 0x1E:
-			case 0x1F:
-				// these are separators - use as is
-				c = (char)b;
-				break;
-
 			default:
 				c = Format6Encoder.LETTERS.charAt(b);
 				break;
Index: src/uk/me/parabola/imgfmt/app/labelenc/Format6Encoder.java
===================================================================
--- src/uk/me/parabola/imgfmt/app/labelenc/Format6Encoder.java	(revision 4818)
+++ src/uk/me/parabola/imgfmt/app/labelenc/Format6Encoder.java	(working copy)
@@ -20,9 +20,12 @@
 import java.util.Locale;
 
 /**
- * Format according to the '6 bit' .img format.  The text is first upper
- * cased.  Any letter with a diacritic or accent is replaced with its base
- * letter.
+ * Format according to the '6 bit' .img format.
+ * Any letter with a diacritic or accent is replaced with its base letter.
+ * Characters from other alphabets are transliterated if resources/chars/ascii/ data exists.
+ * Unless set/forceUpperCase, the text is upper-cased by the transliterator.
+ * NB lower-case is supported but each lower-case letter needs 12 bits, so, with typical OSM data,
+ * almost any other code-page will be more compact
  *
  * For example Körnerstraße would become KORNERSTRASSE,
  * Řípovská would become RIPOVSKA etc.
@@ -35,22 +38,33 @@
  */
 public class Format6Encoder extends BaseEncoder implements CharacterEncoder {
 
-	// This is 0x1b is the source document, but the accompanying code uses
-	// the value 0x1c, which seems to work.
-	private static final int SYMBOL_SHIFT = 0x1c;
+	// Following are swapped in the above John Mechalas document, but this is what works:
+	private static final int LOWERCASE_SHIFT = 0x1b;
+	private static final int SYMBOL_SHIFT    = 0x1c;
 
 	public static final String LETTERS =
 		" ABCDEFGHIJKLMNO" +	// 0x00-0x0F
-		"PQRSTUVWXYZxx   " +	// 0x10-0x1F
-		"0123456789\u0001\u0002\u0003\u0004\u0005\u0006";	// 0x20-0x2F
+		"PQRSTUVWXYZxx\u001d\u001e\u001f" +	// 0x10-0x1F  xx are above SHIFTs. prefix/suffix indicators
+		"0123456789\u0001\u0002\u0003\u0004\u0005\u0006";	// 0x20-0x2F  digits + shields
 
 	public static final String SYMBOLS =
 		"@!\"#$%&'()*+,-./" +	// 0x00-0x0F
-		"xxxxxxxxxx:;<=>?" +	// 0x10-0x1F
-		"xxxxxxxxxxx[\\]^_";	// 0x20-0x2F
+		"          :;<=>?" +	// 0x10-0x1F
+		"°          [\\]^_";	// 0x20-0x2F
+	//   ^ looks like degree (\u00b0) on MapSource/eTrex. Won't happen as transliterated to "deg"
+	//   0123456789abcdef
+	public static final String LOWERCASE =
+		"`abcdefghijklmno" +	// 0x00-0x0F  back-tick
+		"pqrstuvwxyz{|}~ " +	// 0x10-0x1F
+		"           \u001b\u001c   ";     // 0x20-0x2F  more prefix/suffix indicators
 
-	private final Transliterator transliterator = new TableTransliterator("ascii");
+	private final Transliterator transliterator;
 
+	public Format6Encoder(/*String cs, Transliterator transliterator*/) {
+		this.transliterator = new TableTransliterator("ascii");
+		this.transliterator.forceUppercase(true); // for test environment. LBLFile.java will set correctly
+	}
+
 	/**
 	 * Encode the text into the 6 bit format.  See the class level notes.
 	 *
@@ -62,7 +76,7 @@
 		if (text == null || text.isEmpty())
 			return NO_TEXT;
 		String normalisedText = Normalizer.normalize(text, Normalizer.Form.NFC);
-		String s = transliterator.transliterate(normalisedText).toUpperCase(Locale.ENGLISH);
+		String s = transliterator.transliterate(normalisedText);  // it does the upper if forceUpper
 
 		// Allocate more than enough space on average for the label.
 		// if you overdo it then it will waste a lot of space , but
@@ -78,8 +92,8 @@
 				put6(buf, off++, c - 'A' + 1);
 			} else if (c >= '0' && c <= '9') {
 				put6(buf, off++, c - '0' + 0x20);
-			} else if (c == 0x1b || c == 0x1c) {
-				put6(buf, off++, 0x1b);
+			} else if (c == 0x1b || c == 0x1c) {  // shiftedLowerCase() does same thing
+				put6(buf, off++, LOWERCASE_SHIFT);
 				put6(buf, off++, c + 0x10);
 			} else if (c >= 0x1d && c <= 0x1f) {
 				put6(buf, off++, c);
@@ -86,8 +100,14 @@
 			} else if (c >= 1 && c <= 6) {
 				// Highway shields
 				put6(buf, off++, 0x29 + c);
+			} else if (c >= 'a' && c <= 'z') {
+				put6(buf, off++, LOWERCASE_SHIFT);
+				put6(buf, off++, c - 'a' + 1);
 			} else {
+				int rememberOff = off;
 				off = shiftedSymbol(buf, off, c);
+				if (off == rememberOff)
+					off = shiftedLowerCase(buf, off, c);
 			}
 		}
 
@@ -119,6 +139,16 @@
 		return off;
 	}
 
+	private int shiftedLowerCase(byte[] buf, int startOffset, char c) {
+		int off = startOffset;
+		int ind = LOWERCASE.indexOf(c);
+		if (ind >= 0) {
+			put6(buf, off++, LOWERCASE_SHIFT);
+			put6(buf, off++, ind);
+		}
+		return off;
+	}
+
 	/**
 	 * Each character is packed into 6 bits.  This keeps track of everything so
 	 * that the character can be put into the right place in the byte array.
@@ -149,4 +179,9 @@
 
 		return buf;
 	}
+
+	public void setUpperCase(boolean upperCase) {
+		super.setUpperCase(upperCase);
+		transliterator.forceUppercase(upperCase);
+	}
 }

_______________________________________________
mkgmap-dev mailing list
mkgmap-dev@lists.mkgmap.org.uk
https://www.mkgmap.org.uk/mailman/listinfo/mkgmap-dev

Re: [mkgmap-dev] Format6Encoder/Decoder

Reply via email to