Re: [mkgmap-dev] Index with Arabic names

Steve Ratcliffe Mon, 06 Jun 2011 13:28:51 -0700

Hi

For example, the Arabic name of Damascus International Airport is:
مطار دمشق الدولي
but it is written in the file osmmap_mdr.img as:
أطار دأشق اءدئءح


Yes, you are right, thanks for reporting it.

When names are read out of the .img file, the code page is ignored.
I'm in the middle of making changes to the character set code so I've
attached a new version of the patch which includes a fix.

If you just want to try the fix separately The relevant part of thepatch is:


@@ -89,6 +100,7 @@
                        funcs.setEncoder(new Simple8Encoder());
                } else {
                        funcs.setEncodingType(ENCODING_FORMAT9);
+                       funcs.setDecoder(new AnyCharsetDecoder(charset));
                        funcs.setEncoder(new AnyCharsetEncoder(charset));
                        guessCodepage(funcs, charset);
                }

The patch, is the second version of the previous translit_first patch
(which was for problems first noted in Greek names).

There is still the problem that the names will not be sorted quitecorrectly, although they should still be findable, as they will besorted consistently.


..Steve

Index: src/uk/me/parabola/imgfmt/app/labelenc/LatinEncoder.java
===================================================================
--- src/uk/me/parabola/imgfmt/app/labelenc/LatinEncoder.java	(revision 1650)
+++ src/uk/me/parabola/imgfmt/app/labelenc/LatinEncoder.java	(revision )
@@ -20,7 +20,6 @@
  * @author Steve Ratcliffe
  */
 public class LatinEncoder extends BaseEncoder implements CharacterEncoder {
-	private final Transliterator trans = new TableTransliterator("latin1");
 	private final Charset latinCharset = Charset.forName("latin1");
 
 	public EncodedText encodeText(String t) {
@@ -32,8 +31,7 @@
 		// Need to add a null character at the end of the string for this format.
 		String zText = text + "\000";
 
-		String s = trans.transliterate(zText);
-		byte[] chars = s.getBytes(latinCharset);
+		byte[] chars = zText.getBytes(latinCharset);
 		return new EncodedText(chars, chars.length);
 	}
 }
Index: src/uk/me/parabola/imgfmt/app/labelenc/NullTransliterator.java
===================================================================
--- src/uk/me/parabola/imgfmt/app/labelenc/NullTransliterator.java	(revision )
+++ src/uk/me/parabola/imgfmt/app/labelenc/NullTransliterator.java	(revision )
@@ -0,0 +1,36 @@
+/*
+ * Copyright (C) 2011.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 3 or
+ * version 2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ */
+package uk.me.parabola.imgfmt.app.labelenc;
+
+/**
+ * A transliterator that does not touch the string at all.
+ *
+ * @author Steve Ratcliffe
+ */
+public class NullTransliterator implements Transliterator {
+
+	/**
+	 * Return the string unchanged.
+	 * @return The original string, not a copy.
+	 */
+	public String transliterate(String s) {
+		return s;
+	}
+
+	/**
+	 * Do not ever upper case.
+	 * @param uc Ignored parameter.
+	 */
+	public void forceUppercase(boolean uc) {
+	}
+}
Index: src/uk/me/parabola/imgfmt/app/labelenc/CodeFunctions.java
===================================================================
--- src/uk/me/parabola/imgfmt/app/labelenc/CodeFunctions.java	(revision 1650)
+++ src/uk/me/parabola/imgfmt/app/labelenc/CodeFunctions.java	(revision )
@@ -29,6 +29,7 @@
 	private int encodingType;
 	private CharacterEncoder encoder;
 	private CharacterDecoder decoder;
+	private Transliterator transliterator = new TableTransliterator("ascii");
 
 	protected void setEncoder(CharacterEncoder encoder) {
 		this.encoder = encoder;
@@ -62,6 +63,14 @@
 		this.codepage = codepage;
 	}
 
+	public Transliterator getTransliterator() {
+		return transliterator;
+	}
+
+	public void setTransliterator(Transliterator transliterator) {
+		this.transliterator = transliterator;
+	}
+
 	/**
 	 * Create a CharacterEncoder for the given charset option.  Note that this
 	 * routine also writes to the lblHeader parameter to set the encoding type.
@@ -74,10 +83,12 @@
 		if ("ascii".equals(charset)) {
 			funcs.setEncodingType(ENCODING_FORMAT6);
 			funcs.setEncoder(new Format6Encoder());
+			funcs.setTransliterator(getDefaultTransliterator());
 			funcs.setDecoder(new Format6Decoder());
 		} else if ("latin1".equals(charset)) {
 			funcs.setEncodingType(ENCODING_FORMAT9);
 			funcs.setEncoder(new LatinEncoder());
+			funcs.setTransliterator(new TableTransliterator("latin1"));
 			funcs.setDecoder(new AnyCharsetDecoder("cp1252"));
 			funcs.setCodepage(1252);
 		} else if ("unicode".equals(charset)) {
@@ -89,6 +100,7 @@
 			funcs.setEncoder(new Simple8Encoder());
 		} else {
 			funcs.setEncodingType(ENCODING_FORMAT9);
+			funcs.setDecoder(new AnyCharsetDecoder(charset));
 			funcs.setEncoder(new AnyCharsetEncoder(charset));
 			guessCodepage(funcs, charset);
 		}
@@ -104,6 +116,7 @@
 	 */
 	private static void guessCodepage(CodeFunctions funcs, String charset) {
 		String cs = charset.toLowerCase();
+		Transliterator transliterator = new NullTransliterator();
 		if (cs.startsWith("cp")) {
 			try {
 				funcs.setCodepage(Integer.parseInt(charset.substring(2)));
@@ -118,7 +131,9 @@
 			}
 		} else if (cs.equals("latin1")) {
 			funcs.setCodepage(1252);
+			transliterator = new TableTransliterator("latin1");
 		}
+		funcs.setTransliterator(transliterator);
 	}
 
 	/**
@@ -152,4 +167,10 @@
 	public static CharacterDecoder getDefaultDecoder() {
 		return new Format6Decoder();
 	}
+
+	public static Transliterator getDefaultTransliterator() {
+		TableTransliterator ascii = new TableTransliterator("ascii");
+		ascii.forceUppercase(true);
+		return ascii;
-}
+	}
+}
Index: test/uk/me/parabola/imgfmt/app/labelenc/CodeFunctionsTest.java
===================================================================
--- test/uk/me/parabola/imgfmt/app/labelenc/CodeFunctionsTest.java	(revision 1519)
+++ test/uk/me/parabola/imgfmt/app/labelenc/CodeFunctionsTest.java	(revision )
@@ -58,7 +58,8 @@
 		CodeFunctions functions = CodeFunctions.createEncoderForLBL(6);
 
 		CharacterEncoder encoder = functions.getEncoder();
-		EncodedText text = encoder.encodeText("KÃ¶rnerstraÃe, VelkomezeÅÃÄskÃ¡, SkÃ³lavÃ¶rÃ°ustigur");
+		Transliterator transliterator = functions.getTransliterator();
+		EncodedText text = encoder.encodeText(transliterator.transliterate("KÃ¶rnerstraÃe, VelkomezeÅÃÄskÃ¡, SkÃ³lavÃ¶rÃ°ustigur"));
 
 		CharacterDecoder decoder = functions.getDecoder();
 		byte[] ctext = text.getCtext();
@@ -79,7 +80,8 @@
 		CodeFunctions functions = CodeFunctions.createEncoderForLBL("latin1");
 
 		CharacterEncoder encoder = functions.getEncoder();
-		EncodedText text = encoder.encodeText("KÃ¶rnerstraÃe, VelkomezeÅÃÄskÃ¡, SkÃ³lavÃ¶rÃ°ustigur");
+		Transliterator transliterator = functions.getTransliterator();
+		EncodedText text = encoder.encodeText(transliterator.transliterate("KÃ¶rnerstraÃe, VelkomezeÅÃÄskÃ¡, SkÃ³lavÃ¶rÃ°ustigur"));
 
 		CharacterDecoder decoder = functions.getDecoder();
 		byte[] ctext = text.getCtext();
Index: src/uk/me/parabola/imgfmt/app/labelenc/Format6Encoder.java
===================================================================
--- src/uk/me/parabola/imgfmt/app/labelenc/Format6Encoder.java	(revision 1652)
+++ src/uk/me/parabola/imgfmt/app/labelenc/Format6Encoder.java	(revision )
@@ -48,8 +48,6 @@
 		"xxxxxxxxxx:;<=>?" +	// 0x10-0x1F
 		"xxxxxxxxxxx[\\]^_";	// 0x20-0x2F
 
-	private final Transliterator transliterator = new TableTransliterator("ascii");
-
 	/**
 	 * Encode the text into the 6 bit format.  See the class level notes.
 	 *
@@ -69,7 +67,7 @@
 		byte[] buf = new byte[2 * s.length() + 4];
 		int off = 0;
 
-		for (char c : transliterator.transliterate(s).toCharArray()) {
+		for (char c : s.toCharArray()) {
 
 			if (c == ' ') {
 				put6(buf, off++, 0);
Index: src/uk/me/parabola/imgfmt/app/labelenc/Transliterator.java
===================================================================
--- src/uk/me/parabola/imgfmt/app/labelenc/Transliterator.java	(revision 1519)
+++ src/uk/me/parabola/imgfmt/app/labelenc/Transliterator.java	(revision )
@@ -19,11 +19,18 @@
  */
 public interface Transliterator {
 	/**
-	 * Convert a string into a string that uses only ascii characters.
+	 * Convert a string into a string that uses only ascii or latin1 characters.
 	 *
-	 * @param s The original string.  It can use any unicode character.
+	 * @param s The original string.  It can use any unicode character. Can be null in which
+	 * case null will be returned.
 	 * @return A string that uses a restricted subset of characters (ascii or
 	 * latin) that is a transliterated form of the input string.
 	 */
 	public String transliterate(String s);
+
+	/**
+	 * Force the use of uppercase in this transliterator.
+	 * Note that it is normal to set this.
+	 */
+	public void forceUppercase(boolean uc);
 }
Index: src/uk/me/parabola/imgfmt/app/labelenc/TableTransliterator.java
===================================================================
--- src/uk/me/parabola/imgfmt/app/labelenc/TableTransliterator.java	(revision 1751)
+++ src/uk/me/parabola/imgfmt/app/labelenc/TableTransliterator.java	(revision )
@@ -18,6 +18,7 @@
 import java.io.InputStream;
 import java.io.InputStreamReader;
 import java.util.Arrays;
+import java.util.Locale;
 
 import uk.me.parabola.log.Logger;
 
@@ -35,6 +36,7 @@
 
 	private final String[][] rows = new String[256][];
 	private final boolean useLatin;
+	private boolean forceUppercase;
 
 	public TableTransliterator(String targetCharset) {
 		if (targetCharset.equals("latin1"))
@@ -46,11 +48,15 @@
 	/**
 	 * Convert a string into a string that uses only ascii characters.
 	 *
-	 * @param s The original string.  It can use any unicode character.
+	 * @param s The original string.  It can use any unicode character. Can be null in which case null will
+	 * be returned.
 	 * @return A string that uses only ascii characters that is a transcription or
 	 *         transliteration of the original string.
 	 */
 	public String transliterate(String s) {
+		if (s == null)
+			return null;
+
 		StringBuilder sb = new StringBuilder(s.length() + 5);
 		for (char c : s.toCharArray()) {
 			if (c <= (useLatin? 0xff: 0x7f)) {
@@ -65,9 +71,16 @@
 			}
 		}
 
-		return sb.toString();
+		String text = sb.toString();
+		if (forceUppercase)
+			text = text.toUpperCase(Locale.ENGLISH);
+		return text;
 	}
 
+	public void forceUppercase(boolean uc) {
+		forceUppercase = uc;
+	}
+
 	/**
 	 * Load one row of characters.  This means unicode characters that are of the
 	 * form U+RRXX where RR is the row.
@@ -127,9 +140,13 @@
 
 				// The first field must look like 'U+RRXX', we extract the XX part
 				int index = Integer.parseInt(upoint.substring(4), 16);
-				if (newRow[index].equals("?"))
+				if (newRow[index].equals("?")) {
+					if (forceUppercase)
+						newRow[index] = translation.toUpperCase(Locale.ENGLISH);
+					else
-					newRow[index] = translation;
-			}
+						newRow[index] = translation;
+				}
+			}
 		} catch (IOException e) {
 			log.error("Could not read character translation table");
 		}
Index: src/uk/me/parabola/imgfmt/app/lbl/LBLFile.java
===================================================================
--- src/uk/me/parabola/imgfmt/app/lbl/LBLFile.java	(revision 1870)
+++ src/uk/me/parabola/imgfmt/app/lbl/LBLFile.java	(revision )
@@ -17,7 +17,6 @@
 package uk.me.parabola.imgfmt.app.lbl;
 
 import java.util.HashMap;
-import java.util.Locale;
 import java.util.Map;
 
 import uk.me.parabola.imgfmt.Utils;
@@ -29,7 +28,7 @@
 import uk.me.parabola.imgfmt.app.labelenc.BaseEncoder;
 import uk.me.parabola.imgfmt.app.labelenc.CharacterEncoder;
 import uk.me.parabola.imgfmt.app.labelenc.CodeFunctions;
-import uk.me.parabola.imgfmt.app.labelenc.Format6Encoder;
+import uk.me.parabola.imgfmt.app.labelenc.Transliterator;
 import uk.me.parabola.imgfmt.app.srt.Sort;
 import uk.me.parabola.imgfmt.app.trergn.Subdivision;
 import uk.me.parabola.imgfmt.fs.ImgChannel;
@@ -49,6 +48,7 @@
 	private static final Logger log = Logger.getLogger(LBLFile.class);
 
 	private CharacterEncoder textEncoder = CodeFunctions.getDefaultEncoder();
+	private Transliterator transliterator = CodeFunctions.getDefaultTransliterator();
 
 	private final Map<String, Label> labelCache = new HashMap<String, Label>();
 
@@ -101,27 +101,24 @@
 		
 		lblHeader.setEncodingType(cfuncs.getEncodingType());
 		textEncoder = cfuncs.getEncoder();
+		transliterator = cfuncs.getTransliterator();
 		if (forceUpper && textEncoder instanceof BaseEncoder) {
 			BaseEncoder baseEncoder = (BaseEncoder) textEncoder;
 			baseEncoder.setUpperCase(true);
 		}
+		if (forceUpper)
+			transliterator.forceUppercase(true);
 	}
 	
 	/**
 	 * Add a new label with the given text.  Labels are shared, so that identical
 	 * text is always represented by the same label.
 	 *
-	 * @param text The text of the label, it will be in uppercase.
+	 * @param inText The text of the label, it will be in uppercase.
 	 * @return A reference to the created label.
 	 */
-	public Label newLabel(String text) {
-		// if required, fold case now so that labelCache doesn't
-		// contain multiple labels that only differ in letter case
-		if(text != null &&
-		   (textEncoder instanceof Format6Encoder ||
-			textEncoder instanceof BaseEncoder &&
-			((BaseEncoder)textEncoder).isUpperCase()))
-			text = text.toUpperCase(Locale.ENGLISH);
+	public Label newLabel(String inText) {
+		String text = transliterator.transliterate(inText);
 		Label l = labelCache.get(text);
 		if (l == null) {
 			l = new Label(text);

_______________________________________________
mkgmap-dev mailing list
[email protected]
http://www.mkgmap.org.uk/mailman/listinfo/mkgmap-dev

Re: [mkgmap-dev] Index with Arabic names

Reply via email to