Re: [mkgmap-dev] Index with Arabic names

Steve Ratcliffe Sat, 18 Jun 2011 03:04:17 -0700

Hi Hosam

I have now made a lot of progress and have something that works as faras I can tell.


I can now hit space, select one of the displayed names and it is found.

There are two patches attached.

The first is the main problem I think. Previously there were two
equal and opposite errors when reading the labels out of the .img
files and then writing to the index. This was deliberate and works
as far as getting the correct characters. However it meant that the
labels were completly mis-sorted and sorting is essential for
searching.

The second patch includes a sort table for cp1256 and implements
the character expansion feature in the SRT compiler, this is where
a character like æ can be made to sort as if it were the two separate
characters "ae". Previously the feature was hardwired in as I didn't
know how it worked, and it may therefore have been causing problems.

..Steve

Index: src/uk/me/parabola/imgfmt/app/lbl/LBLHeader.java
===================================================================
--- src/uk/me/parabola/imgfmt/app/lbl/LBLHeader.java	(revision 1870)
+++ src/uk/me/parabola/imgfmt/app/lbl/LBLHeader.java	(revision )
@@ -16,6 +16,8 @@
  */
 package uk.me.parabola.imgfmt.app.lbl;
 
+import java.io.UnsupportedEncodingException;
+
 import uk.me.parabola.imgfmt.app.CommonHeader;
 import uk.me.parabola.imgfmt.app.ImgFileReader;
 import uk.me.parabola.imgfmt.app.ImgFileWriter;
@@ -74,6 +76,27 @@
 		// Read the places part of the header.
 		placeHeader.readFileHeader(reader);
 
+		int codepage = reader.getChar();
+		int id1 = reader.getChar();
+		int id2 = reader.getChar();
+		int descOff = reader.getInt();
+		int descLen = reader.getInt();
+
+		reader.position(descOff);
+		byte[] bytes = reader.get(descLen);
+		String description;
+		try {
+			description = new String(bytes, "ascii");
+		} catch (UnsupportedEncodingException e) {
+			description = "Unknown";
+		}
+
+		sort = new Sort();
+		sort.setCodepage(codepage);
+		sort.setId1(id1);
+		sort.setId2(id2);
+		sort.setDescription(description);
+		
 		// more to do but not needed yet...  Just set position
 		reader.position(labelStart);
 	}
@@ -97,9 +120,14 @@
 		writer.putChar((char) getCodePage());
 
 		// Identifying the sort
-		writer.putChar((char) sort.getId1());
-		writer.putChar((char) (sort.getId2() | 0x8000));
+		char id1 = (char) sort.getId1();
+		writer.putChar(id1);
-
+		
+		char id2 = (char) sort.getId2();
+		if (id1 != 0 && id2 != 0)
+			id2 |= 0x8000;
+		writer.putChar(id2);
+
 		writer.putInt(HEADER_LEN);
 		writer.putInt(sortDescriptionLength);
 
Index: src/uk/me/parabola/imgfmt/app/lbl/LBLFileReader.java
===================================================================
--- src/uk/me/parabola/imgfmt/app/lbl/LBLFileReader.java	(revision 1873)
+++ src/uk/me/parabola/imgfmt/app/lbl/LBLFileReader.java	(revision )
@@ -61,7 +61,7 @@
 		header.readHeader(getReader());
 		int offsetMultiplier = header.getOffsetMultiplier();
 		CodeFunctions funcs = CodeFunctions.createEncoderForLBL(
-				header.getEncodingType());
+				header.getEncodingType(), header.getCodePage());
 		textDecoder = funcs.getDecoder();
 
 		readLables(offsetMultiplier);
Index: src/uk/me/parabola/imgfmt/app/mdr/MDRFile.java
===================================================================
--- src/uk/me/parabola/imgfmt/app/mdr/MDRFile.java	(revision 1966)
+++ src/uk/me/parabola/imgfmt/app/mdr/MDRFile.java	(revision )
@@ -20,6 +20,7 @@
 import uk.me.parabola.imgfmt.app.lbl.Country;
 import uk.me.parabola.imgfmt.app.lbl.Region;
 import uk.me.parabola.imgfmt.app.lbl.Zip;
+import uk.me.parabola.imgfmt.app.mdr.MdrSection.PointerSizes;
 import uk.me.parabola.imgfmt.app.net.RoadDef;
 import uk.me.parabola.imgfmt.app.srt.Sort;
 import uk.me.parabola.imgfmt.app.trergn.Point;
@@ -62,7 +63,7 @@
 	private int currentMap;
 
 	private final MdrSection[] sections;
-	private MdrSection.PointerSizes sizes;
+	private PointerSizes sizes;
 
 	public MDRFile(ImgChannel chan, MdrConfig config) {
 		Sort sort = config.getSort();
Index: src/uk/me/parabola/imgfmt/app/labelenc/CodeFunctions.java
===================================================================
--- src/uk/me/parabola/imgfmt/app/labelenc/CodeFunctions.java	(revision 1968)
+++ src/uk/me/parabola/imgfmt/app/labelenc/CodeFunctions.java	(revision )
@@ -140,16 +140,23 @@
 	 * Sets encoding functions for a given format and code page.  This is used
 	 * when reading from an existing file.
 	 *
+	 *
 	 * @param format The format from the lbl header.
+	 * @param codePage The codepage found in the header.
 	 * @return The various character set parameters that will be needed.
 	 */
-	public static CodeFunctions createEncoderForLBL(int format) {
+	public static CodeFunctions createEncoderForLBL(int format, int codePage) {
 		CodeFunctions funcs = new CodeFunctions();
 
 		if (format == ENCODING_FORMAT6) {
 			funcs.setEncodingType(ENCODING_FORMAT6);
 			funcs.setEncoder(new Format6Encoder());
 			funcs.setDecoder(new Format6Decoder());
+		} else if (format == ENCODING_FORMAT9) {
+			funcs.setEncodingType(ENCODING_FORMAT9);
+			String cpName = "cp" + codePage;
+			funcs.setEncoder(new AnyCharsetEncoder(cpName));
+			funcs.setDecoder(new AnyCharsetDecoder(cpName));
 		} else {
 			// TODO TEMP...
 			funcs.setEncodingType(ENCODING_FORMAT9);
Index: src/uk/me/parabola/imgfmt/app/mdr/Mdr15.java
===================================================================
--- src/uk/me/parabola/imgfmt/app/mdr/Mdr15.java	(revision 1870)
+++ src/uk/me/parabola/imgfmt/app/mdr/Mdr15.java	(revision )
@@ -14,6 +14,7 @@
 
 import java.io.ByteArrayOutputStream;
 import java.io.IOException;
+import java.nio.charset.Charset;
 import java.util.HashMap;
 import java.util.Map;
 
@@ -31,10 +32,13 @@
 	private final ByteArrayOutputStream buffer = new ByteArrayOutputStream();
 
 	private final Map<String, Integer> strings = new HashMap<String, Integer>();
+	private final Charset charset;
 
 	public Mdr15(MdrConfig config) {
 		setConfig(config);
 
+		charset = config.getSort().getCharset();
+
 		// reserve the string at offset 0 to be the empty string.
 		buffer.write(0);
 	}
@@ -55,7 +59,7 @@
 
 		int off = buffer.size();
 		try {
-			buffer.write(str.getBytes("latin1"));
+			buffer.write(str.getBytes(charset));
 			buffer.write(0);
 		} catch (IOException e) {
 			// Can't convert, return empty string instead.
Index: test/uk/me/parabola/imgfmt/app/labelenc/CodeFunctionsTest.java
===================================================================
--- test/uk/me/parabola/imgfmt/app/labelenc/CodeFunctionsTest.java	(revision 1968)
+++ test/uk/me/parabola/imgfmt/app/labelenc/CodeFunctionsTest.java	(revision )
@@ -24,7 +24,7 @@
 	 */
 	@Test
 	public void testFormat6() {
-		CodeFunctions functions = CodeFunctions.createEncoderForLBL(6);
+		CodeFunctions functions = CodeFunctions.createEncoderForLBL(6, 0);
 		assertEquals("code page", 0, functions.getCodepage());
 		assertEquals("encoding type", 6, functions.getEncodingType());
 		CharacterEncoder enc = functions.getEncoder();
@@ -55,7 +55,7 @@
 	 */
 	@Test
 	public void testTransliterate6() {
-		CodeFunctions functions = CodeFunctions.createEncoderForLBL(6);
+		CodeFunctions functions = CodeFunctions.createEncoderForLBL(6, 0);
 
 		CharacterEncoder encoder = functions.getEncoder();
 		Transliterator transliterator = functions.getTransliterator();

Index: test/uk/me/parabola/mkgmap/srt/SrtTextReaderTest.java
===================================================================
--- test/uk/me/parabola/mkgmap/srt/SrtTextReaderTest.java	(revision 1870)
+++ test/uk/me/parabola/mkgmap/srt/SrtTextReaderTest.java	(revision )
@@ -17,7 +17,6 @@
 import java.io.IOException;
 import java.io.Reader;
 import java.io.StringReader;
-import java.util.List;
 
 import uk.me.parabola.imgfmt.app.srt.Sort;
 
@@ -119,14 +118,6 @@
 		assertEquals(3, major(sortcodes['c']));
 	}
 
-	@Test
-	public void testTab2() throws Exception {
-		Sort sort = getSort("tab2 12ab");
-		List<Character> tab2 = sort.getTab2();
-		assertEquals(1, tab2.size());
-		assertEquals((char) 0x12ab, (char) tab2.get(0));
-	}
-
 	private char[] getSortcodes(String text) throws IOException {
 		Sort sort = getSort(text);
 		return sort.getSortPositions();
Index: src/uk/me/parabola/mkgmap/srt/SrtTextReader.java
===================================================================
--- src/uk/me/parabola/mkgmap/srt/SrtTextReader.java	(revision 1873)
+++ src/uk/me/parabola/mkgmap/srt/SrtTextReader.java	(revision )
@@ -1,5 +1,5 @@
 /*
- * Copyright (C) 2010.
+ * Copyright (C) 2010, 2011.
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License version 3 or
@@ -24,6 +24,8 @@
 import java.nio.charset.CharsetDecoder;
 import java.nio.charset.CharsetEncoder;
 import java.nio.charset.CodingErrorAction;
+import java.util.ArrayList;
+import java.util.List;
 
 import uk.me.parabola.imgfmt.app.srt.SRTFile;
 import uk.me.parabola.imgfmt.app.srt.Sort;
@@ -79,14 +81,10 @@
 	// States
 	private static final int IN_INITIAL = 0;
 	private static final int IN_CODE = 1;
-	private static final int IN_TAB2 = 2;
+	private static final int IN_EXPAND = 2;
 
 	private int codepage;
 
-	// Identification (or perhaps has more meaning).
-	private int id1;
-	private int id2;
-
 	// Data that is read in, the output of the reading operation
 	private final Sort sort = new Sort();
 
@@ -140,8 +138,8 @@
 			case IN_CODE:
 				codeState(scanner, tok);
 				break;
-			case IN_TAB2:
-				tab2State(scanner, tok);
+			case IN_EXPAND:
+				expandState(scanner, tok);
 				break;
 			}
 		}
@@ -163,6 +161,7 @@
 				Charset charset = Charset.forName("cp" + codepage);
 				encoder = charset.newEncoder();
 				decoder = charset.newDecoder();
+				decoder.onMalformedInput(CodingErrorAction.REPORT);
 			} else if (val.equals("description")) {
 				sort.setDescription(scanner.nextWord());
 			} else if (val.equals("id1")) {
@@ -174,10 +173,11 @@
 					throw new SyntaxException(scanner, "Missing codepage declaration before code");
 				state = IN_CODE;
 				scanner.skipSpace();
-			} else if (val.equals("tab2")) {
-				if (codepage == 0)
-					throw new SyntaxException(scanner, "Missing codepage declaration before code");
-				state = IN_TAB2;
+			} else if (val.equals("expand")) {
+				state = IN_EXPAND;
+				scanner.skipSpace();
+			} else {
+				throw new SyntaxException(scanner, "Unrecognised command " + val);
 			}
 		}
 	}
@@ -212,20 +212,8 @@
 			} else if (val.equals("pos3")) {
 				scanner.validateNext("=");
 				pos3 = Integer.decode(scanner.nextWord());
-			} else if (val.length() == 1) {
+			} else if (val.length() == 1 || val.length() == 2) {
 				addCharacter(scanner, val);
-			} else if (val.length() == 2) {
-				byte bval = (byte) Integer.parseInt(val, 16);
-				ByteBuffer bin = ByteBuffer.allocate(1);
-				bin.put(bval);
-				bin.flip();
-				try {
-					decoder.onMalformedInput(CodingErrorAction.REPORT);
-					CharBuffer out = decoder.decode(bin);
-					setSortcode(bval, out.get());
-				} catch (CharacterCodingException e) {
-					throw new SyntaxException(scanner, "Character not valid in codepage " + codepage);
-				}
 			} else {
 				throw new SyntaxException(scanner, "Unexpected word " + val);
 			}
@@ -242,47 +230,52 @@
 			}
 
 		} else if (type == TokType.EOL) {
-			state = 0;
+			state = IN_INITIAL;
 			advancePos();
 		}
 	}
 
 	/**
-	 * Unknown section. Two byte records.
-	 * You usually need to be able to say things like o-umlaut sorts as if it were o followed by e
-	 * so perhaps this section is used for that.
+	 * Within an 'expand' command. The whole command is read before
+	 * return, they can not span lines.
+	 * @param tok The first token after the keyword.
 	 */
-	private void tab2State(TokenScanner scanner, Token tok) {
-		TokType type = tok.getType();
-		if (type == TokType.TEXT) {
+	private void expandState(TokenScanner scanner, Token tok) {
-			String val = tok.getValue();
+		String val = tok.getValue();
-			char tab2 = (char) Integer.parseInt(val, 16);
-			sort.add(tab2);
-			scanner.skipLine();
+
+		Code code = new Code(scanner, val).invoke();
+
+		String s = scanner.nextValue();
+		if (!s.equals("to"))
+			throw new SyntaxException(scanner, "Expected the word 'to' in expand command");
+
+		List<Byte> expansionList = new ArrayList<Byte>();
+		while (!scanner.isEndOfFile()) {
+			Token t = scanner.nextRawToken();
+			if (t.isEol())
+				break;
+			if (t.isWhiteSpace())
+				continue;
+			
+			Code r = new Code(scanner, t.getValue()).invoke();
+			expansionList.add(r.getBval());
+		}
+
+		sort.addExpansion(code.getBval(), charFlags(code.getCval()), expansionList);
-			state = IN_INITIAL;
+		state = IN_INITIAL;
-		} else if (type == TokType.EOL) {
-			state = IN_INITIAL;
-		}
+	}
-	}
 
 	/**
 	 * Add a character to the sort table.
 	 * @param scanner Input scanner, for line number information.
-	 * @param val A single character string containing the character to be added.
+	 * @param val A single character string containing the character to be added. This will
+	 * be either a single character which is the unicode representation of the character, or
+	 * two characters which is the hex representation of the code point in the target codepage.
 	 */
 	private void addCharacter(TokenScanner scanner, String val) {
-		CharBuffer cbuf = CharBuffer.wrap(val.toCharArray());
-		try {
-			ByteBuffer out = encoder.encode(cbuf);
-			if (out.remaining() > 1)
-				throw new SyntaxException(scanner, "more than one character resulter from conversion of " + val);
-			byte b = out.get();
-			char cval = val.charAt(0);
-			setSortcode(b, cval);
-		} catch (CharacterCodingException e) {
-			throw new SyntaxException(scanner, "Invalid character in the target charset " + val);
+		Code code = new Code(scanner, val).invoke();
+		setSortcode(code.getBval(), code.getCval());
-		}
+	}
-	}
 
 	/**
 	 * Set the sort code for the given 8-bit character.
@@ -290,23 +283,30 @@
 	 * @param cval The same character in unicode.
 	 */
 	private void setSortcode(byte b, char cval) {
-		int flags = 0;
-		if (Character.isLetter(cval) && (Character.getType(cval) & Character.MODIFIER_LETTER) == 0)
-			flags = 1;
-		if (Character.isDigit(cval))
-			flags = 2;
+		int flags = charFlags(cval);
 		if (cflags.contains("0"))
 			flags = 0;
-		if (cflags.contains("g"))
-			flags |= 0x10;
-		if (cflags.contains("w"))
-			flags |= 0x20;
 
 		sort.add(b, pos1, pos2, pos3, flags);
 		this.cflags = "";
 	}
 
 	/**
+	 * The flags that describe the kind of character. Known ones
+	 * are letter and digit. There may be others.
+	 * @param cval The actual character (unicode).
+	 * @return The flags that apply to it.
+	 */
+	private int charFlags(char cval) {
+		int flags = 0;
+		if (Character.isLetter(cval) && (Character.getType(cval) & Character.MODIFIER_LETTER) == 0)
+			flags = 1;
+		if (Character.isDigit(cval))
+			flags = 2;
+		return flags;
+	}
+
+	/**
 	 * Reset the position fields to their initial values.
 	 */
 	private void resetPos() {
@@ -352,4 +352,61 @@
 		sf.close();
 		chan.close();
 	}
+
+	/**
+	 * Helper to represent a code read from the file.
+	 *
+	 * You can write it in unicode, or as a two digit hex number.
+	 * We work out what you wrote, and return both the code point in
+	 * the codepage and the unicode character form of the letter.
+	 */
+	private class Code {
+		private final TokenScanner scanner;
+		private final String val;
+		private byte bval;
+		private char cval;
+
+		public Code(TokenScanner scanner, String val) {
+			this.scanner = scanner;
+			this.val = val;
-}
+		}
+
+		public byte getBval() {
+			return bval;
+		}
+
+		public char getCval() {
+			return cval;
+		}
+
+		public Code invoke() {
+			try {
+				if (val.length() == 1) {
+					CharBuffer cbuf = CharBuffer.wrap(val.toCharArray());
+					ByteBuffer out = encoder.encode(cbuf);
+					if (out.remaining() > 1)
+						throw new SyntaxException(scanner, "more than one character resulted from conversion of " + val);
+
+					bval = out.get();
+					cval = val.charAt(0);
+				} else {
+					bval = (byte) Integer.parseInt(val, 16);
+					ByteBuffer bin = ByteBuffer.allocate(1);
+					bin.put(bval);
+					bin.flip();
+					CharBuffer out = decoder.decode(bin);
+					cval = out.get();
+				}
+			} catch (CharacterCodingException e) {
+				throw new SyntaxException(scanner, "Not a valid character (" + val + ") in codepage");
+			} catch (NumberFormatException e) {
+				throw new SyntaxException(scanner, "Not a valid hex number " + val);
+			}
+			return this;
+		}
+
+		public String toString() {
+			return String.format("%02x: %c (0x%x)", bval, cval, (int) cval);
+		}
+	}
+}
Index: test/uk/me/parabola/imgfmt/app/srt/SortTest.java
===================================================================
--- test/uk/me/parabola/imgfmt/app/srt/SortTest.java	(revision 1926)
+++ test/uk/me/parabola/imgfmt/app/srt/SortTest.java	(revision )
@@ -26,6 +26,7 @@
 
 public class SortTest {
 	private Sort sort;
+	private Collator collator;
 
 	@Before
 	public void setUp() throws Exception {
@@ -34,6 +35,8 @@
 				"code a, A; Ã¢, Ã < b, B;\n");
 		SrtTextReader srr = new SrtTextReader(r);
 		sort = srr.getSort();
+		collator = sort.getCollator();
+		collator.setStrength(Collator.TERTIARY);
 	}
 
 	@Test
@@ -180,5 +183,6 @@
 		SortKey<Object> k2 = sort.createSortKey(null, s1);
 
 		assertEquals(1, k2.compareTo(k1));
+		assertEquals(-1, collator.compare(s, s1));
 	}
 }
Index: src/uk/me/parabola/imgfmt/app/srt/Sort.java
===================================================================
--- src/uk/me/parabola/imgfmt/app/srt/Sort.java	(revision 1926)
+++ src/uk/me/parabola/imgfmt/app/srt/Sort.java	(revision )
@@ -1,5 +1,5 @@
 /*
- * Copyright (C) 2010.
+ * Copyright (C) 2010, 2011.
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License version 3 or
@@ -22,6 +22,7 @@
 import java.text.CollationKey;
 import java.text.Collator;
 import java.util.ArrayList;
+import java.util.Iterator;
 import java.util.List;
 
 import uk.me.parabola.imgfmt.ExitException;
@@ -45,20 +46,22 @@
 	private final byte[] secondary = new byte[256];
 	private final byte[] tertiary = new byte[256];
 	private final byte[] flags = new byte[256];
-	private final List<Character> tab2 = new ArrayList<Character>();
+
+	private final List<CodePosition> expansions = new ArrayList<CodePosition>();
+	private int maxExpSize = 1;
+
 	private CharsetEncoder encoder;
 
 	public void add(int ch, int primary, int secondary, int tertiary, int flags) {
+		if (this.primary[ch & 0xff] != 0)
+			throw new ExitException(String.format("Repeated primary index 0x%x", ch & 0xff));
 		this.primary[ch & 0xff] = (byte) primary;
-		this.secondary[ch & 0xff] = flags > 0xf? 0: (byte) secondary;
-		this.tertiary[ch & 0xff] = flags > 0xf? 0: (byte) tertiary;
+		this.secondary[ch & 0xff] = (byte) secondary;
+		this.tertiary[ch & 0xff] = (byte) tertiary;
+
 		this.flags[ch & 0xff] = (byte) flags;
 	}
 
-	public void add(char tab2) {
-		this.tab2.add(tab2);
-	}
-
 	/**
 	 * Return a table indexed by a character value in the target codepage, that gives the complete sort
 	 * position of the character.
@@ -92,11 +95,11 @@
 		try {
 			ByteBuffer out = encoder.encode(inb);
 			byte[] bval = out.array();
-			byte[] key = new byte[bval.length * 3 + 3];
+			byte[] key = new byte[bval.length * 3 * maxExpSize + 3];
 
-			int start = fillKey(primary, bval, key, 0);
-			start = fillKey(secondary, bval, key, start);
-			fillKey(tertiary, bval, key, start);
+			int start = fillKey(Collator.PRIMARY, primary, bval, key, 0);
+			start = fillKey(Collator.SECONDARY, secondary, bval, key, start);
+			fillKey(Collator.TERTIARY, tertiary, bval, key, start);
 
 			return new SrtSortKey<T>(object, key, second);
 		} catch (CharacterCodingException e) {
@@ -113,17 +116,31 @@
 	 * @param start The index into the output key to start at.
 	 * @return The next position in the output key.
 	 */
-	private int fillKey(byte[] sortPositions, byte[] input, byte[] outKey, int start) {
+	private int fillKey(int type, byte[] sortPositions, byte[] input, byte[] outKey, int start) {
 		int index = start;
 		for (byte inb : input) {
 			int b = inb & 0xff;
 
+			int exp = (flags[b] >> 4) & 0x3;
+			if (exp == 0) {
-			// I am guessing that a sort position of 0 means that the character is ignorable at this
-			// strength. In other words it is as if it is not present in the string.  This appears to
-			// be true for shield symbols, but perhaps not for other kinds of control characters.
+				// I am guessing that a sort position of 0 means that the character is ignorable at this
+				// strength. In other words it is as if it is not present in the string.  This appears to
+				// be true for shield symbols, but perhaps not for other kinds of control characters.
-			if (sortPositions[b] != 0)
-				outKey[index++] = sortPositions[b];
+				byte pos = sortPositions[b];
+				if (pos != 0)
+					outKey[index++] = pos;
+			} else {
+				// now have to redirect to a list of input chars, get the list via the primary value always.
+				byte idx = primary[b];
+				//List<CodePosition> list = expansions.get(idx-1);
+
+				for (int i = idx - 1; i < idx + exp; i++) {
+					byte pos = expansions.get(i).getPosition(type);
+					if (pos != 0)
+						outKey[index++] = pos;
-		}
+				}
+			}
+		}
 
 		outKey[index++] = '\0';
 		return index;
@@ -149,10 +166,6 @@
 		return flags[ch];
 	}
 
-	public List<Character> getTab2() {
-		return tab2;
-	}
-
 	public int getCodepage() {
 		return codepage;
 	}
@@ -192,6 +205,47 @@
 		this.description = description;
 	}
 
+	/**
+	 * Add an expansion to the sort.
+	 * An expansion is a letter that sorts as if it were two separate letters.
+	 *
+	 * The case were two letters sort as if the were just one (and more complex cases) are
+	 * not supported or are unknown to us.
+	 *
+	 * @param bval The code point of this letter in the code page.
+	 * @param inFlags The initial flags, eg if it is a letter or not.
+	 * @param expansionList The letters that this letter sorts as, as code points in the codepage.
+	 */
+	public void addExpansion(byte bval, int inFlags, List<Byte> expansionList) {
+		int idx = bval & 0xff;
+		flags[idx] = (byte) ((inFlags & 0xf) | (((expansionList.size()-1) << 4) & 0x30));
+
+		// Check for repeated definitions
+		if (primary[idx] != 0)
+			throw new ExitException(String.format("repeated code point %x", idx));
+
+		primary[idx] = (byte) (expansions.size() + 1);
+		secondary[idx] = 0;
+		tertiary[idx] = 0;
+		maxExpSize = Math.max(maxExpSize, expansionList.size());
+
+		for (Byte b : expansionList) {
+			CodePosition cp = new CodePosition();
+			cp.setPrimary(primary[b & 0xff]);
+			cp.setSecondary(secondary[b & 0xff]);
+			cp.setTertiary((byte) (tertiary[b & 0xff] + 2));
+			expansions.add(cp);
+		}
+	}
+
+	/**
+	 * Get the expansion with the given index, one based.
+	 * @param val The one-based index number of the extension.
+	 */
+	public CodePosition getExpansion(int val) {
+		return expansions.get(val - 1);
+	}
+
 	public Collator getCollator() {
 		return new SrtCollator(codepage);
 	}
@@ -219,6 +273,16 @@
 		return sort;
 	}
 
+	public int getExpansionSize() {
+		return expansions.size();
+	}
+
+	/**
+	 * A collator that works with this sort. This should be used if you just need to compare two
+	 * strings against each other once.
+	 *
+	 * The sort key is better when the comparison must be done several times as in a sort operation.
+	 */
 	private class SrtCollator extends Collator {
 		private final int codepage;
 
@@ -239,12 +303,12 @@
 			}
 
 			int strength = getStrength();
-			int res = compareOneStrength(bytes1, bytes2, primary);
+			int res = compareOneStrength(bytes1, bytes2, primary, Collator.PRIMARY);
 
 			if (res == 0 && strength != PRIMARY) {
-				res = compareOneStrength(bytes1, bytes2, secondary);
+				res = compareOneStrength(bytes1, bytes2, secondary, Collator.SECONDARY);
 				if (res == 0 && strength != SECONDARY) {
-					res = compareOneStrength(bytes1, bytes2, tertiary);
+					res = compareOneStrength(bytes1, bytes2, tertiary, Collator.TERTIARY);
 				}
 			}
 
@@ -261,20 +325,26 @@
 		 * Compare the bytes against primary, secondary or tertiary arrays.
 		 * @param bytes1 Bytes for the first string in the codepage encoding.
 		 * @param bytes2 Bytes for the second string in the codepage encoding.
-		 * @param type The strength array to use in the comparison.
+		 * @param typePositions The strength array to use in the comparison.
 		 * @return Comparison result -1, 0 or 1.
 		 */
-		private int compareOneStrength(byte[] bytes1, byte[] bytes2, byte[] type) {
+		@SuppressWarnings({"AssignmentToForLoopParameter"})
+		private int compareOneStrength(byte[] bytes1, byte[] bytes2, byte[] typePositions, int type) {
 			int res = 0;
-			int length = Math.min(bytes1.length, bytes2.length);
-			for (int i = 0; i < length; i++) {
 
-				byte p1 = type[bytes1[i] & 0xff];
-				byte p2 = type[bytes2[i] & 0xff];
+			PositionIterator it1 = new PositionIterator(bytes1, typePositions, type);
+			PositionIterator it2 = new PositionIterator(bytes2, typePositions, type);
+
+			while (it1.hasNext() && it2.hasNext()) {
+				int p1 = it1.next();
+				int p2 = it2.next();
+				
 				if (p1 < p2) {
 					res = -1;
+					break;
 				} else if (p1 > p2) {
 					res = 1;
+					break;
 				}
 			}
 			return res;
@@ -297,5 +367,62 @@
 		public int hashCode() {
 			return codepage;
 		}
+
+		class PositionIterator implements Iterator<Integer> {
+			private final byte[] bytes;
+			private final byte[] sortPositions;
+			private final int len;
+			private final int type;
+
+			private int pos;
+
+			private int expStart;
+			private int expEnd;
+			private int expPos;
+
+			PositionIterator(byte[] bytes, byte[] sortPositions, int type) {
+				this.bytes = bytes;
+				this.sortPositions = sortPositions;
+				this.len = bytes.length;
+				this.type = type;
-	}
+			}
+
+			public boolean hasNext() {
+				return pos < len || expPos != 0;
-}
+			}
+
+			public Integer next() {
+				int next;
+				if (expPos == 0) {
+					int in = pos++ & 0xff;
+					byte b = bytes[in];
+					int n = (flags[b & 0xff] >> 4) & 0x3;
+					if (n > 0) {
+						expStart = primary[b & 0xff] - 1;
+						expEnd = expStart + n;
+						expPos = expStart;
+						next = expansions.get(expPos).getPosition(type);
+
+						if (++expPos > expEnd)
+							expPos = 0;
+
+					} else {
+						for (next = sortPositions[bytes[in] & 0xff]; next == 0 && pos < len; ) {
+							next = sortPositions[bytes[pos++ & 0xff]];
+						}
+					}
+				} else {
+					next = expansions.get(expPos).getPosition(type);
+					if (++expPos > expEnd)
+						expPos = 0;
+
+				}
+				return next;
+			}
+
+			public void remove() {
+				throw new UnsupportedOperationException("remove not supported");
+			}
+		}
+	}
+}
Index: resources/sort/cp1252.txt
===================================================================
--- resources/sort/cp1252.txt	(revision 1870)
+++ resources/sort/cp1252.txt	(revision )
@@ -36,17 +36,12 @@
 description "Western European Sort"
 
 code pos2=0 pos3=8 01, 02, 03, 04, 05, 06, 07
-code flags=w 85
-code flags=g 8c
-code flags=g 9c
 code flags=w Â¼
 code 20,a0,1e,1f; _ ;b4;`;^;a8;98;b8;af
-code flags=w Â½; pos2=1 ad
+code pos2=1 ad
 code -
-code flags=w Â¾
 code 96
 code 97
-code flags=g Ã
 code 2c
 code 3b
 code :
@@ -117,7 +112,6 @@
 code 8
 code 9
 code a,A,,Âª; Ã¡,Ã; Ã ,Ã; Ã¢,Ã; Ã¥,Ã; Ã¤,Ã; Ã£,Ã
-code Ã¦, Ã
 code b,B
 code c,C; Ã§,Ã
 code d,D;;Ã°,Ã
@@ -147,21 +141,14 @@
 code Ã¾,Ã
 code flags=0 Âµ
 
-tab2 2126
-tab2 2126
-tab2 2126
-tab2 58b1
-tab2 519c
-tab2 38b1
-tab2 319c
-tab2 3186
-tab2 3166
-tab2 3189
-tab2 3186
-tab2 3166
-tab2 3187
-tab2 3188
-tab2 3166
-tab2 3189
-tab2 38bd
-tab2 31bd
+expand Ã¦ to a e
+expand Ã to A E
+
+expand Ã to s s
+expand 85 to . . .
+expand 9c to o e
+expand 8c to O E
+expand Â½ to 1 / 2
+expand Â¼ to 1 / 4
+expand Â¾ to 3 / 4
+
Index: src/uk/me/parabola/imgfmt/app/srt/SRTFile.java
===================================================================
--- src/uk/me/parabola/imgfmt/app/srt/SRTFile.java	(revision 1870)
+++ src/uk/me/parabola/imgfmt/app/srt/SRTFile.java	(revision )
@@ -13,7 +13,6 @@
 package uk.me.parabola.imgfmt.app.srt;
 
 import java.nio.charset.Charset;
-import java.util.List;
 
 import uk.me.parabola.imgfmt.app.BufferedImgFileWriter;
 import uk.me.parabola.imgfmt.app.ImgFile;
@@ -59,7 +58,7 @@
 		SectionWriter subWriter = header.makeSectionWriter(writer);
 		subWriter.position(SRTHeader.HEADER3_LEN);
 		writeCharacterTable(subWriter);
-		writeTab2(subWriter);
+		writeExpansions(subWriter);
 		subWriter.close();
 
 		// Header 2 is just after the real header
@@ -83,17 +82,29 @@
 	private void writeCharacterTable(ImgFileWriter writer) {
 		for (int i = 1; i < 256; i++) {
 			writer.put(sort.getFlags(i));
-			writer.put(sort.getPrimary(i));
-			writer.put((byte) ((sort.getTertiary(i) << 4) | (sort.getSecondary(i) & 0xf)));
+			writeWeights(writer, i);
 		}
 		header.endCharTable(writer.position());
 	}
 
-	private void writeTab2(ImgFileWriter writer) {
-		List<Character> tab2 = sort.getTab2();
-		for (Character c : tab2) {
-			writer.putChar(c);
+	private void writeWeights(ImgFileWriter writer, int i) {
+		writer.put(sort.getPrimary(i));
+		writer.put((byte) ((sort.getTertiary(i) << 4) | (sort.getSecondary(i) & 0xf)));
-		}
+	}
+
+	/**
+	 * Write out the expansion table. This is referenced from the character table, when
+	 * the top nibble of the type is set via the primary position value.
+	 */
+	private void writeExpansions(ImgFileWriter writer) {
+
+		int size = sort.getExpansionSize();
+		for (int j = 1; j <= size; j++) {
+			CodePosition b = sort.getExpansion(j);
+			writer.put(b.getPrimary());
+			writer.put((byte) ((b.getTertiary() << 4) | (b.getSecondary() & 0xf)));
+		}
+
 		header.endTab2(writer.position());
 	}
 
Index: src/uk/me/parabola/mkgmap/scan/TokenScanner.java
===================================================================
--- src/uk/me/parabola/mkgmap/scan/TokenScanner.java	(revision 1870)
+++ src/uk/me/parabola/mkgmap/scan/TokenScanner.java	(revision )
@@ -78,8 +78,9 @@
 	}
 
 	/**
-	 * Get the value of the next token and consume the token.  You'd
+	 * Get the value of the next non-space token and consume the token.  You'd
 	 * probably only call this after having peeked the type earlier.
+	 * Any initial space is skipped.
 	 */
 	public String nextValue() {
 		skipSpace();
Index: resources/sort/cp1256.txt
===================================================================
--- resources/sort/cp1256.txt	(revision )
+++ resources/sort/cp1256.txt	(revision )
@@ -0,0 +1,115 @@
+
+codepage 1256
+id1 7
+id2 2
+description "Arabic"
+
+code pos2=0 pos3=8 01, 02, 03, 04, 05, 06, 07
+
+
+code 85
+code 20 < a0 < 09 < 0a < 0b < 0c < 0d
+code !
+code "
+code 23
+code $
+code %
+code &
+code (
+code )
+code *
+code ,
+code .
+code /
+code :
+code ;
+code ?
+code @
+code [
+code \
+code ]
+code ^ 88
+code `
+code {
+code |
+code }
+code ~
+code a6
+code a8
+code af
+code b4
+code b8
+code a1
+code ba
+code bf
+code 91
+code 92
+code 82
+code 93
+code 94
+code 84
+code 8b
+code 9b
+code _
+code <
+code =
+code >
+code Â±
+code ab
+code bb
+code d7
+code f7
+code a2
+code a3
+code a4
+code a5
+code a7
+code a9 < ac < ae < b0 < b5 < b6 < b7 < 86 < 87 < 95 < 85 < 89
+code 80
+code 0 < bc < bd < be 
+code 1,Â¹
+code 2,Â²
+code 3,Â³
+code 4
+code 5
+code 6
+code 7
+code 8
+code 9
+code a,A; e0; e2
+code b,B
+code c,C; e7
+code d,D
+code e,E; e9; e8; ea; eb
+code f,F; 83
+code g,G
+code h,H
+code i,I; ee; ef
+code j,J
+code k,K
+code l,L
+code m,M
+code n,N
+code o,O; f4
+code 9c,8c
+code p,P
+code q,Q
+code r,R
+code s,S
+code t,T
+code 99
+code u,U; f9; fb; fc
+code v,V
+code w,W
+code x,X
+code y,Y
+code z,Z
+code c1; c4; c6
+code c7; c2; c3; c5
+code c8 < 81
+code c9, ca
+code cb < cc < 8d < cd < ce < cf < d0 < d1 < d2 < 8e < d3 < d4
+code d5 < d6 < d8 < d9 < da < db < dd < de < df < 90 < e1 < e3
+code e4 < e5 < e6
+code ec; ed
+code 8a < 8f < 9a < 98 < 9f < aa < c0 < ff < f8
Index: src/uk/me/parabola/mkgmap/scan/Token.java
===================================================================
--- src/uk/me/parabola/mkgmap/scan/Token.java	(revision 738)
+++ src/uk/me/parabola/mkgmap/scan/Token.java	(revision )
@@ -46,5 +46,12 @@
 	public boolean isValue(String val) {
 		return val.equals(value);
 	}
+
+	/**
+	 * True if we are at the end of a line. End of file counts as the end of a line.
+	 */
+	public boolean isEol() {
+		return type == TokType.EOL || type == TokType.EOF;
-}
+	}
+}
 
Index: src/uk/me/parabola/imgfmt/app/srt/CodePosition.java
===================================================================
--- src/uk/me/parabola/imgfmt/app/srt/CodePosition.java	(revision )
+++ src/uk/me/parabola/imgfmt/app/srt/CodePosition.java	(revision )
@@ -0,0 +1,69 @@
+/*
+ * Copyright (C) 2011.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 3 or
+ * version 2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ */
+package uk.me.parabola.imgfmt.app.srt;
+
+import java.text.Collator;
+
+/**
+ * Represents the collation positions of a given code point.
+ *
+ * @author Steve Ratcliffe
+ */
+class CodePosition {
+	private byte primary;
+	private byte secondary;
+	private byte tertiary;
+
+	public byte getPrimary() {
+		return primary;
+	}
+
+	public byte getSecondary() {
+		return secondary;
+	}
+
+	public byte getTertiary() {
+		return tertiary;
+	}
+
+	/**
+	 * Get the position with the given strength.
+	 *
+	 * @param type The strength, Collator.PRIMARY, SECONDARY etc.
+	 * @return The collation position at the given strength.
+	 */
+	public byte getPosition(int type) {
+		switch (type) {
+		case Collator.PRIMARY:
+			return primary;
+		case Collator.SECONDARY:
+			return secondary;
+		case Collator.TERTIARY:
+			return tertiary;
+		default:
+			return 0;
+		}
+	}
+
+	public void setPrimary(byte primary) {
+		this.primary = primary;
+	}
+
+	public void setSecondary(byte secondary) {
+		this.secondary = secondary;
+	}
+
+	public void setTertiary(byte tertiary) {
+		this.tertiary = tertiary;
+	}
+}
Index: test/uk/me/parabola/imgfmt/app/srt/SortExpandTest.java
===================================================================
--- test/uk/me/parabola/imgfmt/app/srt/SortExpandTest.java	(revision )
+++ test/uk/me/parabola/imgfmt/app/srt/SortExpandTest.java	(revision )
@@ -0,0 +1,86 @@
+/*
+ * Copyright (C) 2011.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 3 or
+ * version 2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ */
+package uk.me.parabola.imgfmt.app.srt;
+
+import java.io.Reader;
+import java.io.StringReader;
+import java.text.Collator;
+
+import uk.me.parabola.mkgmap.srt.SrtTextReader;
+
+import org.junit.Before;
+import org.junit.Test;
+
+import static org.junit.Assert.*;
+
+/**
+ * Tests for characters that are expanded into two or more sort
+ * positions.
+ */
+public class SortExpandTest {
+	private Sort sort;
+	private Collator collator;
+
+	@Before
+	public void setUp() throws Exception {
+		Reader r = new StringReader("codepage 1252\n" +
+				"code 01\n" +
+				"code a, A; Ã¢, Ã < b, B\n" +
+				"code c < d < e <f < g < h < i < j < k < l < m < n < o\n" +
+				"code p < q < r,R < s,S < t,T < u < v < w < x < y < z\n" +
+				"expand Ã to s s\n");
+		SrtTextReader srr = new SrtTextReader(r);
+		sort = srr.getSort();
+		collator = sort.getCollator();
+	}
+
+	@Test
+	public void testNormal() {
+		checkOrder("asÃst", "astst");
+		checkOrder("asrst", "asÃst");
+	}
+
+	/**
+	 * Expanded letters should sort just after what they expand to.
+	 */
+	@Test
+	public void testAgainstExpansion() {
+		checkOrder("asssst", "asÃst");
+	}
+
+	@Test
+	public void testGreaterThanInExpansion() {
+		checkOrder("aÃzaa", "astb");
+	}
+
+	@Test
+	public void testLessThanInExpansion() {
+		checkOrder("asrb", "aÃaaa");
+	}
+
+	/**
+	 * Check and assert that the second string is greater than the first.
+	 * @param s First string.
+	 * @param s1 Second string.
+	 */
+	private void checkOrder(String s, String s1) {
+		SortKey<Object> k1 = sort.createSortKey(null, s);
+		SortKey<Object> k2 = sort.createSortKey(null, s1);
+
+		assertEquals(1, k2.compareTo(k1));
+		assertEquals(-1, k1.compareTo(k2));
+
+		assertEquals(-1, collator.compare(s, s1));
+		assertEquals(1, collator.compare(s1, s));
+	}
+}

_______________________________________________
mkgmap-dev mailing list
[email protected]
http://www.mkgmap.org.uk/mailman/listinfo/mkgmap-dev

Re: [mkgmap-dev] Index with Arabic names

Reply via email to