This character lies in the CJK_UNIFIED_IDEOGRAPHS_EXTENSION_A block. Added extensions detection, I assume (not really knowing) that all of these characters are not phonetic as well.
import java.lang.Character.UnicodeBlock; import java.util.Arrays; import java.util.HashSet; import java.util.Set; import junit.framework.Assert; import org.junit.Test; public class DetectCJK { Set<UnicodeBlock> cjkUnicodeBlocks = new HashSet<UnicodeBlock>( Arrays.asList(new Character.UnicodeBlock[] { Character.UnicodeBlock.CJK_UNIFIED_IDEOGRAPHS, Character.UnicodeBlock.CJK_UNIFIED_IDEOGRAPHS_EXTENSION_A, Character.UnicodeBlock.CJK_UNIFIED_IDEOGRAPHS_EXTENSION_B // C and D were added in Java7 - I'm using Java6 //,Character.UnicodeBlock.CJK_UNIFIED_IDEOGRAPHS_EXTENSION_C, //Character.UnicodeBlock.CJK_UNIFIED_IDEOGRAPHS_EXTENSION_D, })); @Test public void test1() { Assert.assertEquals(Character.UnicodeBlock.BASIC_LATIN, Character.UnicodeBlock.of('a')); Assert.assertEquals(Character.UnicodeBlock.HEBREW, Character.UnicodeBlock.of('א')); assertCJK('電', "Traditional Chinese: Electricity"); assertCJK('电', "Simplified Chinese: Electricity"); assertCJK('電', "Simplified Chinese: Japanese"); assertCJK('㒨', "in CJK_UNIFIED_IDEOGRAPHS_EXTENSION_A"); } private void assertCJK(Character character, String message) { UnicodeBlock unicodeBlock = Character.UnicodeBlock.of(character); Assert.assertTrue(message, cjkUnicodeBlocks.contains(unicodeBlock)); } } On Mon, Mar 11, 2013 at 12:10 AM, Trejkaz <trej...@trypticon.org> wrote: > On Sun, Mar 10, 2013 at 8:19 PM, Gili Nachum <gilinac...@gmail.com> wrote: > > Answering myself for next generations' sake. > > Character.UnicodeBlock.CJK_UNIFIED_IDEOGRAPHS does the job. > > How about 㒨? > > TX > > --------------------------------------------------------------------- > To unsubscribe, e-mail: java-user-unsubscr...@lucene.apache.org > For additional commands, e-mail: java-user-h...@lucene.apache.org > >