Is there a reason that Jena does not support the full range of XML name start characters?
see https://www.w3.org/TR/xml/#NT-NameStartChar I wrote a quick test and found that there were a number of characters that Jena does not support. Miscategorization appears to start at 0x132. There are 936990 miscategorized characters. The issue is actually in the Xerces util class XMLChar Is this because of the version of Xerces we are stuck with? Is there a way around this issue? Claude p.s. Since I can't attach a file, here is the test code I wrote. import static org.junit.Assert.assertTrue; import org.apache.xerces.util.XMLChar; import org.junit.Test; public class NameTest { /* * NameStartChar ::= ":" | [A-Z] | "_" | [a-z] | [#xC0-#xD6] | [#xD8-#xF6] | * [#xF8-#x2FF] | [#x370-#x37D] | [#x37F-#x1FFF] | [#x200C-#x200D] | * [#x2070-#x218F] | [#x2C00-#x2FEF] | [#x3001-#xD7FF] | [#xF900-#xFDCF] | * [#xFDF0-#xFFFD] | [#x10000-#xEFFFF] */ int[][] ranges = { { ':', ':' }, { 'A', 'Z' }, { '_', '_' }, { 0xC0, 0xD6 }, { 0xD8, 0xF6 }, { 0xF8, 0x2FF }, { 0x370, 0x37D }, { 0x37F, 0x1FFF }, { 0x200C, 0x200D }, { 0x2070, 0x218F }, { 0x2C00, 0x2FEF }, { 0x3001, 0xD7FF }, { 0xF900, 0xFDCF }, { 0xFDF0, 0xFFFD }, { 0x10000, 0xEFFFF } }; @Test public void testNameStart() { for (int[] range : ranges) { for (int c = range[0]; c <= range[1]; c++) { assertTrue( String.format( "character %s 0x%s",c,Integer.toHexString( c )) , XMLChar.isNameStart( c ) ); } } } @Test public void listNameStartErr() { int cnt = 0; for (int[] range : ranges) { for (int c = range[0]; c <= range[1]; c++) { if (!XMLChar.isNameStart( c )) { System.out.print( String.format( "0x%s ",Integer.toHexString( c )) ); cnt++; if (cnt % 25 == 0) { System.out.println(); } } } } System.out.println(); System.out.println( cnt+" characters miscategorized" ); } } -- I like: Like Like - The likeliest place on the web <http://like-like.xenei.com> LinkedIn: http://www.linkedin.com/in/claudewarren
