This is an automated email from the ASF dual-hosted git repository. andy pushed a commit to branch main in repository https://gitbox.apache.org/repos/asf/jena.git
commit 388343ed5714d933aeecda62f41e548882445d55 Author: Andy Seaborne <[email protected]> AuthorDate: Wed Mar 26 10:26:21 2025 +0000 GH-3086: Use jena-langtag as implementation. LangTagX for jena policy. --- .../riot/process/normalize/NormalizeRDFTerms.java | 4 +- .../process/normalize/StreamCanonicalLangTag.java | 7 +- .../java/org/apache/jena/riot/system/Checker.java | 9 +- .../apache/jena/riot/system/ParserProfileStd.java | 2 +- .../java/org/apache/jena/riot/web/LangTag.java | 108 ++----- .../test/java/org/apache/jena/riot/TC_Riot.java | 1 + .../java/org/apache/jena/riot/lang/TS_Lang.java | 2 +- .../jena/riot/lang/TestLangJsonLD_DocLoader.java | 3 - .../lang/{TestLang.java => TestRDFLanguages.java} | 2 +- .../org/apache/jena/riot/system/TS_RiotSystem.java | 14 +- .../java/org/apache/jena/riot/web/TS_RiotWeb.java | 9 +- .../java/org/apache/jena/riot/web/TestLangTag.java | 120 -------- .../org/apache/jena/riot/web/TestWebContent.java | 21 +- .../java/org/apache/jena/graph/NodeFactory.java | 59 ++-- .../org/apache/jena/graph/langtag/LangTags.java | 144 +-------- .../java/org/apache/jena/langtagx/LangTagX.java | 119 ++++++++ .../apache/jena/graph/test/TestTypedLiterals.java | 4 +- .../java/org/apache/jena/langtagx/TS_LangTagX.java | 28 +- .../org/apache/jena/langtagx/TestLangTagX.java | 90 ++++++ .../org/apache/jena/test/TestPackage_core.java | 1 + .../ttl_test/test/turtle/TurtleInternalTests.java | 339 +++++++++++---------- .../main/java/org/apache/jena/langtag/LangTag.java | 15 + .../org/apache/jena/langtag/LangTagRFC5646.java | 43 ++- .../java/org/apache/jena/langtag/LangTags.java | 105 +++---- .../java/org/apache/jena/langtag/SysLangTag.java | 1 - .../jena/langtag/TestBasicSyntaxLangTags.java | 8 +- .../java/org/apache/jena/langtag/TestLangTag.java | 21 +- .../org/apache/jena/langtag/TestLangTagFormat.java | 1 - .../org/apache/jena/langtag/TestLangTagsOps.java | 12 +- .../apache/jena/tdb1/store/nodetable/NodecSSE.java | 5 +- 30 files changed, 595 insertions(+), 702 deletions(-) diff --git a/jena-arq/src/main/java/org/apache/jena/riot/process/normalize/NormalizeRDFTerms.java b/jena-arq/src/main/java/org/apache/jena/riot/process/normalize/NormalizeRDFTerms.java index 12556cafb5..af004d1b00 100644 --- a/jena-arq/src/main/java/org/apache/jena/riot/process/normalize/NormalizeRDFTerms.java +++ b/jena-arq/src/main/java/org/apache/jena/riot/process/normalize/NormalizeRDFTerms.java @@ -25,7 +25,7 @@ import org.apache.jena.datatypes.RDFDatatype ; import org.apache.jena.datatypes.xsd.XSDDatatype ; import org.apache.jena.graph.Node ; import org.apache.jena.graph.NodeFactory ; -import org.apache.jena.riot.web.LangTag ; +import org.apache.jena.langtagx.LangTagX; import org.apache.jena.sparql.util.NodeUtils ; import org.apache.jena.vocabulary.RDF ; @@ -189,7 +189,7 @@ public class NormalizeRDFTerms implements NormalizeTerm { */ private static Node canonicalLangtag(Node node) { String langTag = node.getLiteralLanguage(); - String langTag2 = LangTag.canonical(langTag); + String langTag2 = LangTagX.formatLanguageTag(langTag); if ( langTag2.equals(langTag) ) return node; //String textDir = n.getLiteralTextDirection(); diff --git a/jena-arq/src/main/java/org/apache/jena/riot/process/normalize/StreamCanonicalLangTag.java b/jena-arq/src/main/java/org/apache/jena/riot/process/normalize/StreamCanonicalLangTag.java index 820e603b4f..5edda18e2f 100644 --- a/jena-arq/src/main/java/org/apache/jena/riot/process/normalize/StreamCanonicalLangTag.java +++ b/jena-arq/src/main/java/org/apache/jena/riot/process/normalize/StreamCanonicalLangTag.java @@ -25,7 +25,7 @@ import java.util.function.BiFunction; import org.apache.jena.graph.Node; import org.apache.jena.graph.NodeFactory; -import org.apache.jena.graph.langtag.LangTags; +import org.apache.jena.langtagx.LangTagX; import org.apache.jena.riot.process.StreamRDFApplyObject; import org.apache.jena.riot.system.StreamRDF; import org.apache.jena.sparql.util.NodeUtils; @@ -41,7 +41,8 @@ public class StreamCanonicalLangTag extends StreamRDFApplyObject { /** Return a {@link StreamRDF} that converts language tags to canonical form (RFC 4646, 5646). */ public static StreamRDF toCanonical(StreamRDF other) { - Locale.Builder locBuild = new Locale.Builder(); + // Only for LangTagJDK + Locale.Builder locBuild = null; //new Locale.Builder(); return new StreamCanonicalLangTag(other, locBuild, (b,n) -> canonical(locBuild, n, StreamCanonicalLangTag::langTagCanonical)); } @@ -65,7 +66,7 @@ public class StreamCanonicalLangTag extends StreamRDFApplyObject { } static String langTagCanonical(Locale.Builder locBuild, String str) { - return LangTags.basicFormat(str); + return LangTagX.formatLanguageTag(str); } static String langTagLC(Locale.Builder locBuild, String str) { diff --git a/jena-arq/src/main/java/org/apache/jena/riot/system/Checker.java b/jena-arq/src/main/java/org/apache/jena/riot/system/Checker.java index 125adda74a..85bd530fe2 100644 --- a/jena-arq/src/main/java/org/apache/jena/riot/system/Checker.java +++ b/jena-arq/src/main/java/org/apache/jena/riot/system/Checker.java @@ -18,8 +18,6 @@ package org.apache.jena.riot.system; -import java.util.regex.Pattern; - import org.apache.jena.datatypes.RDFDatatype; import org.apache.jena.datatypes.xsd.XSDDatatype; import org.apache.jena.datatypes.xsd.impl.RDFLangString; @@ -27,6 +25,7 @@ import org.apache.jena.graph.Node; import org.apache.jena.graph.Triple; import org.apache.jena.iri.IRI; import org.apache.jena.irix.*; +import org.apache.jena.langtagx.LangTagX; import org.apache.jena.sparql.core.Quad; import org.apache.jena.util.SplitIRI; @@ -130,8 +129,6 @@ public class Checker { } catch (org.apache.jena.iri.IRIException0 | org.apache.jena.irix.IRIException ex) {} } - final static private Pattern langPattern = Pattern.compile("[a-zA-Z]{1,8}(-[a-zA-Z0-9]{1,8})*"); - public static boolean checkLiteral(Node node) { return checkLiteral(node, nullErrorHandler, -1L, -1L); } @@ -165,8 +162,8 @@ public class Checker { // If the Literal has a language... if ( hasLang ) { - // Test language tag format -- not a perfect test... - if ( !langPattern.matcher(lang).matches() ) { + // Test language tag format + if ( !LangTagX.checkLanguageTag(lang) ) { errorHandler(errorHandler).warning("Language not valid: " + lang, line, col); return false; } diff --git a/jena-arq/src/main/java/org/apache/jena/riot/system/ParserProfileStd.java b/jena-arq/src/main/java/org/apache/jena/riot/system/ParserProfileStd.java index 87187663b7..5f93eb309f 100644 --- a/jena-arq/src/main/java/org/apache/jena/riot/system/ParserProfileStd.java +++ b/jena-arq/src/main/java/org/apache/jena/riot/system/ParserProfileStd.java @@ -212,7 +212,7 @@ public class ParserProfileStd implements ParserProfile { @Override public Node createLangLiteral(String lexical, String langTag, long line, long col) { if ( checking ) - Checker.checkLiteral(lexical, langTag, errorHandler, line, col); + Checker.checkLiteral(lexical, langTag, null, errorHandler, line, col); return factory.createLangLiteral(lexical, langTag); } diff --git a/jena-arq/src/main/java/org/apache/jena/riot/web/LangTag.java b/jena-arq/src/main/java/org/apache/jena/riot/web/LangTag.java index 25677a0bb1..f068faceea 100644 --- a/jena-arq/src/main/java/org/apache/jena/riot/web/LangTag.java +++ b/jena-arq/src/main/java/org/apache/jena/riot/web/LangTag.java @@ -19,11 +19,10 @@ package org.apache.jena.riot.web ; import java.util.Locale ; -import java.util.regex.Matcher ; -import java.util.regex.Pattern ; +import java.util.regex.Matcher; +import java.util.regex.Pattern; -import org.apache.jena.atlas.lib.Chars ; -import org.apache.jena.riot.system.RiotChars ; +import org.apache.jena.langtagx.LangTagX; /** * Language tags: support for parsing and canonicalization of case. @@ -34,8 +33,9 @@ import org.apache.jena.riot.system.RiotChars ; * <li>Matching Language tags: <a href="http://www.ietf.org/rfc/rfc4647.txt">RFC 4647</a></li> * <li>Language tags syntax (BCP 47): <a href="http://www.ietf.org/rfc/rfc5646.txt">RFC 5646</a></li> * </ul> + * @deprecated Use {@link LangTagX} */ - +@Deprecated(forRemoval = true) public class LangTag { // Valid language tag, not irregular, not grand-fathered. /** Index of the language part */ @@ -135,45 +135,12 @@ public class LangTag { /** * Validate - basic syntax check for a language tags: [a-zA-Z]+ ('-'[a-zA-Z0-9]+)* + * + * @deprecated Use {@link LangTagX#checkLanguageTag(String)} */ + @Deprecated(forRemoval = true) public static boolean check(String languageTag) { - int len = languageTag.length() ; - int idx = 0 ; - boolean first = true ; - while (idx < languageTag.length()) { - int idx2 = checkPart(languageTag, idx, first) ; - first = false ; - if ( idx2 == idx ) - // zero length part. - return false ; - idx = idx2 ; - if ( idx == len ) - return true ; - if ( languageTag.charAt(idx) != Chars.CH_DASH ) - return false ; - idx++ ; - if ( idx == len ) - // trailing DASH - return false ; - } - return true ; - } - - private static int checkPart(String languageTag, int idx, boolean leader) { - for (; idx < languageTag.length(); idx++) { - int ch = languageTag.charAt(idx) ; - if ( leader ) { - if ( RiotChars.isA2Z(ch) ) - continue ; - } else { - if ( RiotChars.isA2ZN(ch) ) - continue ; - } - // Not acceptable. - return idx ; - } - // Off end. - return idx ; + return LangTagX.checkLanguageTag(languageTag); } /** @@ -181,8 +148,10 @@ public class LangTag { * constants for the array contents. Parts not present cause a null in * the return array. * - * @return Langtag parts, or null if the input string does not parse as a lang tag. + * @return The language tag parts, or null if the input string does not parse as a lang tag. + * @deprecated Use {@link org.apache.jena.langtag.LangTag#of(String)} to create a language tag object. */ + @Deprecated(forRemoval = true) public static String[] parse(String languageTag) { String[] parts = new String[partsLength] ; @@ -224,50 +193,6 @@ public class LangTag { return parts ; } - /** Canonicalize with the rules of RFC 4646, or RFC5646 without replacement of preferred form. */ - public static String canonical(String str) { - if ( str == null ) - return null ; - String[] parts = parse(str) ; - String x = canonical(parts) ; - if ( x == null ) { - // Could try to apply the rule case-setting rules - // even through it's not a conforming langtag. - return str ; - } - return x ; - } - - /** - * Canonicalize with the rules of RFC 4646 "In this format, all non-initial - * two-letter subtags are uppercase, all non-initial four-letter subtags are - * titlecase, and all other subtags are lowercase." In addition, leave - * extensions unchanged. - * <p> - * This is the same as RFC5646 without replacement of preferred form - * or consulting the registry. - */ - public static String canonical(String[] parts) { - // We canonicalised parts on parsing. - if ( parts == null ) - return null ; - - if ( parts[0] == null ) { - // Grandfathered - return parts[idxExtension] ; - } - - StringBuilder sb = new StringBuilder() ; - sb.append(parts[0]) ; - for (int i = 1; i < parts.length; i++) { - if ( parts[i] != null ) { - sb.append("-") ; - sb.append(parts[i]) ; - } - } - return sb.toString() ; - } - private static String strcase(String string) { if ( string == null ) return null ; @@ -278,6 +203,7 @@ public class LangTag { return lowercase(string) ; } + private static String lowercase(String string) { if ( string == null ) return null ; @@ -298,4 +224,12 @@ public class LangTag { string = lowercase(string.substring(1)) ; return ch1 + string ; } + /** + * Canonicalize with the rules ofRFC 5646 without replacement of preferred form. + * @deprecated Use {@link LangTagX#formatLanguageTag(String)} + */ + @Deprecated(forRemoval = true) + public static String canonical(String langTagStr) { + return LangTagX.formatLanguageTag(langTagStr); + } } diff --git a/jena-arq/src/test/java/org/apache/jena/riot/TC_Riot.java b/jena-arq/src/test/java/org/apache/jena/riot/TC_Riot.java index ed95a8c9c1..1075552297 100644 --- a/jena-arq/src/test/java/org/apache/jena/riot/TC_Riot.java +++ b/jena-arq/src/test/java/org/apache/jena/riot/TC_Riot.java @@ -42,6 +42,7 @@ import org.apache.jena.riot.writer.TS_RiotWriter ; , TS_Out.class , TS_Lang.class , TS_RiotGeneral.class + , TS_RiotSystem.class , TS_IO2.class , TS_RIOTAdapters.class , TS_Process.class diff --git a/jena-arq/src/test/java/org/apache/jena/riot/lang/TS_Lang.java b/jena-arq/src/test/java/org/apache/jena/riot/lang/TS_Lang.java index b7535a68be..ecd5f0f067 100644 --- a/jena-arq/src/test/java/org/apache/jena/riot/lang/TS_Lang.java +++ b/jena-arq/src/test/java/org/apache/jena/riot/lang/TS_Lang.java @@ -28,7 +28,7 @@ import org.junit.runners.Suite ; @RunWith(Suite.class) @Suite.SuiteClasses( { TestIRI.class - , TestLang.class + , TestRDFLanguages.class , TestBlankNodeAllocator.class , TestNodeAllocator.class // Older tests , TestLabelToNode.class // Newer tests diff --git a/jena-arq/src/test/java/org/apache/jena/riot/lang/TestLangJsonLD_DocLoader.java b/jena-arq/src/test/java/org/apache/jena/riot/lang/TestLangJsonLD_DocLoader.java index 47052e3b48..47ab95fca8 100644 --- a/jena-arq/src/test/java/org/apache/jena/riot/lang/TestLangJsonLD_DocLoader.java +++ b/jena-arq/src/test/java/org/apache/jena/riot/lang/TestLangJsonLD_DocLoader.java @@ -78,9 +78,6 @@ public class TestLangJsonLD_DocLoader { @Override public Document loadDocument(URI url, DocumentLoaderOptions options) throws JsonLdError { - - System.out.println(url); - DocumentLoader loader = HttpLoader.defaultInstance(); JsonObject obj = Json.createObjectBuilder() diff --git a/jena-arq/src/test/java/org/apache/jena/riot/lang/TestLang.java b/jena-arq/src/test/java/org/apache/jena/riot/lang/TestRDFLanguages.java similarity index 99% rename from jena-arq/src/test/java/org/apache/jena/riot/lang/TestLang.java rename to jena-arq/src/test/java/org/apache/jena/riot/lang/TestRDFLanguages.java index 93e6312d13..b4570e8654 100644 --- a/jena-arq/src/test/java/org/apache/jena/riot/lang/TestLang.java +++ b/jena-arq/src/test/java/org/apache/jena/riot/lang/TestRDFLanguages.java @@ -33,7 +33,7 @@ import org.apache.jena.util.FileUtils ; import org.junit.Assert ; import org.junit.Test ; -public class TestLang +public class TestRDFLanguages { static { JenaSystem.init(); } @Test public void registration_01() { testregistration(RDFLanguages.RDFXML) ; } diff --git a/jena-arq/src/test/java/org/apache/jena/riot/system/TS_RiotSystem.java b/jena-arq/src/test/java/org/apache/jena/riot/system/TS_RiotSystem.java index 097202a815..c44b2dc6da 100644 --- a/jena-arq/src/test/java/org/apache/jena/riot/system/TS_RiotSystem.java +++ b/jena-arq/src/test/java/org/apache/jena/riot/system/TS_RiotSystem.java @@ -18,17 +18,11 @@ package org.apache.jena.riot.system; -// Test classes get missed -import org.junit.platform.suite.api.SelectClasses; -import org.junit.platform.suite.api.Suite; -@Suite -@SelectClasses({ +import org.junit.runner.RunWith ; +import org.junit.runners.Suite ; -//import org.junit.runner.RunWith; -//import org.junit.runners.Suite; -//import org.junit.runners.Suite.SuiteClasses; -//@RunWith(Suite.class) -//@SuiteClasses({ +@RunWith(Suite.class) [email protected]( { TestChecker.class , TestStreamRDF.class , TestFactoryRDF.class diff --git a/jena-arq/src/test/java/org/apache/jena/riot/web/TS_RiotWeb.java b/jena-arq/src/test/java/org/apache/jena/riot/web/TS_RiotWeb.java index 9022e7ee51..ca580195fe 100644 --- a/jena-arq/src/test/java/org/apache/jena/riot/web/TS_RiotWeb.java +++ b/jena-arq/src/test/java/org/apache/jena/riot/web/TS_RiotWeb.java @@ -23,12 +23,9 @@ import org.junit.runners.Suite ; import org.junit.runners.Suite.SuiteClasses ; @RunWith(Suite.class) -@SuiteClasses({ - TestLangTag.class - , TestWebContent.class +@SuiteClasses({ + TestWebContent.class }) -public class TS_RiotWeb -{ -} +public class TS_RiotWeb {} diff --git a/jena-arq/src/test/java/org/apache/jena/riot/web/TestLangTag.java b/jena-arq/src/test/java/org/apache/jena/riot/web/TestLangTag.java deleted file mode 100644 index bf25e10ba1..0000000000 --- a/jena-arq/src/test/java/org/apache/jena/riot/web/TestLangTag.java +++ /dev/null @@ -1,120 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.jena.riot.web; - -import static org.junit.Assert.*; - -import org.junit.Test ; - -public class TestLangTag -{ - @Test public void parse_01() - { parseGood("en", "en", "en", null, null, null, null) ; } - - @Test public void parse_02() - { parseGood("en-uk", "en-UK", "en", null, "UK", null, null) ; } - - @Test public void parse_03() - { parseGood("es-419", "es-419", "es", null, "419", null, null) ; } - - @Test public void parse_04() - { parseGood("zh-Hant", "zh-Hant", "zh", "Hant", null, null, null) ; } - - @Test public void parse_05() - { parseGood("sr-Latn-CS", "sr-Latn-CS", "sr", "Latn", "CS", null, null) ; } - - @Test public void parse_06() - { parseGood("sl-nedis", "sl-nedis", "sl", null, null, "nedis", null) ; } - - @Test public void parse_07() - { parseGood("sl-IT-nedis", "sl-IT-nedis", "sl", null, "IT", "nedis", null) ; } - - @Test public void parse_08() - { parseGood("sl-Latn-IT-nedis", "sl-Latn-IT-nedis", "sl", "Latn", "IT", "nedis", null) ; } - - @Test public void parse_09() - { parseGood("de-CH-x-Phonebk", "de-CH-x-Phonebk", "de", null, "CH", null, "x-Phonebk") ; } - - @Test public void parse_10() - { parseGood("zh-cn-a-myExt-x-private", "zh-CN-a-myExt-x-private", - "zh", null, "CN", null, "a-myExt-x-private") ; } - - @Test public void parse_bad_01() { parseBad("i18n") ; } - @Test public void parse_bad_02() { parseBad("i@n") ; } - @Test public void parse_bad_03() { parseBad("123-abc") ; } - @Test public void parse_bad_04() { parseBad("en-") ; } - - private static void parseGood(String input, String ex_output, String... ex_parts ) - { - String[] parts = LangTag.parse(input) ; - assertArrayEquals(ex_parts, parts) ; - - String output = LangTag.canonical(input) ; - assertEquals(ex_output, output) ; - - assertTrue(LangTag.check(input)) ; - } - - - private static void parseBad(String input) - { - String[] parts = LangTag.parse(input) ; - assertNull(parts) ; - String output = LangTag.canonical(input) ; - assertEquals(input, output) ; - assertFalse(LangTag.check(input)) ; - } - - private void testCanonical(String input, String ex_output) { - String output = LangTag.canonical(input) ; - assertEquals(ex_output, output) ; - } - - // "x" extensions and irregular forms are left alone, including "sgn-be-fr" - - // Mentioned in BCP 47 tests -// @Test public void parseCanonical_01() { testCanonical("en-ca-x-ca","en-CA-x-ca"); } // "x" -// @Test public void parseCanonical_02() { testCanonical("EN-ca-X-Ca","en-CA-x-ca"); } -// @Test public void parseCanonical_03() { testCanonical("En-Ca-X-Ca","en-CA-x-ca"); } -// @Test public void parseCanonical_04() { testCanonical("SGN-BE-FR","sgn-BE-FR"); } // Irregular -// @Test public void parseCanonical_05() { testCanonical("sgn-be-fr","sgn-BE-FR"); } // Irregular -// @Test public void parseCanonical_06() { testCanonical("AZ-latn-x-LATN","az-Latn-x-latn"); } -// @Test public void parseCanonical_07() { testCanonical("Az-latn-X-Latn","az-Latn-x-latn"); } - - @Test public void parseCanonical_10() { testCanonical("zh-hant", "zh-Hant"); } - @Test public void parseCanonical_11() { testCanonical("zh-latn-wadegile", "zh-Latn-wadegile"); } - @Test public void parseCanonical_12() { testCanonical("zh-latn-pinyin", "zh-Latn-pinyin"); } - @Test public void parseCanonical_13() { testCanonical("en-us", "en-US"); } - @Test public void parseCanonical_14() { testCanonical("EN-Gb", "en-GB"); } - @Test public void parseCanonical_15() { testCanonical("qqq-002", "qqq-002"); } - @Test public void parseCanonical_16() { testCanonical("ja-latn", "ja-Latn"); } - @Test public void parseCanonical_17() { testCanonical("x-local", "x-local"); } - @Test public void parseCanonical_18() { testCanonical("he-latn", "he-Latn"); } - @Test public void parseCanonical_19() { testCanonical("und", "und"); } - @Test public void parseCanonical_20() { testCanonical("nn", "nn"); } - @Test public void parseCanonical_21() { testCanonical("ko-latn", "ko-Latn"); } - @Test public void parseCanonical_22() { testCanonical("ar-latn", "ar-Latn"); } - @Test public void parseCanonical_23() { testCanonical("la-x-liturgic", "la-x-liturgic"); } - @Test public void parseCanonical_24() { testCanonical("fa-x-middle", "fa-x-middle"); } - @Test public void parseCanonical_25() { testCanonical("qqq-142", "qqq-142"); } - @Test public void parseCanonical_26() { testCanonical("bnt", "bnt"); } - @Test public void parseCanonical_27() { testCanonical("grc-x-liturgic", "grc-x-liturgic"); } - @Test public void parseCanonical_28() { testCanonical("egy-Latn", "egy-Latn"); } - @Test public void parseCanonical_29() { testCanonical("la-x-medieval", "la-x-medieval"); } -} diff --git a/jena-arq/src/test/java/org/apache/jena/riot/web/TestWebContent.java b/jena-arq/src/test/java/org/apache/jena/riot/web/TestWebContent.java index 9296047e9d..0da87e170a 100644 --- a/jena-arq/src/test/java/org/apache/jena/riot/web/TestWebContent.java +++ b/jena-arq/src/test/java/org/apache/jena/riot/web/TestWebContent.java @@ -20,51 +20,44 @@ package org.apache.jena.riot.web; import org.junit.Assert; -import org.apache.jena.riot.WebContent ; +import org.apache.jena.riot.WebContent; import org.junit.Test; public class TestWebContent { @Test - public void testCanonicaliseMimeTypes1() - { + public void testCanonicaliseMimeTypes1() { testCanonicalise(WebContent.contentTypeTurtle, WebContent.contentTypeTurtle); testCanonicalise(WebContent.contentTypeTurtleAlt1, WebContent.contentTypeTurtle); } @Test - public void testCanonicaliseMimeTypes2() - { + public void testCanonicaliseMimeTypes2() { testCanonicalise(WebContent.contentTypeN3, WebContent.contentTypeN3); testCanonicalise(WebContent.contentTypeN3Alt1, WebContent.contentTypeN3); testCanonicalise(WebContent.contentTypeN3Alt2, WebContent.contentTypeN3); } @Test - public void testCanonicaliseMimeTypes3() - { + public void testCanonicaliseMimeTypes3() { testCanonicalise(WebContent.contentTypeNTriples, WebContent.contentTypeNTriples); testCanonicalise(WebContent.contentTypeNTriplesAlt, WebContent.contentTypeNTriples); } @Test - public void testCanonicaliseMimeTypes4() - { + public void testCanonicaliseMimeTypes4() { testCanonicalise(WebContent.contentTypeNQuads, WebContent.contentTypeNQuads); testCanonicalise(WebContent.contentTypeNQuadsAlt1, WebContent.contentTypeNQuads); } @Test - public void testCanonicaliseMimeTypes5() - { + public void testCanonicaliseMimeTypes5() { testCanonicalise(WebContent.contentTypeTriG, WebContent.contentTypeTriG); testCanonicalise(WebContent.contentTypeTriGAlt1, WebContent.contentTypeTriG); } - private void testCanonicalise(String input, String expected) - { + private void testCanonicalise(String input, String expected) { String canonical = WebContent.contentTypeCanonical(input); Assert.assertEquals(expected, canonical); } - } diff --git a/jena-core/src/main/java/org/apache/jena/graph/NodeFactory.java b/jena-core/src/main/java/org/apache/jena/graph/NodeFactory.java index 29727c46ee..bd94b203db 100644 --- a/jena-core/src/main/java/org/apache/jena/graph/NodeFactory.java +++ b/jena-core/src/main/java/org/apache/jena/graph/NodeFactory.java @@ -20,6 +20,7 @@ package org.apache.jena.graph; import static org.apache.jena.atlas.lib.Lib.isEmpty; +import static org.apache.jena.langtagx.LangTagX.formatLanguageTag; import java.util.Objects; @@ -30,7 +31,6 @@ import org.apache.jena.datatypes.xsd.impl.RDFDirLangString; import org.apache.jena.datatypes.xsd.impl.RDFLangString; import org.apache.jena.graph.impl.LiteralLabel; import org.apache.jena.graph.impl.LiteralLabelFactory; -import org.apache.jena.graph.langtag.LangTags; import org.apache.jena.shared.JenaException; import org.apache.jena.sys.JenaSystem; @@ -137,8 +137,7 @@ public class NodeFactory { */ public static Node createLiteralDirLang(String string, String lang, String textDir) { TextDirection textDirEnum = initialTextDirection(textDir); - String langFmt = formatLanguageTag(lang); - return createLiteralDirLang(string, langFmt, textDirEnum); + return createLiteralDirLang(string, lang, textDirEnum); } private static boolean noTextDir(TextDirection textDir) { @@ -149,7 +148,7 @@ public class NodeFactory { Objects.requireNonNull(string, "null lexical form for literal"); if ( isEmpty(lang) ) { if ( textDir != null ) - throw new JenaException("The language must be gived for a language direction literal"); + throw new JenaException("The language must be given for a language direction literal"); return new Node_Literal(string); } if ( noTextDir(textDir) ) @@ -205,6 +204,7 @@ public class NodeFactory { * needing the caller to differentiate between the xsd:string, rdf:langString, and other * datatype cases. * It calls {@link #createLiteralString(String)}, + * {@link #createLiteralLang(String, String)} or * {@link #createLiteralDirLang(String, String, String)} or * {@link #createLiteralDT(String, RDFDatatype)} * as appropriate. @@ -217,37 +217,29 @@ public class NodeFactory { public static Node createLiteral(String lex, String lang, TextDirection textDir, RDFDatatype dtype) { Objects.requireNonNull(lex, "null lexical form for literal"); boolean hasLang = ! isEmpty(lang); + boolean hasTextDirLang = ! noTextDir(textDir); + + if ( hasTextDirLang && ! hasLang ) + throw new JenaException("The language must be given for a language direction literal"); + + // Datatype check when lang present. if ( hasLang ) { - String langFmt = formatLanguageTag(lang); - if ( dtype != null ) { - if ( noTextDir(textDir) ) { - if ( ! dtype.equals(RDFLangString.rdfLangString) ) - throw new JenaException("Datatype is not rdf:langString but a language was given"); - } else { - if ( ! dtype.equals(RDFDirLangString.rdfDirLangString) ) - throw new JenaException("Datatype is not rdf:dirLangString but a language and initial text direction was given"); - } + if ( ! hasTextDirLang ) { + if ( dtype != null && ! dtype.equals(RDFLangString.rdfLangString) ) + throw new JenaException("Datatype is not rdf:langString but a language was given"); + return createLiteralLang(lex, lang); } - - return createLiteralDirLang(lex, langFmt, textDir); + // hasLang && hasTextDirLang + if ( dtype != null && ! dtype.equals(RDFDirLangString.rdfDirLangString) ) + throw new JenaException("Datatype is not rdf:dirLangString but a language and initial text direction was given"); + return createLiteralDirLang(lex, lang, textDir); } + // no language tag, no text direction, no datatype. if ( dtype == null ) // No datatype, no lang (it is null or "") => xsd:string. return createLiteralString(lex); - // No language. Has a datatype. - boolean hasTextDirLang = ( textDir != null ); - if ( hasTextDirLang ) { - if ( dtype.equals(RDFDirLangString.rdfDirLangString) ) { - // No language. Datatype is rdf:dirLangString, Does have an initial text direction - throw new JenaException("Datatype is rdf:dirLangString and has an initial text direction but no language given"); - } else if ( dtype.equals(RDFLangString.rdfLangString) ) { - // No language. Datatype is rdf:langString, Does have an initial text direction. - throw new JenaException("Datatype is rdf:langString and has an initial text direction but no language given"); - } - } - // Datatype. No language, no initial text direction. // Allow "abc"^^rdf:langString // Allow "abc"^^rdf:dirLangString @@ -262,19 +254,6 @@ public class NodeFactory { return n; } - /*package*/ static final boolean legacyLangTag = false; - /** Prepare the language tag - apply formatting normalization */ - private static String formatLanguageTag(String langTagStr) { - // LangTags.formatLangtag(input) except with the legacy option. - if ( langTagStr == null ) - return Node.noLangTag; - if ( legacyLangTag ) - return langTagStr; - if ( langTagStr.isEmpty() ) - return langTagStr; - return LangTags.basicFormat(langTagStr); - } - /** Prepare the initial text direction - apply formatting normalization */ private static TextDirection initialTextDirection(String input) { if ( isEmpty(input) ) diff --git a/jena-core/src/main/java/org/apache/jena/graph/langtag/LangTags.java b/jena-core/src/main/java/org/apache/jena/graph/langtag/LangTags.java index bf28bdd9ee..a52239516a 100644 --- a/jena-core/src/main/java/org/apache/jena/graph/langtag/LangTags.java +++ b/jena-core/src/main/java/org/apache/jena/graph/langtag/LangTags.java @@ -18,153 +18,31 @@ package org.apache.jena.graph.langtag; -import static org.apache.jena.atlas.lib.Lib.lowercase; - -import java.util.ArrayList; -import java.util.List; - -import org.apache.jena.atlas.lib.Lib; -import org.apache.jena.atlas.logging.Log; -import org.apache.jena.graph.Node; - -/** Functions on language tag strings */ +/** + * Functions on language tag strings + * @deprecated use org.apache.jena.langtag.LangTags + */ +@Deprecated(forRemoval = true) public class LangTags { /** * Format language tag. * This is the system-wide policy for formatting language tags. + * @deprecated use org.apache.jena.langtag.LangTags#formatLangtag(String) */ + @Deprecated(forRemoval=true) public static String formatLangtag(String input) { - if ( input == null ) - return Node.noLangTag; - if ( input.isEmpty() ) - return input; - return basicFormat(input); + return org.apache.jena.langtag.LangTags.format(input); } -// /** -// * Language tag formatter. -// * <a href="https://datatracker.ietf.org/doc/html/rfc5646#section-2.1.1">RFC 5646 section 2.1.1</a> -// */ -// public static String formatRFC5646(String string) { -// return basicFormat(string); -// } - /** * Format an language tag assumed to be valid. * This code only deals with langtags by the string length of the subtags. * <a href="https://datatracker.ietf.org/doc/html/rfc5646#section-2.1.1">RFC 5646 section 2.1.1</a> + * @deprecated use org.apache.jena.langtag.LangTags#basicFormat(String) */ + @Deprecated(forRemoval=true) public static String basicFormat(String string) { - // with the interpretation that "after singleton" means anywhere after the singleton. - if ( string == null ) - return null; - if ( string.isEmpty() ) - return string; - List<String> strings = splitOnDash(string); - if ( strings == null ) { - return lowercase(string); - //error("Bad language string: %s", string); - } - StringBuilder sb = new StringBuilder(string.length()); - boolean singleton = false; - boolean first = true; - - for ( String s : strings ) { - if ( first ) { - // language - sb.append(lowercase(s)); - first = false; - continue; - } - first = false; - // All subtags after language - sb.append('-'); - if ( singleton ) - // Always lowercase - sb.append(lowercase(s)); - else { - // case depends on ;length - sb.append(strcase(s)); - // is it the start of an extension or privateuse - // XXX s.length()==1? - if ( s.length() == 1 ) - singleton = true; - } - } - return sb.toString(); - } - - private static List<String> splitOnDash(String x) { - List<String> strings = new ArrayList<>(6); - // Split efficiently(?) based on [a-z][A-Z][0-9] units separated by "-"s - StringBuilder sb = new StringBuilder(); - - boolean start = true; - for ( int idx = 0; idx < x.length(); idx++ ) { - char ch = x.charAt(idx); - if ( isA2ZN(ch) ) { - sb.append(ch); - continue; - } - if ( ch == '-' ) { - String str = sb.toString(); - strings.add(str); - sb.setLength(0); - continue; - } - error("Bad character: (0x%02X) '%s' index %d", (int)ch, str(ch), idx); - } - String strLast = sb.toString(); - if ( strLast.isEmpty() ) { - // Ends in "-" - return null; - //error("Empty part: %s", x); - } - strings.add(strLast); - return strings; - } - - private static void error(String msg, Object ... args) { - String x = String.format(msg, args); - //throw new LangTagException(x); - Log.warn("LangTag", x); - } - - private static String strcase(String string) { - if ( string == null ) - return null; - if ( string.length() == 2 ) - return Lib.uppercase(string); - if ( string.length() == 4 ) - return titlecase(string); - return lowercase(string); - } - - private static String titlecase(String string) { - if ( string == null ) - return null; - char ch1 = string.charAt(0); - ch1 = Character.toUpperCase(ch1); - string = lowercase(string.substring(1)); - return ch1 + string; - } - - private static String str(char ch) { - return String.format("'%s' U+%04X", Character.valueOf(ch), (int)ch); - } - - /** ASCII A-Z */ - /*package*/ static boolean isA2Z(int ch) { - return range(ch, 'a', 'z') || range(ch, 'A', 'Z'); - } - - /** ASCII A-Z or 0-9 */ - /*package*/ static boolean isA2ZN(int ch) { - return range(ch, 'a', 'z') || range(ch, 'A', 'Z') || range(ch, '0', '9'); - } - - private static boolean range(int ch, char a, char b) { - return (ch >= a && ch <= b); + return org.apache.jena.langtag.LangTags.basicFormat(string); } } diff --git a/jena-core/src/main/java/org/apache/jena/langtagx/LangTagX.java b/jena-core/src/main/java/org/apache/jena/langtagx/LangTagX.java new file mode 100644 index 0000000000..d385f865e6 --- /dev/null +++ b/jena-core/src/main/java/org/apache/jena/langtagx/LangTagX.java @@ -0,0 +1,119 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.jena.langtagx; + +import org.apache.jena.langtag.LangTag; +import org.apache.jena.langtag.LangTagException; +import org.apache.jena.langtag.LangTags; +import org.apache.jena.shared.JenaException; + +/** + * This class defined the Jena side policies for language tags and maps operations to + * {@link org.apache.jena.langtag.LangTags}. + */ +public class LangTagX { + + /** + * Create a {@link LangTag} object, using the Jena system default + * implementation of the {@code LangTag} interface. + * The string must conform to the syntax defined in rules and syntax in + * <a href="https://datatracker.ietf.org/doc/html/rfc5646">RFC 5646</a> + */ + public static LangTag createLanguageTag(String langTagStr) { + try { + return org.apache.jena.langtag.LangTags.create(langTagStr); + } catch (LangTagException ex) { + throw convertException(ex); + } + } + + /*package*/ static final boolean legacyLangTag = false; + /** + * Prepare the language tag - apply formatting normalization, and always return a string. + * If the input is invalid as a language tag, return the input as-is. + * @throws JenaException on an all blank string. + */ + public static String formatLanguageTag(String langTagStr) { + if ( langTagStr == null ) + return langTagStr; + if ( legacyLangTag ) + return langTagStr; + if ( langTagStr.isEmpty() ) + return langTagStr; + try { + return LangTags.format(langTagStr); + } catch (LangTagException ex) { + if ( langTagStr.isBlank() ) + throw new JenaException("Language tag string is all white space"); + // Bad language tag. e.g. over long primary language or subtags. + // Apply a more basic formatting - split into segments and + // apply the subtag length rules. + try { + return LangTags.basicFormat(langTagStr); + } catch (LangTagException ex2) { + // Very bad + return langTagStr; + } + } + } + + /** + * Check a string is valid as a language tag. + * This function returns true or false and does not throw an exception. + */ + public static boolean checkLanguageTag(String langTagStr) { + return org.apache.jena.langtag.LangTags.check(langTagStr); + } + + /** + * Check a language tag string meets the Turtle(etc) and SPARQL grammar rule + * for a language tag without initial text direction. + * <p> + * Passing this test does not guarantee the string is valid language tag. Use + * {@link LangTagX#checkLanguageTag(String)} for validity checking. + * + * @returns true or false + */ + public static boolean checkLanguageTagBasicSyntax(String langTagStr) { + return org.apache.jena.langtag.LangTags.basicCheck(langTagStr); + } + + /** + * Check a string is valid as a language tag. + * Throw a {@link JenaException} if it is not valid. + */ + public static void requireValidLanguageTag(String langTagStr) { + try { + org.apache.jena.langtag.LangTags.requireValid(langTagStr); + } catch (LangTagException ex) { + throw convertException(ex); + } + } + + /** Is @code{langTagStr1} the same language tag as @code{langTagStr2}? */ + public static boolean sameLanguageTagAs(String langTagStr1, String langTagStr2) { + requireValidLanguageTag(langTagStr1); + requireValidLanguageTag(langTagStr2); + return langTagStr1.equalsIgnoreCase(langTagStr2); + } + + private static JenaException convertException(LangTagException ex) { + return new JenaException(ex.getMessage()); + } +} diff --git a/jena-core/src/test/java/org/apache/jena/graph/test/TestTypedLiterals.java b/jena-core/src/test/java/org/apache/jena/graph/test/TestTypedLiterals.java index 8ddacbeea6..6d0c1f5714 100644 --- a/jena-core/src/test/java/org/apache/jena/graph/test/TestTypedLiterals.java +++ b/jena-core/src/test/java/org/apache/jena/graph/test/TestTypedLiterals.java @@ -73,7 +73,7 @@ public class TestTypedLiterals extends TestCase { /** * Test the base functioning of unknown datatypes */ - public void testUnknown() { + public void testUnknownDatatype() { String typeURI = "urn:jena-dt:unknown"; String typeURI2 = "urn:jena-dt:unknown2"; @@ -82,7 +82,7 @@ public class TestTypedLiterals extends TestCase { Literal l1 = m.createTypedLiteral("foo", typeURI); Literal l3 = m.createTypedLiteral("15", typeURI); Literal l5 = m.createTypedLiteral("foo", typeURI2); - Literal l6 = m.createLiteral("foo", "lang1"); + Literal l6 = m.createLiteral("foo", "lang"); JenaParameters.enableSilentAcceptanceOfUnknownDatatypes = originalFlag; // Check for successful creation diff --git a/jena-arq/src/test/java/org/apache/jena/riot/web/TS_RiotWeb.java b/jena-core/src/test/java/org/apache/jena/langtagx/TS_LangTagX.java similarity index 60% copy from jena-arq/src/test/java/org/apache/jena/riot/web/TS_RiotWeb.java copy to jena-core/src/test/java/org/apache/jena/langtagx/TS_LangTagX.java index 9022e7ee51..96633d00fe 100644 --- a/jena-arq/src/test/java/org/apache/jena/riot/web/TS_RiotWeb.java +++ b/jena-core/src/test/java/org/apache/jena/langtagx/TS_LangTagX.java @@ -16,19 +16,27 @@ * limitations under the License. */ -package org.apache.jena.riot.web; +package org.apache.jena.langtagx; -import org.junit.runner.RunWith ; -import org.junit.runners.Suite ; -import org.junit.runners.Suite.SuiteClasses ; +import org.junit.runner.RunWith; +import org.junit.runners.Suite; +import junit.framework.JUnit4TestAdapter; +import junit.framework.TestSuite; + +//JUnit5. Does not mix with JUnit3. So until jena-core updates to JUnit 4 or 5 ... +//@Suite +//@SelectClasses({ @RunWith(Suite.class) -@SuiteClasses({ - TestLangTag.class - , TestWebContent.class [email protected]( { + TestLangTagX.class }) -public class TS_RiotWeb -{ +public class TS_LangTagX { + public static TestSuite suite() { + TestSuite ts = new TestSuite(); + ts.setName("LangTagX"); + ts.addTest(new JUnit4TestAdapter(TS_LangTagX.class)); + return ts; + } } - diff --git a/jena-core/src/test/java/org/apache/jena/langtagx/TestLangTagX.java b/jena-core/src/test/java/org/apache/jena/langtagx/TestLangTagX.java new file mode 100644 index 0000000000..052067b880 --- /dev/null +++ b/jena-core/src/test/java/org/apache/jena/langtagx/TestLangTagX.java @@ -0,0 +1,90 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.jena.langtagx; + +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertFalse; +import static org.junit.jupiter.api.Assertions.assertNotNull; +import static org.junit.jupiter.api.Assertions.assertTrue; + +import org.junit.Test ; + +import org.apache.jena.langtag.LangTag; +import org.apache.jena.shared.JenaException; + +/** + * Test of the LangTagX adapter to language tag implementation. + * This is not a comprehensive set of test for language tags. + */ +public class TestLangTagX { + + @Test public void check_langtag_01() { + assertTrue(LangTagX.checkLanguageTag("en-gb")); + } + + @Test public void check_langtag_02() { + assertTrue(LangTagX.checkLanguageTag("en-gb-oed")); + } + + @Test public void check_langtag_03() { + assertFalse(LangTagX.checkLanguageTag("en-ab-xy")); + } + + @Test public void require_langtag_01() { + LangTagX.requireValidLanguageTag("en-gb"); + } + + @Test(expected = JenaException.class) + public void require_langtag_02() { + LangTagX.requireValidLanguageTag("en-ab-xy"); + } + + @Test public void langtag_01() { + LangTag langTag = LangTagX.createLanguageTag("en-gb"); + assertNotNull(langTag); + } + + @Test(expected = JenaException.class) + public void langtag_02() { + LangTag langTag = LangTagX.createLanguageTag("en-ab-xy"); + assertNotNull(langTag); + } + + @Test public void langtag_format_01() { + String fmt = LangTagX.formatLanguageTag("en-gb"); + assertEquals("en-GB", fmt); + } + + @Test public void langtag_format_02() { + String fmt = LangTagX.formatLanguageTag("en-latn-illegalSubTag"); + // Falls back to the "by part" formatting + assertEquals("en-Latn-illegalsubtag", fmt); + } + + @Test public void langtag_format_03() { + String fmt = LangTagX.formatLanguageTag(""); + assertEquals("", fmt); + } + + @Test(expected = JenaException.class) + public void langtag_format_04() { + String fmt = LangTagX.formatLanguageTag(" "); + assertEquals("", fmt); + } +} diff --git a/jena-core/src/test/java/org/apache/jena/test/TestPackage_core.java b/jena-core/src/test/java/org/apache/jena/test/TestPackage_core.java index 513d341242..1749cc66ea 100644 --- a/jena-core/src/test/java/org/apache/jena/test/TestPackage_core.java +++ b/jena-core/src/test/java/org/apache/jena/test/TestPackage_core.java @@ -38,6 +38,7 @@ public class TestPackage_core extends TestCase { addTest(ts, "System setup", TestSystemSetup.suite()); addTest(ts, "IRIx", org.apache.jena.irix.TS_IRIx.suite()); + addTest(ts, "LangTagX", org.apache.jena.langtagx.TS_LangTagX.suite()); addTest(ts, "Enhanced", org.apache.jena.enhanced.test.TestPackage_enh.suite()); addTest(ts, "Datatypes", org.apache.jena.datatypes.TestPackage_dt.suite()) ; addTest(ts, "Graph", org.apache.jena.graph.test.TestPackage_graph.suite()); diff --git a/jena-core/src/test/java/org/apache/jena/ttl_test/test/turtle/TurtleInternalTests.java b/jena-core/src/test/java/org/apache/jena/ttl_test/test/turtle/TurtleInternalTests.java index 8a1fd989d8..7ba92db599 100644 --- a/jena-core/src/test/java/org/apache/jena/ttl_test/test/turtle/TurtleInternalTests.java +++ b/jena-core/src/test/java/org/apache/jena/ttl_test/test/turtle/TurtleInternalTests.java @@ -31,230 +31,231 @@ public class TurtleInternalTests extends TestSuite static public TestSuite suite() { return new TurtleInternalTests() ; } - static public final String QUOTE3 = "\"\"\"" ; static public boolean VERBOSE = false ; - + public TurtleInternalTests() { super("Turtle Parser Syntactic tests") ; - + // ---- Debug testing - //addTest(new Test("<thing> b:px.b:py [] . ")) ; + //addTest("<thing> b:px.b:py [] . "); // if ( true ) return ; // ---- Debug testing // Make sure basic things, at least, parse. - + // URIs, qnames, statements, prefixes - + // End of statement (and whitespace) - addTest(new Test("a:subj a:prop a:d .")) ; - addTest(new Test("a:subj a:prop a:d . ")) ; - addTest(new Test("a:subj a:prop a:d.")) ; - addTest(new Test("a:subj a:prop a:d. ")) ; - - addTest(new Test("rdf: rdf:type :_.")) ; - addTest(new Test("@prefix start: <somewhere>.")) ; - addTest(new Test("<http://here/subj> <http://here/prep> <http://here/obj>.")) ; - + addTest("a:subj a:prop a:d ."); + addTest("a:subj a:prop a:d . "); + addTest("a:subj a:prop a:d."); + addTest("a:subj a:prop a:d. "); + + addTest("rdf: rdf:type :_."); + addTest("@prefix start: <somewhere>."); + addTest("<http://here/subj> <http://here/prep> <http://here/obj>."); + // Whitespace, comments - addTest(new Test("a:subj\ta:prop\ta:d.\t")) ; - addTest(new Test(" a:subj\ta:prop\ta:d. ")) ; - addTest(new Test("a:subj a:prop a:d. ")) ; - addTest(new Test("")) ; - addTest(new Test(" #Comment")) ; - addTest(new Test("a:subj a:prop a:d. # Comment")) ; - addTest(new Test("a:subj a:prop a:d.# Comment")) ; + addTest("a:subj\ta:prop\ta:d.\t"); + addTest(" a:subj\ta:prop\ta:d. "); + addTest("a:subj a:prop a:d. "); + addTest(""); + addTest(" #Comment"); + addTest("a:subj a:prop a:d. # Comment"); + addTest("a:subj a:prop a:d.# Comment"); // Literal: strings - addTest(new Test("a:subj a:prop 'string1'.")) ; - addTest(new Test("a:subj a:prop \"string2\".")) ; - addTest(new Test("a:subj a:prop '''string3'''.")) ; - addTest(new Test("a:subj a:prop "+QUOTE3+"string3"+QUOTE3+".")) ; - + addTest("a:subj a:prop 'string1'."); + addTest("a:subj a:prop \"string2\"."); + addTest("a:subj a:prop '''string3'''."); + addTest("a:subj a:prop "+QUOTE3+"string3"+QUOTE3+"."); + // Literals: datatypes - addTest(new Test("a:subj a:prop 'string1'^^x:dt.")) ; - addTest(new Test("a:subj a:prop 'string1'^^<uriref>.")) ; - + addTest("a:subj a:prop 'string1'^^x:dt."); + addTest("a:subj a:prop 'string1'^^<uriref>."); + // Literals: numbers. - addTest(new Test("a: :p 2. .")) ; - addTest(new Test("a: :p +2. .")) ; - addTest(new Test("a: :p -2 .")) ; - addTest(new Test("a: :p 2e6.")) ; - addTest(new Test("a: :p 2e-6.")) ; - addTest(new Test("a: :p -2e-6.")) ; - addTest(new Test("a: :p 2.0e-6.")) ; - addTest(new Test("a: :p 2.0 .")) ; - + addTest("a: :p 2. ."); + addTest("a: :p +2. ."); + addTest("a: :p -2 ."); + addTest("a: :p 2e6."); + addTest("a: :p 2e-6."); + addTest("a: :p -2e-6."); + addTest("a: :p 2.0e-6."); + addTest("a: :p 2.0 ."); + // // The "unusual" cases -// addTest(new Test("a:subj 'prop'^^<uriref> 'string'.")) ; -// addTest(new Test("a:subj a:prop 'string1'^^'stringDT'.")) ; +// addTest("a:subj 'prop'^^<uriref> 'string'."); +// addTest("a:subj a:prop 'string1'^^'stringDT'."); // -// addTest(new Test("a:subj a:prop1 ?x ^^ x:dt.")) ; -// addTest(new Test("a:subj a:prop2 ?x ^^ ?x.")) ; +// addTest("a:subj a:prop1 ?x ^^ x:dt."); +// addTest("a:subj a:prop2 ?x ^^ ?x."); // Quotes in string - addTest(new Test("a:subj a:prop \"\\'string2\\'\".")) ; - addTest(new Test("a:subj a:prop \"\\\"string2\\\"\".")) ; - addTest(new Test("a:subj a:prop '\\'string1\\'\'.")) ; - addTest(new Test("a:subj a:prop '\\\"string1\\\"\'.")) ; - - addTest(new Test("a:q21 a:prop "+QUOTE3+"start\"finish"+QUOTE3+".")) ; - addTest(new Test("a:q22 a:prop "+QUOTE3+"start\"\"finish"+QUOTE3+".")) ; - addTest(new Test("a:q2e3 a:prop "+QUOTE3+"start\\\"\\\"\\\"finish"+QUOTE3+".")) ; - addTest(new Test("a:q13 a:prop "+QUOTE3+"start'''finish"+QUOTE3+".")) ; - - addTest(new Test("a:q11 a:prop '''start'finish'''.")) ; - addTest(new Test("a:q12 a:prop '''start''finish'''.")) ; - addTest(new Test("a:q12 a:prop '''start\\'\\'\\'finish'''.")) ; - addTest(new Test("a:q23 a:prop '''start\"\"\"finish'''.")) ; - + addTest("a:subj a:prop \"\\'string2\\'\"."); + addTest("a:subj a:prop \"\\\"string2\\\"\"."); + addTest("a:subj a:prop '\\'string1\\'\'."); + addTest("a:subj a:prop '\\\"string1\\\"\'."); + + addTest("a:q21 a:prop "+QUOTE3+"start\"finish"+QUOTE3+"."); + addTest("a:q22 a:prop "+QUOTE3+"start\"\"finish"+QUOTE3+"."); + addTest("a:q2e3 a:prop "+QUOTE3+"start\\\"\\\"\\\"finish"+QUOTE3+"."); + addTest("a:q13 a:prop "+QUOTE3+"start'''finish"+QUOTE3+"."); + + addTest("a:q11 a:prop '''start'finish'''."); + addTest("a:q12 a:prop '''start''finish'''."); + addTest("a:q12 a:prop '''start\\'\\'\\'finish'''."); + addTest("a:q23 a:prop '''start\"\"\"finish'''."); + // Keywords and syntactic sugar -// addTest(new Test("this a:prop x:y .")) ; -// addTest(new Test("a:subj a x:y .")) ; -// addTest(new Test("a:subj = x:y .")) ; -// addTest(new Test("a:subj => x:y .")) ; -// addTest(new Test("a:subj <= x:y .")) ; -// // <=> is not legal : it would mean "implies and is implied by" -// // addTest(new Test("a:subj <=> x:y .")) ; -// addTest(new Test("a:subj >- x:y -> 'value' .")) ; -// addTest(new Test("a:subj >- x:y -> 'value1', 'value2' .")) ; - +// addTest("this a:prop x:y ."); +// addTest("a:subj a x:y ."); +// addTest("a:subj = x:y ."); +// addTest("a:subj => x:y ."); +// addTest("a:subj <= x:y ."); +// // <=> is not legal : it would mean "implies and is implied by" +// // addTest("a:subj <=> x:y ."); +// addTest("a:subj >- x:y -> 'value' ."); +// addTest("a:subj >- x:y -> 'value1', 'value2' ."); + // Not keywords - addTest(new Test("a:subj <a> x:y .")) ; - addTest(new Test("<this> a x:y .")) ; - addTest(new Test("@prefix has: <uri>.")) ; - - addTest(new Test("<> a:prop x:y .")) ; - addTest(new Test("<#> a:prop x:y .")) ; - + addTest("a:subj <a> x:y ."); + addTest("<this> a x:y ."); + addTest("@prefix has: <uri>."); + + addTest("<> a:prop x:y ."); + addTest("<#> a:prop x:y ."); + // Object lists - addTest(new Test("a:subj a:prop a:d, a:e.")) ; - addTest(new Test("a:subj a:prop a:d, '123'.")) ; - addTest(new Test("a:subj a:prop '123', a:e.")) ; - //addTest(new Test("a:subj a:prop '123', .")) ; // Null object list - //addTest(new Test("a:subj a:prop '123', '456', .")) ; // Null object list - + addTest("a:subj a:prop a:d, a:e."); + addTest("a:subj a:prop a:d, '123'."); + addTest("a:subj a:prop '123', a:e."); + //addTest("a:subj a:prop '123', ."); // Null object list + //addTest("a:subj a:prop '123', '456', ."); // Null object list + // Property lists - addTest(new Test("a:subj a:p1 a:v1 ; a:p2 a:v2 .")) ; - addTest(new Test("a:subj a:p1 a:v1, a:v2 ; a:p2 a:v2 ; a:p3 'v4' ,'v5' .")) ; - addTest(new Test("a:subj a:p1 a:v1; .")) ; // Null property list - addTest(new Test("a:subj a:p1 a:v1; a:p2 a:v2; .")) ; // Null property list - - + addTest("a:subj a:p1 a:v1 ; a:p2 a:v2 ."); + addTest("a:subj a:p1 a:v1, a:v2 ; a:p2 a:v2 ; a:p3 'v4' ,'v5' ."); + addTest("a:subj a:p1 a:v1; ."); // Null property list + addTest("a:subj a:p1 a:v1; a:p2 a:v2; ."); // Null property list + + // anon nodes - addTest(new Test("[a:prop a:val].")) ; - addTest(new Test("[] a:prop a:val.")) ; - addTest(new Test("[] a:prop [].")) ; - + addTest("[a:prop a:val]."); + addTest("[] a:prop a:val."); + addTest("[] a:prop []."); + // formulae // The final dot (statement terminator of outer statement) is necessary // Inside formulae, it is not. -// addTest(new Test("{:x :y :z} => {:x :y :z}.")) ; -// addTest(new Test("{:x :y :z} => {:x :y :z . }.")) ; -// addTest(new Test("{:x :y :z. } => {:x :y :z}.")) ; - +// addTest("{:x :y :z} => {:x :y :z}."); +// addTest("{:x :y :z} => {:x :y :z . }."); +// addTest("{:x :y :z. } => {:x :y :z}."); + // Variables -// addTest(new Test("?who ?knows ?what .")) ; -// addTest(new Test("{?who ?knows ?what} => {'somesort' 'of' 'logic'}." )) ; - +// addTest("?who ?knows ?what ."); +// addTest("{?who ?knows ?what} => {'somesort' 'of' 'logic'}." ); + // Formulae do not need the trailing '.' -// addTest(new Test("{ this a \"string2\". } => { this a 'string1'} .")) ; - +// addTest("{ this a \"string2\". } => { this a 'string1'} ."); + // And they can have directives in. -// addTest(new Test("{ @prefix : <a> } => { this a 'string1'} .")) ; -// addTest(new Test("{ @prefix : <a> . a:x <b> 'c'} => { this a 'string1'} .")) ; - +// addTest("{ @prefix : <a> } => { this a 'string1'} ."); +// addTest("{ @prefix : <a> . a:x <b> 'c'} => { this a 'string1'} ."); + // RDF collections - //addTest(new Test("() .")) ; - addTest(new Test("<here> <list> ().")) ; - addTest(new Test(" ( a:i1 a:i2 a:i3 ) a rdf:List.")) ; - + //addTest("() ."); + addTest("<here> <list> ()."); + addTest(" ( a:i1 a:i2 a:i3 ) a rdf:List."); + // Paths -// addTest(new Test(":x!:y <prop> [].")) ; -// addTest(new Test(":x!:y!:z <prop> [].")) ; -// addTest(new Test(":x^:y <prop> [].")) ; -// addTest(new Test(":x^:y^:z <prop> [].")) ; -// addTest(new Test("[] <prop> :x!:y^:z.")) ; -// addTest(new Test("[] :x^:y!:z [].")) ; - +// addTest(":x!:y <prop> []."); +// addTest(":x!:y!:z <prop> []."); +// addTest(":x^:y <prop> []."); +// addTest(":x^:y^:z <prop> []."); +// addTest("[] <prop> :x!:y^:z."); +// addTest("[] :x^:y!:z []."); + // Paths - using . (dot) -// addTest(new Test(":x.:y <prop> [].")) ; -// addTest(new Test(":x.:y.:z <prop> [].")) ; -// addTest(new Test("[] <prop> :a.:c.")) ; -// addTest(new Test("<thing>.:y <prop> [].")) ; -// addTest(new Test("x:x.<thing>.:y <prop> [].")) ; -// addTest(new Test("<thing>.:y^:z <prop> [].")) ; -// addTest(new Test(":y.<thing>.:z <prop> [].")) ; -// addTest(new Test("<thing> :px.:py.:pz [] . ")) ; -// addTest(new Test("<thing> :px!:py!:pz [] . ")) ; - +// addTest(":x.:y <prop> []."); +// addTest(":x.:y.:z <prop> []."); +// addTest("[] <prop> :a.:c."); +// addTest("<thing>.:y <prop> []."); +// addTest("x:x.<thing>.:y <prop> []."); +// addTest("<thing>.:y^:z <prop> []."); +// addTest(":y.<thing>.:z <prop> []."); +// addTest("<thing> :px.:py.:pz [] . "); +// addTest("<thing> :px!:py!:pz [] . "); + // Paths and formulae -// addTest(new Test("{ :a.:b.:c . }.")) ; -// addTest(new Test("{ :a.:b.<c>.}.")) ; - +// addTest("{ :a.:b.:c . }."); +// addTest("{ :a.:b.<c>.}."); + // Named things -// addTest(new Test("_:anon :- [a:p a:v] .")) ; -// addTest(new Test("<uri> :- [a:p [ a:p a:v] ] .")) ; -// // Named list: Not supported by cwm (as of 2001, 2002, 2003/09) but needed for printing shared -// addTest(new Test("_:anon :- (\"1\") .")) ; +// addTest("_:anon :- [a:p a:v] ."); +// addTest("<uri> :- [a:p [ a:p a:v] ] ."); +// // Named list: Not supported by cwm (as of 2001, 2002, 2003/09) but needed for printing shared +// addTest("_:anon :- (\"1\") ."); // // Named formulae: Not supported by cwm (as of 2001, 2002, 2003/09) -// addTest(new Test("_:anon :- { ?a ?b ?c } .")) ; - +// addTest("_:anon :- { ?a ?b ?c } ."); + // Datatypes - addTest(new Test("a:subj a:prop '123'^^xsd:integer .")) ; - addTest(new Test("a:subj a:prop '123'^^<uri> .")) ; - addTest(new Test("a:subj a:prop '<tag>text</tag>'^^rdf:XMLLiteral .")) ; - + addTest("a:subj a:prop '123'^^xsd:integer ."); + addTest("a:subj a:prop '123'^^<uri> ."); + addTest("a:subj a:prop '<tag>text</tag>'^^rdf:XMLLiteral ."); + // Numbers - addTest(new Test("a:subj a:prop 123 .")) ; - // addTest(new Test("a:subj a:prop 123.")) ; Illegal N3 - addTest(new Test("a:subj a:prop 123.1 .")) ; - addTest(new Test("a:subj a:prop -123.1 .")) ; - addTest(new Test("a:subj a:prop 123.1e3 .")) ; - addTest(new Test("a:subj a:prop 123.1e-3 .")) ; - addTest(new Test("a:subj a:prop 123.1E3 .")) ; - addTest(new Test("a:subj a:prop 123.1E-3 .")) ; + addTest("a:subj a:prop 123 ."); + // addTest("a:subj a:prop 123."); Illegal N3 + addTest("a:subj a:prop 123.1 ."); + addTest("a:subj a:prop -123.1 ."); + addTest("a:subj a:prop 123.1e3 ."); + addTest("a:subj a:prop 123.1e-3 ."); + addTest("a:subj a:prop 123.1E3 ."); + addTest("a:subj a:prop 123.1E-3 ."); // Language tags - addTest(new Test("a:subj a:prop 'text'@en .")) ; + addTest("a:subj a:prop 'text'@en ."); // Illegal in N-Triples - //addTest(new Test("a:subj a:prop 'text'^^a:lang@en .")) ; - //addTest(new Test("a:subj a:prop 'text'@en^^a:lang .")) ; // Can't have both - + //addTest("a:subj a:prop 'text'^^a:lang@en ."); + //addTest("a:subj a:prop 'text'@en^^a:lang ."); // Can't have both + // XML Literal - addTest(new Test("a:subj a:prop '<tag>text</tag>'^^rdf:XMLLiteral .")) ; // Can't have both -// addTest(new Test("a:subj a:prop '<tag>text</tag>'^^rdf:XMLLiteral@fr .")) ; + addTest("a:subj a:prop '<tag>text</tag>'^^rdf:XMLLiteral ."); // Can't have both +// addTest("a:subj a:prop '<tag>text</tag>'^^rdf:XMLLiteral@fr ."); + + //addTest("a:subj a:prop ?x^^xsd:integer ."); // No varibales + //addTest("a:subj a:prop '123'^^?x ."); + //addTest("a:subj a:prop ?x^^?y ."); - //addTest(new Test("a:subj a:prop ?x^^xsd:integer .")) ; // No varibales - //addTest(new Test("a:subj a:prop '123'^^?x .")) ; - //addTest(new Test("a:subj a:prop ?x^^?y .")) ; - // Unicode 00E9 is e-acute // Unicode 03B1 is alpha - addTest(new Test("a:subj a:prop '\u00E9'.")) ; - addTest(new Test("a:subj a:prop '\u003B1'.")) ; - - addTest(new Test("\u00E9:subj a:prop '\u00E9'.")) ; - addTest(new Test("a:subj-\u00E9 a:prop '\u00E9'.")) ; - - addTest(new Test("\u03B1:subj a:prop '\u03B1'.")) ; - addTest(new Test("a:subj-\u03B1 a:prop '\u03B1'.")) ; + addTest("a:subj a:prop '\u00E9'."); + addTest("a:subj a:prop '\u003B1'."); + + addTest("\u00E9:subj a:prop '\u00E9'."); + addTest("a:subj-\u00E9 a:prop '\u00E9'."); + + addTest("\u03B1:subj a:prop '\u03B1'."); + addTest("a:subj-\u03B1 a:prop '\u03B1'."); } - + + void addTest(String string) { addTest(new Test(string)); } + static class Test extends TestCase { String testString ; - - Test(String s) { super(TestUtils.safeName(s)) ; testString = s ; } - + + Test(String s) { super(TestUtils.safeName(s)); testString = s ; } + @Override protected void runTest() throws Throwable { - TurtleParser parser = new TurtleParser(new StringReader(testString)) ; - parser.setEventHandler(new TurtleEventNull()) ; + TurtleParser parser = new TurtleParser(new StringReader(testString)); + parser.setEventHandler(new TurtleEventNull()); parser.getPrefixMapping().setNsPrefix("a", "http://host/a#") ; parser.getPrefixMapping().setNsPrefix("x", "http://host/a#") ; // Unicode 00E9 is e-acute diff --git a/jena-langtag/src/main/java/org/apache/jena/langtag/LangTag.java b/jena-langtag/src/main/java/org/apache/jena/langtag/LangTag.java index 541e2c504b..5c1428b67f 100644 --- a/jena-langtag/src/main/java/org/apache/jena/langtag/LangTag.java +++ b/jena-langtag/src/main/java/org/apache/jena/langtag/LangTag.java @@ -47,6 +47,21 @@ import java.util.Locale; */ public sealed interface LangTag permits LangTagJDK, LangTagRFC5646, LangTagRE { + /** + * Create a {@link LangTag} from a string + * that meets the + * <a href="https://datatracker.ietf.org/doc/html/rfc5646#section-2.1">syntax of RFC 5646</a>. + * + * @throws LangTagException if the string is syntacticly invalid. + */ + public static LangTag of(String string) { + LangTag langTag = SysLangTag.create(string); + // Implementations should not return null but just in case ... + if ( langTag == null ) + throw new LangTagException("Bad syntax"); + return langTag; + } + /** * Formatted according to the RFC 5646 rules. * <p> diff --git a/jena-langtag/src/main/java/org/apache/jena/langtag/LangTagRFC5646.java b/jena-langtag/src/main/java/org/apache/jena/langtag/LangTagRFC5646.java index f75659831c..783df7e5ea 100644 --- a/jena-langtag/src/main/java/org/apache/jena/langtag/LangTagRFC5646.java +++ b/jena-langtag/src/main/java/org/apache/jena/langtag/LangTagRFC5646.java @@ -87,7 +87,17 @@ public final class LangTagRFC5646 implements LangTag{ @Override public String getLanguage() { - return getSubTag("Language", langTagString, language0, language1, CaseRule.LOWER); + String x = getSubTag("Language", langTagString, language0, language1, CaseRule.LOWER); + if ( ! isGrandfathered ) + return x; + // The general getSubTag code will get these wrong. + // "sgn-BE-FR", "sgn-BE-NL", "sgn-CH-DE" + return switch(x) { + case "sgn-be-fr"->"sgn-BE-FR"; + case "sgn-be-nl"->"sgn-BE-NL"; + case "sgn-ch-de"->"sgn-CH-DE"; + default -> x; + }; } @Override @@ -163,7 +173,23 @@ public final class LangTagRFC5646 implements LangTag{ public String str() { if ( isPrivateUseLanguage ) return InternalLangTag.lowercase(langTagString); + String x = irregularFormat(langTagString); + if ( x != null ) + return x; + // Format by parts + // Works for en-GB-oed - the variant is not syntax compatible but the variant formatting rules applies. + StringBuffer sb = new StringBuffer(); + add(sb, getLanguage()); + add(sb, getScript()); + add(sb, getRegion()); + add(sb, getVariant()); + add(sb, getExtension()); + add(sb, getPrivateUse()); + return sb.toString(); + } + /** Return a string if there is special formatting for this language tag, else return null */ + private static String irregularFormat(String langTagString) { // Some irregular special cases. if ( InternalLangTag.caseInsensitivePrefix(langTagString, "sgn-") ) { // "sgn-BE-FR", "sgn-BE-NL", "sgn-CH-DE" @@ -174,21 +200,12 @@ public final class LangTagRFC5646 implements LangTag{ if ( langTagString.equalsIgnoreCase("sgn-CH-DE") ) return "sgn-CH-DE"; } - if ( langTagString.startsWith("i-") || langTagString.startsWith("I-") ) { String lcLangTagStr = InternalLangTag.lowercase(langTagString); if ( irregular_i.contains(lcLangTagStr) ) return lcLangTagStr; } - - StringBuffer sb = new StringBuffer(); - add(sb, getLanguage()); - add(sb, getScript()); - add(sb, getRegion()); - add(sb, getVariant()); - add(sb, getExtension()); - add(sb, getPrivateUse()); - return sb.toString(); + return null; } private void add(StringBuffer sb, String subtag) { @@ -556,7 +573,7 @@ public final class LangTagRFC5646 implements LangTag{ char ch = string.charAt(x); if ( ch != '-' ) break; - int x1 = maybeSubtag1(string, N, x+1, min, max); + int x1 = maybeOneSubtag(string, N, x+1, min, max); if ( x1 <= 0 ) break; if ( x1 == N ) { @@ -572,7 +589,7 @@ public final class LangTagRFC5646 implements LangTag{ * Peek for a segment between min and max in length. * The initial "-" has been read. */ - private static int maybeSubtag1(String string, int N, int idxStart, int min, int max) { + private static int maybeOneSubtag(String string, int N, int idxStart, int min, int max) { int idx = idxStart; if ( idx >= N ) return -1; diff --git a/jena-langtag/src/main/java/org/apache/jena/langtag/LangTags.java b/jena-langtag/src/main/java/org/apache/jena/langtag/LangTags.java index 2738298949..dd2222453c 100644 --- a/jena-langtag/src/main/java/org/apache/jena/langtag/LangTags.java +++ b/jena-langtag/src/main/java/org/apache/jena/langtag/LangTags.java @@ -27,80 +27,50 @@ import java.util.Objects; public class LangTags { - /** Index of the language part */ - public static final int idxLanguage = 0 ; - /** Index of the script part */ - public static final int idxScript = 1 ; - /** Index of the region part */ - public static final int idxRegion = 2 ; - /** Index of the variant part */ - public static final int idxVariant = 3 ; - /** Index of all extensions */ - public static final int idxExtension = 4 ; - - private static final int partsLength = 5 ; - - /** @deprecated Compatibility operation (the behaviour of Jena 5.3.0 and earlier). To be removed. */ - @Deprecated(forRemoval = true) - public static String[] parse(String languageTag) { - try { - LangTag langTag = SysLangTag.create(languageTag); - if (langTag == null ) - return null; - String result[] = new String[partsLength]; - - result[idxLanguage] = langTag.getLanguage(); - result[idxScript] = langTag.getScript(); - result[idxRegion] = langTag.getRegion(); - result[idxVariant] = langTag.getVariant(); - // Legacy compatible. - if ( langTag.getPrivateUse() == null ) - result[idxExtension] = langTag.getExtension(); - else if ( langTag.getExtension() == null ) - result[idxExtension] = langTag.getPrivateUse(); - else - result[idxExtension] = langTag.getExtension()+"-"+langTag.getPrivateUse(); - return result; - } catch (LangTagException ex) { - return null; - } - } - /** * Create a {@link LangTag} from a string * that meets the * <a href="https://datatracker.ietf.org/doc/html/rfc5646#section-2.1">syntax of RFC 5646</a>. - * <p> - * Throws {@link LangTagException} on bad syntax. + * @throws LangTagException if the string is syntacticly invalid. */ - public static LangTag of(String string) { - LangTag langTag = SysLangTag.create(string); - // Implements should not return null but just in case ... - if ( langTag == null ) - throw new LangTagException("Bad syntax"); - return langTag; - } - - /** Same as {@link #of(String)} */ public static LangTag create(String string) { - return of(string); + return LangTag.of(string); } - public static String canonical(String string) { - LangTag langTag = of(string); + /** + * Return the language tag in canonical form (RFC 5646 case rules). + * + * @throws LangTagException if the string is syntacticly invalid. + */ + public static String format(String string) { + LangTag langTag = LangTag.of(string); return langTag.str(); } - /** Check a string is valid as a language tag. */ + /** + * Check a string is valid as a language tag. + * This function returns true or false and does not throw an exception. + */ public static boolean check(String languageTag) { try { - LangTag langTag = SysLangTag.create(languageTag); - return (langTag != null ); + requireValid(languageTag); + return true; } catch (LangTagException ex) { return false; } } + /** + * Check a string is valid as a language tag. + * Throw a {@link LangTagException} if it is not valid. + */ + public static void requireValid(String languageTag) { + // Be robust/general + LangTag langTag = SysLangTag.create(languageTag); + if ( langTag == null ) + throw new LangTagException("Invalid lang tag"); + } + /** * Basic formatter following * <a href="https://datatracker.ietf.org/doc/html/rfc5646#section-2.1.1">RFC 5646 section 2.1.1</a> @@ -143,12 +113,13 @@ public class LangTags { return sb.toString(); } - /** Is @code{langTag1} the same as @code{langTag2}? */ + /** Is @code{langTag1} the same language tag as @code{langTag2}? */ public static boolean sameLangTagAs(LangTag langTag1, LangTag langTag2) { Objects.requireNonNull(langTag1); Objects.requireNonNull(langTag2); if ( langTag1 == langTag2 ) return true; + // get* case normalizes. if ( ! Objects.equals(langTag1.getLanguage(),langTag2.getLanguage()) ) return false; if ( ! Objects.equals(langTag1.getScript(),langTag2.getScript()) ) @@ -188,21 +159,26 @@ public class LangTags { * Passing this test does not guarantee the string is valid language tag. Use * {@link LangTags#check(String)} for validity checking. * - * @throws LangTagException + * @throws LangTagException on invalid string. */ public static boolean basicCheckEx(String string) { boolean start = true; int lastSegmentStart = 0; - + boolean firstSegment = true; for ( int idx = 0; idx < string.length(); idx++ ) { char ch = string.charAt(idx); - if ( InternalLangTag.isA2ZN(ch) ) + if ( InternalLangTag.isA2ZN(ch) ) { + if ( firstSegment && InternalLangTag.isNum(ch) ) { + error("'%s': Number in first subtag", string); + } continue; + } if ( ch == '-' ) { if ( idx == 0 ) { error("'%s': starts with a '-' character", string); return false; } + firstSegment = false; if ( idx == lastSegmentStart ) { error("'%s': two dashes", string); return false; @@ -226,6 +202,7 @@ public class LangTags { * Split a language tag based on dash separators * <p> * The string should be a legal language tag, at least by the general SPARQL/Turtle(etc) grammar rule. + * {@code [a-zA-Z]+ ('-' [a-zA-Z0-9]+)*} * @returns null on bad input syntax * * @see LangTags#check @@ -243,6 +220,7 @@ public class LangTags { * Split a language tag into subtags. * <p> * The string should be a legal language tag, at least by the general SPARQL/Turtle(etc) grammar rule. + * {@code [a-zA-Z]+ ('-' [a-zA-Z0-9]+)*} * @throw {@link LangTagException} * * @see LangTags#check @@ -253,10 +231,16 @@ public class LangTags { // Split efficiently based on [a-z][A-Z][0-9] units separated by "-", with meaning error messages. StringBuilder sb = new StringBuilder(); + boolean firstSegment = true; + boolean start = true; for ( int idx = 0; idx < string.length(); idx++ ) { char ch = string.charAt(idx); if ( InternalLangTag.isA2ZN(ch) ) { + if ( firstSegment && InternalLangTag.isNum(ch) ) { + error("'%s': Number in first subtag", string); + return null; + } sb.append(ch); continue; } @@ -265,6 +249,7 @@ public class LangTags { error("'%s': starts with a '-' character", string); return null; } + firstSegment = false; String str = sb.toString(); if ( str.isEmpty() ) { error("'%s': two dashes", string); diff --git a/jena-langtag/src/main/java/org/apache/jena/langtag/SysLangTag.java b/jena-langtag/src/main/java/org/apache/jena/langtag/SysLangTag.java index 8e8835ac90..91fa9584d4 100644 --- a/jena-langtag/src/main/java/org/apache/jena/langtag/SysLangTag.java +++ b/jena-langtag/src/main/java/org/apache/jena/langtag/SysLangTag.java @@ -26,7 +26,6 @@ public class SysLangTag { /** * Create a {@link LangTag} using the system-wide default language tag parser, * which is {@link LangTagRFC5646}. - * */ public static LangTag create(String languageTag) { return LangTagRFC5646.create(languageTag); diff --git a/jena-langtag/src/test/java/org/apache/jena/langtag/TestBasicSyntaxLangTags.java b/jena-langtag/src/test/java/org/apache/jena/langtag/TestBasicSyntaxLangTags.java index e746aab548..cce8185206 100644 --- a/jena-langtag/src/test/java/org/apache/jena/langtag/TestBasicSyntaxLangTags.java +++ b/jena-langtag/src/test/java/org/apache/jena/langtag/TestBasicSyntaxLangTags.java @@ -21,8 +21,6 @@ package org.apache.jena.langtag; import static org.apache.jena.langtag.LangTags.*; import static org.junit.jupiter.api.Assertions.*; - - import java.util.Arrays; import java.util.List; @@ -36,9 +34,11 @@ public class TestBasicSyntaxLangTags { @Test public void basic_02() { basicSplitCheck("en-GB", "en", "GB"); } @Test public void basic_03() { basicSplitCheck("en-gb", "en", "gb"); } @Test public void basic_04() { basicSplitCheck("en", "en"); } + @Test public void basic_05() { basicSplitCheck("en-123", "en","123"); } + @Test public void basic_06() { basicSplitCheck("e", "e"); } // Showing the split does not allocate subtags to their category. e.g. "x-private" is split. - @Test public void basic_05() { basicSplitCheck("en-Latn-GB-boont-r-extended-sequence-x-private", + @Test public void basic_10() { basicSplitCheck("en-Latn-GB-boont-r-extended-sequence-x-private", "en","Latn", "GB", "boont", "r", "extended", "sequence", "x", "private"); } @Test public void basic_bad_01() { basicSplitCheckBad(""); } @@ -46,6 +46,8 @@ public class TestBasicSyntaxLangTags { @Test public void basic_bad_03() { basicSplitCheckBad("--"); } @Test public void basic_bad_04() { basicSplitCheckBad("abc-xy%20"); } @Test public void basic_bad_05() { basicSplitCheckBad("abc def"); } + @Test public void basic_bad_06() { basicSplitCheckBad("a12-def"); } + @Test public void basic_bad_07() { basicSplitCheckBad("9-def"); } static void basicSplitCheck(String input, String...parts) { basicSplitTest(input, parts); diff --git a/jena-langtag/src/test/java/org/apache/jena/langtag/TestLangTag.java b/jena-langtag/src/test/java/org/apache/jena/langtag/TestLangTag.java index 289406bcc3..c71ae4d605 100644 --- a/jena-langtag/src/test/java/org/apache/jena/langtag/TestLangTag.java +++ b/jena-langtag/src/test/java/org/apache/jena/langtag/TestLangTag.java @@ -49,7 +49,6 @@ public class TestLangTag { @Test public void test_lang_basic_22() { testPrivateUse("az-Latn-x-latn", "az-Latn-x-latn", "az", "Latn", null, null, null, "x-latn"); } @Test public void test_lang_basic_23() { testPrivateUse("sss-x-y", "sss-x-y", "sss", null, null, null, null, "x-y"); } - @Test public void test_lang_bad_01() { testBad("123"); } @Test public void test_lang_bad_02() { testBad("abcdefghijklmn"); } @Test public void test_lang_bad_03() { testBad("abcdefghijklmn-123"); } @@ -94,6 +93,14 @@ public class TestLangTag { @Test public void test_langtag_special_03() { testFormatting("EN-gb-OED", "en-GB-oed"); } @Test public void test_langtag_special_04() { testNotJDK("EN-gb-OED", "en-GB-oed", "en", null, "GB", "oed", null, null); } + // Only LangTagRFC5646 (the JDK replaces the language name) + @Test public void test_langtag_special_11() { test1_RFC5646("sgn-BE-FR", "sgn-BE-FR", "sgn-BE-FR", null, null, null, null, null); } + @Test public void test_langtag_special_12() { test1_RFC5646("sgn-BE-NL", "sgn-BE-NL", "sgn-BE-NL", null, null, null, null, null); } + @Test public void test_langtag_special_13() { test1_RFC5646("sgn-CH-DE", "sgn-CH-DE", "sgn-CH-DE", null, null, null, null, null); } + + // Does not exist + @Test public void test_langtag_special_14() { testBad("sgn-GB-SW"); } + // The examples from RFC 5646 @Test public void test_lang_10() { testRFC5646("de", "de", "de", null, null, null, null); } @Test public void test_lang_11() { testRFC5646("fr", "fr", "fr", null, null, null, null); } @@ -148,7 +155,6 @@ public class TestLangTag { @Test public void test_lang_61() { testPrivateUse("en-Latn-GB-boont-r-extended-sequence-s-another-x-private", "en-Latn-GB-boont-r-extended-sequence-s-another-x-private", "en","Latn", "GB", "boont", "r-extended-sequence-s-another", "x-private"); } - /** General test - include JDK */ private static void testRFC5646(String langString, String formatted, String lang, String script, String region, String variant, String extension) { runTest(langString, formatted, lang, script, region, variant, extension, null, true); @@ -156,11 +162,10 @@ public class TestLangTag { /** Has a private use part */ private static void testPrivateUse(String langString, String formatted, String lang, String script, String region, String variant, String extension, String privateUse) { - // Private use is supported by LanTagJDK by extracting the "x" extension + // Private use is supported by LangTagJDK by extracting the "x" extension runTest(langString, formatted, lang, script, region, variant, extension, privateUse, true); } - /** Run a test which is not properly supported by the JDK-Locale based implementation. */ private static void testNotJDK(String langString, String formatted, String lang, String script, String region, String variant, String extension, String privateUse) { runTest(langString, formatted, lang, script, region, variant, extension, privateUse, false); @@ -182,9 +187,9 @@ public class TestLangTag { String lang, String script, String region, String variant, String extension, String privateuse, boolean jdkSupported) { // Run the test with varied case of the input string. - test1(langString, formatted, lang, script, region, variant, extension, privateuse); - test1(langString.toLowerCase(), formatted, lang, script, region, variant, extension, privateuse); - test1(langString.toUpperCase(), formatted, lang, script, region, variant, extension, privateuse); + test1_RFC5646(langString, formatted, lang, script, region, variant, extension, privateuse); + test1_RFC5646(langString.toLowerCase(), formatted, lang, script, region, variant, extension, privateuse); + test1_RFC5646(langString.toUpperCase(), formatted, lang, script, region, variant, extension, privateuse); // Formatting. testFormatting(langString, formatted); @@ -213,7 +218,7 @@ public class TestLangTag { } // Test execution for LangTagRFC5646 on one exact input string. - private static void test1(String langString, String formatted, String lang, String script, String region, String variant, String extension, String privateuse) { + private static void test1_RFC5646(String langString, String formatted, String lang, String script, String region, String variant, String extension, String privateuse) { LangTag langTag = LangTagRFC5646.create(langString); assertNotNull(langTag); assertEquals(lang, langTag.getLanguage(), "Lang"); diff --git a/jena-langtag/src/test/java/org/apache/jena/langtag/TestLangTagFormat.java b/jena-langtag/src/test/java/org/apache/jena/langtag/TestLangTagFormat.java index db7c4b0377..c4c33c96bf 100644 --- a/jena-langtag/src/test/java/org/apache/jena/langtag/TestLangTagFormat.java +++ b/jena-langtag/src/test/java/org/apache/jena/langtag/TestLangTagFormat.java @@ -18,7 +18,6 @@ package org.apache.jena.langtag; - import java.util.ArrayList; import java.util.List; import java.util.function.Function; diff --git a/jena-langtag/src/test/java/org/apache/jena/langtag/TestLangTagsOps.java b/jena-langtag/src/test/java/org/apache/jena/langtag/TestLangTagsOps.java index 9db126f90f..cc0ef95c9a 100644 --- a/jena-langtag/src/test/java/org/apache/jena/langtag/TestLangTagsOps.java +++ b/jena-langtag/src/test/java/org/apache/jena/langtag/TestLangTagsOps.java @@ -27,22 +27,22 @@ import org.junit.jupiter.api.Test; public class TestLangTagsOps { @Test public void sameLangTag_01() { - LangTag langTag1 = LangTags.of("en-GB"); - LangTag langTag2 = LangTags.of("en-GB"); + LangTag langTag1 = LangTag.of("en-GB"); + LangTag langTag2 = LangTag.of("en-GB"); sameLangTag(langTag1, langTag2, true, true, true); } @Test public void sameLangTag_02() { - LangTag langTag1 = LangTags.of("en-GB"); - LangTag langTag2 = LangTags.of("en-gb"); + LangTag langTag1 = LangTag.of("en-GB"); + LangTag langTag2 = LangTag.of("en-gb"); sameLangTag(langTag1, langTag2, true, false, false); } @Test public void sameLangTag_03() { - LangTag langTag1 = LangTags.of("en-GB-Latn"); - LangTag langTag2 = LangTags.of("en-gb"); + LangTag langTag1 = LangTag.of("en-GB-Latn"); + LangTag langTag2 = LangTag.of("en-gb"); sameLangTag(langTag1, langTag2, false, false, false); } diff --git a/jena-tdb1/src/main/java/org/apache/jena/tdb1/store/nodetable/NodecSSE.java b/jena-tdb1/src/main/java/org/apache/jena/tdb1/store/nodetable/NodecSSE.java index 3b8c2480cb..c8fb7ae2db 100644 --- a/jena-tdb1/src/main/java/org/apache/jena/tdb1/store/nodetable/NodecSSE.java +++ b/jena-tdb1/src/main/java/org/apache/jena/tdb1/store/nodetable/NodecSSE.java @@ -26,6 +26,7 @@ import org.apache.jena.atlas.logging.FmtLog; import org.apache.jena.graph.Node; import org.apache.jena.graph.NodeFactory; import org.apache.jena.graph.Triple; +import org.apache.jena.langtagx.LangTagX; import org.apache.jena.riot.RiotException; import org.apache.jena.riot.out.NodeFmtLib; import org.apache.jena.riot.system.ErrorHandler; @@ -35,7 +36,6 @@ import org.apache.jena.riot.system.PrefixMapZero; import org.apache.jena.riot.tokens.Token; import org.apache.jena.riot.tokens.Tokenizer; import org.apache.jena.riot.tokens.TokenizerText; -import org.apache.jena.riot.web.LangTag; import org.apache.jena.shared.PrefixMapping; import org.apache.jena.sparql.util.NodeUtils; import org.apache.jena.tdb1.TDB1; @@ -74,7 +74,8 @@ public class NodecSSE implements Nodec } else if ( node.isLiteral() && NodeUtils.isLangString(node) ) { // Check syntactically valid. String lang = node.getLiteralLanguage(); - if ( ! LangTag.check(lang) ) + // Weak, defensive check. + if ( ! LangTagX.checkLanguageTagBasicSyntax(lang) ) throw new TDB1Exception("bad language tag: "+node); } else if ( node.isBlank() && ! onlySafeBNodeLabels ) { // Special case.
