This is an automated email from the ASF dual-hosted git repository. andy pushed a commit to branch main in repository https://gitbox.apache.org/repos/asf/jena.git
commit 73bfa864cd2973dd2c280df44892c0d68baab43e Author: Andy Seaborne <[email protected]> AuthorDate: Tue Mar 25 18:39:09 2025 +0000 GH-3086: Module jena-langtag --- jena-langtag/pom.xml | 95 +++ jena-langtag/src/main/java/module-info.java | 22 + .../org/apache/jena/langtag/InternalLangTag.java | 179 +++++ .../java/org/apache/jena/langtag/LangExamples.java | 79 +++ .../main/java/org/apache/jena/langtag/LangTag.java | 67 ++ .../org/apache/jena/langtag/LangTagException.java | 23 + .../java/org/apache/jena/langtag/LangTagJDK.java | 154 ++++ .../java/org/apache/jena/langtag/LangTagRE.java | 393 +++++++++++ .../org/apache/jena/langtag/LangTagRFC5646.java | 781 +++++++++++++++++++++ .../java/org/apache/jena/langtag/LangTags.java | 289 ++++++++ .../java/org/apache/jena/langtag/SysLangTag.java | 46 ++ .../org/apache/jena/langtag/cmd/CmdLangTag.java | 76 ++ .../java/org/apache/jena/langtag/TS_LangTag.java | 32 + .../jena/langtag/TestBasicSyntaxLangTags.java | 76 ++ .../java/org/apache/jena/langtag/TestLangTag.java | 240 +++++++ .../org/apache/jena/langtag/TestLangTagFormat.java | 143 ++++ .../org/apache/jena/langtag/TestLangTagsOps.java | 62 ++ 17 files changed, 2757 insertions(+) diff --git a/jena-langtag/pom.xml b/jena-langtag/pom.xml new file mode 100644 index 0000000000..4016701806 --- /dev/null +++ b/jena-langtag/pom.xml @@ -0,0 +1,95 @@ +<?xml version="1.0" encoding="utf-8"?> +<!-- + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. + + See the NOTICE file distributed with this work for additional + information regarding copyright ownership. +--> + +<project xmlns="http://maven.apache.org/POM/4.0.0" + xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" + xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/maven-v4_0_0.xsd"> + <modelVersion>4.0.0</modelVersion> + + <artifactId>jena-langtag</artifactId> + <name>Apache Jena - Language tags</name> + + <parent> + <groupId>org.apache.jena</groupId> + <artifactId>jena</artifactId> + <version>5.4.0-SNAPSHOT</version> + </parent> + + <description>Implementation of RFC 5646 (BCP-47) Language tags</description> + + <properties> + <automatic.module.name>org.apache.jena.langtag</automatic.module.name> + </properties> + + <dependencies> + + <dependency> + <groupId>org.junit.jupiter</groupId> + <artifactId>junit-jupiter-api</artifactId> + <scope>test</scope> + </dependency> + + <dependency> + <groupId>org.junit.platform</groupId> + <artifactId>junit-platform-suite-engine</artifactId> + <scope>test</scope> + </dependency> + + <dependency> + <groupId>org.junit.jupiter</groupId> + <artifactId>junit-jupiter-params</artifactId> + </dependency> + + <!-- + Needed for @Parameterized test suite + JUnit5 will eventually have @ParameterizedClass. + --> + <dependency> + <groupId>org.junit.vintage</groupId> + <artifactId>junit-vintage-engine</artifactId> + <scope>test</scope> + </dependency> + + </dependencies> + + <build> + <plugins> + <plugin> + <groupId>org.apache.maven.plugins</groupId> + <artifactId>maven-resources-plugin</artifactId> + </plugin> + + <plugin> + <groupId>org.apache.maven.plugins</groupId> + <artifactId>maven-source-plugin</artifactId> + </plugin> + + <plugin> + <groupId>org.apache.maven.plugins</groupId> + <artifactId>maven-jar-plugin</artifactId> + </plugin> + + <plugin> + <groupId>org.apache.maven.plugins</groupId> + <artifactId>maven-javadoc-plugin</artifactId> + </plugin> + </plugins> + + </build> + +</project> diff --git a/jena-langtag/src/main/java/module-info.java b/jena-langtag/src/main/java/module-info.java new file mode 100644 index 0000000000..326731548b --- /dev/null +++ b/jena-langtag/src/main/java/module-info.java @@ -0,0 +1,22 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +module org.apache.jena.langtag { + exports org.apache.jena.langtag; + exports org.apache.jena.langtag.cmd; +} diff --git a/jena-langtag/src/main/java/org/apache/jena/langtag/InternalLangTag.java b/jena-langtag/src/main/java/org/apache/jena/langtag/InternalLangTag.java new file mode 100644 index 0000000000..93281be82a --- /dev/null +++ b/jena-langtag/src/main/java/org/apache/jena/langtag/InternalLangTag.java @@ -0,0 +1,179 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.jena.langtag; + +import java.util.ArrayList; +import java.util.List; +import java.util.Locale; + +/** + * LangTag processing support. + */ +class InternalLangTag { + + static List<String> splitOnDash(String x) { + List<String> strings = new ArrayList<>(6); + // Split efficiently(?) based on [a-z][A-Z][0-9] units separated by "-"s + StringBuilder sb = new StringBuilder(); + + boolean start = true; + for ( int idx = 0; idx < x.length(); idx++ ) { + char ch = x.charAt(idx); + if ( isA2ZN(ch) ) { + sb.append(ch); + continue; + } + if ( ch == '-' ) { + String str = sb.toString(); + strings.add(str); + sb.setLength(0); + continue; + } + error("Bad character: (0x%02X) '%c' index %d", (int)ch, str(ch), idx); + } + String strLast = sb.toString(); + if ( strLast.isEmpty() ) { + return null; + //throw new LangTagException("Empty part: "+x); + } + strings.add(strLast); + return strings; + } + + /*package*/ static String strcase(String string) { + if ( string == null ) + return null; + if ( string.length() == 2 ) + return uppercase(string); + if ( string.length() == 4 ) + return titlecase(string); + return lowercase(string); + } + + /*package*/static String lowercase(String string) { + if ( string == null ) + return null; + return string.toLowerCase(Locale.ROOT); + } + + /*package*/static String uppercase(String string) { + if ( string == null ) + return null; + return string.toUpperCase(Locale.ROOT); + } + + /*package*/static String titlecase(String string) { + if ( string == null ) + return null; + char ch1 = string.charAt(0); + ch1 = Character.toUpperCase(ch1); + string = lowercase(string.substring(1)); + return ch1 + string; + } + + /** ASCII A-Z */ + /*package*/ static boolean isA2Z(int ch) { + return range(ch, 'a', 'z') || range(ch, 'A', 'Z'); + } + + /** ASCII A-Z or 0-9 */ + /*package*/ static boolean isA2ZN(int ch) { + return range(ch, 'a', 'z') || range(ch, 'A', 'Z') || range(ch, '0', '9'); + } + + static void checkDigits(String string, int N, int start, int end) { + for ( int i = start ; i < end ; i++ ) { + char ch = string.charAt(i); + if ( ! isNum(ch) ) + error("Not a DIGIT (%s, posn = %s) in '%s'", str(ch), (i+1), string); + } + } + + static void checkAlpha(String string, int N, int start, int end) { + for ( int i = start ; i < end ; i++ ) { + char ch = string.charAt(i); + if ( ! isAlpha(ch) ) + // 1-based error message + error("Not an ALPHA (%s, posn = %s) in '%s'", str(ch), (i+1), string); + } + } + + static boolean isAlpha(String string, int start, int end) { + for ( int i = start ; i < end ; i++ ) { + char ch = string.charAt(i); + if ( ! isAlpha(ch) ) + return false; + } + return true; + } + + static void checkAlphaMinus(String string, int N, int start, int end) { + for ( int i = start ; i < end ; i++ ) { + char ch = string.charAt(i); + if ( ! isAlpha(ch) && ! isMinus(ch) ) + error("Not an ALPHA or MINUS (%s, posn = %s) in '%s'", str(ch), (i+1), string); + } + } + + static void checkAlphaNum(String string, int N, int start, int end) { + for ( int i = start ; i < end ; i++ ) { + char ch = string.charAt(i); + if ( ! isAlpha(ch) && ! isNum(ch) ) + error("Not an ALPHA or DIGITS (%s, posn = %s) in '%s'", str(ch), (i+1), string); + } + } + + static void checkAlphaNumMinus(String string, int N, int start, int end) { + for ( int i = start ; i < end ; i++ ) { + char ch = string.charAt(i); + if ( ! isAlpha(ch) && ! isNum(ch) && ! isMinus(ch) ) + error("Not an ALPHA, DIGITS or MINUS (%s, posn = %s) in '%s'", str(ch), (i+1), string); + } + } + + /*package*/ static String str(char ch) { + return String.format("'%s' U+%04X", Character.valueOf(ch), (int)ch); + } + + static boolean isAlpha(char ch) { + return ( ch >= 'a' && ch <= 'z' ) || ( ch >= 'A' && ch <= 'Z' ); + } + + static boolean isNum(char ch) { + return ( ch >= '0' && ch <= '9' ); + } + + static boolean isMinus(char ch) { + return ( ch == '-' ); + } + + /*package*/ static void error(String msg, Object...args) { + String x = String.format(msg, args); + throw new LangTagException(x); + } + + private static boolean range(int ch, char a, char b) { + return (ch >= a && ch <= b); + } + + /** Case insensitive test of whether a string has a prefix. */ + static boolean caseInsensitivePrefix(String string, String prefix) { + return string.regionMatches(true, 0, prefix, 0, prefix.length()); + } +} diff --git a/jena-langtag/src/main/java/org/apache/jena/langtag/LangExamples.java b/jena-langtag/src/main/java/org/apache/jena/langtag/LangExamples.java new file mode 100644 index 0000000000..f1fb4d12a5 --- /dev/null +++ b/jena-langtag/src/main/java/org/apache/jena/langtag/LangExamples.java @@ -0,0 +1,79 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.jena.langtag; + +public class LangExamples { + + // Examples from RFC 5646 + static String[] examples5646 = { + "de", + "fr", + "ja", + "i-enochian", // (example of a grandfathered tag) + "zh-Hant", // (Chinese written using the Traditional Chinese script) + "zh-Hans", // (Chinese written using the Simplified Chinese script) + "sr-Cyrl", // (Serbian written using the Cyrillic script) + "sr-Latn", // (Serbian written using the Latin script) + + //Extended language subtags and their primary language subtag counterparts: + "zh-cmn-Hans-CN", // (Chinese, Mandarin, Simplified script, as used in China) + "cmn-Hans-CN", // (Mandarin Chinese, Simplified script, as used in China) + "zh-yue-HK", // (Chinese, Cantonese, as used in Hong Kong SAR) + "yue-HK", // (Cantonese, Chinese, as used in Hong Kong SAR) + //Language-Script-Region: + "zh-Hans-CN", // (Chinese written using the Simplified script as used in mainland China) + "sr-Latn-RS", // (Serbian written using the Latin script as used in Serbia) + //Language-Variant: + "sl-rozaj", // (Resian dialect of Slovenian) + "sl-rozaj-biske", // (San Giorgio dialect of Resian dialect of Slovenian) + "sl-nedis", // (Nadiza dialect of Slovenian) + //Language-Region-Variant: + "de-CH-1901", // (German as used in Switzerland using the 1901 variant [orthography]) + "sl-IT-nedis", // (Slovenian as used in Italy, Nadiza dialect) + //Language-Script-Region-Variant: + "hy-Latn-IT-arevela", // (Eastern Armenian written in Latin script, as used in Italy) + //Language-Region: + "de-DE", // (German for Germany) + "en-US", // (English as used in the United States) + "es-419", // (Spanish appropriate for the Latin America and Caribbean region using the UN region code) + //Private use subtags: + "de-CH-x-phonebk", + "az-Arab-x-AZE-derbend", + //Private use registry values: + "x-whatever", // (private use using the singleton 'x') + "qaa-Qaaa-QM-x-southern", // (all private tags) + "de-Qaaa", // (German, with a private script) + "sr-Latn-QM", // (Serbian, Latin script, private region) + "sr-Qaaa-RS", // (Serbian, private script, for Serbia) + //Tags that use extensions + // (examples ONLY -- extensions MUST be defined by revision or update to this document, or by RFC): + "en-US-u-islamcal", + "zh-CN-a-myext-x-private", + "en-a-myext-b-another" + }; + + static String[] examples5646_bad = { + //Some Invalid Tags: + "de-419-DE", // (two region tags) + "a-DE" // (use of a single-character subtag in primary position; note + // that there are a few grandfathered tags that start with "i-" that + // are valid) + //"ar-a-aaa-b-bbb-a-ccc" // (two extensions with same single-letterprefix) + }; +} diff --git a/jena-langtag/src/main/java/org/apache/jena/langtag/LangTag.java b/jena-langtag/src/main/java/org/apache/jena/langtag/LangTag.java new file mode 100644 index 0000000000..541e2c504b --- /dev/null +++ b/jena-langtag/src/main/java/org/apache/jena/langtag/LangTag.java @@ -0,0 +1,67 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.jena.langtag; + +import java.util.Locale; + +/** + * A language tag as a tuple of 5 strings (lang, script, region, + * variant, extension) and + * <p> + * See {@link LangTagRFC5646} for generating {@code LangTag}s. Note this returns the old ISO code. + * See the javadoc of {@link Locale#getLanguage()}. + * <p> + * {@link LangTagJDK} is an alternative version which uses the Java locale + * built-in functionality and does not canonical language names (replace one name by another). + * JDK Locale It is not fully RFC 5646 compliance + * <p> + * Language tags are BCP 47. + * <p> + * RFCs: + * <ul> + * <li><a href="https://tools.ietf.org/html/5646">RFC 5646</a> "Tags for Identifying Languages" + * <li><a href="https://tools.ietf.org/html/4646">RFC 4646</a> "Tags for Identifying Languages" + * <li><a href="https://tools.ietf.org/html/3066">RFC 3066</a> "Tags for the Identification of Languages" + * </ul> + * Related: + * <ul> + * <li><a href="https://tools.ietf.org/html/4647">RFC 4647</a> "Matching of Language Tags" + * <li><a href="https://tools.ietf.org/html/4234">RFC 4232</a> "Augmented BNF for Syntax Specifications: ABNF" + * </ul> + */ +public sealed interface LangTag permits LangTagJDK, LangTagRFC5646, LangTagRE { + + /** + * Formatted according to the RFC 5646 rules. + * <p> + * {@code toString()} should return the language tag with the same case as it was originally. + */ + public String str(); + + public String getLanguage(); + public String getScript(); + public String getRegion(); + public String getVariant(); + public String getExtension(); + public String getPrivateUse(); + + @Override public int hashCode(); + @Override public boolean equals(Object other); + @Override public String toString(); +} diff --git a/jena-langtag/src/main/java/org/apache/jena/langtag/LangTagException.java b/jena-langtag/src/main/java/org/apache/jena/langtag/LangTagException.java new file mode 100644 index 0000000000..c68073b934 --- /dev/null +++ b/jena-langtag/src/main/java/org/apache/jena/langtag/LangTagException.java @@ -0,0 +1,23 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.jena.langtag; + +public class LangTagException extends RuntimeException { + public LangTagException(String msg) { super(msg); } +} diff --git a/jena-langtag/src/main/java/org/apache/jena/langtag/LangTagJDK.java b/jena-langtag/src/main/java/org/apache/jena/langtag/LangTagJDK.java new file mode 100644 index 0000000000..60ce5e910f --- /dev/null +++ b/jena-langtag/src/main/java/org/apache/jena/langtag/LangTagJDK.java @@ -0,0 +1,154 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + * See the NOTICE file distributed with this work for additional + * information regarding copyright ownership. + */ + +package org.apache.jena.langtag; + +import java.util.IllformedLocaleException; +import java.util.Locale; +import java.util.Locale.Builder; +import java.util.Objects; +import java.util.Set; + +/** + * LangTag parsing. + * <p> + * A layer over the JDK {@link Locale} and {@link Builder} to introduce a class without legacy langtag conversion. + * {@link LangTag}. + * <p> + * This is not RFC 5646 compliant. + * <ul> + * <li>Does not handle language subtags (e.g. "zh-cmn-Hans-CN")</li> + * <li>Does not handle grandfathered language tags e.g. "i-enochian"</li> + * <li>Multiple variant subtags</li> + * <li>Legacy "en-GB-oed" - "oed" is a 3 letter script (script is 4 by the grammar/)</li> + * <ul> + */ +public final class LangTagJDK implements LangTag { + private final String langTagAsGiven; + private final String fmtString; + private final String lang; + private final String script; + private final String region; + private final String variant; + private final String extension; + // Not supported by the JDK (part of extensions). + private final String privateUse; + + private static Locale.Builder locBuild = new Locale.Builder(); + + public static LangTag create(String string) { + try { + locBuild.clear(); + locBuild.setLanguageTag(string); + return asLangTag(string, locBuild); + } catch (IllformedLocaleException ex) { + return null; + } + } + + private LangTagJDK(String langTagAsGiven, String fmtString, String language, String script, String region, String variant, String extension, String privateUse) { + this.langTagAsGiven = langTagAsGiven; + this.fmtString = Objects.requireNonNull(fmtString); + this.lang = maybe(language); + this.script = maybe(script); + this.region = maybe(region); + this.variant = maybe(variant); + this.extension = maybe(extension); + this.privateUse = maybe(privateUse); + } + + private static String maybe(String x) { + // Choice. + if ( x == null ) + return null; + if ( x.isEmpty() ) + return null; + return x; + } + + @Override public String str() { return fmtString; } + + @Override public String getLanguage() { return lang; } + @Override public String getScript() { return script; } + @Override public String getRegion() { return region; } + @Override public String getVariant() { return variant; } + @Override public String getExtension() { return extension; } + @Override public String getPrivateUse() { return privateUse; } + + public static String canonical(String str) { + try { + // Does not do conversion of language for ISO 639 codes that have changed. + return locBuild.setLanguageTag(str).build().toLanguageTag(); + } catch (IllformedLocaleException ex) { + return str; + } + } + + @Override + public int hashCode() { + return Objects.hash(langTagAsGiven, fmtString, + lang, script, region, variant, + extension, privateUse); + } + + @Override + public boolean equals(Object obj) { + if ( this == obj ) + return true; + if ( !(obj instanceof LangTagJDK) ) + return false; + LangTagJDK other = (LangTagJDK)obj; + return Objects.equals(lang, other.lang) + && Objects.equals(script, other.script) + && Objects.equals(region, other.region) + && Objects.equals(variant, other.variant) + && Objects.equals(extension, other.extension) + && Objects.equals(privateUse, other.privateUse) + && Objects.equals(langTagAsGiven, other.langTagAsGiven) + && Objects.equals(fmtString, other.fmtString); + } + + private static Character privateUseSingleton = Character.valueOf('x'); + + private static LangTag asLangTag(String string, Locale.Builder locBuild) { + Locale locale = locBuild.build(); + Set<Character> extkeys = locale.getExtensionKeys(); + StringBuilder sb1 = new StringBuilder(); + StringBuilder sb2 = new StringBuilder(); + for ( Character k : extkeys ) { + String ext = locale.getExtension(k); + StringBuilder sb = sb1; + if ( privateUseSingleton.equals(k) ) + sb = sb2; + if ( sb.length() != 0 ) + sb.append('-'); + sb.append(k); + sb.append('-'); + sb.append(ext); + } + String extension = sb1.toString(); + String privateUse = sb2.toString(); + return new LangTagJDK(string, + locale.toLanguageTag(), + locale.getLanguage(), + locale.getScript(), + locale.getCountry(), + locale.getVariant(), + extension, + privateUse); + } +} diff --git a/jena-langtag/src/main/java/org/apache/jena/langtag/LangTagRE.java b/jena-langtag/src/main/java/org/apache/jena/langtag/LangTagRE.java new file mode 100644 index 0000000000..760fd75c97 --- /dev/null +++ b/jena-langtag/src/main/java/org/apache/jena/langtag/LangTagRE.java @@ -0,0 +1,393 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.jena.langtag; + +import java.util.Locale; +import java.util.regex.Matcher; +import java.util.regex.Pattern; + +public final class LangTagRE implements LangTag { + + public static LangTag create(String string) { + LangTag langTagRE = new LangTagRE(string); + return langTagRE; + } + + private final String string; + private final String[] parts; + + private LangTagRE(String string) { + this.string = string; + this.parts = LangTagByRE.parse(string); + } + + @Override + public String str() { + return null; + } + + @Override + public String getLanguage() { + return parts[idxLanguage]; + } + + @Override + public String getScript() { + return parts[idxScript]; + } + + @Override + public String getRegion() { + return parts[idxRegion]; + } + + @Override + public String getVariant() { + return parts[idxVariant]; + } + + @Override + public String getExtension() { + return parts[idxExtension]; + } + + @Override + public String getPrivateUse() { + return parts[idxPrivateUse]; + } + + /*package*/ static final int idxLanguage = 0; + /*package*/ static final int idxScript = 1; + /*package*/ static final int idxRegion = 2; + /*package*/ static final int idxVariant = 3; + /*package*/ static final int idxExtension = 4; + /*package*/ static final int idxPrivateUse = 5; + + /** Language tag handled with regular expressions. */ + static class LangTagByRE { + /** + * Language tags: support for parsing and canonicalization of case. + * Grandfathered forms ("i-") are left untouched. Unsupported or syntactically + * illegal forms are handled in canonicalization by doing nothing. + * <ul> + * <li>Language tags syntax: <a href="http://www.ietf.org/rfc/rfc4646.txt">RFC 4646</a></li> + * <li>Matching Language tags: <a href="http://www.ietf.org/rfc/rfc4647.txt">RFC 4647</a></li> + * <li>Language tags syntax (BCP 47): <a href="http://www.ietf.org/rfc/rfc5646.txt">RFC 5646</a></li> + * </ul> + */ + + // Valid language tag, not irregular, not grand-fathered. + + private static final int partsLength = 6; + + private LangTagByRE() {} + + // Defined by BCP 47 which is currently RFC 5646 and which obsoletes RFC 4646. + + // Canonical forms: + /* + * RFC 4646 In this format, all non-initial two-letter subtags are + * uppercase, all non-initial four-letter subtags are titlecase, and all + * other subtags are lowercase. + */ + /* + * RFC 5646 An implementation can reproduce this format without accessing + * the registry as follows. All subtags, including extension and private use + * subtags, use lowercase letters with two exceptions: two-letter and + * four-letter subtags that neither appear at the start of the tag nor occur + * after singletons. Such two-letter subtags are all uppercase (as in the + * tags "en-CA-x-ca" or "sgn-BE-FR") and four- letter subtags are titlecase + * (as in the tag "az-Latn-x-latn"). + */ + + /* + * ABNF definition: <a href="http://www.ietf.org/rfc/rfc5646.txt">RFC 5646</a> + * +Language-Tag = langtag ; normal language tags + / privateuse ; private use tag + / grandfathered ; grandfathered tags + + langtag = language + ["-" script] + ["-" region] + *("-" variant) + *("-" extension) + ["-" privateuse] + + language = 2*3ALPHA ; shortest ISO 639 code + ["-" extlang] ; sometimes followed by + ; extended language subtags + / 4ALPHA ; or reserved for future use + / 5*8ALPHA ; or registered language subtag + + extlang = 3ALPHA ; selected ISO 639 codes + *2("-" 3ALPHA) ; permanently reserved + + script = 4ALPHA ; ISO 15924 code + + region = 2ALPHA ; ISO 3166-1 code + / 3DIGIT ; UN M.49 code + + variant = 5*8alphanum ; registered variants + / (DIGIT 3alphanum) + + extension = singleton 1*("-" (2*8alphanum)) + + ; Single alphanumerics + ; "x" reserved for private use + singleton = DIGIT ; 0 - 9 + / %x41-57 ; A - W + / %x59-5A ; Y - Z + / %x61-77 ; a - w + / %x79-7A ; y - z + + privateuse = "x" 1*("-" (1*8alphanum)) + + grandfathered = irregular ; non-redundant tags registered + / regular ; during the RFC 3066 era + + irregular = "en-GB-oed" ; irregular tags do not match + / "i-ami" ; the 'langtag' production and + / "i-bnn" ; would not otherwise be + / "i-default" ; considered 'well-formed' + / "i-enochian" ; These tags are all valid, + / "i-hak" ; but most are deprecated + / "i-klingon" ; in favor of more modern + / "i-lux" ; subtags or subtag + / "i-mingo" ; combination + / "i-navajo" + / "i-pwn" + / "i-tao" + / "i-tay" + / "i-tsu" + / "sgn-BE-FR" + / "sgn-BE-NL" + / "sgn-CH-DE" + + regular = "art-lojban" ; these tags match the 'langtag' + / "cel-gaulish" ; production, but their subtags + / "no-bok" ; are not extended language + / "no-nyn" ; or variant subtags: their meaning + / "zh-guoyu" ; is defined by their registration + / "zh-hakka" ; and all of these are deprecated + / "zh-min" ; in favor of a more modern + / "zh-min-nan" ; subtag or sequence of subtags + / "zh-xiang" + + alphanum = (ALPHA / DIGIT) ; letters and numbers + */ + + private static final String languageRE_1 = "(?:[a-zA-Z]{2,3}(?:-[a-zA-Z]{3}){0,3})"; + private static final String languageRE_2 = "[a-zA-Z]{4}"; + private static final String languageRE_3 = "[a-zA-Z]{5,8}"; + private static final String language = languageRE_1 + "|" + languageRE_2 + "|" + languageRE_3; + + private static final String script = "[a-zA-Z]{4}"; + private static final String region = "[a-zA-Z]{2}|[0-9]{3}"; + + private static final String variant1 = "(?:[a-zA-Z0-9]{5,8}|[0-9][a-zA-Z0-9]{3})"; + private static final String variant = variant1 + "(?:-" + variant1 + ")*"; + + private static final String extension1 = "(?:[a-wyzA-WYZ0-9](?:-[a-zA-Z0-9]{2,8})+)"; // Not 'x' + private static final String extension = extension1 + "(?:-" + extension1 + ")*"; + + private static final String privateuse = "[xX](?:-[a-zA-Z0-9]{1,8})+"; + + private static final String langtag = String.format("^(%s)(?:-(%s))?(?:-(%s))?(?:-(%s))*(?:-(%s))?(?:-(%s))?$", + language, script, region, variant, extension, privateuse); + + // This is for the "i-" forms only. + private static final String grandfatheredRE = "^i(?:-[a-zA-Z0-9]{2,8}){1,2}$"; + private static final String privateUseLangRE = "^"+privateuse+"$"; + + private static Pattern pattern = Pattern.compile(langtag); + private static Pattern patternGrandfathered = Pattern.compile(grandfatheredRE); + private static Pattern privateUseLang = Pattern.compile(privateUseLangRE); + private static Pattern enOED = Pattern.compile("en-GB-oed", Pattern.CASE_INSENSITIVE); + + /** + * Validate - basic syntax check for a language tags: [a-zA-Z]+ ('-'[a-zA-Z0-9]+)* + */ + /*package*/ static boolean check(String languageTag) { + int len = languageTag.length(); + int idx = 0; + boolean first = true; + while (idx < languageTag.length()) { + int idx2 = checkPart(languageTag, idx, first); + first = false; + if ( idx2 == idx ) + // zero length part. + return false; + idx = idx2; + if ( idx == len ) + return true; + if ( languageTag.charAt(idx) != '-' ) + return false; + idx++; + if ( idx == len ) + // trailing DASH + return false; + } + return true; + } + + private static int checkPart(String languageTag, int idx, boolean leader) { + for (; idx < languageTag.length(); idx++) { + int ch = languageTag.charAt(idx); + if ( leader ) { + if ( InternalLangTag.isA2Z(ch) ) + continue; + } else { + if ( InternalLangTag.isA2ZN(ch) ) + continue; + } + // Not acceptable. + return idx; + } + // Off end. + return idx; + } + + /** + * Parse a langtag string and return it's parts in canonical case. See + * constants for the array contents. Parts not present cause a null in + * the return array. + * + * @return Langtag parts, or null if the input string does not parse as a lang tag. + */ + /*package*/ static String[] parse(String languageTag) { + String[] parts = new String[partsLength]; + + Matcher m = pattern.matcher(languageTag); + if ( !m.find() ) { + m = patternGrandfathered.matcher(languageTag); + if ( m.find() ) { + parts[idxLanguage] = m.group(0); + return parts; + } + // Private use language, not extension. + m = privateUseLang.matcher(languageTag); + if ( m.find() ) { + parts[idxPrivateUse] = m.group(0); + return parts; + } + + // Irregular + m = enOED.matcher(languageTag); + if ( m.find() ) { + parts[idxLanguage] = "en"; + parts[idxRegion] = "GB"; + parts[idxVariant] = "oed"; + return parts; + } + + // Give up. + return null; + } + + int gc = m.groupCount(); + for (int i = 0; i < gc; i++) + parts[i] = m.group(i + 1); + + parts[idxLanguage] = lowercase(parts[idxLanguage]); + parts[idxScript] = titlecase(parts[idxScript]); + parts[idxRegion] = uppercase(parts[idxRegion]); + parts[idxVariant] = lowercase(parts[idxVariant]); + parts[idxExtension] = lowercase(parts[idxExtension]); + parts[idxPrivateUse] = lowercase(parts[idxPrivateUse]); + return parts; + } + + /** Canonicalize with the rules of RFC 4646, or RFC 5646 without replacement of preferred form. */ + /*package*/ static String canonical(String str) { + if ( str == null ) + return null; + String[] parts = parse(str); + String x = canonical(parts); + if ( x == null ) { + // Could try to apply the rule case-setting rules + // even through it's not a conforming langtag. + return str; + } + return x; + } + + /** + * Canonicalize with the rules of RFC 4646 "In this format, all non-initial + * two-letter subtags are uppercase, all non-initial four-letter subtags are + * titlecase, and all other subtags are lowercase." In addition, leave + * extensions unchanged. + * <p> + * This is the same as RFC5646 without replacement of preferred form + * or consulting the registry. + */ + /*package*/ static String canonical(String[] parts) { + // We canonicalised parts on parsing. + if ( parts == null ) + return null; + + if ( parts[0] == null ) { + // Grandfathered + return parts[idxExtension]; + } + + StringBuilder sb = new StringBuilder(); + sb.append(parts[0]); + for (int i = 1; i < parts.length; i++) { + if ( parts[i] != null ) { + sb.append("-"); + sb.append(parts[i]); + } + } + return sb.toString(); + } + + // Teh basic formatting rule. + private static String strcase_unused(String string) { + if ( string == null ) + return null; + if ( string.length() == 2 ) + return uppercase(string); + if ( string.length() == 4 ) + return titlecase(string); + return lowercase(string); + } + + private static String lowercase(String string) { + if ( string == null ) + return null; + return string.toLowerCase(Locale.ROOT); + } + + private static String uppercase(String string) { + if ( string == null ) + return null; + return string.toUpperCase(Locale.ROOT); + } + + private static String titlecase(String string) { + if ( string == null ) + return null; + char ch1 = string.charAt(0); + ch1 = Character.toUpperCase(ch1); + string = lowercase(string.substring(1)); + return ch1 + string; + } + } +} diff --git a/jena-langtag/src/main/java/org/apache/jena/langtag/LangTagRFC5646.java b/jena-langtag/src/main/java/org/apache/jena/langtag/LangTagRFC5646.java new file mode 100644 index 0000000000..f75659831c --- /dev/null +++ b/jena-langtag/src/main/java/org/apache/jena/langtag/LangTagRFC5646.java @@ -0,0 +1,781 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.jena.langtag; + +import java.util.HashSet; +import java.util.Locale; +import java.util.Objects; +import java.util.Set; + +/** + * An implementation of parsing and formatting. + * <a href="https://datatracker.ietf.org/doc/html/rfc5646">RFC 5646</a> + * <p> + * This implementation does not replace languages by their preferred form (e.g. + * "i-klingon" has preferred form of "tlh", "zh-xiang" has a preferred form of "hsn"). + * </p> + * <p> + * <a href="https://www.rfc-editor.org/info/rfc5646">RFC 5646: Tags for Identifying Languages</a> + * </p> + */ +public final class LangTagRFC5646 implements LangTag{ + // The language tag as given. + private final String langTagString; + + // Grandfathered + private boolean isGrandfathered = false; + // Private use of the whole Language-Tag + private boolean isPrivateUseLanguage = false; + + /* Formatting: https://datatracker.ietf.org/doc/html/rfc5646#section-2.1.1 + * + * All subtags, including extension and private use subtags, + * use lowercase letters with two exceptions: two-letter + * and four-letter subtags that neither appear at the start of the tag + * nor occur after singletons. Such two-letter subtags are all + * uppercase (as in the tags "en-CA-x-ca" or "sgn-BE-FR") and four- + * letter subtags are titlecase (as in the tag "az-Latn-x-latn"). + * + * See str() + */ + + // Helpers + private enum CaseRule { TITLE, LOWER, UPPER } + private enum CharSet { ALPHA, ALPHANUM } + + public static LangTag create(String string) { + LangTagRFC5646 langtag = parser(string); + return langtag; + } + + // Start/Finish indexes, excluding the initial '-' + private int language0 = -1 ; + private int language1 = -1 ; + + private int script0 = -1 ; + private int script1 = -1 ; + + private int region0 = -1 ; + private int region1 = -1 ; + + private int variant0 = -1 ; + private int variant1 = -1 ; + + // All extensions. + private int extension0 = -1 ; + private int extension1 = -1 ; + + // Private use sub tag (not private use of the whole language tag, which starts "x-"). + private int privateuse0 = -1 ; + private int privateuse1 = -1 ; + + @Override + public String getLanguage() { + return getSubTag("Language", langTagString, language0, language1, CaseRule.LOWER); + } + + @Override + public String getScript() { + return getSubTag("Script", langTagString, script0, script1, CaseRule.TITLE); + } + + @Override + public String getRegion() { + return getSubTag("Region", langTagString, region0, region1, CaseRule.UPPER); + } + + @Override + public String getVariant() { + return getSubTag("Variant", langTagString, variant0, variant1, CaseRule.LOWER); + } + + @Override + public String getExtension() { + return getSubTag("Extension", langTagString, extension0, extension1, CaseRule.LOWER); + } + + @Override + public String getPrivateUse() { + return getSubTag("Private", langTagString, privateuse0, privateuse1, CaseRule.LOWER); + } + + @Override + public int hashCode() { + return Objects.hash(langTagString, + language0, language1, script0, script1, variant0, variant1, + extension0, extension1, privateuse0, privateuse1, isGrandfathered, isPrivateUseLanguage); + } + + /** + * {@code .equals} and {@code .hashCode} + * provide "same immutable object" semantics. + * The language tags are treated case-sensitively. + * + * @See LangTagOps.sameLangTagAs for equivalent language tags. + */ + @Override + public boolean equals(Object obj) { + if ( this == obj ) + return true; + if ( !(obj instanceof LangTagRFC5646 other) ) + return false; + // All but the string. + boolean sameParsePoints = + extension0 == other.extension0 && extension1 == other.extension1 + && isGrandfathered == other.isGrandfathered + && isPrivateUseLanguage == other.isPrivateUseLanguage + && language0 == other.language0 && language1 == other.language1 + && privateuse0 == other.privateuse0 && privateuse1 == other.privateuse1 + && region0 == other.region0 && region1 == other.region1 + && script0 == other.script0 && script1 == other.script1 + && variant0 == other.variant0 && variant1 == other.variant1; + if ( ! sameParsePoints ) + return false; + return Objects.equals(langTagString, other.langTagString); + } + + /** + * Return the lang tag exactly as given. + * Use {@link #str()} for the language tag formatted by the rules of RFC 5646. + */ + @Override + public String toString() { + return langTagString; + } + + @Override + public String str() { + if ( isPrivateUseLanguage ) + return InternalLangTag.lowercase(langTagString); + + // Some irregular special cases. + if ( InternalLangTag.caseInsensitivePrefix(langTagString, "sgn-") ) { + // "sgn-BE-FR", "sgn-BE-NL", "sgn-CH-DE" + if ( langTagString.equalsIgnoreCase("sgn-BE-FR") ) + return "sgn-BE-FR"; + if ( langTagString.equalsIgnoreCase("sgn-BE-NL") ) + return "sgn-BE-NL"; + if ( langTagString.equalsIgnoreCase("sgn-CH-DE") ) + return "sgn-CH-DE"; + } + + if ( langTagString.startsWith("i-") || langTagString.startsWith("I-") ) { + String lcLangTagStr = InternalLangTag.lowercase(langTagString); + if ( irregular_i.contains(lcLangTagStr) ) + return lcLangTagStr; + } + + StringBuffer sb = new StringBuffer(); + add(sb, getLanguage()); + add(sb, getScript()); + add(sb, getRegion()); + add(sb, getVariant()); + add(sb, getExtension()); + add(sb, getPrivateUse()); + return sb.toString(); + } + + private void add(StringBuffer sb, String subtag) { + if ( subtag == null ) + return; + if ( ! sb.isEmpty() ) + sb.append('-'); + sb.append(subtag); + } + + private static String getSubTag(String label, String string, int start, int finish, CaseRule format) { + if ( start == -1 ) + return null; + if ( finish == -1 ) + throw new InternalError(InternalLangTag.titlecase(label)+" start is set but not subtag end: "+string); + if ( start >= finish ) + throw new InternalError(InternalLangTag.titlecase(label)+" start index is after "+InternalLangTag.lowercase(label)+" end index: "+string); + String x = string.substring(start, finish); + return switch(format) { + case TITLE -> InternalLangTag.titlecase(x); + case LOWER -> InternalLangTag.lowercase(x); + case UPPER -> InternalLangTag.uppercase(x); + }; + } + + private static LangTagRFC5646 parser(String string) { + + // A segment is a sequence of A2ZN characters separated by '-'. + + LangTagRFC5646 langtag = new LangTagRFC5646(string); + final int N = string.length(); + // Language-Tag = langtag ; normal language tags + // / privateuse ; private use tag + // / grandfathered ; grandfathered tags + + // langtag = language + // ["-" script] + // ["-" region] + // *("-" variant) + // *("-" extension) + // ["-" privateuse] + + // script = 4ALPHA ; ISO 15924 code + // region = 2ALPHA ; ISO 3166-1 code + // / 3DIGIT ; UN M.49 code + // variant = 5*8alphanum ; registered variants + // / (DIGIT 3alphanum) + // extension = singleton 1*("-" (2*8alphanum)) + + if ( N == 0 ) + InternalLangTag.error("Empty string"); + + // ------------------- + // language = (2*3ALPHA [ extlang ]); shortest ISO 639 code + // / 4ALPHA ; reserved for future use + // / 5*8ALPHA ; registered language subtag + // extlang = 3ALPHA ; selected ISO 639 codes + // *2("-" 3ALPHA) ; permanently reserved + + // Grandfathered + // Must check first because the whole string (except "en-GB-oed") is the "language" + + if ( grandfathered(string) ) { + // Regular: + // "each tag, in its entirety, represents a language or collection of languages." + // + // Irregular: + // With the exception of "en-GB-oed", which is a + // variant of "en-GB", each of them, in its entirety, + // represents a language. + // + langtag.language0 = 0; + langtag.language1 = N; + langtag.isGrandfathered = true; + // Exception. + if ( string.equalsIgnoreCase("en-GB-oed") ) { + // "oed" is "Oxford English Dictionary spelling" + // Better is the replacement "en-GB-oxendict" + langtag.language0 = 0; + langtag.language1 = 2; + langtag.region0 = 3; + langtag.region1 = 5; + // Non-standard variant. + langtag.variant0 = 6; + langtag.variant1 = N; + } + return langtag; + } + + // -- language + + int idx = 0; + int idx2 = segmentNextFinish(string, N, idx); + int segLen = segmentLength(N, idx, idx2); + + // Private use in the language position. + if ( segLen == 1 ) { + if ( string.startsWith("x-") || string.startsWith("X-") ) { + /* + The primary language subtag is the first subtag in a language tag and + cannot be omitted, with two exceptions: + + o The single-character subtag 'x' as the primary subtag indicates + that the language tag consists solely of subtags whose meaning is + defined by private agreement. For example, in the tag "x-fr-CH", + the subtags 'fr' and 'CH' do not represent the French language or + the country of Switzerland (or any other value in the IANA + registry) unless there is a private agreement in place to do so. + See Section 4.6. + */ + langtag.isPrivateUseLanguage = true; + int idxPrivateUseStart = 0; + int idxPrivateUseEnd = maybeSubtags(string, N, idxPrivateUseStart+segLen, 1, 8); + langtag.privateuse0 = idxPrivateUseStart; + langtag.privateuse1 = idxPrivateUseEnd; + if ( langtag.privateuse1 < N ) + InternalLangTag.error("Trailing characters in private langtag: '%s'", string.substring(langtag.privateuse1)); + return langtag; + } + InternalLangTag.error("Language part is 1 character: it must be 2-3 characters (4-8 reserved for future use), \"x-\", or a recognized grandfathered tag"); + } + + if ( idx2 < 0 ) { + // language only. + if ( segLen > 8 ) + InternalLangTag.error("Language too long (2-3 characters, 4-8 reserved for future use)"); + langtag.language0 = 0; + langtag.language1 = N; + InternalLangTag.checkAlpha(string, N, langtag.language0, langtag.language1); + return langtag; + } + + if ( idx == idx2 ) + InternalLangTag.error("Can not find the language subtag: '%s'", string); + + if ( segLen < 2 || segLen > 4 ) + InternalLangTag.error("Language: '%s'", string); + + langtag.language0 = idx; + + if ( segLen == 2 || segLen == 3 ) { + // -- Language extension subtags/ +// language = 2*3ALPHA ; shortest ISO 639 code +// ["-" extlang] +// extlang = 3ALPHA ; selected ISO 639 codes +// *2("-" 3ALPHA) ; permanently reserved + int extStart = idx+segLen; + InternalLangTag.checkAlpha(string, N, langtag.language0, extStart); + // Extensions are 1 to 3 3ALPHA subtags + int extEnd = maybeSubtags(string, N, extStart, 3, 3); + if ( extEnd > extStart ) { + idx2 = extEnd; + InternalLangTag.checkAlphaMinus(string, N, extStart, langtag.language1); + } + } else if ( segLen > 8 ) { + InternalLangTag.error("Language too long (2-3 characters, 4-8 reserved for future use)"); + } + // -- extlang + langtag.language1 = idx2; + // Info + noteSegment("language", string, langtag.language0, langtag.language1); + + // Move on - next subtag + idx = segmentNextStart(N, idx, idx2); + idx2 = segmentNextFinish(string, N, idx); + segLen = segmentLength(N, idx, idx2); + // -- End langtag + + // ---- script + // script = 4ALPHA ; ISO 15924 code + if ( segLen == 4 && InternalLangTag.isAlpha(string.charAt(idx)) ) { + // Script + // Not a digit - which is a variant. + // variant = ... / (DIGIT 3alphanum) + int start = idx; + int finish = idx+segLen; + + langtag.script0 = idx; + langtag.script1 = idx+segLen; + InternalLangTag.checkAlpha(string, N, langtag.script0, langtag.script1); + noteSegment("script", string, langtag.script0, langtag.script1); + + // Move on. + idx = segmentNextStart(N, idx, idx2); + idx2 = segmentNextFinish(string, N, idx); + segLen = segmentLength(N, idx, idx2); + } + // -- End script + + // ---- region + // region = 2ALPHA ; ISO 3166-1 code + // / 3DIGIT ; UN M.49 code + if ( segLen == 2 || segLen == 3 ) { + // Region + langtag.region0 = idx; + langtag.region1 = idx+segLen; + if ( segLen == 2 ) + InternalLangTag.checkAlpha(string, N, langtag.region0, langtag.region1); + else + InternalLangTag.checkDigits(string, N, langtag.region0, langtag.region1); + noteSegment("region", string, langtag.region0, langtag.region1); + + // Move on. + idx = segmentNextStart(N, idx, idx2); + idx2 = segmentNextFinish(string, N, idx); + segLen = segmentLength(N, idx, idx2); + } + // -- End region + + // ---- variant + // variant = 5*8alphanum ; registered variants + // / (DIGIT 3alphanum) + for ( ;; ) { + if ( segLen >= 5 && segLen <= 8) { + // variant 5*8alphanum + if ( langtag.variant0 == -1 ) + langtag.variant0 = idx; + langtag.variant1 = idx+segLen; + InternalLangTag.checkAlphaNum(string, N, idx, langtag.variant1); + noteSegment("variant", string, langtag.variant0, langtag.variant1); + // Move on. + idx = segmentNextStart(N, idx, idx2); + idx2 = segmentNextFinish(string, N, idx); + segLen = segmentLength(N, idx, idx2); + continue; + } + + if ( segLen == 4 ) { + // variant + // DIGIT 3alphanum + char ch = string.charAt(idx); + if ( ch >= '0' || ch <= '9' ) { + if ( langtag.variant0 == -1 ) + langtag.variant0 = idx; + langtag.variant1 = idx+segLen; + InternalLangTag.checkAlphaNum(string, N, idx, langtag.variant1); + noteSegment("variant", string, langtag.variant0, langtag.variant1); + } + // Move on. + idx = segmentNextStart(N, idx, idx2); + idx2 = segmentNextFinish(string, N, idx); + segLen = segmentLength(N, idx, idx2); + continue; + } + break; + } + // -- End variant + + // ---- extension and private use + // extension = singleton 1*("-" (2*8alphanum)) + // privateuse = "x" 1*("-" (1*8alphanum)) + boolean inPrivateUseSubtag = false; + Set<Character> extSingletons = null; new HashSet<>(); + while ( segLen == 1 ) { + char singleton = string.charAt(idx); + if ( singleton == 'x' || singleton == 'X' ) { + inPrivateUseSubtag = true; + break; + } + if ( extSingletons == null ) { + extSingletons = new HashSet<>(); + extSingletons.add(singleton); + } else { + boolean newEntry = extSingletons.add(singleton); + if ( ! newEntry ) + InternalLangTag.error("Duplicate extension singleton: '"+singleton+"'"); + } + + if ( langtag.extension0 == -1 ) + langtag.extension0 = idx; + // Extension. + // 2*8 alphanum + int idxExtStart = idx+segLen; + int idxEndExtra = maybeSubtags(string, N, idxExtStart, 2, 8); + + // Expecting at least one subtag. + if ( idxExtStart == idxEndExtra ) + InternalLangTag.error("Ill-formed extension"); + + if ( idxEndExtra > idxExtStart ) + idx2 = idxEndExtra; + langtag.extension1 = idx2; + InternalLangTag.checkAlphaNumMinus(string, N, langtag.extension0, langtag.extension1); + + noteSegment("extension", string, langtag.extension0, langtag.extension1); + // Move on. + idx = segmentNextStart(N, idx, idx2); + idx2 = segmentNextFinish(string, N, idx); + segLen = segmentLength(N, idx, idx2); + if ( segLen == 0 ) + InternalLangTag.error("Ill-formed extension. Trailing dash."); + } + + // ---- private use + if ( inPrivateUseSubtag ) { + langtag.privateuse0 = idx; + // privateuse = "x" 1*("-" (1*8alphanum)) + int idxPrivateUseStart = idx+segLen; + int idxPrivateUseEnd = maybeSubtags(string, N, idxPrivateUseStart, 1, 8); + + // Expecting at least one subtag. + if ( idxPrivateUseStart == idxPrivateUseEnd ) + InternalLangTag.error("Ill-formed private use component"); + + if ( idxPrivateUseEnd > idxPrivateUseStart ) + idx2 = idxPrivateUseEnd; + langtag.privateuse1 = idx2; + InternalLangTag.checkAlphaNumMinus(string, N, langtag.privateuse0, langtag.privateuse1); + + noteSegment("private use", string, langtag.privateuse0, langtag.privateuse1); + // Private use runs to end of string. But do checking. + // Move on. + idx = segmentNextStart(N, idx, idx2); + idx2 = segmentNextFinish(string, N, idx); + segLen = segmentLength(N, idx, idx2); + if ( segLen == 0 ) + InternalLangTag.error("Ill-formed private use subtag. Trailing dash."); + } + + // -- End extension and privateuse + + // Did we process everything? No segment: idx == -1 idx2 == -1 seglen == -1 + + if ( idx != -1 && idx < N ) + InternalLangTag.error("Trailing characters: '%s'", string.substring(idx)); + if ( idx2 >= 0 ) + InternalLangTag.error("Bad string: '%s'", string); + return langtag; + } + + private LangTagRFC5646(String string) { + this.langTagString = string; + } + + private LangTagRFC5646(String string, + int language0, int language1, + int script0, int script1, + int region0, int region1, + int variant0, int variant1, + int extension0, int extension1, + int privateuse0, int privateuse1, + boolean isGrandfathered) { + this.langTagString = string; + this.isGrandfathered = isGrandfathered; + this.language0 = language0; + this.language1 = language1; + this.script0 = script0; + this.script1 = script1; + this.variant0 = variant0; + this.variant1 = variant1; + this.extension0 = extension0; + this.extension1 = extension1; + this.privateuse0 = privateuse0; + this.privateuse1 = privateuse1; + } + + /** Zero or more subtags, each between min and max length. */ + private static int maybeSubtags(String string, int N, int idxStart, int min, int max) { + // Looking at the '-' or end of string. + int numExt = 0; + int count = 0; + int x = idxStart; + // Outer loop - each subtag segment, having read at the "-" + while ( x >= 0 && x < N ) { + char ch = string.charAt(x); + if ( ch != '-' ) + break; + int x1 = maybeSubtag1(string, N, x+1, min, max); + if ( x1 <= 0 ) + break; + if ( x1 == N ) { + x = N; + break; + } + x = x1; + } + return x; + } + + /** + * Peek for a segment between min and max in length. + * The initial "-" has been read. + */ + private static int maybeSubtag1(String string, int N, int idxStart, int min, int max) { + int idx = idxStart; + if ( idx >= N ) + return -1; + int idx2 = segmentNextFinish(string, N, idx); + int segLen = segmentLength(N, idx, idx2); + if ( segLen == 0 ) + InternalLangTag.error("Bad langtag. Found '--'"); + + if ( segLen < min || segLen > max ) + return -1; + if ( ! InternalLangTag.isAlpha(string, idxStart, idxStart+segLen) ) + return -1; + return idxStart+segLen; + } + + // Start/Finish indexes, excluding the initial '-' + private static String getSegment(String string, int x0, int x1) { + if ( x0 < 0 && x1 < 0 ) + return null; + if ( x0 < 0 || x1 < 0 ) { + InternalLangTag.error("Segment one undef index"); + return null; + } + return string.substring(x0, x1); + } + + /** Length of a segment, excluding any "-" */ + private static int segmentLength(int N, int idx, int idx2) { + if ( idx < 0 ) + return -1; + if ( idx2 < 0 ) + return N-idx; + return idx2-idx; + } + + /** Index of the start of the next segment. */ + private static int segmentNextStart(int N, int idx, int idx2) { + if ( idx2 == -1 ) + return -1; + idx = idx2; + // Skip '-' + idx++; + return idx; + } + + /** Note segment - development aid. */ + private static void noteSegment(String label, String string, int idx, int idx2) { +// if ( idx2 < 0 ) { +// System.out.printf("%-10s [%d,%d) '%s'\n", label, idx, idx2, string.substring(idx)); +// return; +// } +// System.out.printf("%-10s [%d,%d) '%s'\n",label, idx, idx2, string.substring(idx, idx2)); + } + + /** Return the index of the next '-' or -1 */ + private static int segmentNextFinish(String x, int N, int idx) { + if ( idx == -1 ) + return -1; + if ( idx == N ) + return -1; + for ( ; idx < N ; idx++ ) { + char ch = x.charAt(idx); + if ( ch == '-' ) { + if ( idx == N-1 ) { + // The case of "subtag-" + InternalLangTag.error("Language tag string ends in '-'"); + } + return idx; + } + } + return -1; + } + + // --- + // RFC 5646: regular tags + // Grandfathered tags that (appear to) match the 'langtag' production in + // Figure 1 are considered 'regular' grandfathered tags. These tags + // contain one or more subtags that either do not individually appear in + // the registry or appear but with a different semantic meaning: each + // tag, in its entirety, represents a language or collection of + // languages. + + private static boolean grandfathered(String s) { + s = s.toLowerCase(Locale.ROOT); + return grandfathered.contains(s) || regular.contains(s) ; + } + + // These tags match the 'langtag' production, but their subtags are not extended + // language or variant subtags: their meaning is defined by their registration and + // all of these are deprecated in favor of a more modern subtag or sequence of + // subtags + + private static Set<String> regular = + Set.of("art-lojban", "cel-gaulish", "no-bok", "no-nyn", "zh-guoyu", "zh-hakka", "zh-min", "zh-min-nan", "zh-xiang"); + + // RFC 5646: irregular tags do not match the 'langtag' production and would not be 'well-formed' + // Grandfathered tags that do not match the 'langtag' production in the + // ABNF and would otherwise be invalid are considered 'irregular' + // grandfathered tags. With the exception of "en-GB-oed", which is a + // variant of "en-GB", each of them, in its entirety, represents a + // language. + + private static Set<String> irregular = + Set.of("en-GB-oed", + "i-ami", "i-bnn", "i-default", "i-enochian", "i-hak", "i-klingon", + "i-lux", "i-mingo", "i-navajo", "i-pwn", "i-tao", "i-tay", "i-tsu", + // These are irregular in that they are "primary subtag ("sgn" - sign language) + // then two region-like subtags. + // They do obey the basic formatting rule - two letters non-primary subtag is uppercase. + "sgn-BE-FR", "sgn-BE-NL", "sgn-CH-DE"); + + // The "i-" irregulars. + private static Set<String> irregular_i = + Set.of("i-ami", "i-bnn", "i-default", "i-enochian", "i-hak", "i-klingon", + "i-lux", "i-mingo", "i-navajo", "i-pwn", "i-tao", "i-tay", "i-tsu"); + + // --- + + private static Set<String> grandfathered = new HashSet<>(2*(regular.size()+irregular.size())); + static { + for ( String s : irregular ) + grandfathered.add(InternalLangTag.lowercase(s)); + for ( String s : regular ) + grandfathered.add(InternalLangTag.lowercase(s)); + } + + // @formatter:off + /* + RFC 5646 Section 2.1 + ABNF definition: https://datatracker.ietf.org/doc/html/rfc5646#section-2.1 + + Language-Tag = langtag ; normal language tags + / privateuse ; private use tag + / grandfathered ; grandfathered tags + + langtag = language + ["-" script] + ["-" region] + *("-" variant) + *("-" extension) + ["-" privateuse] + + language = 2*3ALPHA ; shortest ISO 639 code + ["-" extlang] ; sometimes followed by + ; extended language subtags + / 4ALPHA ; or reserved for future use + / 5*8ALPHA ; or registered language subtag + + extlang = 3ALPHA ; selected ISO 639 codes + *2("-" 3ALPHA) ; permanently reserved + + script = 4ALPHA ; ISO 15924 code + + region = 2ALPHA ; ISO 3166-1 code + / 3DIGIT ; UN M.49 code + + variant = 5*8alphanum ; registered variants + / (DIGIT 3alphanum) + + extension = singleton 1*("-" (2*8alphanum)) + + ; Single alphanumerics + ; "x" reserved for private use + singleton = DIGIT ; 0 - 9 + / %x41-57 ; A - W + / %x59-5A ; Y - Z + / %x61-77 ; a - w + / %x79-7A ; y - z + + privateuse = "x" 1*("-" (1*8alphanum)) + + grandfathered = irregular ; non-redundant tags registered + / regular ; during the RFC 3066 era + + irregular = "en-GB-oed" ; irregular tags do not match + / "i-ami" ; the 'langtag' production and + / "i-bnn" ; would not otherwise be + / "i-default" ; considered 'well-formed' + / "i-enochian" ; These tags are all valid, + / "i-hak" ; but most are deprecated + / "i-klingon" ; in favor of more modern + / "i-lux" ; subtags or subtag + / "i-mingo" ; combination + / "i-navajo" + / "i-pwn" + / "i-tao" + / "i-tay" + / "i-tsu" + / "sgn-BE-FR" + / "sgn-BE-NL" + / "sgn-CH-DE" + + regular = "art-lojban" ; these tags match the 'langtag' + / "cel-gaulish" ; production, but their subtags + / "no-bok" ; are not extended language + / "no-nyn" ; or variant subtags: their meaning + / "zh-guoyu" ; is defined by their registration + / "zh-hakka" ; and all of these are deprecated + / "zh-min" ; in favor of a more modern + / "zh-min-nan" ; subtag or sequence of subtags + / "zh-xiang" + + alphanum = (ALPHA / DIGIT) ; letters and numbers + */ + // @formatter:on +} diff --git a/jena-langtag/src/main/java/org/apache/jena/langtag/LangTags.java b/jena-langtag/src/main/java/org/apache/jena/langtag/LangTags.java new file mode 100644 index 0000000000..2738298949 --- /dev/null +++ b/jena-langtag/src/main/java/org/apache/jena/langtag/LangTags.java @@ -0,0 +1,289 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.jena.langtag; + +import static org.apache.jena.langtag.InternalLangTag.error; +import static org.apache.jena.langtag.InternalLangTag.str; + +import java.util.ArrayList; +import java.util.List; +import java.util.Objects; + +public class LangTags { + + /** Index of the language part */ + public static final int idxLanguage = 0 ; + /** Index of the script part */ + public static final int idxScript = 1 ; + /** Index of the region part */ + public static final int idxRegion = 2 ; + /** Index of the variant part */ + public static final int idxVariant = 3 ; + /** Index of all extensions */ + public static final int idxExtension = 4 ; + + private static final int partsLength = 5 ; + + /** @deprecated Compatibility operation (the behaviour of Jena 5.3.0 and earlier). To be removed. */ + @Deprecated(forRemoval = true) + public static String[] parse(String languageTag) { + try { + LangTag langTag = SysLangTag.create(languageTag); + if (langTag == null ) + return null; + String result[] = new String[partsLength]; + + result[idxLanguage] = langTag.getLanguage(); + result[idxScript] = langTag.getScript(); + result[idxRegion] = langTag.getRegion(); + result[idxVariant] = langTag.getVariant(); + // Legacy compatible. + if ( langTag.getPrivateUse() == null ) + result[idxExtension] = langTag.getExtension(); + else if ( langTag.getExtension() == null ) + result[idxExtension] = langTag.getPrivateUse(); + else + result[idxExtension] = langTag.getExtension()+"-"+langTag.getPrivateUse(); + return result; + } catch (LangTagException ex) { + return null; + } + } + + /** + * Create a {@link LangTag} from a string + * that meets the + * <a href="https://datatracker.ietf.org/doc/html/rfc5646#section-2.1">syntax of RFC 5646</a>. + * <p> + * Throws {@link LangTagException} on bad syntax. + */ + public static LangTag of(String string) { + LangTag langTag = SysLangTag.create(string); + // Implements should not return null but just in case ... + if ( langTag == null ) + throw new LangTagException("Bad syntax"); + return langTag; + } + + /** Same as {@link #of(String)} */ + public static LangTag create(String string) { + return of(string); + } + + public static String canonical(String string) { + LangTag langTag = of(string); + return langTag.str(); + } + + /** Check a string is valid as a language tag. */ + public static boolean check(String languageTag) { + try { + LangTag langTag = SysLangTag.create(languageTag); + return (langTag != null ); + } catch (LangTagException ex) { + return false; + } + } + + /** + * Basic formatter following + * <a href="https://datatracker.ietf.org/doc/html/rfc5646#section-2.1.1">RFC 5646 section 2.1.1</a> + */ + public static String basicFormat(String string) { + // with the interpretation that "after singleton" means anywhere after the singleton. + if ( string == null ) + return null; + if ( string.isEmpty() ) + return string; + List<String> strings = InternalLangTag.splitOnDash(string); + if ( strings == null ) { + //return lowercase(string); + error("Bad language string: %s", string); + } + StringBuilder sb = new StringBuilder(string.length()); + boolean singleton = false; + boolean first = true; + + for ( String s : strings ) { + if ( first ) { + // language + sb.append(InternalLangTag.lowercase(s)); + first = false; + continue; + } + first = false; + // All subtags after language + sb.append('-'); + if ( singleton ) + // Always lowercase + sb.append(InternalLangTag.lowercase(s)); + else { + // case depends on ;length + sb.append(InternalLangTag.strcase(s)); + if ( s.length() == 1 ) + singleton = true; + } + } + return sb.toString(); + } + + /** Is @code{langTag1} the same as @code{langTag2}? */ + public static boolean sameLangTagAs(LangTag langTag1, LangTag langTag2) { + Objects.requireNonNull(langTag1); + Objects.requireNonNull(langTag2); + if ( langTag1 == langTag2 ) + return true; + if ( ! Objects.equals(langTag1.getLanguage(),langTag2.getLanguage()) ) + return false; + if ( ! Objects.equals(langTag1.getScript(),langTag2.getScript()) ) + return false; + if ( ! Objects.equals(langTag1.getRegion(),langTag2.getRegion()) ) + return false; + if ( ! Objects.equals(langTag1.getVariant(), langTag2.getVariant()) ) + return false; + if ( ! Objects.equals(langTag1.getExtension(), langTag2.getExtension()) ) + return false; + if ( ! Objects.equals(langTag1.getPrivateUse(), langTag2.getPrivateUse()) ) + return false; + return true; + } + + /** + * Check a language tag string meets the Turtle(etc) and SPARQL grammar rule + * for a language tag without initial text direction. + * <p> + * Passing this test does not guarantee the string is valid language tag. Use + * {@link LangTags#check(String)} for validity checking. + * + * @returns true or false + */ + public static boolean basicCheck(String string) { + try { + return basicCheckEx(string); + } catch (LangTagException ex) { + return false; + } + } + + /** + * Check a language tag string meets the Turtle(etc) and SPARQL grammar rule + * for a language tag without initial text direction. + * <p> + * Passing this test does not guarantee the string is valid language tag. Use + * {@link LangTags#check(String)} for validity checking. + * + * @throws LangTagException + */ + public static boolean basicCheckEx(String string) { + boolean start = true; + int lastSegmentStart = 0; + + for ( int idx = 0; idx < string.length(); idx++ ) { + char ch = string.charAt(idx); + if ( InternalLangTag.isA2ZN(ch) ) + continue; + if ( ch == '-' ) { + if ( idx == 0 ) { + error("'%s': starts with a '-' character", string); + return false; + } + if ( idx == lastSegmentStart ) { + error("'%s': two dashes", string); + return false; + } + lastSegmentStart = idx+1; + continue; + } + // Not A2ZN, not '-'. + error("Bad character: (0x%02X) '%s' index %d", (int)ch, str(ch), idx); + return false; + } + // End of string. + if ( lastSegmentStart == string.length() ) { + error("'%s': Ends in a '-'", string); + return false; + } + return true; + } + + /** + * Split a language tag based on dash separators + * <p> + * The string should be a legal language tag, at least by the general SPARQL/Turtle(etc) grammar rule. + * @returns null on bad input syntax + * + * @see LangTags#check + * @see LangTags#create + */ + public static List<String> splitOnDash(String string) { + try { + return splitOnDashEx(string); + } catch (LangTagException ex) { + return null; + } + } + + /** + * Split a language tag into subtags. + * <p> + * The string should be a legal language tag, at least by the general SPARQL/Turtle(etc) grammar rule. + * @throw {@link LangTagException} + * + * @see LangTags#check + * @see LangTags#create + */ + public static List<String> splitOnDashEx(String string) { + List<String> parts = new ArrayList<>(); + // Split efficiently based on [a-z][A-Z][0-9] units separated by "-", with meaning error messages. + StringBuilder sb = new StringBuilder(); + + boolean start = true; + for ( int idx = 0; idx < string.length(); idx++ ) { + char ch = string.charAt(idx); + if ( InternalLangTag.isA2ZN(ch) ) { + sb.append(ch); + continue; + } + if ( ch == '-' ) { + if ( idx == 0 ) { + error("'%s': starts with a '-' character", string); + return null; + } + String str = sb.toString(); + if ( str.isEmpty() ) { + error("'%s': two dashes", string); + return null; + } + parts.add(str); + sb.setLength(0); + continue; + } + error("Bad character: (0x%02X) '%s' index %d", (int)ch, str(ch), idx); + return null; + } + String strLast = sb.toString(); + if ( strLast.isEmpty() ) { + error("'%s': Ends in a '-'", string); + return null; + } + parts.add(strLast); + return parts; + } +} + diff --git a/jena-langtag/src/main/java/org/apache/jena/langtag/SysLangTag.java b/jena-langtag/src/main/java/org/apache/jena/langtag/SysLangTag.java new file mode 100644 index 0000000000..8e8835ac90 --- /dev/null +++ b/jena-langtag/src/main/java/org/apache/jena/langtag/SysLangTag.java @@ -0,0 +1,46 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.jena.langtag; + +/** + * See also {@link LangTags}. + */ +public class SysLangTag { + + /** + * Create a {@link LangTag} using the system-wide default language tag parser, + * which is {@link LangTagRFC5646}. + * + */ + public static LangTag create(String languageTag) { + return LangTagRFC5646.create(languageTag); + } + + /** + * Format language tag. + * This is the system-wide policy for formatting language tags. + */ + public static String formatLangTag(String input) { + if ( input == null ) + return ""; + if ( input.isEmpty() ) + return input; + return create(input).str(); + } +} diff --git a/jena-langtag/src/main/java/org/apache/jena/langtag/cmd/CmdLangTag.java b/jena-langtag/src/main/java/org/apache/jena/langtag/cmd/CmdLangTag.java new file mode 100644 index 0000000000..0b1a46fdec --- /dev/null +++ b/jena-langtag/src/main/java/org/apache/jena/langtag/cmd/CmdLangTag.java @@ -0,0 +1,76 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.jena.langtag.cmd; + +import org.apache.jena.langtag.LangTag; +import org.apache.jena.langtag.LangTagException; +import org.apache.jena.langtag.SysLangTag; + +public class CmdLangTag { + + public static void main(String[] args) { + if ( args.length != 1 ) { + System.err.println("Requires one argument."); + System.exit(1); + } + + String languageTag = args[0]; + if ( languageTag.isEmpty() ) { + System.err.println("Empty string for language tag"); + System.exit(1); + } + if ( languageTag.isBlank() ) { + System.err.println("Blank string for language tag"); + System.exit(1); + } + if ( languageTag.contains(" ") || languageTag.contains("\t") || languageTag.contains("\n") || languageTag.contains("\r") ) { + System.err.println("Language tag contains white space"); + System.exit(1); + } + if ( languageTag.contains("--") ) { + System.err.println("Illgeal language tag. String contains '--'"); + System.exit(1); + } + + try { + System.out.printf("%-16s %s\n", "Input:", languageTag); + LangTag langTag = SysLangTag.create(languageTag); + System.out.printf("%-16s %s\n", "Formatted:", langTag.str()); + print("Language:", langTag.getLanguage(), true); + print("Script:", langTag.getScript(), true); + print("Region:", langTag.getRegion(), true); + print("Variant:", langTag.getVariant(), false); + print("Extension:", langTag.getExtension(), false); + print("Private Use:", langTag.getPrivateUse(), false); + } catch (LangTagException ex) { + System.out.println("Bad language tag"); + System.out.printf("%s\n", ex.getMessage()); + System.exit(1); + } + } + + private static void print(String label, String value, boolean always) { + if ( value == null ) { + if ( ! always ) + return; + value = "-"; + } + System.out.printf(" %-14s %s\n", label, value); + } +} diff --git a/jena-langtag/src/test/java/org/apache/jena/langtag/TS_LangTag.java b/jena-langtag/src/test/java/org/apache/jena/langtag/TS_LangTag.java new file mode 100644 index 0000000000..d369d91ace --- /dev/null +++ b/jena-langtag/src/test/java/org/apache/jena/langtag/TS_LangTag.java @@ -0,0 +1,32 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.jena.langtag; + +import org.junit.platform.suite.api.SelectClasses; +import org.junit.platform.suite.api.Suite; + +@Suite +@SelectClasses( { + TestLangTag.class + , TestLangTagFormat.class + , TestLangTagsOps.class + , TestBasicSyntaxLangTags.class +}) + +public class TS_LangTag { } diff --git a/jena-langtag/src/test/java/org/apache/jena/langtag/TestBasicSyntaxLangTags.java b/jena-langtag/src/test/java/org/apache/jena/langtag/TestBasicSyntaxLangTags.java new file mode 100644 index 0000000000..e746aab548 --- /dev/null +++ b/jena-langtag/src/test/java/org/apache/jena/langtag/TestBasicSyntaxLangTags.java @@ -0,0 +1,76 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.jena.langtag; + +import static org.apache.jena.langtag.LangTags.*; +import static org.junit.jupiter.api.Assertions.*; + + + +import java.util.Arrays; +import java.util.List; + +import org.junit.jupiter.api.Test; + +/** + * Tests for the operations related to basic parsing of language tags (SPARQL and Turtle grammar rules) + */ +public class TestBasicSyntaxLangTags { + @Test public void basic_01() { basicSplitCheck("en", "en"); } + @Test public void basic_02() { basicSplitCheck("en-GB", "en", "GB"); } + @Test public void basic_03() { basicSplitCheck("en-gb", "en", "gb"); } + @Test public void basic_04() { basicSplitCheck("en", "en"); } + + // Showing the split does not allocate subtags to their category. e.g. "x-private" is split. + @Test public void basic_05() { basicSplitCheck("en-Latn-GB-boont-r-extended-sequence-x-private", + "en","Latn", "GB", "boont", "r", "extended", "sequence", "x", "private"); } + + @Test public void basic_bad_01() { basicSplitCheckBad(""); } + @Test public void basic_bad_02() { basicSplitCheckBad("-"); } + @Test public void basic_bad_03() { basicSplitCheckBad("--"); } + @Test public void basic_bad_04() { basicSplitCheckBad("abc-xy%20"); } + @Test public void basic_bad_05() { basicSplitCheckBad("abc def"); } + + static void basicSplitCheck(String input, String...parts) { + basicSplitTest(input, parts); + checkTest(input); + } + + static void basicSplitCheckBad(String input) { + assertFalse(basicCheck(input)); + assertNull(splitOnDash(input)); + assertThrows(LangTagException.class, ()->splitOnDashEx(input)); + assertThrows(LangTagException.class, ()->basicCheckEx(input)); + } + + public static void basicSplitTest(String input, String...parts) { + List<String> expected = (parts == null) ? null : Arrays.asList(parts); + List<String> actual = splitOnDashEx(input); + assertEquals(expected, actual, "Subject: "+input); + List<String> actual2 = splitOnDash(input); + assertEquals(actual, actual2, "Subject(2): "+input); + } + + private static void checkTest(String input) { + boolean actual = basicCheckEx(input); + assertTrue(actual, "Subject: "+input); + boolean actual2 = basicCheck(input); + assertEquals(actual, actual2, "Subject(2): "+input); + } +} diff --git a/jena-langtag/src/test/java/org/apache/jena/langtag/TestLangTag.java b/jena-langtag/src/test/java/org/apache/jena/langtag/TestLangTag.java new file mode 100644 index 0000000000..289406bcc3 --- /dev/null +++ b/jena-langtag/src/test/java/org/apache/jena/langtag/TestLangTag.java @@ -0,0 +1,240 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.jena.langtag; + +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertNotNull; +import static org.junit.jupiter.api.Assertions.assertNull; +import static org.junit.jupiter.api.Assertions.fail; + +import org.junit.jupiter.api.Test; + +public class TestLangTag { + + @Test public void test_lang_parse_00() { testRFC5646("lng-scrp-rg", "lng-Scrp-RG", "lng", "Scrp", "RG", null, null); } + @Test public void test_lang_parse_01() { testRFC5646("lng-scrp-rg-variant", "lng-Scrp-RG-variant", "lng", "Scrp", "RG", "variant", null); } + @Test public void test_lang_parse_02() { testRFC5646("lng-scrp-rg-variant-e-abc", "lng-Scrp-RG-variant-e-abc", "lng", "Scrp", "RG", "variant", "e-abc"); } + + @Test public void test_lang_basic_01() { testRFC5646("en", "en", "en", null, null, null, null); } + @Test public void test_lang_basic_02() { testRFC5646("en-us", "en-US", "en", null, "US", null, null); } + @Test public void test_lang_basic_03() { testRFC5646("en-latn-us", "en-Latn-US", "en", "Latn", "US", null, null); } + @Test public void test_lang_basic_04() { testRFC5646("en-123", "en-123", "en", null, "123", null, null); } + @Test public void test_lang_basic_05() { testRFC5646("en-1234", "en-1234", "en", null, null, "1234", null); } + @Test public void test_lang_basic_06() { testRFC5646("en-latn", "en-Latn", "en", "Latn", null, null, null); } + @Test public void test_lang_basic_07() { testRFC5646("en-latn-gb", "en-Latn-GB", "en", "Latn", "GB", null, null); } + // Language subtags + @Test public void test_lang_basic_08() { testNotJDK("en-brs-xxx-latn-gb", "en-brs-xxx-Latn-GB", "en-brs-xxx", "Latn", "GB", null, null, null); } + @Test public void test_lang_basic_09() { testRFC5646("de-CH-w-extend", "de-CH-w-extend", "de", null, "CH", null, "w-extend"); } + @Test public void test_lang_basic_10() { testRFC5646("de-CH-w-extend-extend", "de-CH-w-extend-extend", "de", null, "CH", null, "w-extend-extend"); } + + @Test public void test_lang_basic_20() { testPrivateUse("de-CH-x-phonebk-morech", "de-CH-x-phonebk-morech", "de", null, "CH", null, null, "x-phonebk-morech"); } + // Private use language tag. No language! + @Test public void test_lang_basic_21() { testPrivateUse("x-private", "x-private", null, null, null, null, null, "x-private"); } + @Test public void test_lang_basic_22() { testPrivateUse("az-Latn-x-latn", "az-Latn-x-latn", "az", "Latn", null, null, null, "x-latn"); } + @Test public void test_lang_basic_23() { testPrivateUse("sss-x-y", "sss-x-y", "sss", null, null, null, null, "x-y"); } + + + @Test public void test_lang_bad_01() { testBad("123"); } + @Test public void test_lang_bad_02() { testBad("abcdefghijklmn"); } + @Test public void test_lang_bad_03() { testBad("abcdefghijklmn-123"); } + @Test public void test_lang_bad_04() { testBad("abcdefghijklmn-latn"); } + + @Test public void test_lang_bad_05() { testBad("a?"); } + @Test public void test_lang_bad_06() { testBad("a b"); } + @Test public void test_lang_bad_07() { testBad("en--us"); } + @Test public void test_lang_bad_08() { testBad("-us"); } + @Test public void test_lang_bad_09() { testBad("en-"); } + @Test public void test_lang_bad_10() { testBad("en-gb-"); } + @Test public void test_lang_bad_11() { testBad("i18n"); } + + // Wrong lengths + @Test public void test_lang_bad_20() { testBad("s"); } + @Test public void test_lang_bad_21() { testBad("abcdefghi"); } + @Test public void test_lang_bad_22() { testBad("en-abcdefghi"); } + @Test public void test_lang_bad_23() { testBad("en-Latn-x-abcdefghi"); } + + // Bad extension + @Test public void test_lang_bad_31() { testBad("sss-d"); } + @Test public void test_lang_bad_32() { testBad("sss-d-"); } + @Test public void test_lang_bad_33() { testBad("sss-d-e"); } + @Test public void test_lang_bad_34() { testBad("sss-d-ext-"); } + + // Bad private use + @Test public void test_lang_bad_45() { testBad("sss-x"); } + @Test public void test_lang_bad_46() { testBad("sss-x-"); } + @Test public void test_lang_bad_47() { testBad("sss-x-part-"); } + + @Test public void test_lang_bad_repeated_extension() { + // "en-a-bbb-a-ccc" is invalid because the subtag 'a' appears twice. + testBad("en-a-bbb-a-ccc"); + } + + // Wikipedia-like -- their private use subtags can be too long + @Test public void test_lang_bad_50() { testBad("en-x-Q123456789"); } + + // Special cases. "en-GB-oed" -- "oed" is variant even though it does not match the syntax rule. + @Test public void test_langtag_special_01() { testFormatting("en-GB-oed", "en-GB-oed"); } + @Test public void test_langtag_special_02() { testNotJDK("en-GB-oed", "en-GB-oed", "en", null, "GB", "oed", null, null); } + @Test public void test_langtag_special_03() { testFormatting("EN-gb-OED", "en-GB-oed"); } + @Test public void test_langtag_special_04() { testNotJDK("EN-gb-OED", "en-GB-oed", "en", null, "GB", "oed", null, null); } + + // The examples from RFC 5646 + @Test public void test_lang_10() { testRFC5646("de", "de", "de", null, null, null, null); } + @Test public void test_lang_11() { testRFC5646("fr", "fr", "fr", null, null, null, null); } + @Test public void test_lang_12() { testRFC5646("ja", "ja", "ja", null, null, null, null); } + @Test public void test_lang_13() { testNotJDK("i-enochian", "i-enochian", "i-enochian", null, null, null, null, null); } + @Test public void test_lang_14() { testRFC5646("zh-Hant", "zh-Hant", "zh", "Hant", null, null, null); } + @Test public void test_lang_15() { testRFC5646("zh-Hans", "zh-Hans", "zh", "Hans", null, null, null); } + @Test public void test_lang_16() { testRFC5646("sr-Cyrl", "sr-Cyrl", "sr", "Cyrl", null, null, null); } + @Test public void test_lang_17() { testRFC5646("sr-Latn", "sr-Latn", "sr", "Latn", null, null, null); } + + // Extended language subtag (3 letter) + @Test public void test_lang_18() { testNotJDK("zh-cmn-Hans-CN", "zh-cmn-Hans-CN", "zh-cmn", "Hans", "CN", null, null, null); } + @Test public void test_lang_19() { testRFC5646("cmn-Hans-CN", "cmn-Hans-CN", "cmn", "Hans", "CN", null, null); } + @Test public void test_lang_20() { testNotJDK("zh-yue-HK", "zh-yue-HK", "zh-yue", null, "HK", null, null, null); } + @Test public void test_lang_21() { testRFC5646("yue-HK", "yue-HK", "yue", null, "HK", null, null); } + @Test public void test_lang_22() { testRFC5646("zh-Hans-CN", "zh-Hans-CN", "zh", "Hans", "CN", null, null); } + + @Test public void test_lang_23() { testRFC5646("sr-Latn-RS", "sr-Latn-RS", "sr", "Latn", "RS", null, null); } + @Test public void test_lang_24() { testRFC5646("sl-rozaj", "sl-rozaj", "sl", null, null, "rozaj", null); } + @Test public void test_lang_25() { testNotJDK("sl-rozaj-biske", "sl-rozaj-biske", "sl", null, null, "rozaj-biske", null, null); } + @Test public void test_lang_26() { testRFC5646("sl-nedis", "sl-nedis", "sl", null, null, "nedis", null); } + @Test public void test_lang_27() { testRFC5646("de-CH-1901", "de-CH-1901", "de", null, "CH", "1901", null); } + @Test public void test_lang_28() { testRFC5646("sl-IT-nedis", "sl-IT-nedis", "sl", null, "IT", "nedis", null); } + @Test public void test_lang_29() { testRFC5646("hy-Latn-IT-arevela", "hy-Latn-IT-arevela", "hy", "Latn", "IT", "arevela", null); } + @Test public void test_lang_30() { testRFC5646("de-DE", "de-DE", "de", null, "DE", null, null); } + @Test public void test_lang_31() { testRFC5646("en-US", "en-US", "en", null, "US", null, null); } + @Test public void test_lang_32() { testRFC5646("es-419", "es-419", "es", null, "419", null, null); } + + @Test public void test_lang_33() { testPrivateUse("de-CH-x-phonebk", "de-CH-x-phonebk", "de", null, "CH", null, null, "x-phonebk"); } + @Test public void test_lang_34() { testPrivateUse("az-Arab-x-AZE-derbend", "az-Arab-x-aze-derbend", "az", "Arab", null, null, null, "x-aze-derbend"); } + @Test public void test_lang_35() { testPrivateUse("x-whatever-a-abc-x-xyz", "x-whatever-a-abc-x-xyz", null, null, null, null, null, "x-whatever-a-abc-x-xyz"); } + @Test public void test_lang_36() { testPrivateUse("qaa-Qaaa-QM-x-southern", "qaa-Qaaa-QM-x-southern", "qaa", "Qaaa", "QM", null, null, "x-southern"); } + + @Test public void test_lang_37() { testRFC5646("de-Qaaa", "de-Qaaa", "de", "Qaaa", null, null, null); } + @Test public void test_lang_38() { testRFC5646("sr-Latn-QM", "sr-Latn-QM", "sr", "Latn", "QM", null, null); } + @Test public void test_lang_39() { testRFC5646("sr-Qaaa-RS", "sr-Qaaa-RS", "sr", "Qaaa", "RS", null, null); } + @Test public void test_lang_40() { testRFC5646("en-US-u-islamcal", "en-US-u-islamcal", "en", null, "US", null, "u-islamcal"); } + @Test public void test_lang_41() { testPrivateUse("zh-CN-a-myext-x-private", "zh-CN-a-myext-x-private", "zh", null, "CN", null, "a-myext", "x-private"); } + @Test public void test_lang_42() { testRFC5646("en-a-myext-b-another", "en-a-myext-b-another", "en", null, null, null, "a-myext-b-another"); } + + @Test public void test_lang_50() { testPrivateUse("en-x-private", "en-x-private", "en", null, null, null, null, "x-private"); } + + @Test public void test_lang_51() { testPrivateUse( "en-x-US", "en-x-us", "en", null, null, null, null, "x-us"); } + // "Note that the tag "en-a-bbb-x-a-ccc" is valid because the second appearance of + // the singleton 'a' is in a private use sequence." + @Test public void test_lang_52() { testPrivateUse( "en-a-bbb-x-a-ccc" , "en-a-bbb-x-a-ccc" , "en", null, null, null, "a-bbb", "x-a-ccc"); } + + // Mentioned in RFC 5646 + @Test public void test_lang_60() { testPrivateUse("en-Latn-GB-boont-r-extended-sequence-x-private", "en-Latn-GB-boont-r-extended-sequence-x-private", + "en","Latn", "GB", "boont", "r-extended-sequence", "x-private"); } + + @Test public void test_lang_61() { testPrivateUse("en-Latn-GB-boont-r-extended-sequence-s-another-x-private", "en-Latn-GB-boont-r-extended-sequence-s-another-x-private", + "en","Latn", "GB", "boont", "r-extended-sequence-s-another", "x-private"); } + + + /** General test - include JDK */ + private static void testRFC5646(String langString, String formatted, String lang, String script, String region, String variant, String extension) { + runTest(langString, formatted, lang, script, region, variant, extension, null, true); + } + + /** Has a private use part */ + private static void testPrivateUse(String langString, String formatted, String lang, String script, String region, String variant, String extension, String privateUse) { + // Private use is supported by LanTagJDK by extracting the "x" extension + runTest(langString, formatted, lang, script, region, variant, extension, privateUse, true); + } + + + /** Run a test which is not properly supported by the JDK-Locale based implementation. */ + private static void testNotJDK(String langString, String formatted, String lang, String script, String region, String variant, String extension, String privateUse) { + runTest(langString, formatted, lang, script, region, variant, extension, privateUse, false); + } + + /** Run a test which illegal by RFC 5646 */ + private void testBad(String string) { + try { + LangTag langTag = LangTagRFC5646.create(string); + // Parser throws an exception. In case that changes ... + assertNull(langTag); + fail("Expected a LangTagException"); + } catch (LangTagException ex) { + //ex.printStackTrace(); + } + } + + private static void runTest(String langString, String formatted, + String lang, String script, String region, String variant, String extension, String privateuse, + boolean jdkSupported) { + // Run the test with varied case of the input string. + test1(langString, formatted, lang, script, region, variant, extension, privateuse); + test1(langString.toLowerCase(), formatted, lang, script, region, variant, extension, privateuse); + test1(langString.toUpperCase(), formatted, lang, script, region, variant, extension, privateuse); + + // Formatting. + testFormatting(langString, formatted); + + // JDK + if ( jdkSupported ) { + LangTag jdk = LangTagJDK.create(langString); + assertEquals(lang, jdk.getLanguage()); + assertEquals(script, jdk.getScript()); + assertEquals(region, jdk.getRegion()); + assertEquals(variant, jdk.getVariant()); + assertEquals(extension, jdk.getExtension()); + assertEquals(privateuse, jdk.getPrivateUse()); + } + + final boolean regexSupported = true; + if ( regexSupported ) { + LangTag langTagByRE = LangTagRE.create(langString); + assertEquals(lang, langTagByRE.getLanguage()); + assertEquals(script, langTagByRE.getScript()); + assertEquals(region, langTagByRE.getRegion()); + assertEquals(variant, langTagByRE.getVariant()); + assertEquals(extension, langTagByRE.getExtension()); + assertEquals(privateuse, langTagByRE.getPrivateUse()); + } + } + + // Test execution for LangTagRFC5646 on one exact input string. + private static void test1(String langString, String formatted, String lang, String script, String region, String variant, String extension, String privateuse) { + LangTag langTag = LangTagRFC5646.create(langString); + assertNotNull(langTag); + assertEquals(lang, langTag.getLanguage(), "Lang"); + assertEquals(script, langTag.getScript(), "Script"); + assertEquals(region, langTag.getRegion(), "Region"); + assertEquals(variant, langTag.getVariant(), "Variant"); + assertEquals(extension, langTag.getExtension(), "Extension"); + assertEquals(privateuse, langTag.getPrivateUse(), "Private use"); + String f = langTag.str(); + assertEquals(formatted, f, "String formatted"); + } + + private static void testFormatting(String langString, String expected) { + // Formatting. + // Already in test1 but redoing it allows a check between the two formatters. + LangTag langTag = LangTagRFC5646.create(langString); + // Build formatted language tag. + String fmt1 = langTag.str(); + assertEquals(expected, fmt1, "RFC5646 parser format"); + // Formatting using the general algorithm of RFC5646. + String fmt2 = LangTags.basicFormat(langString); + assertEquals(expected, fmt2, "RFC5646 basic algorithm"); + } +} diff --git a/jena-langtag/src/test/java/org/apache/jena/langtag/TestLangTagFormat.java b/jena-langtag/src/test/java/org/apache/jena/langtag/TestLangTagFormat.java new file mode 100644 index 0000000000..db7c4b0377 --- /dev/null +++ b/jena-langtag/src/test/java/org/apache/jena/langtag/TestLangTagFormat.java @@ -0,0 +1,143 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.jena.langtag; + + +import java.util.ArrayList; +import java.util.List; +import java.util.function.Function; + +import org.junit.Test; +import org.junit.runner.RunWith; +import org.junit.runners.Parameterized; +import org.junit.runners.Parameterized.Parameters; + +// JUnit4 +// Junit5 is missing @ParameterizedClass which may arrive eventually +@RunWith(Parameterized.class) +public class TestLangTagFormat { + + private static Function<String, String> formatter1 = (s)-> LangTagRFC5646.create(s).str(); + private static Function<String, String> formatter2 = (s)-> LangTags.basicFormat(s); + + @Parameters(name = "{index}: {0}") + public static Iterable<Object[]> data() { + List<Object[]> x = new ArrayList<>() ; + + x.add(new Object[] {"LangTagRFC5646", formatter1}); + x.add(new Object[] {"LangTagOps", formatter2}); + return x ; + } + + private final String formatterName; + private final Function<String, String> formatter; + + public TestLangTagFormat(String name, Function<String, String> formatter) { + this.formatterName = name; + this.formatter = formatter; + } + + @Test public void testBasicFormat01() { test("de", "de"); } + @Test public void testBasicFormat02() { test("FR", "fr"); } + @Test public void testBasicFormat03() { test("jA", "ja"); } + @Test public void testBasicFormat04() { test("de-DE", "de-DE"); } + @Test public void testBasicFormat05() { test("en-US", "en-US"); } + @Test public void testBasicFormat06() { test("en-US-variant", "en-US-variant"); } + + // 419 is a region. + @Test public void testBasicFormat10() { test("es-419", "es-419"); } + @Test public void testBasicFormat11() { test("es-latn-419", "es-Latn-419"); } + + @Test public void testBasicFormat90() { test("en-GB-oed", "en-GB-oed"); } + @Test public void testBasicFormat91() { test("EN-gb-OED", "en-GB-oed"); } + + // Taken from the examples in RFC 5646 + @Test public void testBasicFormat20() { test("zh-hant", "zh-Hant"); } + @Test public void testBasicFormat21() { test("sr-cyrl", "sr-Cyrl"); } + @Test public void testBasicFormat22() { test("sr-latn", "sr-Latn"); } + @Test public void testBasicFormat23() { test("zh-cmn-hans-cn", "zh-cmn-Hans-CN"); } + @Test public void testBasicFormat24() { test("cmn-hans-cn", "cmn-Hans-CN"); } + @Test public void testBasicFormat25() { test("zh-yue-hk", "zh-yue-HK"); } + @Test public void testBasicFormat26() { test("yue-hk", "yue-HK"); } + @Test public void testBasicFormat27() { test("zh-hans-cn", "zh-Hans-CN"); } + @Test public void testBasicFormat28() { test("sr-latn-rs", "sr-Latn-RS"); } + @Test public void testBasicFormat29() { test("sl-rozaj", "sl-rozaj"); } + @Test public void testBasicFormat30() { test("sl-rozaj-biske", "sl-rozaj-biske"); } + @Test public void testBasicFormat31() { test("de-ch-1901", "de-CH-1901"); } + @Test public void testBasicFormat32() { test("sl-it-nedis", "sl-IT-nedis"); } + @Test public void testBasicFormat33() { test("hy-latn-it-arevela", "hy-Latn-IT-arevela"); } + @Test public void testBasicFormat34() { test("de-ch-x-phonebk", "de-CH-x-phonebk"); } + @Test public void testBasicFormat35() { test("az-arab-x-aze-derbend", "az-Arab-x-aze-derbend"); } + @Test public void testBasicFormat36() { test("x-whatever", "x-whatever"); } + @Test public void testBasicFormat37() { test("qaa-qaaa-qm-x-southern", "qaa-Qaaa-QM-x-southern"); } + @Test public void testBasicFormat38() { test("de-qaaa", "de-Qaaa"); } + @Test public void testBasicFormat39() { test("en-us-u-islamcal", "en-US-u-islamcal"); } + @Test public void testBasicFormat40() { test("zh-cn-a-myext-x-private", "zh-CN-a-myext-x-private"); } + @Test public void testBasicFormat41() { test("en-a-myext-b-another", "en-a-myext-b-another"); } + @Test public void testBasicFormat42() { test("en-123", "en-123"); } + @Test public void testBasicFormat43() { test("en-1234", "en-1234"); } + @Test public void testBasicFormat44() { test("en-brs-xxx-latn-gb", "en-brs-xxx-Latn-GB"); } + @Test public void testBasicFormat45() { test("EN-LATN", "en-Latn"); } + @Test public void testBasicFormat46() { test("en-latn-gb", "en-Latn-GB"); } + @Test public void testBasicFormat47() { test("de-ch-w-extend", "de-CH-w-extend"); } + @Test public void testBasicFormat48() { test("de-ch-x-phonebk-morech", "de-CH-x-phonebk-morech"); } + @Test public void testBasicFormat49() { test("x-private", "x-private"); } + @Test public void testBasicFormat50() { test("az-latn-x-latn", "az-Latn-x-latn"); } + @Test public void testBasicFormat51() { test("en-latn-X-DaTa", "en-Latn-x-data"); } + + @Test public void irregular_01() { test("SGN-BE-FR", "sgn-BE-FR"); } + @Test public void irregular_02() { test("sgn-be-fr", "sgn-BE-FR"); } + @Test public void irregular_03() { test("sgn-be-nl", "sgn-BE-NL"); } + @Test public void irregular_04() { test("sgn-ch-de", "sgn-CH-DE"); } + @Test public void irregular_05() { test("i-klingon", "i-klingon"); } + + // Mentioned in RFC 4646 + @Test public void parseCanonical_01() { test("en-ca-x-ca", "en-CA-x-ca"); } + @Test public void parseCanonical_02() { test("EN-ca-X-Ca", "en-CA-x-ca"); } + @Test public void parseCanonical_03() { test("En-Ca-X-Ca", "en-CA-x-ca"); } + @Test public void parseCanonical_04() { test("AZ-latn-x-LATN", "az-Latn-x-latn"); } + @Test public void parseCanonical_05() { test("Az-latn-X-Latn", "az-Latn-x-latn"); } + + @Test public void parseCanonical_10() { test("zh-hant", "zh-Hant"); } + @Test public void parseCanonical_11() { test("zh-latn-wadegile", "zh-Latn-wadegile"); } + @Test public void parseCanonical_12() { test("zh-latn-pinyin", "zh-Latn-pinyin"); } + @Test public void parseCanonical_13() { test("en-us", "en-US"); } + @Test public void parseCanonical_14() { test("EN-Gb", "en-GB"); } + @Test public void parseCanonical_15() { test("qqq-002", "qqq-002"); } + @Test public void parseCanonical_16() { test("ja-latn", "ja-Latn"); } + @Test public void parseCanonical_17() { test("x-local", "x-local"); } + @Test public void parseCanonical_18() { test("he-latn", "he-Latn"); } + @Test public void parseCanonical_19() { test("und", "und"); } + @Test public void parseCanonical_20() { test("nn", "nn"); } + @Test public void parseCanonical_21() { test("ko-latn", "ko-Latn"); } + @Test public void parseCanonical_22() { test("ar-latn", "ar-Latn"); } + @Test public void parseCanonical_23() { test("la-x-liturgic", "la-x-liturgic"); } + @Test public void parseCanonical_24() { test("fa-x-middle", "fa-x-middle"); } + @Test public void parseCanonical_25() { test("qqq-142", "qqq-142"); } + @Test public void parseCanonical_26() { test("bnt", "bnt"); } + @Test public void parseCanonical_27() { test("grc-x-liturgic", "grc-x-liturgic"); } + @Test public void parseCanonical_28() { test("egy-Latn", "egy-Latn"); } + @Test public void parseCanonical_29() { test("la-x-medieval", "la-x-medieval"); } + + private void test(String langString, String expected) { + String result = formatter.apply(langString); + // JUnit4 argument order. + org.junit.Assert.assertEquals(formatterName+"("+langString+"): ", expected, result); + } +} diff --git a/jena-langtag/src/test/java/org/apache/jena/langtag/TestLangTagsOps.java b/jena-langtag/src/test/java/org/apache/jena/langtag/TestLangTagsOps.java new file mode 100644 index 0000000000..9db126f90f --- /dev/null +++ b/jena-langtag/src/test/java/org/apache/jena/langtag/TestLangTagsOps.java @@ -0,0 +1,62 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.jena.langtag; + +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertFalse; +import static org.junit.jupiter.api.Assertions.assertTrue; + +import org.junit.jupiter.api.Test; + +public class TestLangTagsOps { + @Test + public void sameLangTag_01() { + LangTag langTag1 = LangTags.of("en-GB"); + LangTag langTag2 = LangTags.of("en-GB"); + sameLangTag(langTag1, langTag2, true, true, true); + } + + @Test + public void sameLangTag_02() { + LangTag langTag1 = LangTags.of("en-GB"); + LangTag langTag2 = LangTags.of("en-gb"); + sameLangTag(langTag1, langTag2, true, false, false); + } + + @Test + public void sameLangTag_03() { + LangTag langTag1 = LangTags.of("en-GB-Latn"); + LangTag langTag2 = LangTags.of("en-gb"); + sameLangTag(langTag1, langTag2, false, false, false); + } + + private static void sameLangTag(LangTag langTag1, LangTag langTag2, boolean sameAs, boolean equals, boolean sameHash) { + if ( sameAs ) + assertTrue(LangTags.sameLangTagAs(langTag1, langTag2)); + else + assertFalse(LangTags.sameLangTagAs(langTag1, langTag2)); + if ( equals ) + assertTrue(langTag1.equals(langTag2)); + else + assertFalse(langTag1.equals(langTag2)); + if ( sameHash ) + assertEquals(langTag1.hashCode(), langTag2.hashCode()); + // No "hash must be different" + } +}
