(jena) 01/02: GH-3086: Module jena-langtag

andy Wed, 26 Mar 2025 03:01:44 -0700

This is an automated email from the ASF dual-hosted git repository.

andy pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/jena.git


commit 73bfa864cd2973dd2c280df44892c0d68baab43e
Author: Andy Seaborne <[email protected]>
AuthorDate: Tue Mar 25 18:39:09 2025 +0000

    GH-3086: Module jena-langtag
---
 jena-langtag/pom.xml                               |  95 +++
 jena-langtag/src/main/java/module-info.java        |  22 +
 .../org/apache/jena/langtag/InternalLangTag.java   | 179 +++++
 .../java/org/apache/jena/langtag/LangExamples.java |  79 +++
 .../main/java/org/apache/jena/langtag/LangTag.java |  67 ++
 .../org/apache/jena/langtag/LangTagException.java  |  23 +
 .../java/org/apache/jena/langtag/LangTagJDK.java   | 154 ++++
 .../java/org/apache/jena/langtag/LangTagRE.java    | 393 +++++++++++
 .../org/apache/jena/langtag/LangTagRFC5646.java    | 781 +++++++++++++++++++++
 .../java/org/apache/jena/langtag/LangTags.java     | 289 ++++++++
 .../java/org/apache/jena/langtag/SysLangTag.java   |  46 ++
 .../org/apache/jena/langtag/cmd/CmdLangTag.java    |  76 ++
 .../java/org/apache/jena/langtag/TS_LangTag.java   |  32 +
 .../jena/langtag/TestBasicSyntaxLangTags.java      |  76 ++
 .../java/org/apache/jena/langtag/TestLangTag.java  | 240 +++++++
 .../org/apache/jena/langtag/TestLangTagFormat.java | 143 ++++
 .../org/apache/jena/langtag/TestLangTagsOps.java   |  62 ++
 17 files changed, 2757 insertions(+)

diff --git a/jena-langtag/pom.xml b/jena-langtag/pom.xml
new file mode 100644
index 0000000000..4016701806
--- /dev/null
+++ b/jena-langtag/pom.xml
@@ -0,0 +1,95 @@
+<?xml version="1.0" encoding="utf-8"?>
+<!--
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+ 
+   See the NOTICE file distributed with this work for additional
+   information regarding copyright ownership.
+-->
+
+<project xmlns="http://maven.apache.org/POM/4.0.0";
+         xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"; 
+         xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 
http://maven.apache.org/maven-v4_0_0.xsd";>
+  <modelVersion>4.0.0</modelVersion>
+
+  <artifactId>jena-langtag</artifactId>
+  <name>Apache Jena - Language tags</name>
+
+  <parent>
+    <groupId>org.apache.jena</groupId>
+    <artifactId>jena</artifactId>
+    <version>5.4.0-SNAPSHOT</version>
+  </parent>
+
+  <description>Implementation of RFC 5646 (BCP-47) Language tags</description>
+
+  <properties>
+    <automatic.module.name>org.apache.jena.langtag</automatic.module.name>
+  </properties>
+
+  <dependencies>
+
+    <dependency>
+      <groupId>org.junit.jupiter</groupId>
+      <artifactId>junit-jupiter-api</artifactId>
+      <scope>test</scope>
+    </dependency>
+
+    <dependency>
+      <groupId>org.junit.platform</groupId>
+      <artifactId>junit-platform-suite-engine</artifactId>
+      <scope>test</scope>
+    </dependency>
+
+    <dependency>
+      <groupId>org.junit.jupiter</groupId>
+      <artifactId>junit-jupiter-params</artifactId>
+    </dependency>
+
+    <!-- 
+         Needed for @Parameterized test suite
+         JUnit5 will eventually have @ParameterizedClass. 
+    -->
+    <dependency>
+      <groupId>org.junit.vintage</groupId>
+      <artifactId>junit-vintage-engine</artifactId>
+      <scope>test</scope>
+    </dependency>
+
+  </dependencies>
+
+  <build>
+    <plugins>
+      <plugin>
+        <groupId>org.apache.maven.plugins</groupId>
+        <artifactId>maven-resources-plugin</artifactId>
+      </plugin>
+
+      <plugin>
+        <groupId>org.apache.maven.plugins</groupId>
+        <artifactId>maven-source-plugin</artifactId>
+      </plugin>
+
+      <plugin>
+        <groupId>org.apache.maven.plugins</groupId>
+        <artifactId>maven-jar-plugin</artifactId>
+      </plugin> 
+
+      <plugin>
+        <groupId>org.apache.maven.plugins</groupId>
+        <artifactId>maven-javadoc-plugin</artifactId>
+      </plugin>
+    </plugins>
+
+  </build>
+
+</project>
diff --git a/jena-langtag/src/main/java/module-info.java 
b/jena-langtag/src/main/java/module-info.java
new file mode 100644
index 0000000000..326731548b
--- /dev/null
+++ b/jena-langtag/src/main/java/module-info.java
@@ -0,0 +1,22 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+module org.apache.jena.langtag {
+    exports org.apache.jena.langtag;
+    exports org.apache.jena.langtag.cmd;
+}
diff --git 
a/jena-langtag/src/main/java/org/apache/jena/langtag/InternalLangTag.java 
b/jena-langtag/src/main/java/org/apache/jena/langtag/InternalLangTag.java
new file mode 100644
index 0000000000..93281be82a
--- /dev/null
+++ b/jena-langtag/src/main/java/org/apache/jena/langtag/InternalLangTag.java
@@ -0,0 +1,179 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.jena.langtag;
+
+import java.util.ArrayList;
+import java.util.List;
+import java.util.Locale;
+
+/**
+ * LangTag processing support.
+ */
+class InternalLangTag {
+
+    static List<String> splitOnDash(String x) {
+        List<String> strings = new ArrayList<>(6);
+        // Split efficiently(?) based on [a-z][A-Z][0-9] units separated by 
"-"s
+        StringBuilder sb = new StringBuilder();
+
+        boolean start = true;
+        for ( int idx = 0; idx < x.length(); idx++ ) {
+            char ch = x.charAt(idx);
+            if ( isA2ZN(ch) ) {
+                sb.append(ch);
+                continue;
+            }
+            if ( ch == '-' ) {
+                String str = sb.toString();
+                strings.add(str);
+                sb.setLength(0);
+                continue;
+            }
+            error("Bad character: (0x%02X) '%c' index %d", (int)ch, str(ch), 
idx);
+        }
+        String strLast = sb.toString();
+        if ( strLast.isEmpty() ) {
+            return null;
+            //throw new LangTagException("Empty part: "+x);
+        }
+        strings.add(strLast);
+        return strings;
+    }
+
+    /*package*/ static String strcase(String string) {
+        if ( string == null )
+            return null;
+        if ( string.length() == 2 )
+            return uppercase(string);
+        if ( string.length() == 4 )
+            return titlecase(string);
+        return lowercase(string);
+    }
+
+    /*package*/static String lowercase(String string) {
+        if ( string == null )
+            return null;
+        return string.toLowerCase(Locale.ROOT);
+    }
+
+    /*package*/static String uppercase(String string) {
+        if ( string == null )
+            return null;
+        return string.toUpperCase(Locale.ROOT);
+    }
+
+    /*package*/static String titlecase(String string) {
+        if ( string == null )
+            return null;
+        char ch1 = string.charAt(0);
+        ch1 = Character.toUpperCase(ch1);
+        string = lowercase(string.substring(1));
+        return ch1 + string;
+    }
+
+    /** ASCII A-Z */
+    /*package*/ static boolean isA2Z(int ch) {
+        return range(ch, 'a', 'z') || range(ch, 'A', 'Z');
+    }
+
+    /** ASCII A-Z or 0-9 */
+    /*package*/ static boolean isA2ZN(int ch) {
+        return range(ch, 'a', 'z') || range(ch, 'A', 'Z') || range(ch, '0', 
'9');
+    }
+
+    static void checkDigits(String string, int N, int start, int end) {
+        for ( int i = start ; i < end ; i++ ) {
+            char ch = string.charAt(i);
+            if ( ! isNum(ch) )
+                error("Not a DIGIT (%s, posn = %s) in '%s'", str(ch), (i+1), 
string);
+        }
+    }
+
+    static void checkAlpha(String string, int N, int start, int end) {
+        for ( int i = start ; i < end ; i++ ) {
+            char ch = string.charAt(i);
+            if ( ! isAlpha(ch) )
+                // 1-based error message
+                error("Not an ALPHA (%s, posn = %s) in '%s'", str(ch), (i+1), 
string);
+        }
+    }
+
+    static boolean isAlpha(String string, int start, int end) {
+        for ( int i = start ; i < end ; i++ ) {
+            char ch = string.charAt(i);
+            if ( ! isAlpha(ch) )
+                return false;
+        }
+        return true;
+    }
+
+    static void checkAlphaMinus(String string, int N, int start, int end) {
+        for ( int i = start ; i < end ; i++ ) {
+            char ch = string.charAt(i);
+            if ( ! isAlpha(ch) && ! isMinus(ch) )
+                error("Not an ALPHA or MINUS (%s, posn = %s) in '%s'", 
str(ch), (i+1), string);
+        }
+    }
+
+    static void checkAlphaNum(String string, int N, int start, int end) {
+        for ( int i = start ; i < end ; i++ ) {
+            char ch = string.charAt(i);
+            if ( ! isAlpha(ch) && ! isNum(ch) )
+                error("Not an ALPHA or DIGITS (%s, posn = %s) in '%s'", 
str(ch), (i+1), string);
+        }
+    }
+
+    static void checkAlphaNumMinus(String string, int N, int start, int end) {
+        for ( int i = start ; i < end ; i++ ) {
+            char ch = string.charAt(i);
+            if ( ! isAlpha(ch) && ! isNum(ch) && ! isMinus(ch) )
+                error("Not an ALPHA, DIGITS or MINUS (%s, posn = %s) in '%s'", 
str(ch), (i+1), string);
+        }
+    }
+
+    /*package*/ static String str(char ch) {
+        return String.format("'%s' U+%04X", Character.valueOf(ch), (int)ch);
+    }
+
+    static boolean isAlpha(char ch) {
+        return ( ch >= 'a' && ch <= 'z' ) || ( ch >= 'A' && ch <= 'Z' );
+    }
+
+    static boolean isNum(char ch) {
+        return ( ch >= '0' && ch <= '9' );
+    }
+
+    static boolean isMinus(char ch) {
+        return ( ch == '-' );
+    }
+
+    /*package*/ static void error(String msg, Object...args) {
+        String x = String.format(msg, args);
+        throw new LangTagException(x);
+    }
+
+    private static boolean range(int ch, char a, char b) {
+        return (ch >= a && ch <= b);
+    }
+
+    /** Case insensitive test of whether a string has a prefix. */
+    static boolean caseInsensitivePrefix(String string, String prefix) {
+        return string.regionMatches(true, 0, prefix, 0, prefix.length());
+    }
+}
diff --git 
a/jena-langtag/src/main/java/org/apache/jena/langtag/LangExamples.java 
b/jena-langtag/src/main/java/org/apache/jena/langtag/LangExamples.java
new file mode 100644
index 0000000000..f1fb4d12a5
--- /dev/null
+++ b/jena-langtag/src/main/java/org/apache/jena/langtag/LangExamples.java
@@ -0,0 +1,79 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.jena.langtag;
+
+public class LangExamples {
+
+    // Examples from RFC 5646
+    static String[] examples5646 = {
+       "de",
+       "fr",
+       "ja",
+       "i-enochian",        // (example of a grandfathered tag)
+       "zh-Hant",           // (Chinese written using the Traditional Chinese 
script)
+       "zh-Hans",           // (Chinese written using the Simplified Chinese 
script)
+       "sr-Cyrl",           // (Serbian written using the Cyrillic script)
+       "sr-Latn",           // (Serbian written using the Latin script)
+
+    //Extended language subtags and their primary language subtag counterparts:
+       "zh-cmn-Hans-CN",    // (Chinese, Mandarin, Simplified script, as used 
in China)
+       "cmn-Hans-CN",       // (Mandarin Chinese, Simplified script, as used 
in China)
+       "zh-yue-HK",         // (Chinese, Cantonese, as used in Hong Kong SAR)
+       "yue-HK",            // (Cantonese, Chinese, as used in Hong Kong SAR)
+    //Language-Script-Region:
+       "zh-Hans-CN",        // (Chinese written using the Simplified script as 
used in mainland China)
+       "sr-Latn-RS",        // (Serbian written using the Latin script as used 
in Serbia)
+    //Language-Variant:
+       "sl-rozaj",          // (Resian dialect of Slovenian)
+       "sl-rozaj-biske",    // (San Giorgio dialect of Resian dialect of 
Slovenian)
+       "sl-nedis",          // (Nadiza dialect of Slovenian)
+    //Language-Region-Variant:
+       "de-CH-1901",        // (German as used in Switzerland using the 1901 
variant [orthography])
+       "sl-IT-nedis",       // (Slovenian as used in Italy, Nadiza dialect)
+    //Language-Script-Region-Variant:
+       "hy-Latn-IT-arevela",    // (Eastern Armenian written in Latin script, 
as used in Italy)
+    //Language-Region:
+       "de-DE",             // (German for Germany)
+       "en-US",             // (English as used in the United States)
+       "es-419",            // (Spanish appropriate for the Latin America and 
Caribbean region using the UN region code)
+    //Private use subtags:
+       "de-CH-x-phonebk",
+       "az-Arab-x-AZE-derbend",
+    //Private use registry values:
+       "x-whatever",        // (private use using the singleton 'x')
+       "qaa-Qaaa-QM-x-southern", // (all private tags)
+       "de-Qaaa",           // (German, with a private script)
+       "sr-Latn-QM",        // (Serbian, Latin script, private region)
+       "sr-Qaaa-RS",        // (Serbian, private script, for Serbia)
+    //Tags that use extensions
+    // (examples ONLY -- extensions MUST be defined by revision or update to 
this document, or by RFC):
+       "en-US-u-islamcal",
+       "zh-CN-a-myext-x-private",
+       "en-a-myext-b-another"
+    };
+
+    static String[] examples5646_bad = {
+    //Some Invalid Tags:
+       "de-419-DE", // (two region tags)
+       "a-DE" // (use of a single-character subtag in primary position; note
+              // that there are a few grandfathered tags that start with "i-" 
that
+              // are valid)
+       //"ar-a-aaa-b-bbb-a-ccc" // (two extensions with same 
single-letterprefix)
+    };
+}
diff --git a/jena-langtag/src/main/java/org/apache/jena/langtag/LangTag.java 
b/jena-langtag/src/main/java/org/apache/jena/langtag/LangTag.java
new file mode 100644
index 0000000000..541e2c504b
--- /dev/null
+++ b/jena-langtag/src/main/java/org/apache/jena/langtag/LangTag.java
@@ -0,0 +1,67 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.jena.langtag;
+
+import java.util.Locale;
+
+/**
+ * A language tag as a tuple of 5 strings (lang, script, region,
+ * variant, extension) and
+ * <p>
+ * See {@link LangTagRFC5646} for generating {@code LangTag}s. Note this 
returns the old ISO code.
+ * See the javadoc of {@link Locale#getLanguage()}.
+ * <p>
+ * {@link LangTagJDK} is an alternative version which uses the Java locale
+ * built-in functionality and does not canonical language names (replace one 
name by another).
+ * JDK Locale It is not fully RFC 5646 compliance
+ * <p>
+ * Language tags are BCP 47.
+ * <p>
+ * RFCs:
+ * <ul>
+ * <li><a href="https://tools.ietf.org/html/5646";>RFC 5646</a> "Tags for 
Identifying Languages"
+ * <li><a href="https://tools.ietf.org/html/4646";>RFC 4646</a> "Tags for 
Identifying Languages"
+ * <li><a href="https://tools.ietf.org/html/3066";>RFC 3066</a> "Tags for the 
Identification of Languages"
+ * </ul>
+ * Related:
+ * <ul>
+ * <li><a href="https://tools.ietf.org/html/4647";>RFC 4647</a> "Matching of 
Language Tags"
+ * <li><a href="https://tools.ietf.org/html/4234";>RFC 4232</a> "Augmented BNF 
for Syntax Specifications: ABNF"
+ * </ul>
+ */
+public sealed interface LangTag permits LangTagJDK, LangTagRFC5646, LangTagRE {
+
+    /**
+     * Formatted according to the RFC 5646 rules.
+     * <p>
+     * {@code toString()} should return the language tag with the same case as 
it was originally.
+     */
+    public String str();
+
+    public String getLanguage();
+    public String getScript();
+    public String getRegion();
+    public String getVariant();
+    public String getExtension();
+    public String getPrivateUse();
+
+    @Override public int hashCode();
+    @Override public boolean equals(Object other);
+    @Override public String toString();
+}
diff --git 
a/jena-langtag/src/main/java/org/apache/jena/langtag/LangTagException.java 
b/jena-langtag/src/main/java/org/apache/jena/langtag/LangTagException.java
new file mode 100644
index 0000000000..c68073b934
--- /dev/null
+++ b/jena-langtag/src/main/java/org/apache/jena/langtag/LangTagException.java
@@ -0,0 +1,23 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.jena.langtag;
+
+public class LangTagException extends RuntimeException {
+    public LangTagException(String msg) { super(msg); }
+}
diff --git a/jena-langtag/src/main/java/org/apache/jena/langtag/LangTagJDK.java 
b/jena-langtag/src/main/java/org/apache/jena/langtag/LangTagJDK.java
new file mode 100644
index 0000000000..60ce5e910f
--- /dev/null
+++ b/jena-langtag/src/main/java/org/apache/jena/langtag/LangTagJDK.java
@@ -0,0 +1,154 @@
+/*
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ *
+ *  See the NOTICE file distributed with this work for additional
+ *  information regarding copyright ownership.
+ */
+
+package org.apache.jena.langtag;
+
+import java.util.IllformedLocaleException;
+import java.util.Locale;
+import java.util.Locale.Builder;
+import java.util.Objects;
+import java.util.Set;
+
+/**
+ * LangTag parsing.
+ * <p>
+ * A layer over the JDK {@link Locale} and {@link Builder} to introduce a 
class without legacy langtag conversion.
+ * {@link LangTag}.
+ * <p>
+ * This is not RFC 5646 compliant.
+ * <ul>
+ * <li>Does not handle language subtags (e.g. "zh-cmn-Hans-CN")</li>
+ * <li>Does not handle grandfathered language tags e.g. "i-enochian"</li>
+ * <li>Multiple variant subtags</li>
+ * <li>Legacy "en-GB-oed" - "oed" is a 3 letter script (script is 4 by the 
grammar/)</li>
+ * <ul>
+ */
+public final class LangTagJDK implements LangTag {
+    private final String langTagAsGiven;
+    private final String fmtString;
+    private final String lang;
+    private final String script;
+    private final String region;
+    private final String variant;
+    private final String extension;
+    // Not supported by the JDK (part of extensions).
+    private final String privateUse;
+
+    private static Locale.Builder locBuild = new Locale.Builder();
+
+    public static LangTag create(String string) {
+        try {
+            locBuild.clear();
+            locBuild.setLanguageTag(string);
+            return asLangTag(string, locBuild);
+        } catch (IllformedLocaleException ex) {
+            return null;
+        }
+    }
+
+    private LangTagJDK(String langTagAsGiven, String fmtString, String 
language, String script, String region, String variant, String extension, 
String privateUse) {
+        this.langTagAsGiven = langTagAsGiven;
+        this.fmtString  = Objects.requireNonNull(fmtString);
+        this.lang       = maybe(language);
+        this.script     = maybe(script);
+        this.region     = maybe(region);
+        this.variant    = maybe(variant);
+        this.extension  = maybe(extension);
+        this.privateUse = maybe(privateUse);
+    }
+
+    private static String maybe(String x) {
+        // Choice.
+        if ( x == null )
+            return null;
+        if ( x.isEmpty() )
+            return null;
+        return x;
+    }
+
+    @Override public String str() { return fmtString; }
+
+    @Override public String getLanguage() { return lang; }
+    @Override public String getScript() { return script; }
+    @Override public String getRegion() { return region; }
+    @Override public String getVariant() { return variant; }
+    @Override public String getExtension() { return extension; }
+    @Override public String getPrivateUse() { return privateUse; }
+
+    public static String canonical(String str) {
+        try {
+            // Does not do conversion of language for ISO 639 codes that have 
changed.
+            return locBuild.setLanguageTag(str).build().toLanguageTag();
+        } catch (IllformedLocaleException ex) {
+            return str;
+        }
+    }
+
+    @Override
+    public int hashCode() {
+        return Objects.hash(langTagAsGiven, fmtString,
+                            lang, script, region, variant,
+                            extension, privateUse);
+    }
+
+    @Override
+    public boolean equals(Object obj) {
+        if ( this == obj )
+            return true;
+        if ( !(obj instanceof LangTagJDK) )
+            return false;
+        LangTagJDK other = (LangTagJDK)obj;
+        return Objects.equals(lang, other.lang)
+                && Objects.equals(script, other.script)
+                && Objects.equals(region, other.region)
+                && Objects.equals(variant, other.variant)
+                && Objects.equals(extension, other.extension)
+                && Objects.equals(privateUse, other.privateUse)
+                && Objects.equals(langTagAsGiven, other.langTagAsGiven)
+                && Objects.equals(fmtString, other.fmtString);
+    }
+
+    private static Character privateUseSingleton = Character.valueOf('x');
+
+    private static LangTag asLangTag(String string, Locale.Builder locBuild) {
+        Locale locale = locBuild.build();
+        Set<Character> extkeys = locale.getExtensionKeys();
+        StringBuilder sb1 = new StringBuilder();
+        StringBuilder sb2 = new StringBuilder();
+        for ( Character k : extkeys ) {
+            String ext = locale.getExtension(k);
+            StringBuilder sb = sb1;
+            if ( privateUseSingleton.equals(k) )
+                sb = sb2;
+            if ( sb.length() != 0 )
+                sb.append('-');
+            sb.append(k);
+            sb.append('-');
+            sb.append(ext);
+        }
+        String extension = sb1.toString();
+        String privateUse = sb2.toString();
+        return new LangTagJDK(string,
+                              locale.toLanguageTag(),
+                              locale.getLanguage(),
+                              locale.getScript(),
+                              locale.getCountry(),
+                              locale.getVariant(),
+                              extension,
+                              privateUse);
+    }
+}
diff --git a/jena-langtag/src/main/java/org/apache/jena/langtag/LangTagRE.java 
b/jena-langtag/src/main/java/org/apache/jena/langtag/LangTagRE.java
new file mode 100644
index 0000000000..760fd75c97
--- /dev/null
+++ b/jena-langtag/src/main/java/org/apache/jena/langtag/LangTagRE.java
@@ -0,0 +1,393 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.jena.langtag;
+
+import java.util.Locale;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+
+public final class LangTagRE implements LangTag {
+
+    public static LangTag create(String string) {
+        LangTag langTagRE = new LangTagRE(string);
+        return langTagRE;
+    }
+
+    private final String string;
+    private final String[] parts;
+
+    private LangTagRE(String string) {
+        this.string = string;
+        this.parts = LangTagByRE.parse(string);
+    }
+
+    @Override
+    public String str() {
+        return null;
+    }
+
+    @Override
+    public String getLanguage() {
+        return parts[idxLanguage];
+    }
+
+    @Override
+    public String getScript() {
+        return parts[idxScript];
+    }
+
+    @Override
+    public String getRegion() {
+        return parts[idxRegion];
+    }
+
+    @Override
+    public String getVariant() {
+        return parts[idxVariant];
+    }
+
+    @Override
+    public String getExtension() {
+        return parts[idxExtension];
+    }
+
+    @Override
+    public String getPrivateUse() {
+        return parts[idxPrivateUse];
+    }
+
+    /*package*/ static final int  idxLanguage  = 0;
+    /*package*/ static final int  idxScript    = 1;
+    /*package*/ static final int  idxRegion    = 2;
+    /*package*/ static final int  idxVariant   = 3;
+    /*package*/ static final int  idxExtension = 4;
+    /*package*/ static final int  idxPrivateUse = 5;
+
+    /** Language tag handled with regular expressions. */
+    static class LangTagByRE {
+        /**
+         * Language tags: support for parsing and canonicalization of case.
+         * Grandfathered forms ("i-") are left untouched. Unsupported or 
syntactically
+         * illegal forms are handled in canonicalization by doing nothing.
+         * <ul>
+         * <li>Language tags syntax: <a 
href="http://www.ietf.org/rfc/rfc4646.txt";>RFC 4646</a></li>
+         * <li>Matching Language tags: <a 
href="http://www.ietf.org/rfc/rfc4647.txt";>RFC 4647</a></li>
+         * <li>Language tags syntax (BCP 47): <a 
href="http://www.ietf.org/rfc/rfc5646.txt";>RFC 5646</a></li>
+         * </ul>
+         */
+
+        // Valid language tag, not irregular, not grand-fathered.
+
+        private static final int partsLength  = 6;
+
+        private LangTagByRE() {}
+
+        // Defined by BCP 47 which is currently RFC 5646 and which obsoletes 
RFC 4646.
+
+        // Canonical forms:
+        /*
+         * RFC 4646 In this format, all non-initial two-letter subtags are
+         * uppercase, all non-initial four-letter subtags are titlecase, and 
all
+         * other subtags are lowercase.
+         */
+        /*
+         * RFC 5646 An implementation can reproduce this format without 
accessing
+         * the registry as follows. All subtags, including extension and 
private use
+         * subtags, use lowercase letters with two exceptions: two-letter and
+         * four-letter subtags that neither appear at the start of the tag nor 
occur
+         * after singletons. Such two-letter subtags are all uppercase (as in 
the
+         * tags "en-CA-x-ca" or "sgn-BE-FR") and four- letter subtags are 
titlecase
+         * (as in the tag "az-Latn-x-latn").
+         */
+
+        /*
+         * ABNF definition: <a href="http://www.ietf.org/rfc/rfc5646.txt";>RFC 
5646</a>
+         *
+Language-Tag  = langtag            ; normal language tags
+               / privateuse         ; private use tag
+               / grandfathered      ; grandfathered tags
+
+ langtag       = language
+                 ["-" script]
+                 ["-" region]
+                 *("-" variant)
+                 *("-" extension)
+                 ["-" privateuse]
+
+ language      = 2*3ALPHA           ; shortest ISO 639 code
+                 ["-" extlang]      ; sometimes followed by
+                                    ; extended language subtags
+               / 4ALPHA             ; or reserved for future use
+               / 5*8ALPHA           ; or registered language subtag
+
+ extlang       = 3ALPHA             ; selected ISO 639 codes
+                 *2("-" 3ALPHA)     ; permanently reserved
+
+ script        = 4ALPHA             ; ISO 15924 code
+
+ region        = 2ALPHA             ; ISO 3166-1 code
+               / 3DIGIT             ; UN M.49 code
+
+ variant       = 5*8alphanum        ; registered variants
+               / (DIGIT 3alphanum)
+
+ extension     = singleton 1*("-" (2*8alphanum))
+
+                                    ; Single alphanumerics
+                                    ; "x" reserved for private use
+ singleton     = DIGIT              ; 0 - 9
+               / %x41-57            ; A - W
+               / %x59-5A            ; Y - Z
+               / %x61-77            ; a - w
+               / %x79-7A            ; y - z
+
+ privateuse    = "x" 1*("-" (1*8alphanum))
+
+ grandfathered = irregular          ; non-redundant tags registered
+               / regular            ; during the RFC 3066 era
+
+ irregular     = "en-GB-oed"        ; irregular tags do not match
+               / "i-ami"            ; the 'langtag' production and
+               / "i-bnn"            ; would not otherwise be
+               / "i-default"        ; considered 'well-formed'
+               / "i-enochian"       ; These tags are all valid,
+               / "i-hak"            ; but most are deprecated
+               / "i-klingon"        ; in favor of more modern
+               / "i-lux"            ; subtags or subtag
+               / "i-mingo"          ; combination
+               / "i-navajo"
+               / "i-pwn"
+               / "i-tao"
+               / "i-tay"
+               / "i-tsu"
+               / "sgn-BE-FR"
+               / "sgn-BE-NL"
+               / "sgn-CH-DE"
+
+ regular       = "art-lojban"       ; these tags match the 'langtag'
+               / "cel-gaulish"      ; production, but their subtags
+               / "no-bok"           ; are not extended language
+               / "no-nyn"           ; or variant subtags: their meaning
+               / "zh-guoyu"         ; is defined by their registration
+               / "zh-hakka"         ; and all of these are deprecated
+               / "zh-min"           ; in favor of a more modern
+               / "zh-min-nan"       ; subtag or sequence of subtags
+               / "zh-xiang"
+
+ alphanum      = (ALPHA / DIGIT)    ; letters and numbers
+          */
+
+        private static final String languageRE_1         = 
"(?:[a-zA-Z]{2,3}(?:-[a-zA-Z]{3}){0,3})";
+        private static final String languageRE_2         = "[a-zA-Z]{4}";
+        private static final String languageRE_3         = "[a-zA-Z]{5,8}";
+        private static final String language             = languageRE_1 + "|" 
+ languageRE_2 + "|" + languageRE_3;
+
+        private static final String script               = "[a-zA-Z]{4}";
+        private static final String region               = 
"[a-zA-Z]{2}|[0-9]{3}";
+
+        private static final String variant1             = 
"(?:[a-zA-Z0-9]{5,8}|[0-9][a-zA-Z0-9]{3})";
+        private static final String variant              = variant1 + "(?:-" + 
variant1 + ")*";
+
+        private static final String extension1           = 
"(?:[a-wyzA-WYZ0-9](?:-[a-zA-Z0-9]{2,8})+)"; // Not 'x'
+        private static final String extension            = extension1 + "(?:-" 
+ extension1 + ")*";
+
+        private static final String privateuse           = 
"[xX](?:-[a-zA-Z0-9]{1,8})+";
+
+        private static final String langtag              = 
String.format("^(%s)(?:-(%s))?(?:-(%s))?(?:-(%s))*(?:-(%s))?(?:-(%s))?$",
+                                                                         
language, script, region, variant, extension, privateuse);
+
+        // This is for the "i-" forms only.
+        private static final String grandfatheredRE      = 
"^i(?:-[a-zA-Z0-9]{2,8}){1,2}$";
+        private static final String privateUseLangRE     = "^"+privateuse+"$";
+
+        private static Pattern      pattern              = 
Pattern.compile(langtag);
+        private static Pattern      patternGrandfathered = 
Pattern.compile(grandfatheredRE);
+        private static Pattern      privateUseLang       = 
Pattern.compile(privateUseLangRE);
+        private static Pattern      enOED                 = 
Pattern.compile("en-GB-oed", Pattern.CASE_INSENSITIVE);
+
+        /**
+         * Validate - basic syntax check for a language tags: [a-zA-Z]+ 
('-'[a-zA-Z0-9]+)*
+         */
+        /*package*/ static boolean check(String languageTag) {
+            int len = languageTag.length();
+            int idx = 0;
+            boolean first = true;
+            while (idx < languageTag.length()) {
+                int idx2 = checkPart(languageTag, idx, first);
+                first = false;
+                if ( idx2 == idx )
+                    // zero length part.
+                    return false;
+                idx = idx2;
+                if ( idx == len )
+                    return true;
+                if ( languageTag.charAt(idx) != '-' )
+                    return false;
+                idx++;
+                if ( idx == len )
+                    // trailing DASH
+                    return false;
+            }
+            return true;
+        }
+
+        private static int checkPart(String languageTag, int idx, boolean 
leader) {
+            for (; idx < languageTag.length(); idx++) {
+                int ch = languageTag.charAt(idx);
+                if ( leader ) {
+                    if ( InternalLangTag.isA2Z(ch) )
+                        continue;
+                } else {
+                    if ( InternalLangTag.isA2ZN(ch) )
+                        continue;
+                }
+                // Not acceptable.
+                return idx;
+            }
+            // Off end.
+            return idx;
+        }
+
+        /**
+         * Parse a langtag string and return it's parts in canonical case. See
+         * constants for the array contents. Parts not present cause a null in
+         * the return array.
+         *
+         * @return Langtag parts, or null if the input string does not parse 
as a lang tag.
+         */
+        /*package*/ static String[] parse(String languageTag) {
+            String[] parts = new String[partsLength];
+
+            Matcher m = pattern.matcher(languageTag);
+            if ( !m.find() ) {
+                m = patternGrandfathered.matcher(languageTag);
+                if ( m.find() ) {
+                    parts[idxLanguage] = m.group(0);
+                    return parts;
+                }
+                // Private use language, not extension.
+                m = privateUseLang.matcher(languageTag);
+                if ( m.find() ) {
+                    parts[idxPrivateUse] = m.group(0);
+                    return parts;
+                }
+
+                // Irregular
+                m = enOED.matcher(languageTag);
+                if ( m.find() ) {
+                    parts[idxLanguage] = "en";
+                    parts[idxRegion] = "GB";
+                    parts[idxVariant] = "oed";
+                    return parts;
+                }
+
+                // Give up.
+                return null;
+            }
+
+            int gc = m.groupCount();
+            for (int i = 0; i < gc; i++)
+                parts[i] = m.group(i + 1);
+
+            parts[idxLanguage] = lowercase(parts[idxLanguage]);
+            parts[idxScript] = titlecase(parts[idxScript]);
+            parts[idxRegion] = uppercase(parts[idxRegion]);
+            parts[idxVariant] = lowercase(parts[idxVariant]);
+            parts[idxExtension] = lowercase(parts[idxExtension]);
+            parts[idxPrivateUse] = lowercase(parts[idxPrivateUse]);
+            return parts;
+        }
+
+        /** Canonicalize with the rules of RFC 4646, or RFC 5646 without 
replacement of preferred form. */
+        /*package*/ static String canonical(String str) {
+            if ( str == null )
+                return null;
+            String[] parts = parse(str);
+            String x = canonical(parts);
+            if ( x == null ) {
+                // Could try to apply the rule case-setting rules
+                // even through it's not a conforming langtag.
+                return str;
+            }
+            return x;
+        }
+
+        /**
+         * Canonicalize with the rules of RFC 4646 "In this format, all 
non-initial
+         * two-letter subtags are uppercase, all non-initial four-letter 
subtags are
+         * titlecase, and all other subtags are lowercase." In addition, leave
+         * extensions unchanged.
+         * <p>
+         * This is the same as RFC5646 without replacement of preferred form
+         * or consulting the registry.
+         */
+        /*package*/ static String canonical(String[] parts) {
+            // We canonicalised parts on parsing.
+            if ( parts == null )
+                return null;
+
+            if ( parts[0] == null ) {
+                // Grandfathered
+                return parts[idxExtension];
+            }
+
+            StringBuilder sb = new StringBuilder();
+            sb.append(parts[0]);
+            for (int i = 1; i < parts.length; i++) {
+                if ( parts[i] != null ) {
+                    sb.append("-");
+                    sb.append(parts[i]);
+                }
+            }
+            return sb.toString();
+        }
+
+        // Teh basic formatting rule.
+        private static String strcase_unused(String string) {
+            if ( string == null )
+                return null;
+            if ( string.length() == 2 )
+                return uppercase(string);
+            if ( string.length() == 4 )
+                return titlecase(string);
+            return lowercase(string);
+        }
+
+        private static String lowercase(String string) {
+            if ( string == null )
+                return null;
+            return string.toLowerCase(Locale.ROOT);
+        }
+
+        private static String uppercase(String string) {
+            if ( string == null )
+                return null;
+            return string.toUpperCase(Locale.ROOT);
+        }
+
+        private static String titlecase(String string) {
+            if ( string == null )
+                return null;
+            char ch1 = string.charAt(0);
+            ch1 = Character.toUpperCase(ch1);
+            string = lowercase(string.substring(1));
+            return ch1 + string;
+        }
+    }
+}
diff --git 
a/jena-langtag/src/main/java/org/apache/jena/langtag/LangTagRFC5646.java 
b/jena-langtag/src/main/java/org/apache/jena/langtag/LangTagRFC5646.java
new file mode 100644
index 0000000000..f75659831c
--- /dev/null
+++ b/jena-langtag/src/main/java/org/apache/jena/langtag/LangTagRFC5646.java
@@ -0,0 +1,781 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.jena.langtag;
+
+import java.util.HashSet;
+import java.util.Locale;
+import java.util.Objects;
+import java.util.Set;
+
+/**
+ * An implementation of parsing and formatting.
+ * <a href="https://datatracker.ietf.org/doc/html/rfc5646";>RFC 5646</a>
+ * <p>
+ * This implementation does not replace languages by their preferred form (e.g.
+ * "i-klingon" has preferred form of "tlh", "zh-xiang" has a preferred form of 
"hsn").
+ * </p>
+ * <p>
+ * <a href="https://www.rfc-editor.org/info/rfc5646";>RFC 5646: Tags for 
Identifying Languages</a>
+ * </p>
+ */
+public final  class LangTagRFC5646 implements LangTag{
+    // The language tag as given.
+    private final String langTagString;
+
+    // Grandfathered
+    private boolean isGrandfathered = false;
+    // Private use of the whole Language-Tag
+    private boolean isPrivateUseLanguage = false;
+
+    /* Formatting: https://datatracker.ietf.org/doc/html/rfc5646#section-2.1.1
+     *
+     * All subtags, including extension and private use subtags,
+     * use lowercase letters with two exceptions: two-letter
+     * and four-letter subtags that neither appear at the start of the tag
+     * nor occur after singletons.  Such two-letter subtags are all
+     * uppercase (as in the tags "en-CA-x-ca" or "sgn-BE-FR") and four-
+     * letter subtags are titlecase (as in the tag "az-Latn-x-latn").
+     *
+     * See str()
+     */
+
+    // Helpers
+    private enum CaseRule { TITLE, LOWER, UPPER }
+    private enum CharSet { ALPHA, ALPHANUM }
+
+    public static LangTag create(String string) {
+        LangTagRFC5646 langtag = parser(string);
+        return langtag;
+    }
+
+    // Start/Finish indexes, excluding the initial '-'
+    private int language0 = -1 ;
+    private int language1 = -1 ;
+
+    private int script0 = -1 ;
+    private int script1 = -1 ;
+
+    private int region0 = -1 ;
+    private int region1 = -1 ;
+
+    private int variant0 = -1 ;
+    private int variant1 = -1 ;
+
+    // All extensions.
+    private int extension0 = -1 ;
+    private int extension1 = -1 ;
+
+    // Private use sub tag (not private use of the whole language tag, which 
starts "x-").
+    private int privateuse0 = -1 ;
+    private int privateuse1 = -1 ;
+
+    @Override
+    public String getLanguage() {
+        return getSubTag("Language", langTagString, language0, language1, 
CaseRule.LOWER);
+    }
+
+    @Override
+    public String getScript() {
+        return getSubTag("Script", langTagString, script0, script1, 
CaseRule.TITLE);
+    }
+
+    @Override
+    public String getRegion() {
+        return getSubTag("Region", langTagString, region0, region1, 
CaseRule.UPPER);
+    }
+
+    @Override
+    public String getVariant() {
+        return getSubTag("Variant", langTagString, variant0, variant1, 
CaseRule.LOWER);
+    }
+
+    @Override
+    public String getExtension() {
+        return getSubTag("Extension", langTagString, extension0, extension1, 
CaseRule.LOWER);
+    }
+
+    @Override
+    public String getPrivateUse() {
+        return getSubTag("Private", langTagString, privateuse0, privateuse1, 
CaseRule.LOWER);
+    }
+
+    @Override
+    public int hashCode() {
+        return Objects.hash(langTagString,
+                            language0, language1, script0, script1, variant0, 
variant1,
+                            extension0, extension1, privateuse0, privateuse1, 
isGrandfathered, isPrivateUseLanguage);
+    }
+
+    /**
+     * {@code .equals} and {@code .hashCode}
+     *  provide "same immutable object" semantics.
+     * The language tags are treated case-sensitively.
+     *
+     * @See LangTagOps.sameLangTagAs for equivalent language tags.
+     */
+    @Override
+    public boolean equals(Object obj) {
+        if ( this == obj )
+            return true;
+        if ( !(obj instanceof LangTagRFC5646 other) )
+            return false;
+        // All but the string.
+        boolean sameParsePoints =
+                extension0 == other.extension0 && extension1 == 
other.extension1
+                && isGrandfathered == other.isGrandfathered
+                && isPrivateUseLanguage == other.isPrivateUseLanguage
+                && language0 == other.language0 && language1 == other.language1
+                && privateuse0 == other.privateuse0 && privateuse1 == 
other.privateuse1
+                && region0 == other.region0 && region1 == other.region1
+                && script0 == other.script0 && script1 == other.script1
+                && variant0 == other.variant0 && variant1 == other.variant1;
+        if ( ! sameParsePoints )
+            return false;
+        return Objects.equals(langTagString, other.langTagString);
+    }
+
+    /**
+     * Return the lang tag exactly as given.
+     * Use {@link #str()} for the language tag formatted by the rules of RFC 
5646.
+     */
+    @Override
+    public String toString() {
+        return langTagString;
+    }
+
+    @Override
+    public String str() {
+        if ( isPrivateUseLanguage )
+            return InternalLangTag.lowercase(langTagString);
+
+        // Some irregular special cases.
+        if ( InternalLangTag.caseInsensitivePrefix(langTagString, "sgn-") ) {
+            // "sgn-BE-FR", "sgn-BE-NL", "sgn-CH-DE"
+            if ( langTagString.equalsIgnoreCase("sgn-BE-FR") )
+                return "sgn-BE-FR";
+            if ( langTagString.equalsIgnoreCase("sgn-BE-NL") )
+                return "sgn-BE-NL";
+            if ( langTagString.equalsIgnoreCase("sgn-CH-DE") )
+                return "sgn-CH-DE";
+        }
+
+        if ( langTagString.startsWith("i-") || langTagString.startsWith("I-") 
) {
+            String lcLangTagStr = InternalLangTag.lowercase(langTagString);
+            if ( irregular_i.contains(lcLangTagStr) )
+                return lcLangTagStr;
+        }
+
+        StringBuffer sb = new StringBuffer();
+        add(sb, getLanguage());
+        add(sb, getScript());
+        add(sb, getRegion());
+        add(sb, getVariant());
+        add(sb, getExtension());
+        add(sb, getPrivateUse());
+        return sb.toString();
+    }
+
+    private void add(StringBuffer sb, String subtag) {
+        if ( subtag == null )
+            return;
+        if ( ! sb.isEmpty() )
+            sb.append('-');
+        sb.append(subtag);
+    }
+
+    private static String getSubTag(String label, String string, int start, 
int finish, CaseRule format) {
+        if ( start == -1 )
+            return null;
+        if ( finish == -1 )
+            throw new InternalError(InternalLangTag.titlecase(label)+" start 
is set but not subtag end: "+string);
+        if ( start >= finish )
+            throw new InternalError(InternalLangTag.titlecase(label)+" start 
index is after "+InternalLangTag.lowercase(label)+" end index: "+string);
+        String x = string.substring(start, finish);
+        return switch(format) {
+            case TITLE -> InternalLangTag.titlecase(x);
+            case LOWER -> InternalLangTag.lowercase(x);
+            case UPPER -> InternalLangTag.uppercase(x);
+        };
+    }
+
+    private static LangTagRFC5646 parser(String string) {
+
+        // A segment is a sequence of A2ZN characters separated by '-'.
+
+        LangTagRFC5646 langtag = new LangTagRFC5646(string);
+        final int N = string.length();
+        // Language-Tag  = langtag             ; normal language tags
+        //               / privateuse          ; private use tag
+        //               / grandfathered       ; grandfathered tags
+
+        // langtag       = language
+        //                 ["-" script]
+        //                 ["-" region]
+        //                 *("-" variant)
+        //                 *("-" extension)
+        //                 ["-" privateuse]
+
+        // script        = 4ALPHA              ; ISO 15924 code
+        // region        = 2ALPHA              ; ISO 3166-1 code
+        //               / 3DIGIT              ; UN M.49 code
+        // variant       = 5*8alphanum         ; registered variants
+        //               / (DIGIT 3alphanum)
+        // extension     = singleton 1*("-" (2*8alphanum))
+
+        if ( N == 0 )
+            InternalLangTag.error("Empty string");
+
+        // -------------------
+        // language      = (2*3ALPHA [ extlang ]); shortest ISO 639 code
+        //               / 4ALPHA                ; reserved for future use
+        //               / 5*8ALPHA              ; registered language subtag
+        // extlang       = 3ALPHA              ; selected ISO 639 codes
+        //                 *2("-" 3ALPHA)      ; permanently reserved
+
+        // Grandfathered
+        // Must check first because the whole string (except "en-GB-oed") is 
the "language"
+
+        if ( grandfathered(string) ) {
+            // Regular:
+            // "each tag, in its entirety, represents a language or collection 
of languages."
+            //
+            // Irregular:
+            // With the exception of "en-GB-oed", which is a
+            // variant of "en-GB", each of them, in its entirety,
+            // represents a language.
+            //
+            langtag.language0 = 0;
+            langtag.language1 = N;
+            langtag.isGrandfathered = true;
+            // Exception.
+            if ( string.equalsIgnoreCase("en-GB-oed") ) {
+                // "oed" is "Oxford English Dictionary spelling"
+                // Better is the replacement "en-GB-oxendict"
+                langtag.language0 = 0;
+                langtag.language1 = 2;
+                langtag.region0 = 3;
+                langtag.region1 = 5;
+                // Non-standard variant.
+                langtag.variant0 = 6;
+                langtag.variant1 = N;
+            }
+            return langtag;
+        }
+
+        // -- language
+
+        int idx = 0;
+        int idx2 = segmentNextFinish(string, N, idx);
+        int segLen = segmentLength(N, idx, idx2);
+
+        // Private use in the language position.
+        if ( segLen == 1 ) {
+            if ( string.startsWith("x-") || string.startsWith("X-") ) {
+                /*
+                The primary language subtag is the first subtag in a language 
tag and
+                cannot be omitted, with two exceptions:
+
+                o  The single-character subtag 'x' as the primary subtag 
indicates
+                   that the language tag consists solely of subtags whose 
meaning is
+                   defined by private agreement.  For example, in the tag 
"x-fr-CH",
+                   the subtags 'fr' and 'CH' do not represent the French 
language or
+                   the country of Switzerland (or any other value in the IANA
+                   registry) unless there is a private agreement in place to 
do so.
+                   See Section 4.6.
+                */
+                langtag.isPrivateUseLanguage = true;
+                int idxPrivateUseStart = 0;
+                int idxPrivateUseEnd = maybeSubtags(string, N, 
idxPrivateUseStart+segLen, 1, 8);
+                langtag.privateuse0 = idxPrivateUseStart;
+                langtag.privateuse1 = idxPrivateUseEnd;
+                if ( langtag.privateuse1 < N )
+                    InternalLangTag.error("Trailing characters in private 
langtag: '%s'", string.substring(langtag.privateuse1));
+                return langtag;
+            }
+            InternalLangTag.error("Language part is 1 character: it must be 
2-3 characters (4-8 reserved for future use), \"x-\", or a recognized 
grandfathered tag");
+        }
+
+        if ( idx2 < 0 ) {
+            // language only.
+            if ( segLen > 8 )
+                InternalLangTag.error("Language too long (2-3 characters, 4-8 
reserved for future use)");
+            langtag.language0 = 0;
+            langtag.language1 = N;
+            InternalLangTag.checkAlpha(string, N, langtag.language0, 
langtag.language1);
+            return langtag;
+        }
+
+        if ( idx == idx2 )
+            InternalLangTag.error("Can not find the language subtag: '%s'", 
string);
+
+        if ( segLen < 2 || segLen > 4 )
+            InternalLangTag.error("Language: '%s'", string);
+
+        langtag.language0 = idx;
+
+        if ( segLen == 2 || segLen == 3 ) {
+            // -- Language extension subtags/
+//            language      = 2*3ALPHA            ; shortest ISO 639 code
+//                            ["-" extlang]
+//            extlang       = 3ALPHA              ; selected ISO 639 codes
+//                            *2("-" 3ALPHA)      ; permanently reserved
+            int extStart = idx+segLen;
+            InternalLangTag.checkAlpha(string, N, langtag.language0, extStart);
+            // Extensions are 1 to 3 3ALPHA subtags
+            int extEnd = maybeSubtags(string, N, extStart, 3, 3);
+            if ( extEnd > extStart ) {
+                idx2 = extEnd;
+                InternalLangTag.checkAlphaMinus(string, N, extStart, 
langtag.language1);
+            }
+        } else if ( segLen > 8 ) {
+            InternalLangTag.error("Language too long (2-3 characters, 4-8 
reserved for future use)");
+        }
+        // -- extlang
+        langtag.language1 = idx2;
+        // Info
+        noteSegment("language", string, langtag.language0, langtag.language1);
+
+        // Move on - next subtag
+        idx = segmentNextStart(N, idx, idx2);
+        idx2 = segmentNextFinish(string, N, idx);
+        segLen = segmentLength(N, idx, idx2);
+        // -- End langtag
+
+        // ---- script
+        // script        = 4ALPHA              ; ISO 15924 code
+        if ( segLen == 4 && InternalLangTag.isAlpha(string.charAt(idx)) ) {
+            // Script
+            // Not a digit - which is a variant.
+            // variant       = ... / (DIGIT 3alphanum)
+            int start = idx;
+            int finish = idx+segLen;
+
+            langtag.script0 = idx;
+            langtag.script1 = idx+segLen;
+            InternalLangTag.checkAlpha(string, N, langtag.script0, 
langtag.script1);
+            noteSegment("script", string, langtag.script0, langtag.script1);
+
+            // Move on.
+            idx = segmentNextStart(N, idx, idx2);
+            idx2 = segmentNextFinish(string, N, idx);
+            segLen = segmentLength(N, idx, idx2);
+        }
+        // -- End script
+
+        // ---- region
+        // region        = 2ALPHA              ; ISO 3166-1 code
+        //               / 3DIGIT              ; UN M.49 code
+        if ( segLen == 2 || segLen == 3 ) {
+            // Region
+            langtag.region0 = idx;
+            langtag.region1 = idx+segLen;
+            if ( segLen == 2 )
+                InternalLangTag.checkAlpha(string, N, langtag.region0, 
langtag.region1);
+            else
+                InternalLangTag.checkDigits(string, N, langtag.region0, 
langtag.region1);
+            noteSegment("region", string, langtag.region0, langtag.region1);
+
+            // Move on.
+            idx = segmentNextStart(N, idx, idx2);
+            idx2 = segmentNextFinish(string, N, idx);
+            segLen = segmentLength(N, idx, idx2);
+        }
+        // -- End region
+
+        // ---- variant
+        // variant       = 5*8alphanum         ; registered variants
+        //               / (DIGIT 3alphanum)
+        for ( ;; ) {
+            if ( segLen >= 5 && segLen <= 8) {
+                // variant 5*8alphanum
+                if ( langtag.variant0 == -1 )
+                    langtag.variant0 = idx;
+                langtag.variant1 = idx+segLen;
+                InternalLangTag.checkAlphaNum(string, N, idx, 
langtag.variant1);
+                noteSegment("variant", string, langtag.variant0, 
langtag.variant1);
+                // Move on.
+                idx = segmentNextStart(N, idx, idx2);
+                idx2 = segmentNextFinish(string, N, idx);
+                segLen = segmentLength(N, idx, idx2);
+                continue;
+            }
+
+            if ( segLen == 4 ) {
+                // variant
+                // DIGIT 3alphanum
+                char ch = string.charAt(idx);
+                if ( ch >= '0' || ch <= '9' ) {
+                    if ( langtag.variant0 == -1 )
+                        langtag.variant0 = idx;
+                    langtag.variant1 = idx+segLen;
+                    InternalLangTag.checkAlphaNum(string, N, idx, 
langtag.variant1);
+                    noteSegment("variant", string, langtag.variant0, 
langtag.variant1);
+                }
+                // Move on.
+                idx = segmentNextStart(N, idx, idx2);
+                idx2 = segmentNextFinish(string, N, idx);
+                segLen = segmentLength(N, idx, idx2);
+                continue;
+            }
+            break;
+        }
+        // -- End variant
+
+        // ---- extension and private use
+        // extension     = singleton 1*("-" (2*8alphanum))
+        // privateuse    = "x" 1*("-" (1*8alphanum))
+        boolean inPrivateUseSubtag = false;
+        Set<Character> extSingletons = null; new HashSet<>();
+        while ( segLen == 1 ) {
+            char singleton = string.charAt(idx);
+            if ( singleton == 'x' || singleton == 'X' ) {
+                inPrivateUseSubtag = true;
+                break;
+            }
+            if ( extSingletons == null ) {
+                extSingletons = new HashSet<>();
+                extSingletons.add(singleton);
+            } else {
+                boolean newEntry = extSingletons.add(singleton);
+                if ( ! newEntry )
+                    InternalLangTag.error("Duplicate extension singleton: 
'"+singleton+"'");
+            }
+
+            if ( langtag.extension0 == -1 )
+                langtag.extension0 = idx;
+            // Extension.
+            // 2*8 alphanum
+            int idxExtStart = idx+segLen;
+            int idxEndExtra = maybeSubtags(string, N, idxExtStart, 2, 8);
+
+            // Expecting at least one subtag.
+            if ( idxExtStart == idxEndExtra )
+                InternalLangTag.error("Ill-formed extension");
+
+            if ( idxEndExtra > idxExtStart )
+                idx2 = idxEndExtra;
+            langtag.extension1 = idx2;
+            InternalLangTag.checkAlphaNumMinus(string, N, langtag.extension0, 
langtag.extension1);
+
+            noteSegment("extension", string, langtag.extension0, 
langtag.extension1);
+            // Move on.
+            idx = segmentNextStart(N, idx, idx2);
+            idx2 = segmentNextFinish(string, N, idx);
+            segLen = segmentLength(N, idx, idx2);
+            if ( segLen == 0 )
+                InternalLangTag.error("Ill-formed extension. Trailing dash.");
+        }
+
+        // ---- private use
+        if ( inPrivateUseSubtag ) {
+            langtag.privateuse0 = idx;
+            // privateuse    = "x" 1*("-" (1*8alphanum))
+            int idxPrivateUseStart = idx+segLen;
+            int idxPrivateUseEnd = maybeSubtags(string, N, idxPrivateUseStart, 
1, 8);
+
+            // Expecting at least one subtag.
+            if ( idxPrivateUseStart == idxPrivateUseEnd )
+                InternalLangTag.error("Ill-formed private use component");
+
+            if ( idxPrivateUseEnd > idxPrivateUseStart )
+                idx2 = idxPrivateUseEnd;
+            langtag.privateuse1 = idx2;
+            InternalLangTag.checkAlphaNumMinus(string, N, langtag.privateuse0, 
langtag.privateuse1);
+
+            noteSegment("private use", string, langtag.privateuse0, 
langtag.privateuse1);
+            // Private use runs to end of string. But do checking.
+            // Move on.
+            idx = segmentNextStart(N, idx, idx2);
+            idx2 = segmentNextFinish(string, N, idx);
+            segLen = segmentLength(N, idx, idx2);
+            if ( segLen == 0 )
+                InternalLangTag.error("Ill-formed private use subtag. Trailing 
dash.");
+        }
+
+        // -- End extension and privateuse
+
+        // Did we process everything? No segment: idx == -1 idx2 == -1  seglen 
== -1
+
+        if ( idx != -1 && idx < N )
+            InternalLangTag.error("Trailing characters: '%s'", 
string.substring(idx));
+        if ( idx2 >= 0 )
+            InternalLangTag.error("Bad string: '%s'", string);
+        return langtag;
+    }
+
+    private LangTagRFC5646(String string) {
+        this.langTagString = string;
+    }
+
+    private LangTagRFC5646(String string,
+                           int language0, int language1,
+                           int script0, int script1,
+                           int region0, int region1,
+                           int variant0, int variant1,
+                           int extension0, int extension1,
+                           int privateuse0, int privateuse1,
+                           boolean isGrandfathered) {
+        this.langTagString = string;
+        this.isGrandfathered = isGrandfathered;
+        this.language0 = language0;
+        this.language1 = language1;
+        this.script0 = script0;
+        this.script1 = script1;
+        this.variant0 = variant0;
+        this.variant1 = variant1;
+        this.extension0 = extension0;
+        this.extension1 = extension1;
+        this.privateuse0 = privateuse0;
+        this.privateuse1 = privateuse1;
+    }
+
+    /** Zero or more subtags, each between min and max length. */
+    private static int maybeSubtags(String string, int N, int idxStart, int 
min, int max) {
+        // Looking at the '-' or end of string.
+        int numExt = 0;
+        int count = 0;
+        int x = idxStart;
+        // Outer loop - each subtag segment, having read at the "-"
+        while ( x >= 0 && x < N ) {
+            char ch = string.charAt(x);
+            if ( ch != '-' )
+                break;
+            int x1 = maybeSubtag1(string, N, x+1, min, max);
+            if ( x1 <= 0 )
+                break;
+            if ( x1 == N ) {
+                x = N;
+                break;
+            }
+            x = x1;
+        }
+        return x;
+    }
+
+    /**
+     * Peek for a segment between min and max in length.
+     * The initial  "-" has been read.
+     */
+    private static int maybeSubtag1(String string, int N, int idxStart, int 
min, int max) {
+        int idx = idxStart;
+        if ( idx >= N )
+            return -1;
+        int idx2 = segmentNextFinish(string, N, idx);
+        int segLen = segmentLength(N, idx, idx2);
+        if ( segLen == 0 )
+            InternalLangTag.error("Bad langtag. Found '--'");
+
+        if ( segLen < min || segLen > max )
+            return -1;
+        if ( ! InternalLangTag.isAlpha(string, idxStart, idxStart+segLen) )
+            return -1;
+        return idxStart+segLen;
+    }
+
+    // Start/Finish indexes, excluding the initial '-'
+    private static String getSegment(String string, int x0, int x1) {
+        if ( x0 < 0 && x1 < 0 )
+            return null;
+        if ( x0 < 0 || x1 < 0 ) {
+            InternalLangTag.error("Segment one undef index");
+            return null;
+        }
+        return string.substring(x0,  x1);
+    }
+
+    /** Length of a segment, excluding any "-" */
+    private static int segmentLength(int N, int idx, int idx2) {
+        if ( idx < 0 )
+            return -1;
+        if ( idx2 < 0 )
+            return N-idx;
+        return idx2-idx;
+    }
+
+    /** Index of the start of the next segment. */
+    private static int segmentNextStart(int N, int idx, int idx2) {
+        if ( idx2 == -1 )
+            return -1;
+        idx = idx2;
+        // Skip '-'
+        idx++;
+        return idx;
+    }
+
+    /** Note segment - development aid. */
+    private static void noteSegment(String label, String string, int idx, int 
idx2) {
+//        if ( idx2 < 0 ) {
+//            System.out.printf("%-10s [%d,%d) '%s'\n", label, idx, idx2, 
string.substring(idx));
+//            return;
+//        }
+//        System.out.printf("%-10s [%d,%d) '%s'\n",label, idx, idx2, 
string.substring(idx,  idx2));
+    }
+
+    /** Return the index of the next '-' or -1 */
+    private static int segmentNextFinish(String x, int N, int idx) {
+        if ( idx == -1 )
+            return -1;
+        if ( idx == N )
+            return -1;
+        for ( ; idx < N ; idx++ ) {
+            char ch = x.charAt(idx);
+            if ( ch == '-' ) {
+                if ( idx == N-1 ) {
+                    // The case of "subtag-"
+                    InternalLangTag.error("Language tag string ends in '-'");
+                }
+                return idx;
+            }
+        }
+        return -1;
+    }
+
+    // ---
+    // RFC 5646: regular tags
+    // Grandfathered tags that (appear to) match the 'langtag' production in
+    // Figure 1 are considered 'regular' grandfathered tags.  These tags
+    // contain one or more subtags that either do not individually appear in
+    // the registry or appear but with a different semantic meaning: each
+    // tag, in its entirety, represents a language or collection of
+    // languages.
+
+    private static boolean grandfathered(String s) {
+        s = s.toLowerCase(Locale.ROOT);
+        return grandfathered.contains(s) || regular.contains(s) ;
+    }
+
+    // These tags match the 'langtag' production, but their subtags are not 
extended
+    // language or variant subtags: their meaning is defined by their 
registration and
+    // all of these are deprecated in favor of a more modern subtag or 
sequence of
+    // subtags
+
+    private static Set<String> regular =
+            Set.of("art-lojban", "cel-gaulish", "no-bok", "no-nyn", 
"zh-guoyu", "zh-hakka", "zh-min", "zh-min-nan", "zh-xiang");
+
+    // RFC 5646: irregular tags do not match the 'langtag' production and 
would not be 'well-formed'
+    // Grandfathered tags that do not match the 'langtag' production in the
+    // ABNF and would otherwise be invalid are considered 'irregular'
+    // grandfathered tags.  With the exception of "en-GB-oed", which is a
+    // variant of "en-GB", each of them, in its entirety, represents a
+    // language.
+
+    private static Set<String> irregular =
+            Set.of("en-GB-oed",
+                   "i-ami", "i-bnn", "i-default", "i-enochian", "i-hak", 
"i-klingon",
+                   "i-lux", "i-mingo", "i-navajo", "i-pwn", "i-tao", "i-tay", 
"i-tsu",
+                   // These are irregular in that they are "primary subtag 
("sgn" - sign language)
+                   // then two region-like subtags.
+                   // They do obey the basic formatting rule - two letters 
non-primary subtag is uppercase.
+                   "sgn-BE-FR", "sgn-BE-NL", "sgn-CH-DE");
+
+    // The "i-" irregulars.
+    private static Set<String> irregular_i =
+            Set.of("i-ami", "i-bnn", "i-default", "i-enochian", "i-hak", 
"i-klingon",
+                   "i-lux", "i-mingo", "i-navajo", "i-pwn", "i-tao", "i-tay", 
"i-tsu");
+
+    // ---
+
+    private static Set<String> grandfathered = new 
HashSet<>(2*(regular.size()+irregular.size()));
+    static {
+        for ( String s : irregular )
+            grandfathered.add(InternalLangTag.lowercase(s));
+        for ( String s : regular )
+            grandfathered.add(InternalLangTag.lowercase(s));
+    }
+
+    // @formatter:off
+    /*
+     RFC 5646 Section 2.1
+     ABNF definition: https://datatracker.ietf.org/doc/html/rfc5646#section-2.1
+
+     Language-Tag  = langtag             ; normal language tags
+                   / privateuse          ; private use tag
+                   / grandfathered       ; grandfathered tags
+
+     langtag       = language
+                     ["-" script]
+                     ["-" region]
+                     *("-" variant)
+                     *("-" extension)
+                     ["-" privateuse]
+
+     language      = 2*3ALPHA            ; shortest ISO 639 code
+                     ["-" extlang]       ; sometimes followed by
+                                         ; extended language subtags
+                   / 4ALPHA              ; or reserved for future use
+                   / 5*8ALPHA            ; or registered language subtag
+
+     extlang       = 3ALPHA              ; selected ISO 639 codes
+                     *2("-" 3ALPHA)      ; permanently reserved
+
+     script        = 4ALPHA              ; ISO 15924 code
+
+     region        = 2ALPHA              ; ISO 3166-1 code
+                   / 3DIGIT              ; UN M.49 code
+
+     variant       = 5*8alphanum         ; registered variants
+                   / (DIGIT 3alphanum)
+
+     extension     = singleton 1*("-" (2*8alphanum))
+
+                                         ; Single alphanumerics
+                                         ; "x" reserved for private use
+     singleton     = DIGIT               ; 0 - 9
+                   / %x41-57             ; A - W
+                   / %x59-5A             ; Y - Z
+                   / %x61-77             ; a - w
+                   / %x79-7A             ; y - z
+
+     privateuse    = "x" 1*("-" (1*8alphanum))
+
+     grandfathered = irregular           ; non-redundant tags registered
+                   / regular             ; during the RFC 3066 era
+
+     irregular     = "en-GB-oed"         ; irregular tags do not match
+                   / "i-ami"             ; the 'langtag' production and
+                   / "i-bnn"             ; would not otherwise be
+                   / "i-default"         ; considered 'well-formed'
+                   / "i-enochian"        ; These tags are all valid,
+                   / "i-hak"             ; but most are deprecated
+                   / "i-klingon"         ; in favor of more modern
+                   / "i-lux"             ; subtags or subtag
+                   / "i-mingo"           ; combination
+                   / "i-navajo"
+                   / "i-pwn"
+                   / "i-tao"
+                   / "i-tay"
+                   / "i-tsu"
+                   / "sgn-BE-FR"
+                   / "sgn-BE-NL"
+                   / "sgn-CH-DE"
+
+     regular       = "art-lojban"        ; these tags match the 'langtag'
+                   / "cel-gaulish"       ; production, but their subtags
+                   / "no-bok"            ; are not extended language
+                   / "no-nyn"            ; or variant subtags: their meaning
+                   / "zh-guoyu"          ; is defined by their registration
+                   / "zh-hakka"          ; and all of these are deprecated
+                   / "zh-min"            ; in favor of a more modern
+                   / "zh-min-nan"        ; subtag or sequence of subtags
+                   / "zh-xiang"
+
+     alphanum      = (ALPHA / DIGIT)     ; letters and numbers
+     */
+    // @formatter:on
+}
diff --git a/jena-langtag/src/main/java/org/apache/jena/langtag/LangTags.java 
b/jena-langtag/src/main/java/org/apache/jena/langtag/LangTags.java
new file mode 100644
index 0000000000..2738298949
--- /dev/null
+++ b/jena-langtag/src/main/java/org/apache/jena/langtag/LangTags.java
@@ -0,0 +1,289 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.jena.langtag;
+
+import static org.apache.jena.langtag.InternalLangTag.error;
+import static org.apache.jena.langtag.InternalLangTag.str;
+
+import java.util.ArrayList;
+import java.util.List;
+import java.util.Objects;
+
+public class LangTags {
+
+    /** Index of the language part */
+    public static final int  idxLanguage  = 0 ;
+    /** Index of the script part */
+    public static final int  idxScript    = 1 ;
+    /** Index of the region part */
+    public static final int  idxRegion    = 2 ;
+    /** Index of the variant part */
+    public static final int  idxVariant   = 3 ;
+    /** Index of all extensions */
+    public static final int  idxExtension = 4 ;
+
+    private static final int partsLength  = 5 ;
+
+    /** @deprecated Compatibility operation (the behaviour of Jena 5.3.0 and 
earlier). To be removed. */
+    @Deprecated(forRemoval = true)
+    public static String[] parse(String languageTag) {
+        try {
+            LangTag langTag = SysLangTag.create(languageTag);
+            if (langTag == null )
+                return null;
+            String result[] = new String[partsLength];
+
+            result[idxLanguage] = langTag.getLanguage();
+            result[idxScript] = langTag.getScript();
+            result[idxRegion] = langTag.getRegion();
+            result[idxVariant] = langTag.getVariant();
+            // Legacy compatible.
+            if ( langTag.getPrivateUse() == null )
+                result[idxExtension] = langTag.getExtension();
+            else if ( langTag.getExtension() == null )
+                result[idxExtension] = langTag.getPrivateUse();
+            else
+                result[idxExtension] = 
langTag.getExtension()+"-"+langTag.getPrivateUse();
+            return result;
+        } catch (LangTagException ex) {
+            return null;
+        }
+    }
+
+    /**
+     * Create a {@link LangTag} from a string
+     * that meets the
+     * <a 
href="https://datatracker.ietf.org/doc/html/rfc5646#section-2.1";>syntax of RFC 
5646</a>.
+     * <p>
+     * Throws {@link LangTagException} on bad syntax.
+     */
+    public static LangTag of(String string) {
+        LangTag langTag =  SysLangTag.create(string);
+        // Implements should not return null but just in case ...
+        if ( langTag == null )
+            throw new LangTagException("Bad syntax");
+        return langTag;
+    }
+
+    /** Same as {@link #of(String)} */
+    public static LangTag create(String string) {
+        return of(string);
+    }
+
+    public static String canonical(String string) {
+        LangTag langTag =  of(string);
+        return langTag.str();
+    }
+
+    /** Check a string is valid as a language tag. */
+    public static boolean check(String languageTag) {
+        try {
+            LangTag langTag = SysLangTag.create(languageTag);
+            return (langTag != null );
+        } catch (LangTagException ex) {
+            return false;
+        }
+    }
+
+    /**
+     * Basic formatter following
+     * <a 
href="https://datatracker.ietf.org/doc/html/rfc5646#section-2.1.1";>RFC 5646 
section 2.1.1</a>
+     */
+    public static String basicFormat(String string) {
+        // with the interpretation that "after singleton" means anywhere after 
the singleton.
+        if ( string == null )
+            return null;
+        if ( string.isEmpty() )
+            return string;
+        List<String> strings = InternalLangTag.splitOnDash(string);
+        if ( strings == null ) {
+            //return lowercase(string);
+            error("Bad language string: %s", string);
+        }
+        StringBuilder sb = new StringBuilder(string.length());
+        boolean singleton = false;
+        boolean first = true;
+
+        for ( String s : strings ) {
+            if ( first ) {
+                // language
+                sb.append(InternalLangTag.lowercase(s));
+                first = false;
+                continue;
+            }
+            first = false;
+            // All subtags after language
+            sb.append('-');
+            if ( singleton )
+                // Always lowercase
+                sb.append(InternalLangTag.lowercase(s));
+            else {
+                // case depends on ;length
+                sb.append(InternalLangTag.strcase(s));
+                if ( s.length() == 1 )
+                    singleton = true;
+            }
+        }
+        return sb.toString();
+    }
+
+    /** Is @code{langTag1} the same as @code{langTag2}? */
+    public static boolean sameLangTagAs(LangTag langTag1, LangTag langTag2) {
+        Objects.requireNonNull(langTag1);
+        Objects.requireNonNull(langTag2);
+        if ( langTag1 == langTag2 )
+            return true;
+        if ( ! Objects.equals(langTag1.getLanguage(),langTag2.getLanguage()) )
+            return false;
+        if ( ! Objects.equals(langTag1.getScript(),langTag2.getScript()) )
+            return false;
+        if ( ! Objects.equals(langTag1.getRegion(),langTag2.getRegion()) )
+            return false;
+        if ( ! Objects.equals(langTag1.getVariant(), langTag2.getVariant()) )
+            return false;
+        if ( ! Objects.equals(langTag1.getExtension(), 
langTag2.getExtension()) )
+            return false;
+        if ( ! Objects.equals(langTag1.getPrivateUse(), 
langTag2.getPrivateUse()) )
+            return false;
+        return true;
+    }
+
+    /**
+     * Check a language tag string meets the Turtle(etc) and SPARQL grammar 
rule
+     * for a language tag without initial text direction.
+     * <p>
+     * Passing this test does not guarantee the string is valid language tag. 
Use
+     * {@link LangTags#check(String)} for validity checking.
+     *
+     * @returns true or false
+     */
+    public static boolean basicCheck(String string) {
+        try {
+            return basicCheckEx(string);
+        } catch (LangTagException ex) {
+            return false;
+        }
+    }
+
+    /**
+     * Check a language tag string meets the Turtle(etc) and SPARQL grammar 
rule
+     * for a language tag without initial text direction.
+     * <p>
+     * Passing this test does not guarantee the string is valid language tag. 
Use
+     * {@link LangTags#check(String)} for validity checking.
+     *
+     * @throws LangTagException
+     */
+    public static boolean basicCheckEx(String string) {
+        boolean start = true;
+        int lastSegmentStart = 0;
+
+        for ( int idx = 0; idx < string.length(); idx++ ) {
+            char ch = string.charAt(idx);
+            if ( InternalLangTag.isA2ZN(ch) )
+                continue;
+            if ( ch == '-' ) {
+                if ( idx == 0 ) {
+                    error("'%s': starts with a '-' character", string);
+                    return false;
+                }
+                if ( idx == lastSegmentStart ) {
+                    error("'%s': two dashes", string);
+                    return false;
+                }
+                lastSegmentStart = idx+1;
+                continue;
+            }
+            // Not A2ZN, not '-'.
+            error("Bad character: (0x%02X) '%s' index %d", (int)ch, str(ch), 
idx);
+            return false;
+        }
+        // End of string.
+        if ( lastSegmentStart == string.length() ) {
+            error("'%s': Ends in a '-'", string);
+            return false;
+        }
+        return true;
+    }
+
+    /**
+     * Split a language tag based on dash separators
+     * <p>
+     * The string should be a legal language tag, at least by the general 
SPARQL/Turtle(etc) grammar rule.
+     * @returns null on bad input syntax
+     *
+     * @see LangTags#check
+     * @see LangTags#create
+     */
+    public static List<String> splitOnDash(String string) {
+        try {
+            return splitOnDashEx(string);
+        } catch (LangTagException ex) {
+            return null;
+        }
+    }
+
+    /**
+     * Split a language tag into subtags.
+     * <p>
+     * The string should be a legal language tag, at least by the general 
SPARQL/Turtle(etc) grammar rule.
+     * @throw {@link LangTagException}
+     *
+     * @see LangTags#check
+     * @see LangTags#create
+     */
+    public static List<String> splitOnDashEx(String string) {
+        List<String> parts = new ArrayList<>();
+        // Split efficiently based on [a-z][A-Z][0-9] units separated by "-", 
with meaning error messages.
+        StringBuilder sb = new StringBuilder();
+
+        boolean start = true;
+        for ( int idx = 0; idx < string.length(); idx++ ) {
+            char ch = string.charAt(idx);
+            if ( InternalLangTag.isA2ZN(ch) ) {
+                sb.append(ch);
+                continue;
+            }
+            if ( ch == '-' ) {
+                if ( idx == 0 ) {
+                    error("'%s': starts with a '-' character", string);
+                    return null;
+                }
+                String str = sb.toString();
+                if ( str.isEmpty() ) {
+                    error("'%s': two dashes", string);
+                    return null;
+                }
+                parts.add(str);
+                sb.setLength(0);
+                continue;
+            }
+            error("Bad character: (0x%02X) '%s' index %d", (int)ch, str(ch), 
idx);
+            return null;
+        }
+        String strLast = sb.toString();
+        if ( strLast.isEmpty() ) {
+            error("'%s': Ends in a '-'", string);
+            return null;
+        }
+        parts.add(strLast);
+        return parts;
+    }
+}
+
diff --git a/jena-langtag/src/main/java/org/apache/jena/langtag/SysLangTag.java 
b/jena-langtag/src/main/java/org/apache/jena/langtag/SysLangTag.java
new file mode 100644
index 0000000000..8e8835ac90
--- /dev/null
+++ b/jena-langtag/src/main/java/org/apache/jena/langtag/SysLangTag.java
@@ -0,0 +1,46 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.jena.langtag;
+
+/**
+ * See also {@link LangTags}.
+ */
+public class SysLangTag {
+
+    /**
+     * Create a {@link LangTag} using the system-wide default language tag 
parser,
+     * which is {@link LangTagRFC5646}.
+     *
+     */
+    public static LangTag create(String languageTag) {
+        return LangTagRFC5646.create(languageTag);
+    }
+
+    /**
+     * Format language tag.
+     * This is the system-wide policy for formatting language tags.
+     */
+    public static String formatLangTag(String input) {
+        if ( input == null )
+            return "";
+        if ( input.isEmpty() )
+            return input;
+        return create(input).str();
+    }
+}
diff --git 
a/jena-langtag/src/main/java/org/apache/jena/langtag/cmd/CmdLangTag.java 
b/jena-langtag/src/main/java/org/apache/jena/langtag/cmd/CmdLangTag.java
new file mode 100644
index 0000000000..0b1a46fdec
--- /dev/null
+++ b/jena-langtag/src/main/java/org/apache/jena/langtag/cmd/CmdLangTag.java
@@ -0,0 +1,76 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.jena.langtag.cmd;
+
+import org.apache.jena.langtag.LangTag;
+import org.apache.jena.langtag.LangTagException;
+import org.apache.jena.langtag.SysLangTag;
+
+public class CmdLangTag {
+
+    public static void main(String[] args) {
+        if ( args.length != 1 ) {
+            System.err.println("Requires one argument.");
+            System.exit(1);
+        }
+
+        String languageTag = args[0];
+        if ( languageTag.isEmpty() ) {
+            System.err.println("Empty string for language tag");
+            System.exit(1);
+        }
+        if ( languageTag.isBlank() ) {
+            System.err.println("Blank string for language tag");
+            System.exit(1);
+        }
+        if ( languageTag.contains(" ") || languageTag.contains("\t") || 
languageTag.contains("\n") || languageTag.contains("\r") ) {
+            System.err.println("Language tag contains white space");
+            System.exit(1);
+        }
+        if ( languageTag.contains("--") ) {
+            System.err.println("Illgeal language tag. String contains '--'");
+            System.exit(1);
+        }
+
+        try {
+            System.out.printf("%-16s %s\n", "Input:", languageTag);
+            LangTag langTag = SysLangTag.create(languageTag);
+            System.out.printf("%-16s %s\n", "Formatted:", langTag.str());
+            print("Language:",    langTag.getLanguage(), true);
+            print("Script:",      langTag.getScript(), true);
+            print("Region:",      langTag.getRegion(), true);
+            print("Variant:",     langTag.getVariant(), false);
+            print("Extension:",   langTag.getExtension(), false);
+            print("Private Use:", langTag.getPrivateUse(), false);
+        } catch (LangTagException ex) {
+            System.out.println("Bad language tag");
+            System.out.printf("%s\n", ex.getMessage());
+            System.exit(1);
+        }
+    }
+
+    private static void print(String label, String value, boolean always) {
+        if ( value == null ) {
+            if ( ! always )
+                return;
+            value = "-";
+        }
+        System.out.printf("  %-14s %s\n", label, value);
+    }
+}
diff --git a/jena-langtag/src/test/java/org/apache/jena/langtag/TS_LangTag.java 
b/jena-langtag/src/test/java/org/apache/jena/langtag/TS_LangTag.java
new file mode 100644
index 0000000000..d369d91ace
--- /dev/null
+++ b/jena-langtag/src/test/java/org/apache/jena/langtag/TS_LangTag.java
@@ -0,0 +1,32 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.jena.langtag;
+
+import org.junit.platform.suite.api.SelectClasses;
+import org.junit.platform.suite.api.Suite;
+
+@Suite
+@SelectClasses( {
+    TestLangTag.class
+    , TestLangTagFormat.class
+    , TestLangTagsOps.class
+    , TestBasicSyntaxLangTags.class
+})
+
+public class TS_LangTag { }
diff --git 
a/jena-langtag/src/test/java/org/apache/jena/langtag/TestBasicSyntaxLangTags.java
 
b/jena-langtag/src/test/java/org/apache/jena/langtag/TestBasicSyntaxLangTags.java
new file mode 100644
index 0000000000..e746aab548
--- /dev/null
+++ 
b/jena-langtag/src/test/java/org/apache/jena/langtag/TestBasicSyntaxLangTags.java
@@ -0,0 +1,76 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.jena.langtag;
+
+import static org.apache.jena.langtag.LangTags.*;
+import static org.junit.jupiter.api.Assertions.*;
+
+
+
+import java.util.Arrays;
+import java.util.List;
+
+import org.junit.jupiter.api.Test;
+
+/**
+ * Tests for the operations related to basic parsing of language tags (SPARQL 
and Turtle grammar rules)
+ */
+public class TestBasicSyntaxLangTags {
+    @Test public void basic_01() { basicSplitCheck("en", "en"); }
+    @Test public void basic_02() { basicSplitCheck("en-GB", "en", "GB"); }
+    @Test public void basic_03() { basicSplitCheck("en-gb", "en", "gb"); }
+    @Test public void basic_04() { basicSplitCheck("en", "en"); }
+
+    // Showing the split does not allocate subtags to their category. e.g. 
"x-private" is split.
+    @Test public void basic_05() { 
basicSplitCheck("en-Latn-GB-boont-r-extended-sequence-x-private",
+                                                   "en","Latn", "GB", "boont", 
"r", "extended", "sequence", "x", "private"); }
+
+    @Test public void basic_bad_01() { basicSplitCheckBad(""); }
+    @Test public void basic_bad_02() { basicSplitCheckBad("-"); }
+    @Test public void basic_bad_03() { basicSplitCheckBad("--"); }
+    @Test public void basic_bad_04() { basicSplitCheckBad("abc-xy%20"); }
+    @Test public void basic_bad_05() { basicSplitCheckBad("abc def"); }
+
+    static void basicSplitCheck(String input, String...parts) {
+        basicSplitTest(input, parts);
+        checkTest(input);
+    }
+
+    static void basicSplitCheckBad(String input) {
+        assertFalse(basicCheck(input));
+        assertNull(splitOnDash(input));
+        assertThrows(LangTagException.class, ()->splitOnDashEx(input));
+        assertThrows(LangTagException.class, ()->basicCheckEx(input));
+    }
+
+    public static void basicSplitTest(String input, String...parts) {
+        List<String> expected = (parts == null) ? null : Arrays.asList(parts);
+        List<String> actual = splitOnDashEx(input);
+        assertEquals(expected, actual, "Subject: "+input);
+        List<String> actual2 = splitOnDash(input);
+        assertEquals(actual, actual2, "Subject(2): "+input);
+    }
+
+    private static void checkTest(String input) {
+        boolean actual =  basicCheckEx(input);
+        assertTrue(actual, "Subject: "+input);
+        boolean actual2 =  basicCheck(input);
+        assertEquals(actual, actual2, "Subject(2): "+input);
+    }
+}
diff --git 
a/jena-langtag/src/test/java/org/apache/jena/langtag/TestLangTag.java 
b/jena-langtag/src/test/java/org/apache/jena/langtag/TestLangTag.java
new file mode 100644
index 0000000000..289406bcc3
--- /dev/null
+++ b/jena-langtag/src/test/java/org/apache/jena/langtag/TestLangTag.java
@@ -0,0 +1,240 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.jena.langtag;
+
+import static org.junit.jupiter.api.Assertions.assertEquals;
+import static org.junit.jupiter.api.Assertions.assertNotNull;
+import static org.junit.jupiter.api.Assertions.assertNull;
+import static org.junit.jupiter.api.Assertions.fail;
+
+import org.junit.jupiter.api.Test;
+
+public class TestLangTag {
+
+    @Test public void test_lang_parse_00() { testRFC5646("lng-scrp-rg", 
"lng-Scrp-RG", "lng", "Scrp", "RG", null, null); }
+    @Test public void test_lang_parse_01() { 
testRFC5646("lng-scrp-rg-variant", "lng-Scrp-RG-variant", "lng", "Scrp", "RG", 
"variant", null); }
+    @Test public void test_lang_parse_02() { 
testRFC5646("lng-scrp-rg-variant-e-abc", "lng-Scrp-RG-variant-e-abc", "lng", 
"Scrp", "RG", "variant", "e-abc"); }
+
+    @Test public void test_lang_basic_01() { testRFC5646("en", "en",           
    "en", null, null, null, null); }
+    @Test public void test_lang_basic_02() { testRFC5646("en-us", "en-US",     
       "en", null, "US", null, null); }
+    @Test public void test_lang_basic_03() { testRFC5646("en-latn-us", 
"en-Latn-US",  "en", "Latn", "US", null, null); }
+    @Test public void test_lang_basic_04() { testRFC5646("en-123", "en-123", 
"en", null, "123", null, null); }
+    @Test public void test_lang_basic_05() { testRFC5646("en-1234", "en-1234", 
"en", null, null, "1234", null); }
+    @Test public void test_lang_basic_06() { testRFC5646("en-latn", "en-Latn", 
"en", "Latn", null, null, null); }
+    @Test public void test_lang_basic_07() { testRFC5646("en-latn-gb", 
"en-Latn-GB", "en", "Latn", "GB", null, null); }
+    // Language subtags
+    @Test public void test_lang_basic_08() { testNotJDK("en-brs-xxx-latn-gb", 
"en-brs-xxx-Latn-GB", "en-brs-xxx", "Latn", "GB", null, null, null); }
+    @Test public void test_lang_basic_09() { testRFC5646("de-CH-w-extend", 
"de-CH-w-extend", "de", null, "CH", null, "w-extend"); }
+    @Test public void test_lang_basic_10() { 
testRFC5646("de-CH-w-extend-extend", "de-CH-w-extend-extend", "de", null, "CH", 
null, "w-extend-extend"); }
+
+    @Test public void test_lang_basic_20() { 
testPrivateUse("de-CH-x-phonebk-morech", "de-CH-x-phonebk-morech", "de", null, 
"CH", null, null, "x-phonebk-morech"); }
+    // Private use language tag. No language!
+    @Test public void test_lang_basic_21() { testPrivateUse("x-private", 
"x-private", null, null, null, null, null, "x-private"); }
+    @Test public void test_lang_basic_22() { testPrivateUse("az-Latn-x-latn", 
"az-Latn-x-latn", "az", "Latn", null, null, null, "x-latn"); }
+    @Test public void test_lang_basic_23() { testPrivateUse("sss-x-y", 
"sss-x-y", "sss", null, null, null, null, "x-y"); }
+
+
+    @Test public void test_lang_bad_01() { testBad("123"); }
+    @Test public void test_lang_bad_02() { testBad("abcdefghijklmn"); }
+    @Test public void test_lang_bad_03() { testBad("abcdefghijklmn-123"); }
+    @Test public void test_lang_bad_04() { testBad("abcdefghijklmn-latn"); }
+
+    @Test public void test_lang_bad_05() { testBad("a?"); }
+    @Test public void test_lang_bad_06() { testBad("a b"); }
+    @Test public void test_lang_bad_07() { testBad("en--us"); }
+    @Test public void test_lang_bad_08() { testBad("-us"); }
+    @Test public void test_lang_bad_09() { testBad("en-"); }
+    @Test public void test_lang_bad_10() { testBad("en-gb-"); }
+    @Test public void test_lang_bad_11() { testBad("i18n"); }
+
+    // Wrong lengths
+    @Test public void test_lang_bad_20() { testBad("s"); }
+    @Test public void test_lang_bad_21() { testBad("abcdefghi"); }
+    @Test public void test_lang_bad_22() { testBad("en-abcdefghi"); }
+    @Test public void test_lang_bad_23() { testBad("en-Latn-x-abcdefghi"); }
+
+    // Bad extension
+    @Test public void test_lang_bad_31() { testBad("sss-d"); }
+    @Test public void test_lang_bad_32() { testBad("sss-d-"); }
+    @Test public void test_lang_bad_33() { testBad("sss-d-e"); }
+    @Test public void test_lang_bad_34() { testBad("sss-d-ext-"); }
+
+    // Bad private use
+    @Test public void test_lang_bad_45() { testBad("sss-x"); }
+    @Test public void test_lang_bad_46() { testBad("sss-x-"); }
+    @Test public void test_lang_bad_47() { testBad("sss-x-part-"); }
+
+    @Test public void test_lang_bad_repeated_extension() {
+        // "en-a-bbb-a-ccc" is invalid because the subtag 'a' appears twice.
+        testBad("en-a-bbb-a-ccc");
+    }
+
+    // Wikipedia-like -- their private use subtags can be too long
+    @Test public void test_lang_bad_50() { testBad("en-x-Q123456789"); }
+
+    // Special cases. "en-GB-oed" -- "oed" is variant even though it does not 
match the syntax rule.
+    @Test public void test_langtag_special_01() { testFormatting("en-GB-oed", 
"en-GB-oed"); }
+    @Test public void test_langtag_special_02() { testNotJDK("en-GB-oed", 
"en-GB-oed", "en", null, "GB", "oed",  null, null); }
+    @Test public void test_langtag_special_03() { testFormatting("EN-gb-OED", 
"en-GB-oed"); }
+    @Test public void test_langtag_special_04() { testNotJDK("EN-gb-OED", 
"en-GB-oed", "en", null, "GB", "oed",  null, null); }
+
+    // The examples from RFC 5646
+    @Test public void test_lang_10() { testRFC5646("de", "de", "de", null, 
null, null, null); }
+    @Test public void test_lang_11() { testRFC5646("fr", "fr", "fr", null, 
null, null, null); }
+    @Test public void test_lang_12() { testRFC5646("ja", "ja", "ja", null, 
null, null, null); }
+    @Test public void test_lang_13() { testNotJDK("i-enochian", "i-enochian", 
"i-enochian", null, null, null, null, null); }
+    @Test public void test_lang_14() { testRFC5646("zh-Hant", "zh-Hant", "zh", 
"Hant", null, null, null); }
+    @Test public void test_lang_15() { testRFC5646("zh-Hans", "zh-Hans", "zh", 
"Hans", null, null, null); }
+    @Test public void test_lang_16() { testRFC5646("sr-Cyrl", "sr-Cyrl", "sr", 
"Cyrl", null, null, null); }
+    @Test public void test_lang_17() { testRFC5646("sr-Latn", "sr-Latn", "sr", 
"Latn", null, null, null); }
+
+    // Extended language subtag (3 letter)
+    @Test public void test_lang_18() { testNotJDK("zh-cmn-Hans-CN", 
"zh-cmn-Hans-CN", "zh-cmn", "Hans", "CN", null, null, null); }
+    @Test public void test_lang_19() { testRFC5646("cmn-Hans-CN", 
"cmn-Hans-CN", "cmn", "Hans", "CN", null, null); }
+    @Test public void test_lang_20() { testNotJDK("zh-yue-HK", "zh-yue-HK", 
"zh-yue", null, "HK", null, null, null); }
+    @Test public void test_lang_21() { testRFC5646("yue-HK", "yue-HK", "yue", 
null, "HK", null, null); }
+    @Test public void test_lang_22() { testRFC5646("zh-Hans-CN", "zh-Hans-CN", 
"zh", "Hans", "CN", null, null); }
+
+    @Test public void test_lang_23() { testRFC5646("sr-Latn-RS", "sr-Latn-RS", 
"sr", "Latn", "RS", null, null); }
+    @Test public void test_lang_24() { testRFC5646("sl-rozaj", "sl-rozaj", 
"sl", null, null, "rozaj", null); }
+    @Test public void test_lang_25() { testNotJDK("sl-rozaj-biske", 
"sl-rozaj-biske", "sl", null, null, "rozaj-biske", null, null); }
+    @Test public void test_lang_26() { testRFC5646("sl-nedis", "sl-nedis", 
"sl", null, null, "nedis", null); }
+    @Test public void test_lang_27() { testRFC5646("de-CH-1901", "de-CH-1901", 
"de", null, "CH", "1901", null); }
+    @Test public void test_lang_28() { testRFC5646("sl-IT-nedis", 
"sl-IT-nedis", "sl", null, "IT", "nedis", null); }
+    @Test public void test_lang_29() { testRFC5646("hy-Latn-IT-arevela", 
"hy-Latn-IT-arevela", "hy", "Latn", "IT", "arevela", null); }
+    @Test public void test_lang_30() { testRFC5646("de-DE", "de-DE", "de", 
null, "DE", null, null); }
+    @Test public void test_lang_31() { testRFC5646("en-US", "en-US", "en", 
null, "US", null, null); }
+    @Test public void test_lang_32() { testRFC5646("es-419", "es-419", "es", 
null, "419", null, null); }
+
+    @Test public void test_lang_33() { testPrivateUse("de-CH-x-phonebk", 
"de-CH-x-phonebk", "de", null, "CH", null, null, "x-phonebk"); }
+    @Test public void test_lang_34() { testPrivateUse("az-Arab-x-AZE-derbend", 
"az-Arab-x-aze-derbend", "az", "Arab", null, null, null, "x-aze-derbend"); }
+    @Test public void test_lang_35() { 
testPrivateUse("x-whatever-a-abc-x-xyz", "x-whatever-a-abc-x-xyz", null, null, 
null, null, null, "x-whatever-a-abc-x-xyz"); }
+    @Test public void test_lang_36() { 
testPrivateUse("qaa-Qaaa-QM-x-southern", "qaa-Qaaa-QM-x-southern", "qaa", 
"Qaaa", "QM", null, null, "x-southern"); }
+
+    @Test public void test_lang_37() { testRFC5646("de-Qaaa", "de-Qaaa", "de", 
"Qaaa", null, null, null); }
+    @Test public void test_lang_38() { testRFC5646("sr-Latn-QM", "sr-Latn-QM", 
"sr", "Latn", "QM", null, null); }
+    @Test public void test_lang_39() { testRFC5646("sr-Qaaa-RS", "sr-Qaaa-RS", 
"sr", "Qaaa", "RS", null, null); }
+    @Test public void test_lang_40() { testRFC5646("en-US-u-islamcal", 
"en-US-u-islamcal", "en", null, "US", null, "u-islamcal"); }
+    @Test public void test_lang_41() { 
testPrivateUse("zh-CN-a-myext-x-private", "zh-CN-a-myext-x-private", "zh", 
null, "CN", null, "a-myext", "x-private"); }
+    @Test public void test_lang_42() { testRFC5646("en-a-myext-b-another", 
"en-a-myext-b-another", "en", null, null, null, "a-myext-b-another"); }
+
+    @Test public void test_lang_50() { testPrivateUse("en-x-private", 
"en-x-private",    "en", null, null, null, null, "x-private"); }
+
+    @Test public void test_lang_51() { testPrivateUse( "en-x-US",  "en-x-us",  
  "en", null, null, null, null, "x-us"); }
+    // "Note that the tag "en-a-bbb-x-a-ccc" is valid because the second 
appearance of
+    // the singleton 'a' is in a private use sequence."
+    @Test public void test_lang_52() { testPrivateUse( "en-a-bbb-x-a-ccc" ,  
"en-a-bbb-x-a-ccc" ,    "en", null, null, null, "a-bbb", "x-a-ccc"); }
+
+    // Mentioned in RFC 5646
+    @Test public void test_lang_60() { 
testPrivateUse("en-Latn-GB-boont-r-extended-sequence-x-private", 
"en-Latn-GB-boont-r-extended-sequence-x-private",
+                                                      "en","Latn", "GB", 
"boont", "r-extended-sequence", "x-private"); }
+
+    @Test public void test_lang_61() { 
testPrivateUse("en-Latn-GB-boont-r-extended-sequence-s-another-x-private", 
"en-Latn-GB-boont-r-extended-sequence-s-another-x-private",
+                                                       "en","Latn", "GB", 
"boont", "r-extended-sequence-s-another", "x-private"); }
+
+
+    /** General test - include JDK */
+    private static void testRFC5646(String langString, String formatted, 
String lang, String script, String region, String variant, String extension) {
+        runTest(langString, formatted, lang, script, region, variant, 
extension, null, true);
+    }
+
+    /** Has a private use part */
+    private static void testPrivateUse(String langString, String formatted, 
String lang, String script, String region, String variant, String extension, 
String privateUse) {
+        // Private use is supported by LanTagJDK by extracting the "x" 
extension
+        runTest(langString, formatted, lang, script, region, variant, 
extension, privateUse, true);
+    }
+
+
+    /** Run a test which is not properly supported by the JDK-Locale based 
implementation. */
+    private static void testNotJDK(String langString, String formatted, String 
lang, String script, String region, String variant, String extension, String 
privateUse) {
+        runTest(langString, formatted, lang, script, region, variant, 
extension, privateUse, false);
+    }
+
+    /** Run a test which illegal by RFC 5646 */
+    private void testBad(String string) {
+        try {
+            LangTag langTag = LangTagRFC5646.create(string);
+            // Parser throws an exception. In case that changes ...
+            assertNull(langTag);
+            fail("Expected a LangTagException");
+        } catch (LangTagException ex) {
+            //ex.printStackTrace();
+        }
+    }
+
+    private static void runTest(String langString, String formatted,
+                                String lang, String script, String region, 
String variant, String extension, String privateuse,
+                                boolean jdkSupported) {
+        // Run the test with varied case of the input string.
+        test1(langString,               formatted, lang, script, region, 
variant, extension, privateuse);
+        test1(langString.toLowerCase(), formatted, lang, script, region, 
variant, extension, privateuse);
+        test1(langString.toUpperCase(), formatted, lang, script, region, 
variant, extension, privateuse);
+
+        // Formatting.
+        testFormatting(langString, formatted);
+
+        // JDK
+        if ( jdkSupported ) {
+            LangTag jdk = LangTagJDK.create(langString);
+            assertEquals(lang, jdk.getLanguage());
+            assertEquals(script, jdk.getScript());
+            assertEquals(region, jdk.getRegion());
+            assertEquals(variant, jdk.getVariant());
+            assertEquals(extension, jdk.getExtension());
+            assertEquals(privateuse, jdk.getPrivateUse());
+        }
+
+        final boolean regexSupported = true;
+        if ( regexSupported ) {
+            LangTag langTagByRE = LangTagRE.create(langString);
+            assertEquals(lang, langTagByRE.getLanguage());
+            assertEquals(script, langTagByRE.getScript());
+            assertEquals(region, langTagByRE.getRegion());
+            assertEquals(variant, langTagByRE.getVariant());
+            assertEquals(extension, langTagByRE.getExtension());
+            assertEquals(privateuse, langTagByRE.getPrivateUse());
+        }
+    }
+
+    // Test execution for LangTagRFC5646 on one exact input string.
+    private static void test1(String langString, String formatted, String 
lang, String script, String region, String variant, String extension, String 
privateuse) {
+        LangTag langTag = LangTagRFC5646.create(langString);
+        assertNotNull(langTag);
+        assertEquals(lang, langTag.getLanguage(), "Lang");
+        assertEquals(script, langTag.getScript(), "Script");
+        assertEquals(region, langTag.getRegion(), "Region");
+        assertEquals(variant, langTag.getVariant(), "Variant");
+        assertEquals(extension, langTag.getExtension(), "Extension");
+        assertEquals(privateuse, langTag.getPrivateUse(), "Private use");
+        String f = langTag.str();
+        assertEquals(formatted, f, "String formatted");
+    }
+
+    private static void testFormatting(String langString, String expected) {
+        // Formatting.
+        // Already in test1 but redoing it allows a check between the two 
formatters.
+        LangTag langTag = LangTagRFC5646.create(langString);
+        // Build formatted language tag.
+        String fmt1 = langTag.str();
+        assertEquals(expected, fmt1, "RFC5646 parser format");
+        // Formatting using the general algorithm of RFC5646.
+        String fmt2 = LangTags.basicFormat(langString);
+        assertEquals(expected, fmt2, "RFC5646 basic algorithm");
+    }
+}
diff --git 
a/jena-langtag/src/test/java/org/apache/jena/langtag/TestLangTagFormat.java 
b/jena-langtag/src/test/java/org/apache/jena/langtag/TestLangTagFormat.java
new file mode 100644
index 0000000000..db7c4b0377
--- /dev/null
+++ b/jena-langtag/src/test/java/org/apache/jena/langtag/TestLangTagFormat.java
@@ -0,0 +1,143 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.jena.langtag;
+
+
+import java.util.ArrayList;
+import java.util.List;
+import java.util.function.Function;
+
+import org.junit.Test;
+import org.junit.runner.RunWith;
+import org.junit.runners.Parameterized;
+import org.junit.runners.Parameterized.Parameters;
+
+// JUnit4
+// Junit5 is missing @ParameterizedClass which may arrive eventually
+@RunWith(Parameterized.class)
+public class TestLangTagFormat {
+
+    private static Function<String, String> formatter1 = (s)-> 
LangTagRFC5646.create(s).str();
+    private static Function<String, String> formatter2 = (s)-> 
LangTags.basicFormat(s);
+
+    @Parameters(name = "{index}: {0}")
+    public static Iterable<Object[]> data() {
+        List<Object[]> x = new ArrayList<>() ;
+
+        x.add(new Object[] {"LangTagRFC5646", formatter1});
+        x.add(new Object[] {"LangTagOps", formatter2});
+        return x ;
+    }
+
+    private final String formatterName;
+    private final Function<String, String> formatter;
+
+    public TestLangTagFormat(String name, Function<String, String> formatter) {
+        this.formatterName = name;
+        this.formatter = formatter;
+    }
+
+    @Test public void testBasicFormat01() { test("de", "de"); }
+    @Test public void testBasicFormat02() { test("FR", "fr"); }
+    @Test public void testBasicFormat03() { test("jA", "ja"); }
+    @Test public void testBasicFormat04() { test("de-DE", "de-DE"); }
+    @Test public void testBasicFormat05() { test("en-US", "en-US"); }
+    @Test public void testBasicFormat06() { test("en-US-variant", 
"en-US-variant"); }
+
+    // 419 is a region.
+    @Test public void testBasicFormat10() { test("es-419", "es-419"); }
+    @Test public void testBasicFormat11() { test("es-latn-419", 
"es-Latn-419"); }
+
+    @Test public void testBasicFormat90() { test("en-GB-oed", "en-GB-oed"); }
+    @Test public void testBasicFormat91() { test("EN-gb-OED", "en-GB-oed"); }
+
+    // Taken from the examples in RFC 5646
+    @Test public void testBasicFormat20() { test("zh-hant",         
"zh-Hant"); }
+    @Test public void testBasicFormat21() { test("sr-cyrl",         
"sr-Cyrl"); }
+    @Test public void testBasicFormat22() { test("sr-latn",         
"sr-Latn"); }
+    @Test public void testBasicFormat23() { test("zh-cmn-hans-cn",  
"zh-cmn-Hans-CN"); }
+    @Test public void testBasicFormat24() { test("cmn-hans-cn",     
"cmn-Hans-CN"); }
+    @Test public void testBasicFormat25() { test("zh-yue-hk",       
"zh-yue-HK"); }
+    @Test public void testBasicFormat26() { test("yue-hk",          "yue-HK"); 
}
+    @Test public void testBasicFormat27() { test("zh-hans-cn",      
"zh-Hans-CN"); }
+    @Test public void testBasicFormat28() { test("sr-latn-rs",      
"sr-Latn-RS"); }
+    @Test public void testBasicFormat29() { test("sl-rozaj",        
"sl-rozaj"); }
+    @Test public void testBasicFormat30() { test("sl-rozaj-biske",  
"sl-rozaj-biske"); }
+    @Test public void testBasicFormat31() { test("de-ch-1901",      
"de-CH-1901"); }
+    @Test public void testBasicFormat32() { test("sl-it-nedis",     
"sl-IT-nedis"); }
+    @Test public void testBasicFormat33() { test("hy-latn-it-arevela",      
"hy-Latn-IT-arevela"); }
+    @Test public void testBasicFormat34() { test("de-ch-x-phonebk",         
"de-CH-x-phonebk"); }
+    @Test public void testBasicFormat35() { test("az-arab-x-aze-derbend",   
"az-Arab-x-aze-derbend"); }
+    @Test public void testBasicFormat36() { test("x-whatever",              
"x-whatever"); }
+    @Test public void testBasicFormat37() { test("qaa-qaaa-qm-x-southern",  
"qaa-Qaaa-QM-x-southern"); }
+    @Test public void testBasicFormat38() { test("de-qaaa",         
"de-Qaaa"); }
+    @Test public void testBasicFormat39() { test("en-us-u-islamcal",        
"en-US-u-islamcal"); }
+    @Test public void testBasicFormat40() { test("zh-cn-a-myext-x-private", 
"zh-CN-a-myext-x-private"); }
+    @Test public void testBasicFormat41() { test("en-a-myext-b-another",    
"en-a-myext-b-another"); }
+    @Test public void testBasicFormat42() { test("en-123",          "en-123"); 
}
+    @Test public void testBasicFormat43() { test("en-1234",         
"en-1234"); }
+    @Test public void testBasicFormat44() { test("en-brs-xxx-latn-gb",      
"en-brs-xxx-Latn-GB"); }
+    @Test public void testBasicFormat45() { test("EN-LATN",         
"en-Latn"); }
+    @Test public void testBasicFormat46() { test("en-latn-gb",      
"en-Latn-GB"); }
+    @Test public void testBasicFormat47() { test("de-ch-w-extend",  
"de-CH-w-extend"); }
+    @Test public void testBasicFormat48() { test("de-ch-x-phonebk-morech",  
"de-CH-x-phonebk-morech"); }
+    @Test public void testBasicFormat49() { test("x-private",       
"x-private"); }
+    @Test public void testBasicFormat50() { test("az-latn-x-latn",  
"az-Latn-x-latn"); }
+    @Test public void testBasicFormat51() { test("en-latn-X-DaTa",  
"en-Latn-x-data"); }
+
+    @Test public void irregular_01() { test("SGN-BE-FR",    "sgn-BE-FR"); }
+    @Test public void irregular_02() { test("sgn-be-fr",    "sgn-BE-FR"); }
+    @Test public void irregular_03() { test("sgn-be-nl",    "sgn-BE-NL"); }
+    @Test public void irregular_04() { test("sgn-ch-de",    "sgn-CH-DE"); }
+    @Test public void irregular_05() { test("i-klingon",    "i-klingon"); }
+
+    // Mentioned in RFC 4646
+    @Test public void parseCanonical_01() { test("en-ca-x-ca",          
"en-CA-x-ca"); }
+    @Test public void parseCanonical_02() { test("EN-ca-X-Ca",          
"en-CA-x-ca"); }
+    @Test public void parseCanonical_03() { test("En-Ca-X-Ca",          
"en-CA-x-ca"); }
+    @Test public void parseCanonical_04() { test("AZ-latn-x-LATN",      
"az-Latn-x-latn"); }
+    @Test public void parseCanonical_05() { test("Az-latn-X-Latn",      
"az-Latn-x-latn"); }
+
+    @Test public void parseCanonical_10() { test("zh-hant",             
"zh-Hant"); }
+    @Test public void parseCanonical_11() { test("zh-latn-wadegile",    
"zh-Latn-wadegile"); }
+    @Test public void parseCanonical_12() { test("zh-latn-pinyin",      
"zh-Latn-pinyin"); }
+    @Test public void parseCanonical_13() { test("en-us",               
"en-US"); }
+    @Test public void parseCanonical_14() { test("EN-Gb",               
"en-GB"); }
+    @Test public void parseCanonical_15() { test("qqq-002",             
"qqq-002"); }
+    @Test public void parseCanonical_16() { test("ja-latn",             
"ja-Latn"); }
+    @Test public void parseCanonical_17() { test("x-local",             
"x-local"); }
+    @Test public void parseCanonical_18() { test("he-latn",             
"he-Latn"); }
+    @Test public void parseCanonical_19() { test("und",                 
"und"); }
+    @Test public void parseCanonical_20() { test("nn",                  "nn"); 
}
+    @Test public void parseCanonical_21() { test("ko-latn",             
"ko-Latn"); }
+    @Test public void parseCanonical_22() { test("ar-latn",             
"ar-Latn"); }
+    @Test public void parseCanonical_23() { test("la-x-liturgic",       
"la-x-liturgic"); }
+    @Test public void parseCanonical_24() { test("fa-x-middle",         
"fa-x-middle"); }
+    @Test public void parseCanonical_25() { test("qqq-142",             
"qqq-142"); }
+    @Test public void parseCanonical_26() { test("bnt",                 
"bnt"); }
+    @Test public void parseCanonical_27() { test("grc-x-liturgic",      
"grc-x-liturgic"); }
+    @Test public void parseCanonical_28() { test("egy-Latn",            
"egy-Latn"); }
+    @Test public void parseCanonical_29() { test("la-x-medieval",       
"la-x-medieval"); }
+
+    private void test(String langString, String expected) {
+        String result = formatter.apply(langString);
+        // JUnit4 argument order.
+        org.junit.Assert.assertEquals(formatterName+"("+langString+"): ", 
expected, result);
+    }
+}
diff --git 
a/jena-langtag/src/test/java/org/apache/jena/langtag/TestLangTagsOps.java 
b/jena-langtag/src/test/java/org/apache/jena/langtag/TestLangTagsOps.java
new file mode 100644
index 0000000000..9db126f90f
--- /dev/null
+++ b/jena-langtag/src/test/java/org/apache/jena/langtag/TestLangTagsOps.java
@@ -0,0 +1,62 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.jena.langtag;
+
+import static org.junit.jupiter.api.Assertions.assertEquals;
+import static org.junit.jupiter.api.Assertions.assertFalse;
+import static org.junit.jupiter.api.Assertions.assertTrue;
+
+import org.junit.jupiter.api.Test;
+
+public class TestLangTagsOps {
+    @Test
+    public void sameLangTag_01() {
+        LangTag langTag1 = LangTags.of("en-GB");
+        LangTag langTag2 = LangTags.of("en-GB");
+        sameLangTag(langTag1, langTag2, true,  true, true);
+    }
+
+    @Test
+    public void sameLangTag_02() {
+        LangTag langTag1 = LangTags.of("en-GB");
+        LangTag langTag2 = LangTags.of("en-gb");
+        sameLangTag(langTag1, langTag2, true,  false, false);
+    }
+
+    @Test
+    public void sameLangTag_03() {
+        LangTag langTag1 = LangTags.of("en-GB-Latn");
+        LangTag langTag2 = LangTags.of("en-gb");
+        sameLangTag(langTag1, langTag2, false,  false, false);
+    }
+
+    private static void sameLangTag(LangTag langTag1, LangTag langTag2, 
boolean sameAs, boolean equals, boolean sameHash) {
+        if ( sameAs )
+            assertTrue(LangTags.sameLangTagAs(langTag1, langTag2));
+        else
+            assertFalse(LangTags.sameLangTagAs(langTag1, langTag2));
+        if ( equals )
+            assertTrue(langTag1.equals(langTag2));
+        else
+            assertFalse(langTag1.equals(langTag2));
+        if ( sameHash )
+            assertEquals(langTag1.hashCode(), langTag2.hashCode());
+        // No "hash must be different"
+    }
+}

(jena) 01/02: GH-3086: Module jena-langtag

Reply via email to