This is an automated email from the ASF dual-hosted git repository.

andy pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/jena.git


The following commit(s) were added to refs/heads/main by this push:
     new 1691d7d2db GH-3137: Accept language primary tag with 4 to 8 characters
1691d7d2db is described below

commit 1691d7d2db6d1aa47b73536e0f748388ea18e0ad
Author: Andy Seaborne <[email protected]>
AuthorDate: Mon Apr 21 18:02:22 2025 +0100

    GH-3137: Accept language primary tag with 4 to 8 characters
---
 .../org/apache/jena/langtag/LangTagRFC5646.java    | 72 ++++++++++++++--------
 .../java/org/apache/jena/langtag/TestLangTag.java  | 12 +++-
 2 files changed, 57 insertions(+), 27 deletions(-)

diff --git 
a/jena-langtag/src/main/java/org/apache/jena/langtag/LangTagRFC5646.java 
b/jena-langtag/src/main/java/org/apache/jena/langtag/LangTagRFC5646.java
index 783df7e5ea..23e33a62f4 100644
--- a/jena-langtag/src/main/java/org/apache/jena/langtag/LangTagRFC5646.java
+++ b/jena-langtag/src/main/java/org/apache/jena/langtag/LangTagRFC5646.java
@@ -237,23 +237,43 @@ public final  class LangTagRFC5646 implements LangTag{
 
         LangTagRFC5646 langtag = new LangTagRFC5646(string);
         final int N = string.length();
-        // Language-Tag  = langtag             ; normal language tags
-        //               / privateuse          ; private use tag
-        //               / grandfathered       ; grandfathered tags
-
-        // langtag       = language
-        //                 ["-" script]
-        //                 ["-" region]
-        //                 *("-" variant)
-        //                 *("-" extension)
-        //                 ["-" privateuse]
-
-        // script        = 4ALPHA              ; ISO 15924 code
-        // region        = 2ALPHA              ; ISO 3166-1 code
-        //               / 3DIGIT              ; UN M.49 code
-        // variant       = 5*8alphanum         ; registered variants
-        //               / (DIGIT 3alphanum)
-        // extension     = singleton 1*("-" (2*8alphanum))
+        // @formatter:off
+        //         langtag       = language
+        //                         ["-" script]
+        //                         ["-" region]
+        //                         *("-" variant)
+        //                         *("-" extension)
+        //                         ["-" privateuse]
+        //
+        //         language      = 2*3ALPHA            ; shortest ISO 639 code
+        //                         ["-" extlang]       ; sometimes followed by
+        //                                             ; extended language 
subtags
+        //                       / 4ALPHA              ; or reserved for 
future use
+        //                       / 5*8ALPHA            ; or registered 
language subtag
+        //
+        //         extlang       = 3ALPHA              ; selected ISO 639 codes
+        //                         *2("-" 3ALPHA)      ; permanently reserved
+        //
+        //         script        = 4ALPHA              ; ISO 15924 code
+        //
+        //         region        = 2ALPHA              ; ISO 3166-1 code
+        //                       / 3DIGIT              ; UN M.49 code
+        //
+        //         variant       = 5*8alphanum         ; registered variants
+        //                       / (DIGIT 3alphanum)
+        //
+        //         extension     = singleton 1*("-" (2*8alphanum))
+        //
+        //                                             ; Single alphanumerics
+        //                                             ; "x" reserved for 
private use
+        //         singleton     = DIGIT               ; 0 - 9
+        //                       / %x41-57             ; A - W
+        //                       / %x59-5A             ; Y - Z
+        //                       / %x61-77             ; a - w
+        //                       / %x79-7A             ; y - z
+        //
+        //         privateuse    = "x" 1*("-" (1*8alphanum))
+        // @formatter:on
 
         if ( N == 0 )
             InternalLangTag.error("Empty string");
@@ -325,13 +345,15 @@ public final  class LangTagRFC5646 implements LangTag{
                     InternalLangTag.error("Trailing characters in private 
langtag: '%s'", string.substring(langtag.privateuse1));
                 return langtag;
             }
+            // else
             InternalLangTag.error("Language part is 1 character: it must be 
2-3 characters (4-8 reserved for future use), \"x-\", or a recognized 
grandfathered tag");
         }
 
+        if ( segLen > 8 )
+            InternalLangTag.error("Language too long (2-3 characters, 4-8 
reserved for future use)");
+
         if ( idx2 < 0 ) {
             // language only.
-            if ( segLen > 8 )
-                InternalLangTag.error("Language too long (2-3 characters, 4-8 
reserved for future use)");
             langtag.language0 = 0;
             langtag.language1 = N;
             InternalLangTag.checkAlpha(string, N, langtag.language0, 
langtag.language1);
@@ -341,9 +363,6 @@ public final  class LangTagRFC5646 implements LangTag{
         if ( idx == idx2 )
             InternalLangTag.error("Can not find the language subtag: '%s'", 
string);
 
-        if ( segLen < 2 || segLen > 4 )
-            InternalLangTag.error("Language: '%s'", string);
-
         langtag.language0 = idx;
 
         if ( segLen == 2 || segLen == 3 ) {
@@ -360,10 +379,15 @@ public final  class LangTagRFC5646 implements LangTag{
                 idx2 = extEnd;
                 InternalLangTag.checkAlphaMinus(string, N, extStart, 
langtag.language1);
             }
-        } else if ( segLen > 8 ) {
+        } else if ( segLen >= 4 && segLen <= 8 ) {
+            //                       / 4ALPHA              ; or reserved for 
future use
+            //                       / 5*8ALPHA            ; or registered 
language subtag
+            // Dubious.
+            InternalLangTag.checkAlpha(string, N, langtag.language0, idx2);
+        } else {
             InternalLangTag.error("Language too long (2-3 characters, 4-8 
reserved for future use)");
         }
-        // -- extlang
+
         langtag.language1 = idx2;
         // Info
         noteSegment("language", string, langtag.language0, langtag.language1);
diff --git 
a/jena-langtag/src/test/java/org/apache/jena/langtag/TestLangTag.java 
b/jena-langtag/src/test/java/org/apache/jena/langtag/TestLangTag.java
index c71ae4d605..2b2c35bddd 100644
--- a/jena-langtag/src/test/java/org/apache/jena/langtag/TestLangTag.java
+++ b/jena-langtag/src/test/java/org/apache/jena/langtag/TestLangTag.java
@@ -49,6 +49,12 @@ public class TestLangTag {
     @Test public void test_lang_basic_22() { testPrivateUse("az-Latn-x-latn", 
"az-Latn-x-latn", "az", "Latn", null, null, null, "x-latn"); }
     @Test public void test_lang_basic_23() { testPrivateUse("sss-x-y", 
"sss-x-y", "sss", null, null, null, null, "x-y"); }
 
+
+    // 4 chars reserved
+    // 5-8 characters
+    @Test public void test_lang_basic_30() { testRFC5646("abcd", "abcd",       
     "abcd", null, null, null, null); }
+    @Test public void test_lang_basic_31() { testRFC5646("abcdefgh", 
"abcdefgh",    "abcdefgh", null, null, null, null); }
+
     @Test public void test_lang_bad_01() { testBad("123"); }
     @Test public void test_lang_bad_02() { testBad("abcdefghijklmn"); }
     @Test public void test_lang_bad_03() { testBad("abcdefghijklmn-123"); }
@@ -64,9 +70,9 @@ public class TestLangTag {
 
     // Wrong lengths
     @Test public void test_lang_bad_20() { testBad("s"); }
-    @Test public void test_lang_bad_21() { testBad("abcdefghi"); }
-    @Test public void test_lang_bad_22() { testBad("en-abcdefghi"); }
-    @Test public void test_lang_bad_23() { testBad("en-Latn-x-abcdefghi"); }
+    @Test public void test_lang_bad_21() { testBad("abcdefghz"); }
+    @Test public void test_lang_bad_22() { testBad("en-abcdefghz"); }
+    @Test public void test_lang_bad_23() { testBad("en-Latn-x-abcdefghz"); }
 
     // Bad extension
     @Test public void test_lang_bad_31() { testBad("sss-d"); }

Reply via email to