This is an automated email from the ASF dual-hosted git repository.
andy pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/jena.git
The following commit(s) were added to refs/heads/main by this push:
new 1691d7d2db GH-3137: Accept language primary tag with 4 to 8 characters
1691d7d2db is described below
commit 1691d7d2db6d1aa47b73536e0f748388ea18e0ad
Author: Andy Seaborne <[email protected]>
AuthorDate: Mon Apr 21 18:02:22 2025 +0100
GH-3137: Accept language primary tag with 4 to 8 characters
---
.../org/apache/jena/langtag/LangTagRFC5646.java | 72 ++++++++++++++--------
.../java/org/apache/jena/langtag/TestLangTag.java | 12 +++-
2 files changed, 57 insertions(+), 27 deletions(-)
diff --git
a/jena-langtag/src/main/java/org/apache/jena/langtag/LangTagRFC5646.java
b/jena-langtag/src/main/java/org/apache/jena/langtag/LangTagRFC5646.java
index 783df7e5ea..23e33a62f4 100644
--- a/jena-langtag/src/main/java/org/apache/jena/langtag/LangTagRFC5646.java
+++ b/jena-langtag/src/main/java/org/apache/jena/langtag/LangTagRFC5646.java
@@ -237,23 +237,43 @@ public final class LangTagRFC5646 implements LangTag{
LangTagRFC5646 langtag = new LangTagRFC5646(string);
final int N = string.length();
- // Language-Tag = langtag ; normal language tags
- // / privateuse ; private use tag
- // / grandfathered ; grandfathered tags
-
- // langtag = language
- // ["-" script]
- // ["-" region]
- // *("-" variant)
- // *("-" extension)
- // ["-" privateuse]
-
- // script = 4ALPHA ; ISO 15924 code
- // region = 2ALPHA ; ISO 3166-1 code
- // / 3DIGIT ; UN M.49 code
- // variant = 5*8alphanum ; registered variants
- // / (DIGIT 3alphanum)
- // extension = singleton 1*("-" (2*8alphanum))
+ // @formatter:off
+ // langtag = language
+ // ["-" script]
+ // ["-" region]
+ // *("-" variant)
+ // *("-" extension)
+ // ["-" privateuse]
+ //
+ // language = 2*3ALPHA ; shortest ISO 639 code
+ // ["-" extlang] ; sometimes followed by
+ // ; extended language
subtags
+ // / 4ALPHA ; or reserved for
future use
+ // / 5*8ALPHA ; or registered
language subtag
+ //
+ // extlang = 3ALPHA ; selected ISO 639 codes
+ // *2("-" 3ALPHA) ; permanently reserved
+ //
+ // script = 4ALPHA ; ISO 15924 code
+ //
+ // region = 2ALPHA ; ISO 3166-1 code
+ // / 3DIGIT ; UN M.49 code
+ //
+ // variant = 5*8alphanum ; registered variants
+ // / (DIGIT 3alphanum)
+ //
+ // extension = singleton 1*("-" (2*8alphanum))
+ //
+ // ; Single alphanumerics
+ // ; "x" reserved for
private use
+ // singleton = DIGIT ; 0 - 9
+ // / %x41-57 ; A - W
+ // / %x59-5A ; Y - Z
+ // / %x61-77 ; a - w
+ // / %x79-7A ; y - z
+ //
+ // privateuse = "x" 1*("-" (1*8alphanum))
+ // @formatter:on
if ( N == 0 )
InternalLangTag.error("Empty string");
@@ -325,13 +345,15 @@ public final class LangTagRFC5646 implements LangTag{
InternalLangTag.error("Trailing characters in private
langtag: '%s'", string.substring(langtag.privateuse1));
return langtag;
}
+ // else
InternalLangTag.error("Language part is 1 character: it must be
2-3 characters (4-8 reserved for future use), \"x-\", or a recognized
grandfathered tag");
}
+ if ( segLen > 8 )
+ InternalLangTag.error("Language too long (2-3 characters, 4-8
reserved for future use)");
+
if ( idx2 < 0 ) {
// language only.
- if ( segLen > 8 )
- InternalLangTag.error("Language too long (2-3 characters, 4-8
reserved for future use)");
langtag.language0 = 0;
langtag.language1 = N;
InternalLangTag.checkAlpha(string, N, langtag.language0,
langtag.language1);
@@ -341,9 +363,6 @@ public final class LangTagRFC5646 implements LangTag{
if ( idx == idx2 )
InternalLangTag.error("Can not find the language subtag: '%s'",
string);
- if ( segLen < 2 || segLen > 4 )
- InternalLangTag.error("Language: '%s'", string);
-
langtag.language0 = idx;
if ( segLen == 2 || segLen == 3 ) {
@@ -360,10 +379,15 @@ public final class LangTagRFC5646 implements LangTag{
idx2 = extEnd;
InternalLangTag.checkAlphaMinus(string, N, extStart,
langtag.language1);
}
- } else if ( segLen > 8 ) {
+ } else if ( segLen >= 4 && segLen <= 8 ) {
+ // / 4ALPHA ; or reserved for
future use
+ // / 5*8ALPHA ; or registered
language subtag
+ // Dubious.
+ InternalLangTag.checkAlpha(string, N, langtag.language0, idx2);
+ } else {
InternalLangTag.error("Language too long (2-3 characters, 4-8
reserved for future use)");
}
- // -- extlang
+
langtag.language1 = idx2;
// Info
noteSegment("language", string, langtag.language0, langtag.language1);
diff --git
a/jena-langtag/src/test/java/org/apache/jena/langtag/TestLangTag.java
b/jena-langtag/src/test/java/org/apache/jena/langtag/TestLangTag.java
index c71ae4d605..2b2c35bddd 100644
--- a/jena-langtag/src/test/java/org/apache/jena/langtag/TestLangTag.java
+++ b/jena-langtag/src/test/java/org/apache/jena/langtag/TestLangTag.java
@@ -49,6 +49,12 @@ public class TestLangTag {
@Test public void test_lang_basic_22() { testPrivateUse("az-Latn-x-latn",
"az-Latn-x-latn", "az", "Latn", null, null, null, "x-latn"); }
@Test public void test_lang_basic_23() { testPrivateUse("sss-x-y",
"sss-x-y", "sss", null, null, null, null, "x-y"); }
+
+ // 4 chars reserved
+ // 5-8 characters
+ @Test public void test_lang_basic_30() { testRFC5646("abcd", "abcd",
"abcd", null, null, null, null); }
+ @Test public void test_lang_basic_31() { testRFC5646("abcdefgh",
"abcdefgh", "abcdefgh", null, null, null, null); }
+
@Test public void test_lang_bad_01() { testBad("123"); }
@Test public void test_lang_bad_02() { testBad("abcdefghijklmn"); }
@Test public void test_lang_bad_03() { testBad("abcdefghijklmn-123"); }
@@ -64,9 +70,9 @@ public class TestLangTag {
// Wrong lengths
@Test public void test_lang_bad_20() { testBad("s"); }
- @Test public void test_lang_bad_21() { testBad("abcdefghi"); }
- @Test public void test_lang_bad_22() { testBad("en-abcdefghi"); }
- @Test public void test_lang_bad_23() { testBad("en-Latn-x-abcdefghi"); }
+ @Test public void test_lang_bad_21() { testBad("abcdefghz"); }
+ @Test public void test_lang_bad_22() { testBad("en-abcdefghz"); }
+ @Test public void test_lang_bad_23() { testBad("en-Latn-x-abcdefghz"); }
// Bad extension
@Test public void test_lang_bad_31() { testBad("sss-d"); }