Re: [PR] Case-insensitive language tags [jena]

via GitHub Tue, 26 Dec 2023 09:04:43 -0800


kinow commented on code in PR #2134:
URL: https://github.com/apache/jena/pull/2134#discussion_r1436532343



##########
jena-arq/src/test/java/org/apache/jena/riot/TestRDFParser.java:
##########
@@ -231,17 +231,17 @@ private RDFParserBuilder builder() {
         testNormalization("+123.00e0", "1.23E2", 
builder().canonicalValues(true));
     }
 
-    @Test public void canonical_langTag_1() {
-        testNormalization("'abc'@En-gB", "'abc'@En-gB", 
builder().langTagAsGiven());
-    }
-
-    @Test public void canonical_langTag_2() {
-        testNormalization("'abc'@En-gB", "'abc'@en-gb", 
builder().langTagLowerCase());
-    }
-
-    @Test public void canonical_langTag_3() {
-        testNormalization("'abc'@En-gB", "'abc'@en-GB", 
builder().langTagCanonical());
-    }
+//    @Test public void canonical_langTag_1() {
+//        testNormalization("'abc'@En-gB", "'abc'@En-gB", 
builder().langTagAsGiven());
+//    }
+//
+//    @Test public void canonical_langTag_2() {
+//        testNormalization("'abc'@En-gB", "'abc'@en-gb", 
builder().langTagLowerCase());
+//    }
+//
+//    @Test public void canonical_langTag_3() {
+//        testNormalization("'abc'@En-gB", "'abc'@en-GB", 
builder().langTagCanonical());
+//    }

Review Comment:
   Delete? :point_up: 



##########
jena-arq/src/main/java/org/apache/jena/sparql/util/StringUtils.java:
##########
@@ -48,28 +48,28 @@ public static String str(float value)
     {
         return decimalFormat.format(value) ;
     }
-    
+
     public static String str(double value)
     {
         return decimalFormat.format(value) ;
     }
-    
+
     public static <T> String str(T[] array)
     {
         return Arrays.asList(array).toString() ;
     }
 
-    private static Pattern p = Pattern.compile("http:[^ \n]*[#/]([^/ \n]*)") ;
-    /** Abbreviate, crudely, URI in strings, leaving only their last 
component. */ 
+    private static Pattern p = Pattern.compile("https?:[^ \n]*[#/]([^/ \n]*)") 
;

Review Comment:
   Was it fixed by the IDE or is it correct/intentional? Shouldn't the `s` be 
optional, perhaps?



##########
jena-core/src/main/java/org/apache/jena/datatypes/xsd/impl/RDFDirLangString.java:
##########
@@ -0,0 +1,59 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.jena.datatypes.xsd.impl;
+
+import org.apache.jena.datatypes.BaseDatatype ;
+import org.apache.jena.datatypes.RDFDatatype ;
+import org.apache.jena.graph.impl.LiteralLabel ;
+
+/**
+ * rdf:dirLangString (literal with language and initial text direction)
+ * This covers the unusual case of "foo"^^rdf:langString or 
"foo"^^rdf:dirLangString.
+ * When there is a language tag, there is a lexical form but it is in two 
parts lex@lang or lex@lang--ltr
+ */
+
+public class RDFDirLangString extends BaseDatatype implements RDFDatatype {
+
+    /** Singleton instance */
+    // Include the string for the RDF namespace, not use RDF.getURI(), to 
avoid an initializer circularity

Review Comment:
   s/not/do not



##########
jena-arq/src/test/java/org/apache/jena/sparql/expr/TestExpressions.java:
##########
@@ -346,9 +346,29 @@ public class TestExpressions
     @Test public void boolean_126() { testBoolean("datatype('fred') = 
<"+XSD.xstring.getURI()+">", true) ; }
     @Test public void boolean_127() { 
testBoolean("datatype('fred'^^<urn:test:foo>) = <urn:test:foo>", true) ; }
     @Test public void boolean_128() { testBoolean("datatype('fred'^^<foo>) = 
<Foo>", false) ; }
-    @Test public void string_15() { testString("lang('fred'@en)", "en") ; }
-    @Test public void string_16() { testString("lang('fred'@en-uk)", "en-uk") 
; }
-    @Test public void string_17() { testString("lang('fred')", "") ; }
+
+    @Test public void lang_01() { testString("LANG('tea time'@en)", "en") ; }
+    // Aside For some strange reason, the language code is GB not UK.
+    // The state is UK! "The United Kingdom of Great Britain and Norther 
Ireland."
+    // The four countries England, Scotland, Wales and Northern Ireland (since 
1922).
+    // It's complicated: https://en.wikipedia.org/wiki/United_Kingdom

Review Comment:
   I don't even want to start digging into that. I tried learning how that all 
works... then there is London that has a special economy (is that a state? A 
special state?). I think there are also overseas territories like falklands and 
gibraltar... not sure if the language code there is GB too? Must be, right?



##########
jena-base/src/main/java/org/apache/jena/atlas/lib/Lib.java:
##########
@@ -112,25 +113,42 @@ static public UnsupportedOperationException 
unsupportedMethod(Object object, Str
     /** Do two lists have the same elements without considering the order of 
the lists nor duplicates? */
     public static <T> boolean equalsListAsSet(List<T> list1, List<T> list2) {
         if ( list1 == null && list2 == null )
-            return true ;
-        if ( list1 == null ) return false ;
-        if ( list2 == null ) return false ;
-        return list1.containsAll(list2) && list2.containsAll(list1) ;
+            return true;
+        if ( list1 == null ) return false;
+        if ( list2 == null ) return false;
+        return list1.containsAll(list2) && list2.containsAll(list1);
     }
 
     /** HashCode - allow nulls */
-    public static final int hashCodeObject(Object obj) { return 
hashCodeObject(obj, -4) ; }
+    public static final int hashCodeObject(Object obj) { return 
hashCodeObject(obj, -4); }
 
     /** HashCode - allow nulls */
     public static final int hashCodeObject(Object obj, int nullHashCode) {
         if ( obj == null )
-            return nullHashCode ;
-        return obj.hashCode() ;
+            return nullHashCode;
+        return obj.hashCode();
+    }
+
+    public static boolean isEmpty(CharSequence cs) {
+        // Hide implementation
+        return StringUtils.isEmpty(cs);
+    }
+
+    /** Non-locale lowercase {@link Locale#ROOT} */
+    public static String lowercase(String string) {
+        // Hide implementation
+        return string.toLowerCase(Locale.ROOT);
+    }
+
+    /** Non-locale uppercase {@link Locale#ROOT} */
+    public static String uppercase(String string) {
+        // Hide implementation
+        return string.toUpperCase(Locale.ROOT);

Review Comment:
   No need to handle codepoints instead of strings?



##########
jena-core/src/main/java/org/apache/jena/graph/impl/LiteralLabel.java:
##########
@@ -451,61 +479,100 @@ public boolean sameValueAs( LiteralLabel other ) {
      * @return
      */
     private static boolean sameValueAs(LiteralLabel lit1, LiteralLabel lit2) {
-        //return  lit1.sameValueAs(lit2) ;
+        //return  lit1.sameValueAs(lit2);
         if ( lit1 == null )
-            throw new NullPointerException() ;
+            throw new NullPointerException();
         if ( lit2 == null )
-            throw new NullPointerException() ;
-        // Strings.
+            throw new NullPointerException();
+        // -- Strings.
         if ( isStringValue(lit1) && isStringValue(lit2) )
-            return lit1.getLexicalForm().equals(lit2.getLexicalForm()) ;
-
-        if ( isStringValue(lit1) ) return false ;
-        if ( isStringValue(lit2) ) return false ;
+            return lit1.getLexicalForm().equals(lit2.getLexicalForm());
+        else {
+            if ( isStringValue(lit1) ) return false;
+            if ( isStringValue(lit2) ) return false;
+        }
 
-        // Language tag strings
+        // -- Language tag strings
         if ( isLangString(lit1) && isLangString(lit2) ) {
-            String lex1 = lit1.getLexicalForm() ;
-            String lex2 = lit2.getLexicalForm() ;
-            return lex1.equals(lex2) && 
lit1.language().equalsIgnoreCase(lit2.language()) ;
+            String lex1 = lit1.getLexicalForm();
+            String lex2 = lit2.getLexicalForm();
+            //return lex1.equals(lex2) && 
lit1.language().equalsIgnoreCase(lit2.language());
+            // Normalized language tags.
+            return lex1.equals(lex2)
+                    && lit1.language().equalsIgnoreCase(lit2.language());
+        } else {
+            if ( isLangString(lit1) ) return false;
+            if ( isLangString(lit2) ) return false;
         }
-        if ( isLangString(lit1) ) return false ;
-        if ( isLangString(lit2) ) return false ;
 
-        // Both not strings, not lang strings.
+        // -- Language tag strings with initial text direction
+        if ( isLangStringDir(lit1) && isLangStringDir(lit2) ) {
+            String lex1 = lit1.getLexicalForm();
+            String lex2 = lit2.getLexicalForm();
+            return lex1.equals(lex2)
+                    && lit1.language().equalsIgnoreCase(lit2.language())
+                    && 
lit1.initialTextDirection().equals(lit2.initialTextDirection());
+        } else {
+            if ( isLangStringDir(lit1) ) return false;
+            if ( isLangStringDir(lit2) ) return false;
+        }
+
+        // -- datatypes.
+        // Both not strings, not lang strings and not dirlang strings.
         // Datatype set.
         if ( lit1.isWellFormedRaw() && lit2.isWellFormedRaw() )
             // Both well-formed.
-            return lit1.getDatatype().isEqual(lit1, lit2) ;
+            return lit1.getDatatype().isEqual(lit1, lit2);
         if ( ! lit1.isWellFormedRaw() && ! lit2.isWellFormedRaw() )
-            return lit1.equals(lit2) ;
+            return lit1.equals(lit2);
         // One is well formed, the other is not.
-        return false ;
+        return false;
     }
 
     /** Return true if the literal label is a string value (RDF 1.0 and RDF 
1.1) */
     private static boolean isStringValue(LiteralLabel lit) {
         if ( lit.getDatatype() == null )
             // RDF 1.0
-            return ! isLangString(lit) ;
+            return ! isLangString(lit);
         if ( lit.getDatatype().equals(XSDDatatype.XSDstring)  )
             return true;
-        return false ;
+        return false;
     }
 
-    /** Return true if the literal label is a language string. */
+    /**
+     * Return true if the literal label is a well-formed language string 
(rdf:langString).
+     * Language strings do not have an initial text direction.
+     * This test excludes "abc"^^rdf:langString (not well-formed).
+     */
     private static boolean isLangString(LiteralLabel lit) {
         // Duplicate of Util.isLangString except for the additional 
consistency check.
-        String lang = lit.language() ;
-        if ( lang == null )
-            return false ;
-        // Check.
-        if ( lang.equals("") )
-            return false ;
-        // This is an additional check.
+        if ( isEmpty(lit.language()) )
+            return false;
+        if ( lit.initialTextDirection() != null )
+            // Has an initial text direction so it is n't
+            return false;
+        // Internal check.
         if ( ! Objects.equals(lit.getDatatype(), RDF.dtLangString) )
-            throw new JenaException("Literal with language string which is not 
rdf:langString: "+lit) ;
-        return true ;
+            throw new JenaException("Literal with language string which is not 
rdf:langString: "+lit);
+        return true;
+    }
+
+    /**
+     * Return true if the literal label is a well-formed language string with 
text direction.
+     * This excludes "abc"^^rdf:dirLangString.
+     */
+    private static boolean isLangStringDir(LiteralLabel lit) {
+        // Assume well formed.
+        String lang = lit.language();
+        // Allow "abc"@--rtl
+//        if ( isEmpty(lit.language()) )
+//            return false;

Review Comment:
   Interesting! :point_up: 



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Re: [PR] Case-insensitive language tags [jena]

Reply via email to