[jackrabbit-oak] branch trunk updated: OAK-10384: Fix stripping of large indexed ordered properties (#1071)

amitj Sun, 10 Sep 2023 21:47:44 -0700

This is an automated email from the ASF dual-hosted git repository.

amitj pushed a commit to branch trunk
in repository https://gitbox.apache.org/repos/asf/jackrabbit-oak.git



The following commit(s) were added to refs/heads/trunk by this push:
     new 1e55c01b22 OAK-10384: Fix stripping of large indexed ordered 
properties (#1071)
1e55c01b22 is described below

commit 1e55c01b22396239653549b3684bd9d71c606307
Author: Amit Jain <[email protected]>
AuthorDate: Mon Sep 11 10:17:33 2023 +0530

    OAK-10384: Fix stripping of large indexed ordered properties (#1071)
    
    - Truncate BytesRef value and handle surrogates correctly (Code from Thomas 
Mueller)
---
 .../plugins/index/lucene/LuceneDocumentMaker.java  |  62 ++++++++++-
 .../lucene/LuceneLargeStringPropertyTest.java      | 116 +++++++++++++++++++--
 2 files changed, 163 insertions(+), 15 deletions(-)

diff --git 
a/oak-lucene/src/main/java/org/apache/jackrabbit/oak/plugins/index/lucene/LuceneDocumentMaker.java
 
b/oak-lucene/src/main/java/org/apache/jackrabbit/oak/plugins/index/lucene/LuceneDocumentMaker.java
index d41461c62e..06167fb03b 100644
--- 
a/oak-lucene/src/main/java/org/apache/jackrabbit/oak/plugins/index/lucene/LuceneDocumentMaker.java
+++ 
b/oak-lucene/src/main/java/org/apache/jackrabbit/oak/plugins/index/lucene/LuceneDocumentMaker.java
@@ -20,6 +20,7 @@
 package org.apache.jackrabbit.oak.plugins.index.lucene;
 
 import java.io.IOException;
+import java.nio.charset.StandardCharsets;
 import java.util.ArrayList;
 import java.util.Arrays;
 import java.util.List;
@@ -287,11 +288,10 @@ public class LuceneDocumentMaker extends 
FulltextDocumentMaker<Document> {
                         new 
BytesRef(property.getValue(Type.BOOLEAN).toString()));
             } else if (tag == Type.STRING.tag()) {
                 String stringValue = property.getValue(Type.STRING);
-                if (stringValue.length() > STRING_PROPERTY_MAX_LENGTH){
-                    log.warn("Truncating property {} having length {} at 
path:[{}] as it is > {}", name, stringValue.length(), this.path, 
STRING_PROPERTY_MAX_LENGTH);
-                    stringValue = stringValue.substring(0, 
STRING_PROPERTY_MAX_LENGTH);
-                }
-                f = new SortedDocValuesField(name, new BytesRef(stringValue));
+                // Truncate the value as lucene limits the length of a 
SortedDocValueField string to 
+                // STRING_PROPERTY_MAX_LENGTH(32766 bytes) and throws 
exception if over the limit
+                f = new SortedDocValuesField(name, getTruncatedBytesRef(name, 
stringValue, this.path,
+                        STRING_PROPERTY_MAX_LENGTH));
             }
 
             if (f != null && includePropertyValue(property, 0, pd)) {
@@ -316,6 +316,58 @@ public class LuceneDocumentMaker extends 
FulltextDocumentMaker<Document> {
         return fieldAdded;
     }
 
+    /**
+     * Returns a {@code BytesRef} object constructed from the given {@code 
String} value and also truncates the length
+     * of the {@code BytesRef} object to the specified {@code maxLength}, 
ensuring that the multi-byte sequences are 
+     * properly truncated.
+     *
+     * <p>The {@code BytesRef} object is created from the provided {@code 
String} value using UTF-8 encoding. As a result, its length
+     * can exceed that of the {@code String} value, since Java strings use 
UTF-16 encoding. This necessitates appropriate truncation.
+     *
+     * <p>Multi-byte sequences will be of the form {@code 11xxxxxx 10xxxxxx 
10xxxxxx 10xxxxxx}.
+     * The method first truncates continuation bytes, which start with {@code 
10} in binary. It then truncates the head byte, which
+     * starts with {@code 11}. Both truncation operations use a binary mask of 
{@code 11000000}.
+     *
+     * @param prop      the name of the property
+     * @param value     the string property value to convert into a {@code 
BytesRef} object
+     * @param path      the path of the node
+     * @param maxLength the maximum length for the {@code BytesRef} object
+     * @return the truncated {@code BytesRef} object
+     */
+    protected static BytesRef getTruncatedBytesRef(String prop, String value, 
String path, int maxLength) {
+        BytesRef ref = new BytesRef(value);
+        if (ref.length <= maxLength) {
+            return ref;
+        }
+        
+        log.trace("Property {} at path:[{}] has value {}", prop, path, value);
+        log.info("Truncating property {} at path:[{}] as length after encoding 
{} is > {} ",
+            prop, path, ref.length, maxLength);
+        
+        int end = maxLength - 1;
+        // skip over tails of utf-8 multi-byte sequences (up to 3 bytes)
+        while ((ref.bytes[end] & 0b11000000) == 0b10000000) {
+            end--;
+        }
+        // remove one head of a utf-8 multi-byte sequence (at most 1)
+        if ((ref.bytes[end] & 0b11000000) == 0b11000000) {
+            end--;
+        }
+        byte[] truncatedBytes = Arrays.copyOf(ref.bytes, end + 1);
+        String truncated = new String(truncatedBytes, StandardCharsets.UTF_8);
+        ref = new BytesRef(truncated);
+        log.trace("Truncated property {} at path:[{}] to {}", prop, path, 
ref.utf8ToString());
+        
+        while (ref.length > maxLength) {
+            log.error("Truncation did not work: still {} bytes", ref.length);
+            // this may not properly work with unicode surrogates:
+            // it is an "emergency" procedure and should never happen
+            truncated = truncated.substring(0, truncated.length() - 10);
+            ref = new BytesRef(truncated);
+        }
+        return ref;
+    }
+
     private FacetsConfig getFacetsConfig(){
         return facetsConfigProvider.getFacetsConfig();
     }
diff --git 
a/oak-lucene/src/test/java/org/apache/jackrabbit/oak/plugins/index/lucene/LuceneLargeStringPropertyTest.java
 
b/oak-lucene/src/test/java/org/apache/jackrabbit/oak/plugins/index/lucene/LuceneLargeStringPropertyTest.java
index b49e26dded..9dda899abf 100644
--- 
a/oak-lucene/src/test/java/org/apache/jackrabbit/oak/plugins/index/lucene/LuceneLargeStringPropertyTest.java
+++ 
b/oak-lucene/src/test/java/org/apache/jackrabbit/oak/plugins/index/lucene/LuceneLargeStringPropertyTest.java
@@ -46,6 +46,7 @@ import org.apache.jackrabbit.oak.spi.state.NodeStore;
 import org.apache.lucene.store.Directory;
 import org.apache.lucene.store.FSDirectory;
 import org.apache.lucene.store.FilterDirectory;
+import org.apache.lucene.util.BytesRef;
 import org.junit.After;
 import org.junit.Before;
 import org.junit.Rule;
@@ -56,6 +57,7 @@ import org.slf4j.event.Level;
 import java.io.File;
 import java.io.IOException;
 import java.text.MessageFormat;
+import java.util.Random;
 import java.util.Set;
 import java.util.concurrent.ExecutorService;
 import java.util.concurrent.Executors;
@@ -66,6 +68,7 @@ import static 
org.apache.jackrabbit.oak.plugins.index.IndexConstants.INDEX_DEFIN
 import static 
org.apache.jackrabbit.oak.plugins.index.IndexConstants.INDEX_DEFINITIONS_NODE_TYPE;
 import static 
org.apache.jackrabbit.oak.plugins.index.IndexConstants.REINDEX_PROPERTY_NAME;
 import static 
org.apache.jackrabbit.oak.plugins.index.IndexConstants.TYPE_PROPERTY_NAME;
+import static 
org.apache.jackrabbit.oak.plugins.index.lucene.LuceneDocumentMaker.getTruncatedBytesRef;
 import static 
org.apache.jackrabbit.oak.plugins.index.search.FulltextIndexConstants.PROP_NODE;
 import static org.junit.Assert.assertTrue;
 
@@ -212,19 +215,112 @@ public class LuceneLargeStringPropertyTest extends 
AbstractQueryTest {
         test.addChild("a").setProperty("propa", aVal);
         test.addChild("b").setProperty("propa", bVal);
         root.commit();
+        assertTruncation("propa", aVal, "/test/a", customizer);
+        // order of result should be first b and then a i.e. sorted on propa
+        assertQuery("select [jcr:path] from [nt:base] where contains(@propa, 
'abcd') order by propa", asList("/test/b", "/test/a"));
+    }
 
-        boolean truncationLogPresent = false;
-        String failureLog = MessageFormat.format("Truncating property :dv{0} 
having length {1,number,#} at path:[{2}] as it is > {3,number,#}",
-                "propa", aVal.length(), "/test/a", 
LuceneDocumentMaker.STRING_PROPERTY_MAX_LENGTH);
-        for (String log : customizer.getLogs()) {
-            if (log.equals(failureLog)) {
-                truncationLogPresent = true;
-                break;
-            }
-        }
-        assertTrue(truncationLogPresent);
+    /**
+     * Tests the truncation of large Unicode strings during indexing.
+     *
+     * <p>This test creates an index on the {@code propa} property and then 
adds two nodes with large 
+     * values for this property. The first node's {@code propa} property 
contains a large string 
+     * with some unicode characters at the start. 
+     * The second node's {@code propa} property contains a large string that 
ends 
+     * with the Unicode character {@code "\uD800\uDF48"} in Java and takes up 
4 bytes in UTF-8.
+     *
+     * <p>After committing the changes, the test asserts that the truncation 
was performed correctly 
+     * for both nodes. Also verifies that a query ordering the nodes by the 
{@code propa} property 
+     * returns the nodes in the correct order.
+     *
+     * @throws Exception if any error occurs during the test
+     */
+    @Test
+    public void truncateLargeUnicodeString() throws Exception {
+        Tree idx = createIndex("test1", of("propa"));
+        Tree tr = idx.addChild(PROP_NODE).addChild("propa");
+        tr.setProperty("ordered", true, Type.BOOLEAN); // in case of ordered 
throws error that it can't index node
+        tr.setProperty("analyzed", true, Type.BOOLEAN);
+        idx.addChild(PROP_NODE).addChild("propa");
+        root.commit();
+
+        Tree test = root.getTree("/").addChild("test");
+        int length = LuceneDocumentMaker.STRING_PROPERTY_MAX_LENGTH;
+        String generatedString = RandomStringUtils.random(length, true, true);
+        
+        // Large String with unicode characters which makes the length longer 
than the max length
+        String aVal ="abcd Mình nói tiếng Việt" + generatedString.substring(0, 
length);
+        
+        // Large String which ends with the unicode char `𐍈` represented by 
"\uD800\uDF48".
+        //This char is represented by 4 bytes in UTF-8 but only with 2 bytes 
in Java. The truncation will
+        // truncate the string `..xyz𐍈` to `..xyz`.
+        String bVal = "abcd " + generatedString.substring(0, length - 6) + 
"\uD800\uDF48";
+
+        test.addChild("a").setProperty("propa", aVal);
+        test.addChild("b").setProperty("propa", bVal);
+        root.commit();
+
+        assertTruncation("propa", aVal, "/test/a", customizer);
+        assertExtendedTruncation("propa", aVal, "/test/a", customizer);
+        assertTruncation("propa", bVal, "/test/b", customizer);
+        assertExtendedTruncation("propa", bVal, "/test/b", customizer);
         // order of result should be first b and then a i.e. sorted on propa
         assertQuery("select [jcr:path] from [nt:base] where contains(@propa, 
'abcd') order by propa", asList("/test/b", "/test/a"));
     }
 
+    @Test
+    public void randomStringTruncation() {
+        Random r = new Random(1);
+        for (int i = 0; i < 100; i++) {
+            String x = randomUnicodeString(r, 5);
+            BytesRef ref = getTruncatedBytesRef("x", x, "/x", 5);
+            assertTrue(ref.length > 0 && ref.length <= 5);
+            //assert valid string
+            assertTrue(x.startsWith(ref.utf8ToString()));
+        }
+    }
+
+    private String randomUnicodeString(Random r, int len) {
+        StringBuilder buff = new StringBuilder();
+        for(int i=0; i<len; i++) {
+            // see https://en.wikipedia.org/wiki/UTF-8
+            switch (r.nextInt(6)) {
+                case 2:
+                    // 2 UTF-8 bytes
+                    buff.append('£');
+                    break;
+                case 3:
+                    // 3 UTF-8 bytes
+                    buff.append('€');
+                    break;
+                case 4:
+                    // 4 UTF-8 bytes
+                    buff.append("\uD800\uDF48");
+                    break;
+                default:
+                    // most cases:
+                    // 1 UTF-8 byte (ASCII)
+                    buff.append('$');
+            }
+        }
+        return buff.toString();
+    }
+    
+    private static boolean assertTruncation(String prop, String val, String 
path, LogCustomizer customizer) {
+        String errorMsg = "Truncating property :dv{0} having length 
{1,number,#} at path:[{2}] as it is > {3,number,#}";
+        String failureLog = MessageFormat.format(errorMsg,
+            prop, val.length(), path, 
LuceneDocumentMaker.STRING_PROPERTY_MAX_LENGTH);
+        
+        return customizer.getLogs().contains(failureLog);
+    }
+
+    private static boolean assertExtendedTruncation(String prop, String val, 
String path, LogCustomizer customizer) {
+        String errorMsg = "Further truncating property :dv{0} at path:[{1}] as 
length after encoding {2,number,#} > " 
+            + "{3,number,#}";
+        BytesRef bytesRef = new BytesRef(val.substring(0, 
LuceneDocumentMaker.STRING_PROPERTY_MAX_LENGTH));
+        String failureLog = MessageFormat.format(errorMsg,
+            prop, path, bytesRef.length, 
LuceneDocumentMaker.STRING_PROPERTY_MAX_LENGTH);
+
+        return customizer.getLogs().contains(failureLog);
+    }
 }

[jackrabbit-oak] branch trunk updated: OAK-10384: Fix stripping of large indexed ordered properties (#1071)

Reply via email to