This is an automated email from the ASF dual-hosted git repository.
amitj pushed a commit to branch trunk
in repository https://gitbox.apache.org/repos/asf/jackrabbit-oak.git
The following commit(s) were added to refs/heads/trunk by this push:
new 1e55c01b22 OAK-10384: Fix stripping of large indexed ordered
properties (#1071)
1e55c01b22 is described below
commit 1e55c01b22396239653549b3684bd9d71c606307
Author: Amit Jain <[email protected]>
AuthorDate: Mon Sep 11 10:17:33 2023 +0530
OAK-10384: Fix stripping of large indexed ordered properties (#1071)
- Truncate BytesRef value and handle surrogates correctly (Code from Thomas
Mueller)
---
.../plugins/index/lucene/LuceneDocumentMaker.java | 62 ++++++++++-
.../lucene/LuceneLargeStringPropertyTest.java | 116 +++++++++++++++++++--
2 files changed, 163 insertions(+), 15 deletions(-)
diff --git
a/oak-lucene/src/main/java/org/apache/jackrabbit/oak/plugins/index/lucene/LuceneDocumentMaker.java
b/oak-lucene/src/main/java/org/apache/jackrabbit/oak/plugins/index/lucene/LuceneDocumentMaker.java
index d41461c62e..06167fb03b 100644
---
a/oak-lucene/src/main/java/org/apache/jackrabbit/oak/plugins/index/lucene/LuceneDocumentMaker.java
+++
b/oak-lucene/src/main/java/org/apache/jackrabbit/oak/plugins/index/lucene/LuceneDocumentMaker.java
@@ -20,6 +20,7 @@
package org.apache.jackrabbit.oak.plugins.index.lucene;
import java.io.IOException;
+import java.nio.charset.StandardCharsets;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
@@ -287,11 +288,10 @@ public class LuceneDocumentMaker extends
FulltextDocumentMaker<Document> {
new
BytesRef(property.getValue(Type.BOOLEAN).toString()));
} else if (tag == Type.STRING.tag()) {
String stringValue = property.getValue(Type.STRING);
- if (stringValue.length() > STRING_PROPERTY_MAX_LENGTH){
- log.warn("Truncating property {} having length {} at
path:[{}] as it is > {}", name, stringValue.length(), this.path,
STRING_PROPERTY_MAX_LENGTH);
- stringValue = stringValue.substring(0,
STRING_PROPERTY_MAX_LENGTH);
- }
- f = new SortedDocValuesField(name, new BytesRef(stringValue));
+ // Truncate the value as lucene limits the length of a
SortedDocValueField string to
+ // STRING_PROPERTY_MAX_LENGTH(32766 bytes) and throws
exception if over the limit
+ f = new SortedDocValuesField(name, getTruncatedBytesRef(name,
stringValue, this.path,
+ STRING_PROPERTY_MAX_LENGTH));
}
if (f != null && includePropertyValue(property, 0, pd)) {
@@ -316,6 +316,58 @@ public class LuceneDocumentMaker extends
FulltextDocumentMaker<Document> {
return fieldAdded;
}
+ /**
+ * Returns a {@code BytesRef} object constructed from the given {@code
String} value and also truncates the length
+ * of the {@code BytesRef} object to the specified {@code maxLength},
ensuring that the multi-byte sequences are
+ * properly truncated.
+ *
+ * <p>The {@code BytesRef} object is created from the provided {@code
String} value using UTF-8 encoding. As a result, its length
+ * can exceed that of the {@code String} value, since Java strings use
UTF-16 encoding. This necessitates appropriate truncation.
+ *
+ * <p>Multi-byte sequences will be of the form {@code 11xxxxxx 10xxxxxx
10xxxxxx 10xxxxxx}.
+ * The method first truncates continuation bytes, which start with {@code
10} in binary. It then truncates the head byte, which
+ * starts with {@code 11}. Both truncation operations use a binary mask of
{@code 11000000}.
+ *
+ * @param prop the name of the property
+ * @param value the string property value to convert into a {@code
BytesRef} object
+ * @param path the path of the node
+ * @param maxLength the maximum length for the {@code BytesRef} object
+ * @return the truncated {@code BytesRef} object
+ */
+ protected static BytesRef getTruncatedBytesRef(String prop, String value,
String path, int maxLength) {
+ BytesRef ref = new BytesRef(value);
+ if (ref.length <= maxLength) {
+ return ref;
+ }
+
+ log.trace("Property {} at path:[{}] has value {}", prop, path, value);
+ log.info("Truncating property {} at path:[{}] as length after encoding
{} is > {} ",
+ prop, path, ref.length, maxLength);
+
+ int end = maxLength - 1;
+ // skip over tails of utf-8 multi-byte sequences (up to 3 bytes)
+ while ((ref.bytes[end] & 0b11000000) == 0b10000000) {
+ end--;
+ }
+ // remove one head of a utf-8 multi-byte sequence (at most 1)
+ if ((ref.bytes[end] & 0b11000000) == 0b11000000) {
+ end--;
+ }
+ byte[] truncatedBytes = Arrays.copyOf(ref.bytes, end + 1);
+ String truncated = new String(truncatedBytes, StandardCharsets.UTF_8);
+ ref = new BytesRef(truncated);
+ log.trace("Truncated property {} at path:[{}] to {}", prop, path,
ref.utf8ToString());
+
+ while (ref.length > maxLength) {
+ log.error("Truncation did not work: still {} bytes", ref.length);
+ // this may not properly work with unicode surrogates:
+ // it is an "emergency" procedure and should never happen
+ truncated = truncated.substring(0, truncated.length() - 10);
+ ref = new BytesRef(truncated);
+ }
+ return ref;
+ }
+
private FacetsConfig getFacetsConfig(){
return facetsConfigProvider.getFacetsConfig();
}
diff --git
a/oak-lucene/src/test/java/org/apache/jackrabbit/oak/plugins/index/lucene/LuceneLargeStringPropertyTest.java
b/oak-lucene/src/test/java/org/apache/jackrabbit/oak/plugins/index/lucene/LuceneLargeStringPropertyTest.java
index b49e26dded..9dda899abf 100644
---
a/oak-lucene/src/test/java/org/apache/jackrabbit/oak/plugins/index/lucene/LuceneLargeStringPropertyTest.java
+++
b/oak-lucene/src/test/java/org/apache/jackrabbit/oak/plugins/index/lucene/LuceneLargeStringPropertyTest.java
@@ -46,6 +46,7 @@ import org.apache.jackrabbit.oak.spi.state.NodeStore;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;
import org.apache.lucene.store.FilterDirectory;
+import org.apache.lucene.util.BytesRef;
import org.junit.After;
import org.junit.Before;
import org.junit.Rule;
@@ -56,6 +57,7 @@ import org.slf4j.event.Level;
import java.io.File;
import java.io.IOException;
import java.text.MessageFormat;
+import java.util.Random;
import java.util.Set;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;
@@ -66,6 +68,7 @@ import static
org.apache.jackrabbit.oak.plugins.index.IndexConstants.INDEX_DEFIN
import static
org.apache.jackrabbit.oak.plugins.index.IndexConstants.INDEX_DEFINITIONS_NODE_TYPE;
import static
org.apache.jackrabbit.oak.plugins.index.IndexConstants.REINDEX_PROPERTY_NAME;
import static
org.apache.jackrabbit.oak.plugins.index.IndexConstants.TYPE_PROPERTY_NAME;
+import static
org.apache.jackrabbit.oak.plugins.index.lucene.LuceneDocumentMaker.getTruncatedBytesRef;
import static
org.apache.jackrabbit.oak.plugins.index.search.FulltextIndexConstants.PROP_NODE;
import static org.junit.Assert.assertTrue;
@@ -212,19 +215,112 @@ public class LuceneLargeStringPropertyTest extends
AbstractQueryTest {
test.addChild("a").setProperty("propa", aVal);
test.addChild("b").setProperty("propa", bVal);
root.commit();
+ assertTruncation("propa", aVal, "/test/a", customizer);
+ // order of result should be first b and then a i.e. sorted on propa
+ assertQuery("select [jcr:path] from [nt:base] where contains(@propa,
'abcd') order by propa", asList("/test/b", "/test/a"));
+ }
- boolean truncationLogPresent = false;
- String failureLog = MessageFormat.format("Truncating property :dv{0}
having length {1,number,#} at path:[{2}] as it is > {3,number,#}",
- "propa", aVal.length(), "/test/a",
LuceneDocumentMaker.STRING_PROPERTY_MAX_LENGTH);
- for (String log : customizer.getLogs()) {
- if (log.equals(failureLog)) {
- truncationLogPresent = true;
- break;
- }
- }
- assertTrue(truncationLogPresent);
+ /**
+ * Tests the truncation of large Unicode strings during indexing.
+ *
+ * <p>This test creates an index on the {@code propa} property and then
adds two nodes with large
+ * values for this property. The first node's {@code propa} property
contains a large string
+ * with some unicode characters at the start.
+ * The second node's {@code propa} property contains a large string that
ends
+ * with the Unicode character {@code "\uD800\uDF48"} in Java and takes up
4 bytes in UTF-8.
+ *
+ * <p>After committing the changes, the test asserts that the truncation
was performed correctly
+ * for both nodes. Also verifies that a query ordering the nodes by the
{@code propa} property
+ * returns the nodes in the correct order.
+ *
+ * @throws Exception if any error occurs during the test
+ */
+ @Test
+ public void truncateLargeUnicodeString() throws Exception {
+ Tree idx = createIndex("test1", of("propa"));
+ Tree tr = idx.addChild(PROP_NODE).addChild("propa");
+ tr.setProperty("ordered", true, Type.BOOLEAN); // in case of ordered
throws error that it can't index node
+ tr.setProperty("analyzed", true, Type.BOOLEAN);
+ idx.addChild(PROP_NODE).addChild("propa");
+ root.commit();
+
+ Tree test = root.getTree("/").addChild("test");
+ int length = LuceneDocumentMaker.STRING_PROPERTY_MAX_LENGTH;
+ String generatedString = RandomStringUtils.random(length, true, true);
+
+ // Large String with unicode characters which makes the length longer
than the max length
+ String aVal ="abcd Mình nói tiếng Việt" + generatedString.substring(0,
length);
+
+ // Large String which ends with the unicode char `𐍈` represented by
"\uD800\uDF48".
+ //This char is represented by 4 bytes in UTF-8 but only with 2 bytes
in Java. The truncation will
+ // truncate the string `..xyz𐍈` to `..xyz`.
+ String bVal = "abcd " + generatedString.substring(0, length - 6) +
"\uD800\uDF48";
+
+ test.addChild("a").setProperty("propa", aVal);
+ test.addChild("b").setProperty("propa", bVal);
+ root.commit();
+
+ assertTruncation("propa", aVal, "/test/a", customizer);
+ assertExtendedTruncation("propa", aVal, "/test/a", customizer);
+ assertTruncation("propa", bVal, "/test/b", customizer);
+ assertExtendedTruncation("propa", bVal, "/test/b", customizer);
// order of result should be first b and then a i.e. sorted on propa
assertQuery("select [jcr:path] from [nt:base] where contains(@propa,
'abcd') order by propa", asList("/test/b", "/test/a"));
}
+ @Test
+ public void randomStringTruncation() {
+ Random r = new Random(1);
+ for (int i = 0; i < 100; i++) {
+ String x = randomUnicodeString(r, 5);
+ BytesRef ref = getTruncatedBytesRef("x", x, "/x", 5);
+ assertTrue(ref.length > 0 && ref.length <= 5);
+ //assert valid string
+ assertTrue(x.startsWith(ref.utf8ToString()));
+ }
+ }
+
+ private String randomUnicodeString(Random r, int len) {
+ StringBuilder buff = new StringBuilder();
+ for(int i=0; i<len; i++) {
+ // see https://en.wikipedia.org/wiki/UTF-8
+ switch (r.nextInt(6)) {
+ case 2:
+ // 2 UTF-8 bytes
+ buff.append('£');
+ break;
+ case 3:
+ // 3 UTF-8 bytes
+ buff.append('€');
+ break;
+ case 4:
+ // 4 UTF-8 bytes
+ buff.append("\uD800\uDF48");
+ break;
+ default:
+ // most cases:
+ // 1 UTF-8 byte (ASCII)
+ buff.append('$');
+ }
+ }
+ return buff.toString();
+ }
+
+ private static boolean assertTruncation(String prop, String val, String
path, LogCustomizer customizer) {
+ String errorMsg = "Truncating property :dv{0} having length
{1,number,#} at path:[{2}] as it is > {3,number,#}";
+ String failureLog = MessageFormat.format(errorMsg,
+ prop, val.length(), path,
LuceneDocumentMaker.STRING_PROPERTY_MAX_LENGTH);
+
+ return customizer.getLogs().contains(failureLog);
+ }
+
+ private static boolean assertExtendedTruncation(String prop, String val,
String path, LogCustomizer customizer) {
+ String errorMsg = "Further truncating property :dv{0} at path:[{1}] as
length after encoding {2,number,#} > "
+ + "{3,number,#}";
+ BytesRef bytesRef = new BytesRef(val.substring(0,
LuceneDocumentMaker.STRING_PROPERTY_MAX_LENGTH));
+ String failureLog = MessageFormat.format(errorMsg,
+ prop, path, bytesRef.length,
LuceneDocumentMaker.STRING_PROPERTY_MAX_LENGTH);
+
+ return customizer.getLogs().contains(failureLog);
+ }
}