This is an automated email from the ASF dual-hosted git repository. thomasm pushed a commit to branch OAK-12133 in repository https://gitbox.apache.org/repos/asf/jackrabbit-oak.git
commit 9ba6d0b8acb31d53e3247986152f33b8bfc46c91 Author: Thomas Mueller <[email protected]> AuthorDate: Wed Mar 11 11:22:04 2026 +0100 OAK-12133 With the segment store, binary properties can not be aggregated --- .../plugins/index/lucene/LuceneDocumentMaker.java | 9 +- .../run/query/BinaryStorageWithBlobStoreTest.java | 159 ++++++++++++++ .../IndexRegexPropertyWithBinaryExcludedTest.java | 228 +++++++++++++++++++++ .../index/elastic/index/ElasticDocumentMaker.java | 7 +- .../oak/segment/SegmentPropertyState.java | 35 +++- 5 files changed, 434 insertions(+), 4 deletions(-) diff --git a/oak-lucene/src/main/java/org/apache/jackrabbit/oak/plugins/index/lucene/LuceneDocumentMaker.java b/oak-lucene/src/main/java/org/apache/jackrabbit/oak/plugins/index/lucene/LuceneDocumentMaker.java index 6192b54ae1..97756f04cb 100644 --- a/oak-lucene/src/main/java/org/apache/jackrabbit/oak/plugins/index/lucene/LuceneDocumentMaker.java +++ b/oak-lucene/src/main/java/org/apache/jackrabbit/oak/plugins/index/lucene/LuceneDocumentMaker.java @@ -131,11 +131,16 @@ public class LuceneDocumentMaker extends FulltextDocumentMaker<Document> { f = new DoubleField(pname, property.getValue(Type.DOUBLE, i), Field.Store.NO); } else if (tag == Type.BOOLEAN.tag()) { f = new StringField(pname, property.getValue(Type.BOOLEAN, i).toString(), Field.Store.NO); + } else if (tag == Type.BINARY.tag()) { + // ignore - never call getValue(Type.STRING) on a binary (see OAK-12133) + f = null; } else { f = new StringField(pname, property.getValue(Type.STRING, i), Field.Store.NO); } - doc.add(f); + if (f != null) { + doc.add(f); + } } @Override @@ -294,7 +299,7 @@ public class LuceneDocumentMaker extends FulltextDocumentMaker<Document> { new BytesRef(property.getValue(Type.BOOLEAN).toString())); } else if (tag == Type.STRING.tag()) { String stringValue = property.getValue(Type.STRING); - // Truncate the value as lucene limits the length of a SortedDocValueField string to + // Truncate the value as lucene limits the length of a SortedDocValueField string to // STRING_PROPERTY_MAX_LENGTH(32766 bytes) and throws exception if over the limit f = new SortedDocValuesField(name, getTruncatedBytesRef(name, stringValue, this.path, STRING_PROPERTY_MAX_LENGTH)); diff --git a/oak-run/src/test/java/org/apache/jackrabbit/oak/run/query/BinaryStorageWithBlobStoreTest.java b/oak-run/src/test/java/org/apache/jackrabbit/oak/run/query/BinaryStorageWithBlobStoreTest.java new file mode 100644 index 0000000000..6550df1ab6 --- /dev/null +++ b/oak-run/src/test/java/org/apache/jackrabbit/oak/run/query/BinaryStorageWithBlobStoreTest.java @@ -0,0 +1,159 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.jackrabbit.oak.run.query; + +import static org.junit.Assert.assertTrue; +import static org.junit.Assert.fail; + +import java.io.ByteArrayInputStream; +import java.io.File; +import java.io.IOException; + +import javax.jcr.Node; +import javax.jcr.Property; +import javax.jcr.Repository; +import javax.jcr.Session; +import javax.jcr.SimpleCredentials; + +import org.apache.jackrabbit.api.JackrabbitRepository; +import org.apache.jackrabbit.oak.Oak; +import org.apache.jackrabbit.oak.jcr.Jcr; +import org.apache.jackrabbit.oak.plugins.blob.datastore.DataStoreBlobStore; +import org.apache.jackrabbit.oak.segment.SegmentNodeStoreBuilders; +import org.apache.jackrabbit.oak.segment.file.FileStore; +import org.apache.jackrabbit.oak.segment.file.FileStoreBuilder; +import org.apache.jackrabbit.oak.segment.file.InvalidFileStoreVersionException; +import org.apache.jackrabbit.oak.spi.blob.BlobStore; +import org.apache.jackrabbit.oak.spi.security.user.UserConstants; +import org.apache.jackrabbit.core.data.FileDataStore; +import org.junit.After; +import org.junit.Before; +import org.junit.Rule; +import org.junit.Test; +import org.junit.rules.TemporaryFolder; + +/** + * Test to verify that binary properties throw IllegalStateException with a clear error message + * when read as STRING - this happens when they're stored externally in BlobStore. + * Verifies the exception message includes the property name and helpful guidance (OAK-12133). + */ +public class BinaryStorageWithBlobStoreTest { + + @Rule + public TemporaryFolder folder = new TemporaryFolder(new File("target")); + + private Repository repository; + private FileStore fileStore; + private Session session; + private BlobStore blobStore; + + @Before + public void setup() throws Exception { + // Create external BlobStore + FileDataStore fds = new FileDataStore(); + fds.setMinRecordLength(0); // Store ALL binaries externally, even tiny ones + fds.init(folder.newFolder("blobstore").getAbsolutePath()); + blobStore = new DataStoreBlobStore(fds); + + fileStore = createFileStore(); + repository = createRepository(fileStore, blobStore); + session = repository.login(new SimpleCredentials(UserConstants.DEFAULT_ADMIN_ID, + UserConstants.DEFAULT_ADMIN_ID.toCharArray())); + } + + @After + public void tearDown() { + if (session != null) { + session.logout(); + } + if (repository instanceof JackrabbitRepository) { + ((JackrabbitRepository) repository).shutdown(); + } + if (fileStore != null) { + fileStore.close(); + } + } + + private FileStore createFileStore() throws IOException, InvalidFileStoreVersionException { + return FileStoreBuilder.fileStoreBuilder(folder.getRoot()) + .withBlobStore(blobStore) + .withBinariesInlineThreshold(10) // Force binaries > 10 bytes to be external! + .build(); + } + + private Repository createRepository(FileStore fileStore, BlobStore blobStore) { + return new Jcr(new Oak(SegmentNodeStoreBuilders.builder(fileStore).build())) + .createRepository(); + } + + @Test + public void testSmallBinaryAsStringWithExternalBlobStore() throws Exception { + Node test = session.getRootNode().addNode("test", "nt:unstructured"); + + byte[] data = new byte[45]; + for (int i = 0; i < data.length; i++) { + data[i] = (byte) i; + } + + test.setProperty("binary1", session.getValueFactory().createBinary( + new ByteArrayInputStream(data))); + + session.save(); + + Property prop = test.getProperty("binary1"); + + try { + prop.getString(); + fail("Expected IllegalStateException when reading binary property as string"); + } catch (IllegalStateException e) { + String message = e.getMessage(); + assertTrue("Exception message should mention the property name 'binary1', but was: " + message, + message.contains("binary1")); + assertTrue("Exception message should mention 'Attempting to read binary property', but was: " + message, + message.contains("Attempting to read binary property")); + assertTrue("Exception message should mention 'can fail if the binary is stored externally', but was: " + message, + message.contains("can fail if the binary is stored externally")); + } + } + + @Test + public void testBinaryAsStringWithExternalBlobStore() throws Exception { + Node test = session.getRootNode().addNode("test2", "nt:unstructured"); + + byte[] data = new byte[17 * 1024]; + + test.setProperty("binary2", session.getValueFactory().createBinary( + new ByteArrayInputStream(data))); + + session.save(); + + Property prop = test.getProperty("binary2"); + + try { + prop.getString(); + fail("Expected IllegalStateException when reading binary property as string"); + } catch (IllegalStateException e) { + String message = e.getMessage(); + assertTrue("Exception message should mention the property name 'binary2', but was: " + message, + message.contains("binary2")); + assertTrue("Exception message should mention 'Attempting to read binary property', but was: " + message, + message.contains("Attempting to read binary property")); + assertTrue("Exception message should mention 'can fail if the binary is stored externally', but was: " + message, + message.contains("can fail if the binary is stored externally")); + } + } +} diff --git a/oak-run/src/test/java/org/apache/jackrabbit/oak/run/query/IndexRegexPropertyWithBinaryExcludedTest.java b/oak-run/src/test/java/org/apache/jackrabbit/oak/run/query/IndexRegexPropertyWithBinaryExcludedTest.java new file mode 100644 index 0000000000..984743a36f --- /dev/null +++ b/oak-run/src/test/java/org/apache/jackrabbit/oak/run/query/IndexRegexPropertyWithBinaryExcludedTest.java @@ -0,0 +1,228 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.jackrabbit.oak.run.query; + +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertTrue; + +import java.io.ByteArrayInputStream; +import java.io.File; +import java.io.IOException; +import java.util.ArrayList; +import java.util.List; + +import javax.jcr.Node; +import javax.jcr.PropertyType; +import javax.jcr.Repository; +import javax.jcr.RepositoryException; +import javax.jcr.Session; +import javax.jcr.SimpleCredentials; +import javax.jcr.query.Query; +import javax.jcr.query.QueryManager; +import javax.jcr.query.QueryResult; +import javax.jcr.query.Row; +import javax.jcr.query.RowIterator; + +import org.apache.jackrabbit.api.JackrabbitRepository; +import org.apache.jackrabbit.oak.Oak; +import org.apache.jackrabbit.oak.jcr.Jcr; +import org.apache.jackrabbit.oak.plugins.index.lucene.LuceneIndexEditorProvider; +import org.apache.jackrabbit.oak.plugins.index.lucene.LuceneIndexProvider; +import org.apache.jackrabbit.oak.segment.SegmentNodeStoreBuilders; +import org.apache.jackrabbit.oak.segment.file.FileStore; +import org.apache.jackrabbit.oak.segment.file.FileStoreBuilder; +import org.apache.jackrabbit.oak.segment.file.InvalidFileStoreVersionException; +import org.apache.jackrabbit.oak.spi.blob.BlobStore; +import org.apache.jackrabbit.oak.spi.security.user.UserConstants; +import org.apache.jackrabbit.oak.plugins.blob.datastore.DataStoreBlobStore; +import org.apache.jackrabbit.core.data.FileDataStore; +import org.junit.After; +import org.junit.Before; +import org.junit.Rule; +import org.junit.Test; +import org.junit.rules.TemporaryFolder; + +/** + * Tests indexing of regex property patterns with binary type exclusion. + * Uses 17KB binaries to ensure they're stored externally in BlobStore + * (above the default binariesInlineThreshold of ~16KB). + */ +public class IndexRegexPropertyWithBinaryExcludedTest { + + @Rule + public TemporaryFolder folder = new TemporaryFolder(new File("target")); + + private Repository repository; + private FileStore fileStore; + private Session session; + private QueryManager queryManager; + private BlobStore blobStore; + + @Before + public void setup() throws Exception { + FileDataStore fds = new FileDataStore(); + fds.setMinRecordLength(4096); + fds.init(folder.newFolder("blobstore").getAbsolutePath()); + blobStore = new DataStoreBlobStore(fds); + + fileStore = createFileStore(); + repository = createRepository(fileStore); + session = repository.login(new SimpleCredentials(UserConstants.DEFAULT_ADMIN_ID, + UserConstants.DEFAULT_ADMIN_ID.toCharArray())); + queryManager = session.getWorkspace().getQueryManager(); + } + + @After + public void tearDown() { + if (session != null) { + session.logout(); + } + if (repository instanceof JackrabbitRepository) { + ((JackrabbitRepository) repository).shutdown(); + } + if (fileStore != null) { + fileStore.close(); + } + } + + private FileStore createFileStore() throws IOException, InvalidFileStoreVersionException { + return FileStoreBuilder.fileStoreBuilder(folder.getRoot()) + .withBlobStore(blobStore) + .build(); + } + + private Repository createRepository(FileStore fileStore) { + LuceneIndexProvider provider = new LuceneIndexProvider(); + LuceneIndexEditorProvider editorProvider = new LuceneIndexEditorProvider(); + + return new Jcr(new Oak(SegmentNodeStoreBuilders.builder(fileStore).build())) + .with((org.apache.jackrabbit.oak.spi.query.QueryIndexProvider) provider) + .with((org.apache.jackrabbit.oak.spi.commit.Observer) provider) + .with(editorProvider) + .withAsyncIndexing("async", 1) + .createRepository(); + } + + @Test + public void indexRegexPropertyWithBinaryExcluded() throws Exception { + // Create index definition + Node oakIndex = session.getRootNode().getNode("oak:index"); + Node index = oakIndex.addNode("testRegexBinary", "oak:QueryIndexDefinition"); + index.setProperty("type", "lucene"); + index.setProperty("async", new String[]{"async", "nrt"}, PropertyType.STRING); + index.setProperty("compatVersion", 2); + index.setProperty("tags", new String[]{"abc"}, PropertyType.STRING); + index.setProperty("selectionPolicy", "tag"); + index.setProperty("includedPaths", "/content"); + index.setProperty("queryPaths", "/content"); + + // indexRules + Node indexRules = index.addNode("indexRules", "nt:unstructured"); + Node ntBase = indexRules.addNode("nt:base", "nt:unstructured"); + ntBase.setProperty("includePropertyTypes", + new String[]{"String", "Name"}, PropertyType.STRING); + Node properties = ntBase.addNode("properties", "nt:unstructured"); + + // regex property for child/.* + Node childProp = properties.addNode("childProp", "nt:unstructured"); + childProp.setProperty("name", ".*"); + childProp.setProperty("propertyIndex", true); + childProp.setProperty("isRegexp", true); + childProp.setProperty("nodeScopeIndex", true); + childProp.setProperty("analyzed", true); + childProp.setProperty("useInExcerpt", true); + childProp.setProperty("useInSpellcheck", true); + + session.save(); + + // Test content + Node content = session.getRootNode().addNode("content", "nt:unstructured"); + + Node one = content.addNode("one", "nt:unstructured"); + one.setProperty("title", "test content"); + Node oneChild = one.addNode("child", "nt:unstructured"); + oneChild.setProperty("data", "searchable text"); + // A 17KB binary to ensure it's stored externally in BlobStore + oneChild.setProperty("binary", session.getValueFactory().createBinary( + new ByteArrayInputStream(createLargeBinaryData(17 * 1024)))); + + Node two = content.addNode("two", "nt:unstructured"); + two.setProperty("title", "another test"); + Node twoChild = two.addNode("child", "nt:unstructured"); + twoChild.setProperty("data", "searchable text"); + twoChild.setProperty("binary", session.getValueFactory().createBinary( + new ByteArrayInputStream(createLargeBinaryData(17 * 1024)))); + + session.save(); + + // Query + String xpath = "/jcr:root/content//*[jcr:contains(., 'searchable')] option(index tag abc, traversal fail)"; + + // Wait for async indexing with retry + List<String> results = null; + AssertionError lastError = null; + for (int attempt = 0; attempt < 300; attempt++) { + try { + Query query = queryManager.createQuery("explain " + xpath, "xpath"); + RowIterator rowIt = query.execute().getRows(); + while (rowIt.hasNext()) { + assertTrue(rowIt.nextRow().toString().contains("lucene:testRegexBinary")); + } + query = queryManager.createQuery(xpath, "xpath"); + QueryResult result = query.execute(); + results = getResultPaths(result); + + // Verify we found both child nodes + assertEquals("Expected 2 results", 2, results.size()); + assertTrue("Expected /content/one/child in results", results.contains("/content/one/child")); + assertTrue("Expected /content/two/child in results", results.contains("/content/two/child")); + + // Success - exit retry loop + return; + } catch (AssertionError e) { + lastError = e; + Thread.sleep(100); + } + } + + if (lastError != null) { + throw new AssertionError("Query did not return expected results after 30 attempts. Last results: " + results, lastError); + } + } + + /** + * Create a large binary data array of specified size. + * Uses a simple pattern to fill the data. + */ + private byte[] createLargeBinaryData(int sizeInBytes) { + byte[] data = new byte[sizeInBytes]; + for (int i = 0; i < sizeInBytes; i++) { + data[i] = (byte) (i % 256); + } + return data; + } + + private List<String> getResultPaths(QueryResult result) throws RepositoryException { + List<String> paths = new ArrayList<>(); + RowIterator rows = result.getRows(); + while (rows.hasNext()) { + Row row = rows.nextRow(); + paths.add(row.getPath()); + } + return paths; + } +} diff --git a/oak-search-elastic/src/main/java/org/apache/jackrabbit/oak/plugins/index/elastic/index/ElasticDocumentMaker.java b/oak-search-elastic/src/main/java/org/apache/jackrabbit/oak/plugins/index/elastic/index/ElasticDocumentMaker.java index dd856dfd07..3929fe655e 100644 --- a/oak-search-elastic/src/main/java/org/apache/jackrabbit/oak/plugins/index/elastic/index/ElasticDocumentMaker.java +++ b/oak-search-elastic/src/main/java/org/apache/jackrabbit/oak/plugins/index/elastic/index/ElasticDocumentMaker.java @@ -190,11 +190,16 @@ public class ElasticDocumentMaker extends FulltextDocumentMaker<ElasticDocument> f = property.getValue(Type.DOUBLE, i); } else if (tag == Type.BOOLEAN.tag()) { f = property.getValue(Type.BOOLEAN, i).toString(); + } else if (tag == Type.BINARY.tag()) { + // ignore - never call getValue(Type.STRING) on a binary (see OAK-12133) + f = null; } else { f = property.getValue(Type.STRING, i); } - doc.addProperty(fieldName, f); + if (f != null) { + doc.addProperty(fieldName, f); + } } catch (Exception e) { if (!LOG_SILENCER.silence(LOG_KEY_COULD_NOT_CONVERT_PROPERTY)) { LOG.warn( diff --git a/oak-segment-tar/src/main/java/org/apache/jackrabbit/oak/segment/SegmentPropertyState.java b/oak-segment-tar/src/main/java/org/apache/jackrabbit/oak/segment/SegmentPropertyState.java index 68b5e18f81..d5bfffda1c 100644 --- a/oak-segment-tar/src/main/java/org/apache/jackrabbit/oak/segment/SegmentPropertyState.java +++ b/oak-segment-tar/src/main/java/org/apache/jackrabbit/oak/segment/SegmentPropertyState.java @@ -49,10 +49,13 @@ import javax.jcr.PropertyType; import org.apache.jackrabbit.oak.api.PropertyState; import org.apache.jackrabbit.oak.api.Type; import org.apache.jackrabbit.oak.commons.conditions.Validate; +import org.apache.jackrabbit.oak.commons.log.LogSilencer; import org.apache.jackrabbit.oak.plugins.memory.AbstractPropertyState; import org.apache.jackrabbit.oak.plugins.value.Conversions; import org.apache.jackrabbit.oak.plugins.value.Conversions.Converter; import org.jetbrains.annotations.NotNull; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; /** * A property, which can read a value or list record from a segment. It @@ -62,6 +65,10 @@ import org.jetbrains.annotations.NotNull; * of type "LIST" (for arrays). */ public class SegmentPropertyState extends Record implements PropertyState { + private static final Logger LOG = LoggerFactory.getLogger(SegmentPropertyState.class); + + private static final LogSilencer LOG_SILENCER = new LogSilencer(10_000, 64); + @NotNull private final SegmentReader reader; @@ -190,7 +197,33 @@ public class SegmentPropertyState extends Record implements PropertyState { return (T) reader.readBlob(id); // load binaries lazily } - String value = reader.readString(id); + // OAK-12133: Detect when trying to read a binary property as a string + Type<?> actualType = getType(); + if (actualType.isArray()) { + actualType = actualType.getBaseType(); + } + String message = null; + if (actualType == BINARY && type != BINARY) { + message = String.format( + "Attempting to read binary property '%s' as %s. " + + "This can fail if the binary is stored externally. " + + "Binary properties should be checked before getValue() is called.", + name, type); + if (!LOG_SILENCER.silence(name)) { + LOG.warn(message, new Exception("Stack trace")); + } + } + + String value; + try { + value = reader.readString(id); + } catch (IllegalStateException e) { + if (message != null && e.getMessage() != null && + e.getMessage().contains("possibly trying to read a BLOB using getString")) { + throw new IllegalStateException(message, e); + } + throw e; + } if (type == STRING || type == URI || type == DATE || type == NAME || type == PATH || type == REFERENCE || type == WEAKREFERENCE) {
