thomasmueller commented on code in PR #1716:
URL: https://github.com/apache/jackrabbit-oak/pull/1716#discussion_r1760824583
##########
oak-run-commons/src/main/java/org/apache/jackrabbit/oak/index/indexer/document/flatfile/pipelined/NodeDocumentCodec.java:
##########
@@ -49,44 +56,109 @@
* <li>Allows estimating the size of the document while reading it, which
will have a negligible overhead (as compared
* with doing an additional traverse of the object structure to compute the
size).</li>
* </ul>
- *
+ * <p>
* This class must be thread-safe, Mongo uses a single coded implementation
across multiple threads.
- *
*/
public class NodeDocumentCodec implements Codec<NodeDocument> {
+ private final static Logger LOG =
LoggerFactory.getLogger(NodeDocumentCodec.class);
+
+ public static final String
OAK_INDEXER_PIPELINED_NODE_DOCUMENT_FILTER_FILTERED_PATH =
"oak.indexer.pipelined.nodeDocument.filter.filteredPath";
+ public static final String
OAK_INDEXER_PIPELINED_NODE_DOCUMENT_FILTER_SUFFIXES_TO_SKIP =
"oak.indexer.pipelined.nodeDocument.filter.suffixesToSkip";
+ private final String filteredPath =
ConfigHelper.getSystemPropertyAsString(OAK_INDEXER_PIPELINED_NODE_DOCUMENT_FILTER_FILTERED_PATH,
"");
+ private final List<String> suffixesToSkip =
ConfigHelper.getSystemPropertyAsStringList(OAK_INDEXER_PIPELINED_NODE_DOCUMENT_FILTER_SUFFIXES_TO_SKIP,
"",';');
+
// The estimated size is stored in the NodeDocument itself
public final static String SIZE_FIELD = "_ESTIMATED_SIZE_";
+
+ private static class NodeDocumentDecoderContext {
+ long docsDecoded = 0;
+ long dataDownloaded = 0;
+ int estimatedSizeOfCurrentObject = 0;
+ }
+
+ private final NodeDocument emptyNodeDocument;
+
private final MongoDocumentStore store;
private final Collection<NodeDocument> collection;
private final BsonTypeCodecMap bsonTypeCodecMap;
private final DecoderContext decoderContext =
DecoderContext.builder().build();
-
private final Codec<String> stringCoded;
private final Codec<Long> longCoded;
private final Codec<Boolean> booleanCoded;
+ private final NodeDocumentFilter fieldFilter = new
NodeDocumentFilter(filteredPath, suffixesToSkip);
+
+ // Statistics
+ private final AtomicLong totalDocsDecoded = new AtomicLong(0);
+ private final AtomicLong totalDataDownloaded = new AtomicLong(0);
+ private final ThreadLocal<NodeDocumentDecoderContext> perThreadContext =
ThreadLocal.withInitial(NodeDocumentDecoderContext::new);
+
public NodeDocumentCodec(MongoDocumentStore store,
Collection<NodeDocument> collection, CodecRegistry defaultRegistry) {
this.store = store;
this.collection = collection;
this.bsonTypeCodecMap = new BsonTypeCodecMap(new BsonTypeClassMap(),
defaultRegistry);
+ this.emptyNodeDocument = collection.newDocument(store);
// Retrieve references to the most commonly used codecs, to avoid the
map lookup in the common case
this.stringCoded = (Codec<String>)
bsonTypeCodecMap.get(BsonType.STRING);
this.longCoded = (Codec<Long>) bsonTypeCodecMap.get(BsonType.INT64);
this.booleanCoded = (Codec<Boolean>)
bsonTypeCodecMap.get(BsonType.BOOLEAN);
}
+ /**
+ * Skipping over values in the BSON file is faster than reading them.
Skipping is done by advancing a pointer in
+ * an internal buffer, while reading requires converting them to a Java
data type (typically String).
+ */
+ private void skipUntilEndOfDocument(BsonReader reader) {
+ while (true) {
+ BsonType bsonType = reader.readBsonType();
+ if (bsonType == BsonType.END_OF_DOCUMENT) {
+ break;
+ }
+ reader.skipName();
+ reader.skipValue();
+ }
Review Comment:
Sorry, even simpler:
```suggestion
while (reader.readBsonType() != BsonType.END_OF_DOCUMENT) {
reader.skipName();
reader.skipValue();
}
```
##########
oak-run-commons/src/main/java/org/apache/jackrabbit/oak/index/indexer/document/flatfile/pipelined/ConfigHelper.java:
##########
@@ -48,4 +53,18 @@ public static boolean getSystemPropertyAsBoolean(String
name, boolean defaultVal
LOG.info("Config {}={}", name, value);
return value;
}
+
+ /**
+ * white space at the start/end of the string or at the start/end of the
parts delimited by separator are trimmed
Review Comment:
Yes this Javadoc helps here.
##########
oak-run-commons/src/main/java/org/apache/jackrabbit/oak/index/indexer/document/flatfile/pipelined/NodeDocumentCodec.java:
##########
@@ -49,44 +56,109 @@
* <li>Allows estimating the size of the document while reading it, which
will have a negligible overhead (as compared
* with doing an additional traverse of the object structure to compute the
size).</li>
* </ul>
- *
+ * <p>
* This class must be thread-safe, Mongo uses a single coded implementation
across multiple threads.
- *
*/
public class NodeDocumentCodec implements Codec<NodeDocument> {
+ private final static Logger LOG =
LoggerFactory.getLogger(NodeDocumentCodec.class);
+
+ public static final String
OAK_INDEXER_PIPELINED_NODE_DOCUMENT_FILTER_FILTERED_PATH =
"oak.indexer.pipelined.nodeDocument.filter.filteredPath";
+ public static final String
OAK_INDEXER_PIPELINED_NODE_DOCUMENT_FILTER_SUFFIXES_TO_SKIP =
"oak.indexer.pipelined.nodeDocument.filter.suffixesToSkip";
+ private final String filteredPath =
ConfigHelper.getSystemPropertyAsString(OAK_INDEXER_PIPELINED_NODE_DOCUMENT_FILTER_FILTERED_PATH,
"");
+ private final List<String> suffixesToSkip =
ConfigHelper.getSystemPropertyAsStringList(OAK_INDEXER_PIPELINED_NODE_DOCUMENT_FILTER_SUFFIXES_TO_SKIP,
"",';');
+
// The estimated size is stored in the NodeDocument itself
public final static String SIZE_FIELD = "_ESTIMATED_SIZE_";
+
+ private static class NodeDocumentDecoderContext {
+ long docsDecoded = 0;
+ long dataDownloaded = 0;
+ int estimatedSizeOfCurrentObject = 0;
+ }
+
+ private final NodeDocument emptyNodeDocument;
+
private final MongoDocumentStore store;
private final Collection<NodeDocument> collection;
private final BsonTypeCodecMap bsonTypeCodecMap;
private final DecoderContext decoderContext =
DecoderContext.builder().build();
-
private final Codec<String> stringCoded;
private final Codec<Long> longCoded;
private final Codec<Boolean> booleanCoded;
+ private final NodeDocumentFilter fieldFilter = new
NodeDocumentFilter(filteredPath, suffixesToSkip);
+
+ // Statistics
+ private final AtomicLong totalDocsDecoded = new AtomicLong(0);
+ private final AtomicLong totalDataDownloaded = new AtomicLong(0);
+ private final ThreadLocal<NodeDocumentDecoderContext> perThreadContext =
ThreadLocal.withInitial(NodeDocumentDecoderContext::new);
Review Comment:
I'm not a very big fan of using ThreadLocal because of the problems they can
cause. One is out-of-memory, another is that some web servers, and possibly
also OSGi, can try to reset them... I think here it is OK. But I wouldn't want
to use ThreadLocal in other parts of Oak. Just FYI
##########
oak-run-commons/src/main/java/org/apache/jackrabbit/oak/index/indexer/document/flatfile/pipelined/ConfigHelper.java:
##########
@@ -48,4 +53,18 @@ public static boolean getSystemPropertyAsBoolean(String
name, boolean defaultVal
LOG.info("Config {}={}", name, value);
return value;
}
+
+ /**
+ * white space at the start/end of the string or at the start/end of the
parts delimited by separator are trimmed
+ */
+ public static List<String> getSystemPropertyAsStringList(String name,
String defaultValue, char separator) {
+ String result = System.getProperty(name, defaultValue);
+ List<String> parts = splitString(result, separator);
+ LOG.info("Config {}={}", name, parts);
+ return parts;
+ }
+
+ private static List<String> splitString(String str, char separator) {
Review Comment:
Hm, I would have hoped to make it public as well, add a Javadoc, and
specially a unit test... But OK, it's a short method, and the risk is low...
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]