This is an automated email from the ASF dual-hosted git repository.
slawrence pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/daffodil.git
The following commit(s) were added to refs/heads/main by this push:
new 67eef7e04 mmap files when possible to improve CLI parse performance
67eef7e04 is described below
commit 67eef7e042e2da50cdd4e663c05eddc34219a580
Author: Steve Lawrence <[email protected]>
AuthorDate: Wed Aug 21 12:14:08 2024 -0400
mmap files when possible to improve CLI parse performance
Daffodil currently supports two different input sources: a
BucketingInputSource backed by an InputStream and ByteBufferInputSource
backed by a ByteBuffer. The CLI currntly always uses the
BucketingInputSource because the ByteBufferInputSource does not support
stdin or files larger than 2GB. Although the gap is closing due to other
optimizations, the BucketingInputSource still has overhead compared to
ByteBufferInputSource due to added complexity.
This changes the CLI logic to use a ByteBufferInputSource where possible
(parsing files <= 2GB) using mmap and a MappedByteBuffer to efficiently
create a ByteBuffer.
Basic testing shows about a 5% increase over the BucketingInputSource
for a large file with many small reads.
Also add Java/Scala API documentation explaining performance
characterisics of the different input source construtors and example
code for using mmap vs FileInputStream.
DAFFODIL-2921
---
.../main/scala/org/apache/daffodil/cli/Main.scala | 22 +++++++++++++++----
.../japi/io/InputSourceDataInputStream.scala | 25 +++++++++++++++++++++-
.../sapi/io/InputSourceDataInputStream.scala | 25 ++++++++++++++++++++++
3 files changed, 67 insertions(+), 5 deletions(-)
diff --git a/daffodil-cli/src/main/scala/org/apache/daffodil/cli/Main.scala
b/daffodil-cli/src/main/scala/org/apache/daffodil/cli/Main.scala
index 604570365..8aec2dcb7 100644
--- a/daffodil-cli/src/main/scala/org/apache/daffodil/cli/Main.scala
+++ b/daffodil-cli/src/main/scala/org/apache/daffodil/cli/Main.scala
@@ -25,7 +25,10 @@ import java.io.PrintStream
import java.net.URI
import java.nio.ByteBuffer
import java.nio.channels.Channels
+import java.nio.channels.FileChannel
+import java.nio.file.Files
import java.nio.file.Paths
+import java.nio.file.StandardOpenOption
import java.util.Scanner
import java.util.concurrent.Executors
import javax.xml.parsers.SAXParserFactory
@@ -1165,13 +1168,24 @@ class Main(
case Some(processor) => {
Assert.invariant(!processor.isError)
val input = parseOpts.infile.toOption match {
- case Some("-") | None => STDIN
+ case Some("-") | None => InputSourceDataInputStream(STDIN)
case Some(file) => {
- val f = new File(file)
- new FileInputStream(f)
+ // for files <= 2GB, use a mapped byte buffer to avoid the
overhead related to
+ // the BucketingInputSource. Larger files cannot be mapped so
we cannot avoid it
+ val path = Paths.get(file)
+ val size = Files.size(path)
+ if (size <= Int.MaxValue) {
+ val fc = FileChannel.open(path, StandardOpenOption.READ)
+ val bb = fc.map(FileChannel.MapMode.READ_ONLY, 0, size)
+ fc.close() // we no longer need the channel now that we've
mapped it
+ InputSourceDataInputStream(bb)
+ } else {
+ val is = Files.newInputStream(path, StandardOpenOption.READ)
+ InputSourceDataInputStream(is)
+ }
}
}
- using(InputSourceDataInputStream(input)) { inStream =>
+ using(input) { inStream =>
val output = parseOpts.output.toOption match {
case Some("-") | None => STDOUT
case Some(file) => new FileOutputStream(file)
diff --git
a/daffodil-japi/src/main/scala/org/apache/daffodil/japi/io/InputSourceDataInputStream.scala
b/daffodil-japi/src/main/scala/org/apache/daffodil/japi/io/InputSourceDataInputStream.scala
index 475365912..7ae369037 100644
---
a/daffodil-japi/src/main/scala/org/apache/daffodil/japi/io/InputSourceDataInputStream.scala
+++
b/daffodil-japi/src/main/scala/org/apache/daffodil/japi/io/InputSourceDataInputStream.scala
@@ -26,7 +26,30 @@ import org.apache.daffodil.io.{ InputSourceDataInputStream
=> SInputSourceDataIn
* Provides Daffodil with byte data from an InputStream, ByteBuffer, or byte
* Array.
*
- * @param dis the underlying Scala InputSourceDataInputStream
+ * Note that the InputStream variant has potential overhead due to streaming
capabilities and
+ * support for files greater than 2GB. In some cases, better performance might
come from using
+ * the byte array or ByteBuffer variants instead. For example, if your data is
already in a byte
+ * array, one should use the Array[Byte] or ByteBuffer variants instead of
wrapping it in a
+ * ByteArrayInputStream. As another example, instead of using a
FileInputStream like this:
+ *
+ * {{{
+ * Path path = Paths.get(file);
+ * FileInputStream fis = Files.newInputStream(path);
+ * InputSourceDataInputStream input = InputSourceDataInputStream(fis);
+ * }}}
+ *
+ * You might consider mapping the file to a MappedByteBuffer like below,
keeping in mind that
+ * MappedByteBuffers have size limitations and potentially different
performance characteristics
+ * depending on the file size and system--it maybe not always be faster than
above.
+ *
+ * {{{
+ * Path path = Paths.get(file);
+ * long size = Files.size(path);
+ * FileChannel fc = FileChannel.open(path, StandardOpenOption.READ);
+ * ByteBuffer bb = fc.map(FileChannel.MapMode.READ_ONLY, 0, size);
+ * fc.close();
+ * InputSourceDataInputStream input = new InputSourceDataInputStream(bb);
+ * }}}
*/
class InputSourceDataInputStream private[japi] (
private[japi] val dis: SInputSourceDataInputStream
diff --git
a/daffodil-sapi/src/main/scala/org/apache/daffodil/sapi/io/InputSourceDataInputStream.scala
b/daffodil-sapi/src/main/scala/org/apache/daffodil/sapi/io/InputSourceDataInputStream.scala
index 7047592f5..55e67fda1 100644
---
a/daffodil-sapi/src/main/scala/org/apache/daffodil/sapi/io/InputSourceDataInputStream.scala
+++
b/daffodil-sapi/src/main/scala/org/apache/daffodil/sapi/io/InputSourceDataInputStream.scala
@@ -26,6 +26,31 @@ import org.apache.daffodil.io.{ InputSourceDataInputStream
=> SInputSourceDataIn
* Provides Daffodil with byte data from an InputStream, ByteBuffer, or byte
* Array.
*
+ * Note that the InputStream variant has potential overhead due to streaming
capabilities and
+ * support for files greater than 2GB. In some cases, better performance might
come from using
+ * the byte array or ByteBuffer variants instead. For example, if your data is
already in a byte
+ * array, one should use the Array[Byte] or ByteBuffer variants instead of
wrapping it in a
+ * ByteArrayInputStream. As another example, instead of using a
FileInputStream like this:
+ *
+ * {{{
+ * val path = Paths.get(file)
+ * val fis = Files.newInputStream(path)
+ * val input = InputSourceDataInputStream(fis)
+ * }}}
+ *
+ * You might consider mapping the file to a MappedByteBuffer like below,
keeping in mind that
+ * MappedByteBuffers have size limitations and potentially different
performance characteristics
+ * depending on the file size and system--it maybe not always be faster than
above.
+ *
+ * {{{
+ * val path = Paths.get(file)
+ * val size = Files.size(path)
+ * val fc = FileChannel.open(path, StandardOpenOption.READ)
+ * val bb = fc.map(FileChannel.MapMode.READ_ONLY, 0, size)
+ * fc.close()
+ * val input = InputSourceDataInputStream(bb)
+ * }}}
+ *
* @param dis the underlying Scala InputSourceDataInputStream
*/
class InputSourceDataInputStream private[sapi] (