This is an automated email from the ASF dual-hosted git repository.
gurwls223 pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/spark.git
The following commit(s) were added to refs/heads/master by this push:
new 10439902b6dd [SPARK-46954][SQL] XML: Wrap InputStreamReader with
BufferedReader
10439902b6dd is described below
commit 10439902b6ddc2d5826ed16f855a8429d9a15466
Author: Sandip Agarwala <[email protected]>
AuthorDate: Tue Feb 6 16:11:23 2024 +0900
[SPARK-46954][SQL] XML: Wrap InputStreamReader with BufferedReader
### What changes were proposed in this pull request?
Wrap InputStreamReader with BufferedReader
### Why are the changes needed?
More than doubles the performance.
### Does this PR introduce _any_ user-facing change?
Yes, performance improvement.
### How was this patch tested?
Existing unit tests and manual perf testing
### Was this patch authored or co-authored using generative AI tooling?
No
Closes #45041 from sandip-db/xml_buffered_reader.
Authored-by: Sandip Agarwala <[email protected]>
Signed-off-by: Hyukjin Kwon <[email protected]>
---
.../spark/sql/catalyst/xml/StaxXmlParser.scala | 20 ++++----------------
1 file changed, 4 insertions(+), 16 deletions(-)
diff --git
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/xml/StaxXmlParser.scala
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/xml/StaxXmlParser.scala
index 74413bb8cbb2..66ec636d1a65 100644
---
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/xml/StaxXmlParser.scala
+++
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/xml/StaxXmlParser.scala
@@ -16,7 +16,7 @@
*/
package org.apache.spark.sql.catalyst.xml
-import java.io.{CharConversionException, FileNotFoundException, InputStream,
InputStreamReader, IOException, StringReader}
+import java.io.{BufferedReader, CharConversionException,
FileNotFoundException, InputStream, InputStreamReader, IOException,
StringReader}
import java.nio.charset.{Charset, MalformedInputException}
import java.text.NumberFormat
import java.util.Locale
@@ -37,20 +37,7 @@ import org.apache.spark.SparkUpgradeException
import org.apache.spark.internal.Logging
import org.apache.spark.sql.catalyst.InternalRow
import org.apache.spark.sql.catalyst.expressions.ExprUtils
-import org.apache.spark.sql.catalyst.util.{
- ArrayBasedMapData,
- BadRecordException,
- DateFormatter,
- DropMalformedMode,
- FailureSafeParser,
- GenericArrayData,
- MapData,
- ParseMode,
- PartialResultArrayException,
- PartialResultException,
- PermissiveMode,
- TimestampFormatter
-}
+import org.apache.spark.sql.catalyst.util.{ArrayBasedMapData,
BadRecordException, DateFormatter, DropMalformedMode, FailureSafeParser,
GenericArrayData, MapData, ParseMode, PartialResultArrayException,
PartialResultException, PermissiveMode, TimestampFormatter}
import org.apache.spark.sql.catalyst.util.LegacyDateFormats.FAST_DATE_FORMAT
import org.apache.spark.sql.catalyst.xml.StaxXmlParser.convertStream
import org.apache.spark.sql.errors.QueryExecutionErrors
@@ -623,7 +610,8 @@ class StaxXmlParser(
class XmlTokenizer(
inputStream: InputStream,
options: XmlOptions) extends Logging {
- private var reader = new InputStreamReader(inputStream,
Charset.forName(options.charset))
+ private var reader = new BufferedReader(
+ new InputStreamReader(inputStream, Charset.forName(options.charset)))
private var currentStartTag: String = _
private var buffer = new StringBuilder()
private val startTag = s"<${options.rowTag}>"
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]