[GitHub] [spark] srowen commented on a change in pull request #31809: Fix avro decoding based on avro MAGIC_NUMBER presence

GitBox Thu, 11 Mar 2021 06:48:57 -0800


srowen commented on a change in pull request #31809:
URL: https://github.com/apache/spark/pull/31809#discussion_r592421763




##########
File path: 
external/avro/src/main/scala/org/apache/spark/sql/avro/AvroDataToCatalyst.scala
##########
@@ -80,22 +80,36 @@ private[avro] case class AvroDataToCatalyst(
   }
 
   @transient private lazy val nullResultRow: Any = dataType match {
-      case st: StructType =>
-        val resultRow = new SpecificInternalRow(st.map(_.dataType))
-        for(i <- 0 until st.length) {
-          resultRow.setNullAt(i)
-        }
-        resultRow
+    case st: StructType =>

Review comment:
       While this whitespace change is OK, it's not necessary, and I'd just 
avoid it

##########
File path: 
external/avro/src/main/scala/org/apache/spark/sql/avro/AvroDataToCatalyst.scala
##########
@@ -31,9 +31,9 @@ import org.apache.spark.sql.catalyst.util.{FailFastMode, 
ParseMode, PermissiveMo
 import org.apache.spark.sql.types._
 
 private[avro] case class AvroDataToCatalyst(
-    child: Expression,
-    jsonFormatSchema: String,
-    options: Map[String, String])
+                                             child: Expression,

Review comment:
       Don't change these indents

##########
File path: 
external/avro/src/main/scala/org/apache/spark/sql/avro/AvroDataToCatalyst.scala
##########
@@ -80,22 +80,36 @@ private[avro] case class AvroDataToCatalyst(
   }
 
   @transient private lazy val nullResultRow: Any = dataType match {
-      case st: StructType =>
-        val resultRow = new SpecificInternalRow(st.map(_.dataType))
-        for(i <- 0 until st.length) {
-          resultRow.setNullAt(i)
-        }
-        resultRow
+    case st: StructType =>
+      val resultRow = new SpecificInternalRow(st.map(_.dataType))
+      for (i <- 0 until st.length) {
+        resultRow.setNullAt(i)
+      }
+      resultRow
 
-      case _ =>
-        null
-    }
+    case _ =>
+      null
+  }
 
+  private def getBinaryOffset(binary: Array[Byte]): Int = {
+    val MAGIC_ONE = 0xC3.toByte
+    val MAGIC_TWO = 0x01.toByte
+
+    binary match {

Review comment:
       I'd also leave comments about what this is doing and why

##########
File path: 
external/avro/src/main/scala/org/apache/spark/sql/avro/AvroDataToCatalyst.scala
##########
@@ -80,22 +80,36 @@ private[avro] case class AvroDataToCatalyst(
   }
 
   @transient private lazy val nullResultRow: Any = dataType match {
-      case st: StructType =>
-        val resultRow = new SpecificInternalRow(st.map(_.dataType))
-        for(i <- 0 until st.length) {
-          resultRow.setNullAt(i)
-        }
-        resultRow
+    case st: StructType =>
+      val resultRow = new SpecificInternalRow(st.map(_.dataType))
+      for (i <- 0 until st.length) {
+        resultRow.setNullAt(i)
+      }
+      resultRow
 
-      case _ =>
-        null
-    }
+    case _ =>
+      null
+  }
 
+  private def getBinaryOffset(binary: Array[Byte]): Int = {
+    val MAGIC_ONE = 0xC3.toByte
+    val MAGIC_TWO = 0x01.toByte
+
+    binary match {
+      case Array(MAGIC_ONE, MAGIC_TWO, _*) => 10
+      case _ => 0
+    }
+  }
 
   override def nullSafeEval(input: Any): Any = {
     val binary = input.asInstanceOf[Array[Byte]]
+    val offset = getBinaryOffset(binary)
     try {
-      decoder = DecoderFactory.get().binaryDecoder(binary, 0, binary.length, 
decoder)
+      decoder = DecoderFactory.get().binaryDecoder(
+        binary,
+        offset,
+        binary.length-offset, decoder

Review comment:
       Nit: space around -

##########
File path: 
external/avro/src/main/scala/org/apache/spark/sql/avro/AvroDataToCatalyst.scala
##########
@@ -80,22 +80,36 @@ private[avro] case class AvroDataToCatalyst(
   }
 
   @transient private lazy val nullResultRow: Any = dataType match {
-      case st: StructType =>
-        val resultRow = new SpecificInternalRow(st.map(_.dataType))
-        for(i <- 0 until st.length) {
-          resultRow.setNullAt(i)
-        }
-        resultRow
+    case st: StructType =>
+      val resultRow = new SpecificInternalRow(st.map(_.dataType))
+      for (i <- 0 until st.length) {
+        resultRow.setNullAt(i)
+      }
+      resultRow
 
-      case _ =>
-        null
-    }
+    case _ =>
+      null
+  }
 
+  private def getBinaryOffset(binary: Array[Byte]): Int = {
+    val MAGIC_ONE = 0xC3.toByte
+    val MAGIC_TWO = 0x01.toByte
+
+    binary match {

Review comment:
       It might be more readable to just have an if statement here checking the 
first two bytes

##########
File path: 
external/avro/src/main/scala/org/apache/spark/sql/avro/AvroDataToCatalyst.scala
##########
@@ -80,22 +80,36 @@ private[avro] case class AvroDataToCatalyst(
   }
 
   @transient private lazy val nullResultRow: Any = dataType match {
-      case st: StructType =>
-        val resultRow = new SpecificInternalRow(st.map(_.dataType))
-        for(i <- 0 until st.length) {
-          resultRow.setNullAt(i)
-        }
-        resultRow
+    case st: StructType =>
+      val resultRow = new SpecificInternalRow(st.map(_.dataType))
+      for (i <- 0 until st.length) {
+        resultRow.setNullAt(i)
+      }
+      resultRow
 
-      case _ =>
-        null
-    }
+    case _ =>
+      null
+  }
 
+  private def getBinaryOffset(binary: Array[Byte]): Int = {
+    val MAGIC_ONE = 0xC3.toByte

Review comment:
       Just inline these, not useful




----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
[email protected]



---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

[GitHub] [spark] srowen commented on a change in pull request #31809: Fix avro decoding based on avro MAGIC_NUMBER presence

Reply via email to