dichlorodiphen commented on code in PR #53828:
URL: https://github.com/apache/spark/pull/53828#discussion_r2738385527


##########
connector/protobuf/src/main/scala/org/apache/spark/sql/protobuf/utils/ProtobufUtils.scala:
##########
@@ -232,38 +263,49 @@ private[sql] object ProtobufUtils extends Logging {
         throw QueryCompilationErrors.descriptorParseError(ex)
     }
     val fileDescriptorProtoIndex = createDescriptorProtoMap(fileDescriptorSet)
+
+    // Mutated across invocations of buildFileDescriptor.
+    val builtDescriptors = mutable.Map[String, Descriptors.FileDescriptor]()
     val fileDescriptorList: List[Descriptors.FileDescriptor] =
-      fileDescriptorSet.getFileList.asScala.map( fileDescriptorProto =>
-        buildFileDescriptor(fileDescriptorProto, fileDescriptorProtoIndex)
-      ).toList
+      fileDescriptorSet.getFileList.asScala.map { fileDescriptorProto =>
+        buildFileDescriptor(fileDescriptorProto, fileDescriptorProtoIndex, 
builtDescriptors)
+      }.distinctBy(_.getFullName).toList
     fileDescriptorList
   }
 
   /**
-   * Recursively constructs file descriptors for all dependencies for given
-   * FileDescriptorProto and return.
+   * Recursively constructs file descriptors for all dependencies for given 
FileDescriptorProto
+   * and return.
    */
   private def buildFileDescriptor(
-    fileDescriptorProto: FileDescriptorProto,
-    fileDescriptorProtoMap: Map[String, FileDescriptorProto]): 
Descriptors.FileDescriptor = {
-    val fileDescriptorList = 
fileDescriptorProto.getDependencyList().asScala.map { dependency =>
-      fileDescriptorProtoMap.get(dependency) match {
-        case Some(dependencyProto) =>
-          if (dependencyProto.getName == "google/protobuf/any.proto"
-            && dependencyProto.getPackage == "google.protobuf") {
-            // For Any, use the descriptor already included as part of the 
Java dependency.
-            // Without this, JsonFormat used for converting Any fields fails 
when
-            // an Any field in input is set to `Any.getDefaultInstance()`.
-            com.google.protobuf.AnyProto.getDescriptor
-            // Should we do the same for timestamp.proto and empty.proto?
-          } else {
-            buildFileDescriptor(dependencyProto, fileDescriptorProtoMap)
-          }
-        case None =>
-          throw 
QueryCompilationErrors.protobufDescriptorDependencyError(dependency)
-      }
-    }
-    Descriptors.FileDescriptor.buildFrom(fileDescriptorProto, 
fileDescriptorList.toArray)
+      fileDescriptorProto: FileDescriptorProto,
+      fileDescriptorProtoMap: Map[String, FileDescriptorProto],
+      builtDescriptors: mutable.Map[String, Descriptors.FileDescriptor])
+      : Descriptors.FileDescriptor = {
+    // Storing references to constructed descriptors is crucial because 
descriptors are compared

Review Comment:
   It actually doesn't lead to any issues in the existing implementation 
because we only use the full list of constructed descriptors in two places: 1) 
selecting the descriptor of interest by name and 2) building a TypeRegistry, 
which is only used for printing. The issue in the new implementation is that we 
use the descriptor list to construct both the extension registry and index from 
names to extension descriptors, so we run into validation failures when we call 
`DynamicMessage::hasField` in `ProtobufDeserializer`.
   
   TLDR: We didn't use the full list of descriptors before, so we couldn't 
encounter this error.
   



##########
connector/protobuf/src/main/scala/org/apache/spark/sql/protobuf/utils/SchemaConverters.scala:
##########
@@ -42,20 +42,23 @@ object SchemaConverters extends Logging {
    */
   private[protobuf] def toSqlType(
       descriptor: Descriptor,
+      fullNamesToExtensions: Map[String, Seq[FieldDescriptor]] = Map.empty,

Review Comment:
   Missed this inconsistency, thanks for catching. I'd prefer to rename this to 
`fullNameToExtensions`, just to emphasize that we are using fully qualified 
names.



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to