sandip-db commented on code in PR #43722:
URL: https://github.com/apache/spark/pull/43722#discussion_r1388794901
##########
sql/api/src/main/scala/org/apache/spark/sql/types/StructType.scala:
##########
@@ -588,7 +588,7 @@ object StructType extends AbstractDataType {
.map { case rightField @ StructField(rightName, rightType,
rightNullable, _) =>
try {
leftField.copy(
- dataType = merge(leftType, rightType),
+ dataType = merge(leftType, rightType, caseSensitive),
Review Comment:
can this be a separate PR? It would be easy to backport.
##########
sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/xml/XmlInferSchema.scala:
##########
@@ -198,30 +207,66 @@ private[sql] class XmlInferSchema(options: XmlOptions)
extends Serializable with
*/
private def inferObject(
parser: XMLEventReader,
+ caseSensitive: Boolean,
rootAttributes: Array[Attribute] = Array.empty): DataType = {
val builder = ArrayBuffer[StructField]()
val nameToDataType = collection.mutable.Map.empty[String,
ArrayBuffer[DataType]]
+ // Initialize a map to hold field names with case sensitivity based on
configuration.
+ // The map is only used in case *insensitive* mode
+ var fieldNames = if (caseSensitive) {
Review Comment:
Instead of using another `Map`, use `TreeMap` or define a new class
"`CaseInsensitiveString`" as shown
[here](https://stackoverflow.com/questions/8236945/case-insensitive-string-as-hashmap-key).
##########
sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/xml/StaxXmlParser.scala:
##########
@@ -274,7 +277,11 @@ class StaxXmlParser(
val convertedValuesMap = collection.mutable.Map.empty[String, Any]
val valuesMap =
StaxXmlParserUtils.convertAttributesToValuesMap(attributes, options)
valuesMap.foreach { case (f, v) =>
- val nameToIndex = schema.map(_.name).zipWithIndex.toMap
+ val nameToIndex = if (caseSensitive) {
+ schema.map(_.name).zipWithIndex.toMap
+ } else {
+ CaseInsensitiveMap(schema.map(_.name).zipWithIndex.toMap)
+ }
Review Comment:
Dedup this code by defining a function `getFieldNameToIndex(schema:
StructType)`
##########
sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/xml/XmlInferSchema.scala:
##########
@@ -231,21 +276,24 @@ private[sql] class XmlInferSchema(options: XmlOptions)
extends Serializable with
nestedBuilder += StructField(options.valueTag, dt, nullable =
true)
valuesMap.foreach {
case (f, v) =>
- nestedBuilder += StructField(f, inferFrom(v), nullable =
true)
+ nestedBuilder +=
+ StructField(getCaseSensitiveName(f), inferFrom(v),
nullable = true)
}
StructType(nestedBuilder.sortBy(_.name).toArray)
case dt: DataType => dt
}
// Add the field and datatypes so that we can check if this is
ArrayType.
val field = StaxXmlParserUtils.getName(e.asStartElement.getName,
options)
- val dataTypes = nameToDataType.getOrElse(field,
ArrayBuffer.empty[DataType])
+ val dataTypes =
+ nameToDataType.getOrElse(getCaseSensitiveName(field),
ArrayBuffer.empty[DataType])
dataTypes += inferredType
Review Comment:
Instead of using `ArrayBuffer`, we should be able to call compatibleType
here itself. That can be a separate PR.
##########
sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/xml/XmlInferSchema.scala:
##########
@@ -92,7 +97,7 @@ private[sql] class XmlInferSchema(options: XmlOptions)
extends Serializable with
* 2. Merge types by choosing the lowest type necessary to cover equal keys
* 3. Replace any remaining null fields with string, the top type
*/
- def infer(xml: RDD[String]): StructType = {
+ def infer(xml: RDD[String], caseSensitive: Boolean): StructType = {
Review Comment:
remove caseSensitive argument from all functions and add it to as an arg to
class XmlInferSchema instead.
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]