Re: [PR] [HUDI-6296] Add Scala 2.13 support for Spark 3.5 integration [hudi]

via GitHub Wed, 01 May 2024 18:07:22 -0700


jonvex commented on code in PR #11130:
URL: https://github.com/apache/hudi/pull/11130#discussion_r1586941487



##########
hudi-client/hudi-spark-client/src/main/scala/org/apache/hudi/HoodieConversionUtils.scala:
##########
@@ -30,9 +31,7 @@ object HoodieConversionUtils {
    * a mutable one)
    */
   def mapAsScalaImmutableMap[K, V](map: ju.Map[K, V]): Map[K, V] = {
-    // NOTE: We have to use deprecated [[JavaConversions]] to stay compatible 
w/ Scala 2.11

Review Comment:
   is this not compatible with scala2.11 anymore?



##########
hudi-client/hudi-spark-client/src/main/scala/org/apache/hudi/HoodieDatasetBulkInsertHelper.scala:
##########
@@ -241,17 +241,16 @@ object HoodieDatasetBulkInsertHelper
     }
   }
 
-  private def getPartitionPathFields(config: HoodieWriteConfig): Seq[String] = 
{
+  private def getPartitionPathFields(config: HoodieWriteConfig): 
mutable.Seq[String] = {

Review Comment:
   what is the purpose of this change?



##########
hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/spark/sql/hudi/command/procedures/ValidateMetadataTableFilesProcedure.scala:
##########
@@ -115,10 +115,10 @@ class ValidateMetadataTableFilesProcedure() extends 
BaseProcedure with Procedure
           rows.add(Row(partition, file, doesFsFileExists, 
doesMetadataFileExists, fsFileLength, metadataFileLength))
         }
       }
-      if (metadataPathInfoList.length != pathInfoList.length) {
-        logError(" FS and metadata files count not matching for " + partition 
+ ". FS files count " + pathInfoList.length + ", metadata base files count " + 
metadataPathInfoList.length)
+      if (metadataPathInfoList.size() != pathInfoList.size()) {

Review Comment:
   size and length are the same? Usually one is the capacity and the other is 
the usage



##########
hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/spark/sql/hudi/command/procedures/ExportInstantsProcedure.scala:
##########
@@ -176,12 +176,12 @@ class ExportInstantsProcedure extends BaseProcedure with 
ProcedureBuilder with L
 
   @throws[Exception]
   private def copyNonArchivedInstants(metaClient: HoodieTableMetaClient, 
instants: util.List[HoodieInstant], limit: Int, localFolder: String): Int = {
-    import scala.collection.JavaConversions._
+    import scala.collection.JavaConverters._
     var copyCount = 0
-    if (instants.nonEmpty) {
+    if (!instants.isEmpty) {

Review Comment:
   They got rid of nonEmpty!!!?



##########
hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/spark/sql/hudi/analysis/TestHoodiePruneFileSourcePartitions.scala:
##########
@@ -107,12 +107,12 @@ class TestHoodiePruneFileSourcePartitions extends 
HoodieClientTestBase with Scal
             case "eager" =>
               // NOTE: In case of partitioned table 3 files will be created, 
while in case of non-partitioned just 1
               if (partitioned) {
-                assertEquals(1275, f.stats.sizeInBytes.longValue() / 1024)
-                assertEquals(1275, lr.stats.sizeInBytes.longValue() / 1024)
+                assertEquals(1275, f.stats.sizeInBytes.longValue / 1024)

Review Comment:
   They require no empty ()?



##########
hudi-spark-datasource/hudi-spark3.5.x/src/test/java/org/apache/hudi/spark3/internal/TestReflectUtil.java:
##########
@@ -42,7 +44,7 @@ public void testDataSourceWriterExtraCommitMetadata() throws 
Exception {
     InsertIntoStatement newStatment = ReflectUtil.createInsertInto(
         statement.table(),
         statement.partitionSpec(),
-        scala.collection.immutable.List.empty(),
+        ((scala.collection.immutable.Seq<String>) 
scala.collection.immutable.Seq$.MODULE$.empty()).toSeq(),

Review Comment:
   ?. I guess disabled because it doesn't work?



##########
hudi-client/hudi-spark-client/src/main/scala/org/apache/spark/sql/HoodieInternalRowUtils.scala:
##########
@@ -18,11 +18,12 @@
 
 package org.apache.spark.sql
 
-import org.apache.avro.Schema

Review Comment:
   Not sure what is going on with this file?



##########
.github/workflows/bot.yml:
##########
@@ -454,17 +486,21 @@ jobs:
         env:
           FLINK_PROFILE: ${{ matrix.flinkProfile }}
           SPARK_PROFILE: ${{ matrix.sparkProfile }}
-          SCALA_PROFILE: 'scala-2.12'
+          SCALA_PROFILE: ${{ matrix.scalaProfile }}
         run: |
-          mvn clean package -T 2 -D"$SCALA_PROFILE" -D"$SPARK_PROFILE" 
-DdeployArtifacts=true -DskipTests=true $MVN_ARGS
+          if [ "$SCALA_PROFILE" == "scala-2.13" ]; then
+            mvn clean package -T 2 -D"$SCALA_PROFILE" -D"$SPARK_PROFILE" 
-DdeployArtifacts=true -DskipTests=true $MVN_ARGS -pl 
packaging/hudi-hadoop-mr-bundle,packaging/hudi-kafka-connect-bundle,packaging/hudi-spark-bundle,packaging/hudi-utilities-bundle,packaging/hudi-utilities-slim-bundle,packaging/hudi-metaserver-server-bundle
 -am
+          else
+            mvn clean package -T 2 -D"$SCALA_PROFILE" -D"$SPARK_PROFILE" 
-DdeployArtifacts=true -DskipTests=true $MVN_ARGS

Review Comment:
   I think maybe this line was supposed to be deleted?



##########
hudi-client/hudi-spark-client/src/main/scala/org/apache/hudi/AvroConversionUtils.scala:
##########
@@ -18,20 +18,20 @@
 
 package org.apache.hudi
 
-import org.apache.avro.Schema.Type
-import org.apache.avro.generic.GenericRecord
-import org.apache.avro.{JsonProperties, Schema}
 import org.apache.hudi.HoodieSparkUtils.sparkAdapter
 import org.apache.hudi.avro.AvroSchemaUtils
 import org.apache.hudi.exception.SchemaCompatibilityException
 import org.apache.hudi.internal.schema.HoodieSchemaException
+
+import org.apache.avro.Schema.Type
+import org.apache.avro.generic.GenericRecord
+import org.apache.avro.{JsonProperties, Schema}
 import org.apache.spark.rdd.RDD
 import org.apache.spark.sql.catalyst.InternalRow
-import org.apache.spark.sql.catalyst.encoders.RowEncoder
 import org.apache.spark.sql.types.{ArrayType, DataType, MapType, StructType}
 import org.apache.spark.sql.{Dataset, Row, SparkSession}
 
-import scala.collection.JavaConversions._
+import scala.collection.JavaConverters._

Review Comment:
   Is there a way we can move to just the hudi javascala converters?



##########
hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/hudi/ColumnStatsIndexSupport.scala:
##########
@@ -308,7 +309,7 @@ class ColumnStatsIndexSupport(spark: SparkSession,
               }
           }
 
-        Row(coalescedRowValuesSeq:_*)
+        Row(coalescedRowValuesSeq.toSeq: _*)

Review Comment:
   Is this name still correct if we need to call .toSeq on it?



##########
hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/hudi/DefaultSource.scala:
##########
@@ -263,10 +262,10 @@ object DefaultSource {
         Option(schema)
       }
 
-      val useNewParquetFileFormat = 
parameters.getOrDefault(HoodieReaderConfig.FILE_GROUP_READER_ENABLED.key(),
+      val useNewParquetFileFormat = 
parameters.asJava.getOrDefault(HoodieReaderConfig.FILE_GROUP_READER_ENABLED.key(),
         
HoodieReaderConfig.FILE_GROUP_READER_ENABLED.defaultValue().toString).toBoolean 
&&
         !metaClient.isMetadataTable && (globPaths == null || 
globPaths.isEmpty) &&
-        !parameters.getOrDefault(SCHEMA_EVOLUTION_ENABLED.key(), 
SCHEMA_EVOLUTION_ENABLED.defaultValue().toString).toBoolean &&
+        !parameters.asJava.getOrDefault(SCHEMA_EVOLUTION_ENABLED.key(), 
SCHEMA_EVOLUTION_ENABLED.defaultValue().toString).toBoolean &&

Review Comment:
   Parameters is Map[String, String] so why do we need to change this to java? 
Did they really get rid of that in java 2.13?!?



##########
hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/hudi/functional/TestCOWDataSource.scala:
##########
@@ -226,7 +226,7 @@ class TestCOWDataSource extends HoodieSparkClientTestBase 
with ScalaAssertionSup
       .save(basePath)
 
     partitionPaths = FSUtils.getAllPartitionPaths(new 
HoodieSparkEngineContext(jsc), HoodieMetadataConfig.newBuilder().build(), 
basePath)
-    assertEquals(partitionPaths.length, 1)
+    assertEquals(partitionPaths.size(), 1)

Review Comment:
   same as other comment



##########
hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/hudi/functional/TestCOWDataSource.scala:
##########
@@ -959,7 +959,7 @@ class TestCOWDataSource extends HoodieSparkClientTestBase 
with ScalaAssertionSup
     assertEquals(insert1Cnt, hoodieROViewDF1.count())
 
     val commitInstantTime1 = HoodieDataSourceHelpers.latestCommit(storage, 
basePath)
-    val records2 = recordsToStrings(inserts2Dup ++ inserts2New).toList
+    val records2 = recordsToStrings((inserts2Dup.asScala ++ 
inserts2New.asScala).asJava).asScala.toList

Review Comment:
   😬



##########
hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/spark/sql/hudi/dml/TestHoodieTableValuedFunction.scala:
##########
@@ -689,6 +690,6 @@ class TestHoodieTableValuedFunction extends 
HoodieSparkSqlTestBase {
         }
       }
     }
-    spark.sessionState.conf.unsetConf(SPARK_SQL_INSERT_INTO_OPERATION.key)
+    spark.sessionState.conf.unsetConf(SPARK_SQL_INSERT_INTO_OPERATION.key)*/

Review Comment:
   Are we going to uncomment?



##########
hudi-spark-datasource/hudi-spark3-common/src/main/java/org/apache/hudi/spark3/internal/ReflectUtil.java:
##########
@@ -23,7 +23,7 @@
 import org.apache.spark.sql.catalyst.util.DateFormatter;
 
 import scala.Option;
-import scala.collection.Seq;
+import scala.collection.immutable.Seq;
 import scala.collection.immutable.Map;

Review Comment:
   combine into scala.collection.immutable.{Seq, Map} ?



##########
hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/spark/sql/hudi/procedure/TestHdfsParquetImportProcedure.scala:
##########
@@ -112,7 +112,7 @@ class TestHdfsParquetImportProcedure extends 
HoodieSparkProcedureTestBase {
   @throws[ParseException]
   @throws[IOException]
   def createInsertRecords(srcFolder: Path): util.List[GenericRecord] = {
-    import scala.collection.JavaConversions._
+    import scala.collection.JavaConverters._

Review Comment:
   Should we just do the import at the top instead of each individual method?



##########
packaging/bundle-validation/base/Dockerfile:
##########
@@ -51,9 +52,16 @@ RUN wget 
https://archive.apache.org/dist/flink/flink-$FLINK_VERSION/flink-$FLINK
     && rm $WORKDIR/flink-$FLINK_VERSION-bin-scala_2.12.tgz
 ENV FLINK_HOME=$WORKDIR/flink-$FLINK_VERSION
 
-RUN wget 
https://archive.apache.org/dist/spark/spark-$SPARK_VERSION/spark-$SPARK_VERSION-bin-hadoop$SPARK_HADOOP_VERSION.tgz
 -P "$WORKDIR" \
-    && tar -xf 
$WORKDIR/spark-$SPARK_VERSION-bin-hadoop$SPARK_HADOOP_VERSION.tgz -C $WORKDIR/ \
-    && rm $WORKDIR/spark-$SPARK_VERSION-bin-hadoop$SPARK_HADOOP_VERSION.tgz
+RUN if [ "$SCALA_VERSION" = "2.13" ]; then \

Review Comment:
   Spark version is set to 3.1.3 so do we need this?



##########
hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/hudi/DefaultSource.scala:
##########
@@ -263,10 +262,10 @@ object DefaultSource {
         Option(schema)
       }
 
-      val useNewParquetFileFormat = 
parameters.getOrDefault(HoodieReaderConfig.FILE_GROUP_READER_ENABLED.key(),
+      val useNewParquetFileFormat = 
parameters.asJava.getOrDefault(HoodieReaderConfig.FILE_GROUP_READER_ENABLED.key(),
         
HoodieReaderConfig.FILE_GROUP_READER_ENABLED.defaultValue().toString).toBoolean 
&&
         !metaClient.isMetadataTable && (globPaths == null || 
globPaths.isEmpty) &&
-        !parameters.getOrDefault(SCHEMA_EVOLUTION_ENABLED.key(), 
SCHEMA_EVOLUTION_ENABLED.defaultValue().toString).toBoolean &&
+        !parameters.asJava.getOrDefault(SCHEMA_EVOLUTION_ENABLED.key(), 
SCHEMA_EVOLUTION_ENABLED.defaultValue().toString).toBoolean &&

Review Comment:
   I see you used getOrElse elsewhere right?



##########
hudi-client/hudi-spark-client/src/main/scala/org/apache/hudi/util/JavaScalaConverters.scala:
##########
@@ -0,0 +1,63 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.hudi.util
+
+import scala.collection.JavaConverters._
+
+/**
+ * Utils that do conversion between Java and Scala collections.
+ */
+object JavaScalaConverters {

Review Comment:
   If we only are going to have these 4 conversions then the naming is fine. 
But if we add more, we should make "JavaToScalaConverters" and 
"ScalaToJavaConverters" so the methods can just be "convertList", 
"convertIterator", etc



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]

Re: [PR] [HUDI-6296] Add Scala 2.13 support for Spark 3.5 integration [hudi]

Reply via email to