[ https://issues.apache.org/jira/browse/HUDI-9791 ]
Lin Liu deleted comment on HUDI-9791:
-------------------------------
was (Author: JIRAUSER301185):
/
{code:java}
➜ ~ spark-3.2Welcome to ____ __ / __/__ ___ _____/ /__
_\ \/ _ \/ _ `/ __/ '_/ /___/ .__/\_,_/_/ /_/\_\ version 3.2.1 /_/
Using Scala version 2.12.15, OpenJDK 64-Bit Server VM, 1.8.0_392-internalBranch
HEADCompiled by user hgao on 2022-01-20T19:26:14ZRevision
4f25b3f71238a00508a356591553f2dfa89f8290Url https://github.com/apache/sparkType
--help for more information.➜ ~ export SPARK_VERSION=3.2➜ ~ spark-3.2Welcome
to ____ __ / __/__ ___ _____/ /__ _\ \/ _ \/ _ `/ __/
'_/ /___/ .__/\_,_/_/ /_/\_\ version 3.2.1 /_/
Using Scala version 2.12.15, OpenJDK 64-Bit Server VM, 1.8.0_392-internalBranch
HEADCompiled by user hgao on 2022-01-20T19:26:14ZRevision
4f25b3f71238a00508a356591553f2dfa89f8290Url https://github.com/apache/sparkType
--help for more information.➜ ~ spark-shell --packages
org.apache.hudi:hudi-spark$SPARK_VERSION-bundle_2.12:0.14.1 --conf
'spark.serializer=org.apache.spark.serializer.KryoSerializer' --conf
'spark.sql.extensions=org.apache.spark.sql.hudi.HoodieSparkSessionExtension'
--conf 'spark.kryo.registrator=org.apache.spark.HoodieSparkKryoRegistrar'::
loading settings :: url =
jar:file:/Users/linliu/libraries/spark-3.2.1-bin-hadoop3/jars/ivy-2.5.0.jar!/org/apache/ivy/core/settings/ivysettings.xmlIvy
Default Cache set to: /Users/linliu/.ivy2/cacheThe jars for the packages
stored in: /Users/linliu/.ivy2/jarsorg.apache.hudi#hudi-spark3.2-bundle_2.12
added as a dependency:: resolving dependencies ::
org.apache.spark#spark-submit-parent-4b96494a-df95-4a52-a2ac-322d2d02f5b2;1.0
confs: [default] found org.apache.hudi#hudi-spark3.2-bundle_2.12;0.14.1
in local-m2-cache:: resolution report :: resolve 178ms :: artifacts dl 3ms
:: modules in use: org.apache.hudi#hudi-spark3.2-bundle_2.12;0.14.1 from
local-m2-cache in [default]
--------------------------------------------------------------------- |
| modules || artifacts | | conf
| number| search|dwnlded|evicted|| number|dwnlded|
--------------------------------------------------------------------- |
default | 1 | 0 | 0 | 0 || 1 | 0 |
---------------------------------------------------------------------::
retrieving ::
org.apache.spark#spark-submit-parent-4b96494a-df95-4a52-a2ac-322d2d02f5b2
confs: [default] 0 artifacts copied, 1 already retrieved
(0kB/10ms)25/09/05 14:17:16 WARN NativeCodeLoader: Unable to load native-hadoop
library for your platform... using builtin-java classes where applicableUsing
Spark's default log4j profile:
org/apache/spark/log4j-defaults.propertiesSetting default log level to
"WARN".To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use
setLogLevel(newLevel).25/09/05 14:17:24 WARN Utils: Service 'SparkUI' could not
bind on port 4040. Attempting port 4041.Spark context Web UI available at
http://lins-mbp.attlocal.net:4041Spark context available as 'sc' (master =
local[*], app id = local-1757107044815).Spark session available as
'spark'.Welcome to ____ __ / __/__ ___ _____/ /__ _\
\/ _ \/ _ `/ __/ '_/ /___/ .__/\_,_/_/ /_/\_\ version 3.2.1 /_/
Using Scala version 2.12.15 (OpenJDK 64-Bit Server VM, Java
1.8.0_392-internal)Type in expressions to have them evaluated.Type :help for
more information.
scala> import scala.collection.JavaConversions._import
scala.collection.JavaConversions._
scala> import org.apache.spark.sql.SaveMode._import
org.apache.spark.sql.SaveMode._
scala> import org.apache.hudi.DataSourceReadOptions._import
org.apache.hudi.DataSourceReadOptions._
scala> import org.apache.hudi.DataSourceWriteOptions._import
org.apache.hudi.DataSourceWriteOptions._
scala> import org.apache.hudi.common.table.HoodieTableConfig._import
org.apache.hudi.common.table.HoodieTableConfig._
scala> import org.apache.hudi.config.HoodieWriteConfig._import
org.apache.hudi.config.HoodieWriteConfig._
scala> import org.apache.hudi.keygen.constant.KeyGeneratorOptions._import
org.apache.hudi.keygen.constant.KeyGeneratorOptions._
scala> import org.apache.hudi.common.model.HoodieRecordimport
org.apache.hudi.common.model.HoodieRecord
scala> import spark.implicits._import spark.implicits._
scala> val tableName = "trips_table"tableName: String = trips_table
scala> val basePath = "file:///tmp/trips_table"basePath: String =
file:///tmp/trips_table
scala> val columns = Seq("ts","uuid","rider","driver","fare","city")columns:
Seq[String] = List(ts, uuid, rider, driver, fare, city)
scala> val data = |
Seq((1695159649087L,"334e26e9-8355-45cc-97c6-c31daf0df330","rider-A","driver-K",19.10,"san_francisco"),<console>:2:
error: illegal character '\u00a0'
Seq((1695159649087L,"334e26e9-8355-45cc-97c6-c31daf0df330","rider-A","driver-K",19.10,"san_francisco"),
^
scala>
(1695091554788L,"e96c4396-3fad-413a-a942-4cb36106d721","rider-C","driver-M",27.70
,"san_francisco"),<console>:1: error: illegal character '\u00a0'
(1695091554788L,"e96c4396-3fad-413a-a942-4cb36106d721","rider-C","driver-M",27.70
,"san_francisco"), ^<console>:1: error: illegal character '\u00a0'
(1695091554788L,"e96c4396-3fad-413a-a942-4cb36106d721","rider-C","driver-M",27.70
,"san_francisco"), ^
scala>
(1695046462179L,"9909a8b1-2d15-4d3d-8ec9-efc48c536a00","rider-D","driver-L",33.90
,"san_francisco"),<console>:1: error: illegal character '\u00a0'
(1695046462179L,"9909a8b1-2d15-4d3d-8ec9-efc48c536a00","rider-D","driver-L",33.90
,"san_francisco"), ^<console>:1: error: illegal character '\u00a0'
(1695046462179L,"9909a8b1-2d15-4d3d-8ec9-efc48c536a00","rider-D","driver-L",33.90
,"san_francisco"), ^
scala>
(1695516137016L,"e3cf430c-889d-4015-bc98-59bdce1e530c","rider-F","driver-P",34.15,"sao_paulo"
),<console>:1: error: illegal character '\u00a0'
(1695516137016L,"e3cf430c-889d-4015-bc98-59bdce1e530c","rider-F","driver-P",34.15,"sao_paulo"
), ^<console>:1: error: illegal character '\u00a0'
(1695516137016L,"e3cf430c-889d-4015-bc98-59bdce1e530c","rider-F","driver-P",34.15,"sao_paulo"
), ^<console>:1: error: illegal character '\u00a0'
(1695516137016L,"e3cf430c-889d-4015-bc98-59bdce1e530c","rider-F","driver-P",34.15,"sao_paulo"
),
^<console>:1: error: illegal character '\u00a0'
(1695516137016L,"e3cf430c-889d-4015-bc98-59bdce1e530c","rider-F","driver-P",34.15,"sao_paulo"
),
^
scala>
(1695115999911L,"c8abbe79-8d89-47ea-b4ce-4d224bae5bfa","rider-J","driver-T",17.85,"chennai"));<console>:1:
error: illegal character '\u00a0'
(1695115999911L,"c8abbe79-8d89-47ea-b4ce-4d224bae5bfa","rider-J","driver-T",17.85,"chennai"));
^<console>:1: error: illegal character '\u00a0'
(1695115999911L,"c8abbe79-8d89-47ea-b4ce-4d224bae5bfa","rider-J","driver-T",17.85,"chennai"));
^
scala> var inserts = spark.createDataFrame(data).toDF(columns:_*)<console>:48:
error: not found: value data var inserts =
spark.createDataFrame(data).toDF(columns:_*)
^
scala> inserts.write.format("hudi"). |
option("hoodie.datasource.write.partitionpath.field", "city").<console>:2:
error: illegal character '\u00a0'
option("hoodie.datasource.write.partitionpath.field", "city"). ^
scala> option("hoodie.table.name", tableName).<console>:1: error: illegal
character '\u00a0' option("hoodie.table.name", tableName). ^
scala> option("hoodie.metadata.index.column.stats.enable",
"true").<console>:1: error: illegal character '\u00a0'
option("hoodie.metadata.index.column.stats.enable", "true"). ^
scala> option("hoodie.datasource.write.table.type",
"MERGE_ON_READ").<console>:1: error: illegal character '\u00a0'
option("hoodie.datasource.write.table.type", "MERGE_ON_READ"). ^
scala> mode(Overwrite).<console>:1: error: illegal character '\u00a0'
mode(Overwrite). ^
scala> save(basePath)<console>:1: error: illegal character '\u00a0'
save(basePath) ^
scala> val updatesDf = spark.read.format("hudi").load(basePath).filter($"rider"
=== "rider-D").withColumn("fare", col("fare") * 10)25/09/05 14:17:54 WARN
DFSPropertiesConfiguration: Cannot find HUDI_CONF_DIR, please set it as the dir
of hudi-defaults.conf25/09/05 14:17:54 WARN DFSPropertiesConfiguration:
Properties file file:/etc/hudi/conf/hudi-defaults.conf not found. Ignoring to
load props fileupdatesDf: org.apache.spark.sql.DataFrame =
[_hoodie_commit_time: string, _hoodie_commit_seqno: string ... 9 more fields]
scala>
scala> updatesDf.write.format("hudi"). |
option("hoodie.datasource.write.operation", "upsert").<console>:2: error:
illegal character '\u00a0' option("hoodie.datasource.write.operation",
"upsert"). ^
scala> option("hoodie.datasource.write.partitionpath.field",
"city").<console>:1: error: illegal character '\u00a0'
option("hoodie.datasource.write.partitionpath.field", "city"). ^
scala> option("hoodie.table.name", tableName).<console>:1: error: illegal
character '\u00a0' option("hoodie.table.name", tableName). ^
scala> option("hoodie.metadata.index.column.stats.enable",
"true").<console>:1: error: illegal character '\u00a0'
option("hoodie.metadata.index.column.stats.enable", "true"). ^
scala> option("hoodie.datasource.write.table.type",
"MERGE_ON_READ").<console>:1: error: illegal character '\u00a0'
option("hoodie.datasource.write.table.type", "MERGE_ON_READ"). ^
scala> mode(Append).<console>:1: error: illegal character '\u00a0'
mode(Append). ^
scala> save(basePath)<console>:1: error: illegal character '\u00a0'
save(basePath) ^
scala> import scala.collection.JavaConversions._import
scala.collection.JavaConversions._
scala> import org.apache.spark.sql.SaveMode._import
org.apache.spark.sql.SaveMode._
scala> import org.apache.hudi.DataSourceReadOptions._import
org.apache.hudi.DataSourceReadOptions._
scala> import org.apache.hudi.DataSourceWriteOptions._import
org.apache.hudi.DataSourceWriteOptions._
scala> import org.apache.hudi.common.table.HoodieTableConfig._import
org.apache.hudi.common.table.HoodieTableConfig._
scala> import org.apache.hudi.config.HoodieWriteConfig._import
org.apache.hudi.config.HoodieWriteConfig._
scala> import org.apache.hudi.keygen.constant.KeyGeneratorOptions._import
org.apache.hudi.keygen.constant.KeyGeneratorOptions._
scala> import org.apache.hudi.common.model.HoodieRecordimport
org.apache.hudi.common.model.HoodieRecord
scala> import spark.implicits._import spark.implicits._
scala> val tableName = "trips_table"tableName: String = trips_table
scala> val basePath = "file:///tmp/trips_table"basePath: String =
file:///tmp/trips_table
scala> val columns = Seq("ts","uuid","rider","driver","fare","city")columns:
Seq[String] = List(ts, uuid, rider, driver, fare, city)
scala> val data = |
Seq((1695159649087L,"334e26e9-8355-45cc-97c6-c31daf0df330","rider-A","driver-K",19.10,"san_francisco"),
|
(1695091554788L,"e96c4396-3fad-413a-a942-4cb36106d721","rider-C","driver-M",27.70
,"san_francisco"), |
(1695046462179L,"9909a8b1-2d15-4d3d-8ec9-efc48c536a00","rider-D","driver-L",33.90
,"san_francisco"), |
(1695516137016L,"e3cf430c-889d-4015-bc98-59bdce1e530c","rider-F","driver-P",34.15,"sao_paulo"
), |
(1695115999911L,"c8abbe79-8d89-47ea-b4ce-4d224bae5bfa","rider-J","driver-T",17.85,"chennai"));data:
Seq[(Long, String, String, String, Double, String)] =
List((1695159649087,334e26e9-8355-45cc-97c6-c31daf0df330,rider-A,driver-K,19.1,san_francisco),
(1695091554788,e96c4396-3fad-413a-a942-4cb36106d721,rider-C,driver-M,27.7,san_francisco),
(1695046462179,9909a8b1-2d15-4d3d-8ec9-efc48c536a00,rider-D,driver-L,33.9,san_francisco),
(1695516137016,e3cf430c-889d-4015-bc98-59bdce1e530c,rider-F,driver-P,34.15,sao_paulo),
(1695115999911,c8abbe79-8d89-47ea-b4ce-4d224bae5bfa,rider-J,driver-T,17.85,chennai))
scala> var inserts = spark.createDataFrame(data).toDF(columns:_*)inserts:
org.apache.spark.sql.DataFrame = [ts: bigint, uuid: string ... 4 more fields]
scala> inserts.write.format("hudi"). |
option("hoodie.datasource.write.partitionpath.field", "city"). |
option("hoodie.table.name", tableName). |
option("hoodie.metadata.index.column.stats.enable", "true"). |
option("hoodie.datasource.write.table.type", "MERGE_ON_READ"). |
mode(Overwrite). | save(basePath)25/09/05 14:18:44 WARN
HoodieSparkSqlWriterInternal: Choosing BULK_INSERT as the operation type since
auto record key generation is applicable25/09/05 14:18:44 WARN
HoodieSparkSqlWriterInternal: hoodie table at file:/tmp/trips_table already
exists. Deleting existing data & overwriting with new data.25/09/05 14:18:45
WARN HiveConf: HiveConf of name hive.stats.jdbc.timeout does not exist25/09/05
14:18:45 WARN HiveConf: HiveConf of name hive.stats.retries.wait does not
exist25/09/05 14:18:47 WARN ObjectStore: Version information not found in
metastore. hive.metastore.schema.verification is not enabled so recording the
schema version 2.3.025/09/05 14:18:47 WARN ObjectStore:
setMetaStoreSchemaVersion called but recording version is disabled: version =
2.3.0, comment = Set by MetaStore [email protected]/09/05 14:18:48 WARN
AutoRecordKeyGenerationUtils$: Precombine field ts will be ignored with auto
record key generation enabled25/09/05 14:18:52 WARN MetricsConfig: Cannot
locate configuration: tried
hadoop-metrics2-hbase.properties,hadoop-metrics2.properties# WARNING: Unable to
get Instrumentation. Dynamic Attach failed. You may add this JAR as -javaagent
manually, or supply -Djdk.attach.allowAttachSelf# WARNING: Unable to get
Instrumentation. Dynamic Attach failed. You may add this JAR as -javaagent
manually, or supply -Djdk.attach.allowAttachSelf# WARNING: Unable to attach
Serviceability Agent. Unable to attach even with module exceptions:
[org.apache.hudi.org.openjdk.jol.vm.sa.SASupportException: Sense failed.,
org.apache.hudi.org.openjdk.jol.vm.sa.SASupportException: Sense failed.,
org.apache.hudi.org.openjdk.jol.vm.sa.SASupportException: Sense failed.]#
WARNING: Unable to attach Serviceability Agent. Unable to attach even with
module exceptions: [org.apache.hudi.org.openjdk.jol.vm.sa.SASupportException:
Sense failed., org.apache.hudi.org.openjdk.jol.vm.sa.SASupportException: Sense
failed., org.apache.hudi.org.openjdk.jol.vm.sa.SASupportException: Sense
failed.]# WARNING: Unable to attach Serviceability Agent. Unable to attach even
with module exceptions:
[org.apache.hudi.org.openjdk.jol.vm.sa.SASupportException: Sense failed.,
org.apache.hudi.org.openjdk.jol.vm.sa.SASupportException: Sense failed.,
org.apache.hudi.org.openjdk.jol.vm.sa.SASupportException: Sense failed.]
scala> val updatesDf = spark.read.format("hudi").load(basePath).filter($"rider"
=== "rider-D").withColumn("fare", col("fare") * 10)updatesDf:
org.apache.spark.sql.DataFrame = [_hoodie_commit_time: string,
_hoodie_commit_seqno: string ... 9 more fields]
scala>
scala> updatesDf.write.format("hudi"). |
option("hoodie.datasource.write.operation", "upsert"). |
option("hoodie.datasource.write.partitionpath.field", "city"). |
option("hoodie.table.name", tableName). |
option("hoodie.metadata.index.column.stats.enable", "true"). |
option("hoodie.datasource.write.table.type", "MERGE_ON_READ"). |
mode(Append). | save(basePath)25/09/05 14:19:04 WARN HoodieWriterUtils$:
Changing operation type to UPSERT PREPPED for pk less table upserts25/09/05
14:19:04 WARN AutoRecordKeyGenerationUtils$: Precombine field ts will be
ignored with auto record key generation enabled25/09/05 14:19:08 WARN
HoodieSparkSqlWriterInternal: Closing write client
scala> exit<console>:73: error: not found: value exit exit ^
scala> %➜ ~ spark-3.5Welcome to ____ __ / __/__ ___
_____/ /__ _\ \/ _ \/ _ `/ __/ '_/ /___/ .__/\_,_/_/ /_/\_\ version
3.5.0 /_/
Using Scala version 2.12.18, OpenJDK 64-Bit Server VM, 1.8.0_392-internalBranch
HEADCompiled by user ubuntu on 2023-09-09T01:53:20ZRevision
ce5ddad990373636e94071e7cef2f31021add07bUrl https://github.com/apache/sparkType
--help for more information.➜ ~ export SPARK_VERSION=3.5 # or 3.4,
3.3spark-shell --packages
org.apache.hudi:hudi-spark$SPARK_VERSION-bundle_2.12:1.1.0-SNAPSHOT \--conf
'spark.serializer=org.apache.spark.serializer.KryoSerializer' \--conf
'spark.sql.catalog.spark_catalog=org.apache.spark.sql.hudi.catalog.HoodieCatalog'
\--conf
'spark.sql.extensions=org.apache.spark.sql.hudi.HoodieSparkSessionExtension'
\--conf 'spark.kryo.registrator=org.apache.spark.HoodieSparkKryoRegistrar'::
loading settings :: url =
jar:file:/Users/linliu/libraries/spark-3.5.0-bin-hadoop3/jars/ivy-2.5.1.jar!/org/apache/ivy/core/settings/ivysettings.xmlIvy
Default Cache set to: /Users/linliu/.ivy2/cacheThe jars for the packages
stored in: /Users/linliu/.ivy2/jarsorg.apache.hudi#hudi-spark3.5-bundle_2.12
added as a dependency:: resolving dependencies ::
org.apache.spark#spark-submit-parent-98cac2cf-863b-4747-99fd-31654bd46720;1.0
confs: [default] found
org.apache.hudi#hudi-spark3.5-bundle_2.12;1.1.0-SNAPSHOT in local-m2-cache
found org.apache.hive#hive-storage-api;2.8.1 in local-m2-cache found
org.slf4j#slf4j-api;1.7.36 in local-m2-cache:: resolution report :: resolve
290ms :: artifacts dl 15ms :: modules in use:
org.apache.hive#hive-storage-api;2.8.1 from local-m2-cache in [default]
org.apache.hudi#hudi-spark3.5-bundle_2.12;1.1.0-SNAPSHOT from local-m2-cache in
[default] org.slf4j#slf4j-api;1.7.36 from local-m2-cache in [default]
--------------------------------------------------------------------- |
| modules || artifacts | | conf
| number| search|dwnlded|evicted|| number|dwnlded|
--------------------------------------------------------------------- |
default | 3 | 0 | 0 | 0 || 3 | 0 |
---------------------------------------------------------------------::
retrieving ::
org.apache.spark#spark-submit-parent-98cac2cf-863b-4747-99fd-31654bd46720
confs: [default] 0 artifacts copied, 3 already retrieved
(0kB/7ms)25/09/05 14:19:37 WARN NativeCodeLoader: Unable to load native-hadoop
library for your platform... using builtin-java classes where applicableSetting
default log level to "WARN".To adjust logging level use
sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).25/09/05
14:19:46 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting
port 4041.Spark context Web UI available at
http://lins-mbp.attlocal.net:4041Spark context available as 'sc' (master =
local[*], app id = local-1757107186666).Spark session available as
'spark'.Welcome to ____ __ / __/__ ___ _____/ /__ _\
\/ _ \/ _ `/ __/ '_/ /___/ .__/\_,_/_/ /_/\_\ version 3.5.0 /_/
Using Scala version 2.12.18 (OpenJDK 64-Bit Server VM, Java
1.8.0_392-internal)Type in expressions to have them evaluated.Type :help for
more information.
scala> import scala.collection.JavaConversions._import
scala.collection.JavaConversions._
scala> import org.apache.spark.sql.SaveMode._import
org.apache.spark.sql.SaveMode._
scala> import org.apache.hudi.DataSourceReadOptions._import
org.apache.hudi.DataSourceReadOptions._
scala> import org.apache.hudi.DataSourceWriteOptions._import
org.apache.hudi.DataSourceWriteOptions._
scala> import org.apache.hudi.common.table.HoodieTableConfig._import
org.apache.hudi.common.table.HoodieTableConfig._
scala> import org.apache.hudi.config.HoodieWriteConfig._import
org.apache.hudi.config.HoodieWriteConfig._
scala> import org.apache.hudi.keygen.constant.KeyGeneratorOptions._import
org.apache.hudi.keygen.constant.KeyGeneratorOptions._
scala> import org.apache.hudi.common.model.HoodieRecordimport
org.apache.hudi.common.model.HoodieRecord
scala> import spark.implicits._import spark.implicits._
scala> val tableName = "trips_table"tableName: String = trips_table
scala> val basePath = "file:///tmp/trips_table"basePath: String =
file:///tmp/trips_table
scala> val updatesDf = spark.read.format("hudi").load(basePath).filter($"rider"
=== "rider-D").withColumn("fare", col("fare") * 10)25/09/05 14:20:12 WARN
DFSPropertiesConfiguration: Properties file
file:/etc/hudi/conf/hudi-defaults.conf not found. Ignoring to load props
file25/09/05 14:20:12 WARN DFSPropertiesConfiguration: Cannot find
HUDI_CONF_DIR, please set it as the dir of hudi-defaults.conf25/09/05 14:20:13
WARN ConfigUtils: The configuration key
'hoodie.compaction.record.merger.strategy' has been deprecated and may be
removed in the future. Please use the new key 'hoodie.record.merge.strategy.id'
instead.updatesDf: org.apache.spark.sql.DataFrame = [_hoodie_commit_time:
string, _hoodie_commit_seqno: string ... 9 more fields]
scala>
scala>
updatesDf.write.format("hudi").option("hoodie.datasource.write.operation",
"upsert").option("hoodie.datasource.write.partitionpath.field",
"city").option("hoodie.table.name",
tableName).option("hoodie.metadata.index.column.stats.enable",
"true").option("hoodie.write.table.version",
"6").option("hoodie.datasource.write.table.type",
"MERGE_ON_READ").mode(Append).save(basePath)# WARNING: Unable to attach
Serviceability Agent. Unable to attach even with module exceptions:
[org.apache.hudi.org.openjdk.jol.vm.sa.SASupportException: Sense failed.,
org.apache.hudi.org.openjdk.jol.vm.sa.SASupportException: Sense failed.,
org.apache.hudi.org.openjdk.jol.vm.sa.SASupportException: Sense
failed.]25/09/05 14:20:16 WARN ConfigUtils: The configuration key
'hoodie.compaction.record.merger.strategy' has been deprecated and may be
removed in the future. Please use the new key 'hoodie.record.merge.strategy.id'
instead.25/09/05 14:20:16 WARN ConfigUtils: The configuration key
'hoodie.compaction.record.merger.strategy' has been deprecated and may be
removed in the future. Please use the new key 'hoodie.record.merge.strategy.id'
instead.25/09/05 14:20:16 WARN HoodieWriterUtils$: Changing operation type to
UPSERT PREPPED for pk less table upserts25/09/05 14:20:16 WARN
AutoRecordKeyGenerationUtils$: Precombine field ts will be ignored with auto
record key generation enabled25/09/05 14:20:16 WARN HoodieWriteConfig:
HoodieTableVersion.SIX is not yet fully supported by the writer. Please expect
some unexpected behavior, until its fully implemented.25/09/05 14:20:22 WARN
HoodieWriteConfig: HoodieTableVersion.SIX is not yet fully supported by the
writer. Please expect some unexpected behavior, until its fully
implemented.25/09/05 14:20:23 WARN ConfigUtils: The configuration key
'hoodie.compaction.record.merger.strategy' has been deprecated and may be
removed in the future. Please use the new key 'hoodie.record.merge.strategy.id'
instead.25/09/05 14:20:27 WARN HoodieWriteConfig: HoodieTableVersion.SIX is not
yet fully supported by the writer. Please expect some unexpected behavior,
until its fully implemented.25/09/05 14:20:28 WARN MetricsConfig: Cannot locate
configuration: tried hadoop-metrics2-hbase.properties,hadoop-metrics2.properties
scala> %➜ ~ spark-3.2Welcome to ____ __ / __/__ ___
_____/ /__ _\ \/ _ \/ _ `/ __/ '_/ /___/ .__/\_,_/_/ /_/\_\ version
3.2.1 /_/
Using Scala version 2.12.15, OpenJDK 64-Bit Server VM, 1.8.0_392-internalBranch
HEADCompiled by user hgao on 2022-01-20T19:26:14ZRevision
4f25b3f71238a00508a356591553f2dfa89f8290Url https://github.com/apache/sparkType
--help for more information.➜ ~ export SPARK_VERSION=3.2➜ ~ spark-shell
--packages org.apache.hudi:hudi-spark$SPARK_VERSION-bundle_2.12:0.14.1 --conf
'spark.serializer=org.apache.spark.serializer.KryoSerializer' --conf
'spark.sql.extensions=org.apache.spark.sql.hudi.HoodieSparkSessionExtension'
--conf 'spark.kryo.registrator=org.apache.spark.HoodieSparkKryoRegistrar'::
loading settings :: url =
jar:file:/Users/linliu/libraries/spark-3.2.1-bin-hadoop3/jars/ivy-2.5.0.jar!/org/apache/ivy/core/settings/ivysettings.xmlIvy
Default Cache set to: /Users/linliu/.ivy2/cacheThe jars for the packages
stored in: /Users/linliu/.ivy2/jarsorg.apache.hudi#hudi-spark3.2-bundle_2.12
added as a dependency:: resolving dependencies ::
org.apache.spark#spark-submit-parent-79f360af-ce55-47c6-a9d1-9bcbe74c0b03;1.0
confs: [default] found org.apache.hudi#hudi-spark3.2-bundle_2.12;0.14.1
in local-m2-cache:: resolution report :: resolve 180ms :: artifacts dl 4ms
:: modules in use: org.apache.hudi#hudi-spark3.2-bundle_2.12;0.14.1 from
local-m2-cache in [default]
--------------------------------------------------------------------- |
| modules || artifacts | | conf
| number| search|dwnlded|evicted|| number|dwnlded|
--------------------------------------------------------------------- |
default | 1 | 0 | 0 | 0 || 1 | 0 |
---------------------------------------------------------------------::
retrieving ::
org.apache.spark#spark-submit-parent-79f360af-ce55-47c6-a9d1-9bcbe74c0b03
confs: [default] 0 artifacts copied, 1 already retrieved
(0kB/6ms)25/09/05 14:21:01 WARN NativeCodeLoader: Unable to load native-hadoop
library for your platform... using builtin-java classes where applicableUsing
Spark's default log4j profile:
org/apache/spark/log4j-defaults.propertiesSetting default log level to
"WARN".To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use
setLogLevel(newLevel).25/09/05 14:21:09 WARN Utils: Service 'SparkUI' could not
bind on port 4040. Attempting port 4041.Spark context Web UI available at
http://lins-mbp.attlocal.net:4041Spark context available as 'sc' (master =
local[*], app id = local-1757107269339).Spark session available as
'spark'.Welcome to ____ __ / __/__ ___ _____/ /__ _\
\/ _ \/ _ `/ __/ '_/ /___/ .__/\_,_/_/ /_/\_\ version 3.2.1 /_/
Using Scala version 2.12.15 (OpenJDK 64-Bit Server VM, Java
1.8.0_392-internal)Type in expressions to have them evaluated.Type :help for
more information.
scala> spark.read.format("hudi").option("hoodie.metadata.enable",
"true").option("hoodie.enable.data.skipping",
"true").option("hoodie.metadata.index.column.stats.enable",
"true").load("/tmp/trips_table").filter("fare > 100").show(100,false)25/09/05
14:21:18 WARN DFSPropertiesConfiguration: Cannot find HUDI_CONF_DIR, please set
it as the dir of hudi-defaults.conf25/09/05 14:21:18 WARN
DFSPropertiesConfiguration: Properties file
file:/etc/hudi/conf/hudi-defaults.conf not found. Ignoring to load props file#
WARNING: Unable to attach Serviceability Agent. Unable to attach even with
module exceptions: [org.apache.hudi.org.openjdk.jol.vm.sa.SASupportException:
Sense failed., org.apache.hudi.org.openjdk.jol.vm.sa.SASupportException: Sense
failed., org.apache.hudi.org.openjdk.jol.vm.sa.SASupportException: Sense
failed.]25/09/05 14:21:20 WARN MetricsConfig: Cannot locate configuration:
tried
hadoop-metrics2-hbase.properties,hadoop-metrics2.properties+-------------------+---------------------+---------------------+----------------------+--------------------------------------+-------------+------------------------------------+-------+--------+------+-------------+|_hoodie_commit_time|_hoodie_commit_seqno
|_hoodie_record_key |_hoodie_partition_path|_hoodie_file_name
|ts |uuid |rider |driver |fare
|city
|+-------------------+---------------------+---------------------+----------------------+--------------------------------------+-------------+------------------------------------+-------+--------+------+-------------+|20250905142017073
|20250905142017073_0_1|20250905141844928_2_0|san_francisco
|fbe58388-7307-4159-90fa-f2e0b63f111d-0|1695046462179|9909a8b1-2d15-4d3d-8ec9-efc48c536a00|rider-D|driver-L|3390.0|san_francisco|+-------------------+---------------------+---------------------+----------------------+--------------------------------------+-------------+------------------------------------+-------+--------+------+-------------+
{code}
> MDT breaks with hfile reader changes when writing with master using table
> version 6
> -----------------------------------------------------------------------------------
>
> Key: HUDI-9791
> URL: https://issues.apache.org/jira/browse/HUDI-9791
> Project: Apache Hudi
> Issue Type: Bug
> Components: metadata
> Reporter: Jonathan Vexler
> Assignee: Lin Liu
> Priority: Blocker
> Fix For: 1.1.0
>
> Attachments: hfile.error
>
>
> Exception: [^hfile.error]
> Summary:
> # Write an insert and update to mor table with spark 3.1 hudi 0.14.1
> # write an update with spark 3.5 with current master
> `adda6950e0aaa7353add88ee2fc0499d7135ee33` using write table version 6
> # Read table with spark 3.1 hudi 0.14.1 and get exception
> The hoodie.properties still says table version is 6
> Here is my runscript:
> {code:java}
> set_spark 3.1
> hudi_spark_shell -p -v 0.14.1
> import scala.collection.JavaConversions._
> import org.apache.spark.sql.SaveMode._
> import org.apache.hudi.DataSourceReadOptions._
> import org.apache.hudi.DataSourceWriteOptions._
> import org.apache.hudi.common.table.HoodieTableConfig._
> import org.apache.hudi.config.HoodieWriteConfig._
> import org.apache.hudi.keygen.constant.KeyGeneratorOptions._
> import org.apache.hudi.common.model.HoodieRecord
> import spark.implicits._
> val tableName = "trips_table"
> val basePath = "file:///tmp/trips_table"
> val columns = Seq("ts","uuid","rider","driver","fare","city")
> val data =
>
> Seq((1695159649087L,"334e26e9-8355-45cc-97c6-c31daf0df330","rider-A","driver-K",19.10,"san_francisco"),
>
> (1695091554788L,"e96c4396-3fad-413a-a942-4cb36106d721","rider-C","driver-M",27.70
> ,"san_francisco"),
>
> (1695046462179L,"9909a8b1-2d15-4d3d-8ec9-efc48c536a00","rider-D","driver-L",33.90
> ,"san_francisco"),
>
> (1695516137016L,"e3cf430c-889d-4015-bc98-59bdce1e530c","rider-F","driver-P",34.15,"sao_paulo"
> ),
>
> (1695115999911L,"c8abbe79-8d89-47ea-b4ce-4d224bae5bfa","rider-J","driver-T",17.85,"chennai"));
> var inserts = spark.createDataFrame(data).toDF(columns:_*)
> inserts.write.format("hudi").
> option("hoodie.datasource.write.partitionpath.field", "city").
> option("hoodie.table.name", tableName).
> option("hoodie.metadata.index.column.stats.enable", "true").
> option("hoodie.datasource.write.table.type", "MERGE_ON_READ").
> mode(Overwrite).
> save(basePath)
> val updatesDf = spark.read.format("hudi").load(basePath).filter($"rider" ===
> "rider-D").withColumn("fare", col("fare") * 10)
> updatesDf.write.format("hudi").
> option("hoodie.datasource.write.operation", "upsert").
> option("hoodie.datasource.write.partitionpath.field", "city").
> option("hoodie.table.name", tableName).
> option("hoodie.metadata.index.column.stats.enable", "true").
> option("hoodie.datasource.write.table.type", "MERGE_ON_READ").
> mode(Append).
> save(basePath)
> //exit
> set_spark 3.5
> hudi_spark_shell -j
> import scala.collection.JavaConversions._
> import org.apache.spark.sql.SaveMode._
> import org.apache.hudi.DataSourceReadOptions._
> import org.apache.hudi.DataSourceWriteOptions._
> import org.apache.hudi.common.table.HoodieTableConfig._
> import org.apache.hudi.config.HoodieWriteConfig._
> import org.apache.hudi.keygen.constant.KeyGeneratorOptions._
> import org.apache.hudi.common.model.HoodieRecord
> import spark.implicits._
> val tableName = "trips_table"
> val basePath = "file:///tmp/trips_table"
> val updatesDf = spark.read.format("hudi").load(basePath).filter($"rider" ===
> "rider-D").withColumn("fare", col("fare") * 10)
> updatesDf.write.format("hudi").option("hoodie.datasource.write.operation",
> "upsert").option("hoodie.datasource.write.partitionpath.field",
> "city").option("hoodie.table.name",
> tableName).option("hoodie.metadata.index.column.stats.enable",
> "true").option("hoodie.write.table.version",
> "6").option("hoodie.datasource.write.table.type",
> "MERGE_ON_READ").mode(Append).save(basePath)
> //exit
> set_spark 3.1
> hudi_spark_shell -p -v 0.14.1
> spark.read.format("hudi").option("hoodie.metadata.enable",
> "true").option("hoodie.enable.data.skipping",
> "true").option("hoodie.metadata.index.column.stats.enable",
> "true").load("/tmp/trips_table").filter("fare > 100").show(100,false) {code}
> Command for running 0.14.1 with spark 3.1 using mvn package:
> {code:java}
> /Users/jon/Documents/sparkroot/spark-3.1.3-bin-hadoop3.2/bin/spark-shell
> --packages org.apache.hudi:hudi-spark3.1-bundle_2.12:0.14.1 --conf
> 'spark.sql.extensions=org.apache.spark.sql.hudi.HoodieSparkSessionExtension'
> --conf 'spark.serializer=org.apache.spark.serializer.KryoSerializer' --conf
> 'spark.kryo.registrator=org.apache.spark.HoodieSparkKryoRegistrar' --conf
> 'spark.sql.catalogImplementation=in-memory' {code}
> Command for running with current master on spark 3.5
> {code:java}
> /Users/jon/Documents/sparkroot/spark-3.5.2-bin-hadoop3/bin/spark-shell --jars
> /Users/jon/git/hudi-versions/current/spark3.5/hudi/packaging/hudi-spark-bundle/target/hudi-spark3.5-bundle_2.12-1.1.0-SNAPSHOT.jar
> --conf
> 'spark.sql.extensions=org.apache.spark.sql.hudi.HoodieSparkSessionExtension'
> --conf 'spark.serializer=org.apache.spark.serializer.KryoSerializer' --conf
> 'spark.kryo.registrator=org.apache.spark.HoodieSparkKryoRegistrar' --conf
> 'spark.sql.catalog.spark_catalog=org.apache.spark.sql.hudi.catalog.HoodieCatalog'
> --conf 'spark.sql.catalogImplementation=in-memory' {code}
>
>
--
This message was sent by Atlassian Jira
(v8.20.10#820010)