dongjoon-hyun commented on a change in pull request #32090:
URL: https://github.com/apache/spark/pull/32090#discussion_r609844974
##########
File path:
sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetQuerySuite.scala
##########
@@ -840,6 +840,90 @@ abstract class ParquetQuerySuite extends QueryTest with
ParquetTest with SharedS
testMigration(fromTsType = "INT96", toTsType = "TIMESTAMP_MICROS")
testMigration(fromTsType = "TIMESTAMP_MICROS", toTsType = "INT96")
}
+
+ test("SPARK-34212 Parquet should read decimals correctly") {
+ def readParquet(schema: String, path: File): DataFrame = {
+ spark.read.schema(schema).parquet(path.toString)
+ }
+
+ withTempPath { path =>
+ // a is int-decimal (4 bytes), b is long-decimal (8 bytes), c is
binary-decimal (16 bytes)
+ val df = sql("SELECT 1.0 a, CAST(1.23 AS DECIMAL(17, 2)) b, CAST(1.23 AS
DECIMAL(36, 2)) c")
+ df.write.parquet(path.toString)
+
+ withAllParquetReaders {
+ // We can read the decimal parquet field with a larger precision, if
scale is the same.
+ val schema = "a DECIMAL(9, 1), b DECIMAL(18, 2), c DECIMAL(38, 2)"
+ checkAnswer(readParquet(schema, path), df)
+ }
+
+ withSQLConf(SQLConf.PARQUET_VECTORIZED_READER_ENABLED.key -> "false") {
+ val schema1 = "a DECIMAL(3, 2), b DECIMAL(18, 3), c DECIMAL(37, 3)"
+ checkAnswer(readParquet(schema1, path), df)
+ val schema2 = "a DECIMAL(3, 0), b DECIMAL(18, 1), c DECIMAL(37, 1)"
+ checkAnswer(readParquet(schema2, path), Row(1, 1.2, 1.2))
+ }
+
+ withSQLConf(SQLConf.PARQUET_VECTORIZED_READER_ENABLED.key -> "true") {
+ Seq("a DECIMAL(3, 2)", "b DECIMAL(18, 1)", "c DECIMAL(37, 1)").foreach
{ schema =>
+ val e = intercept[SparkException] {
+ readParquet(schema, path).collect()
+ }.getCause.getCause
+ assert(e.isInstanceOf[SchemaColumnConvertNotSupportedException])
+ }
+ }
+ }
+
+ // tests for parquet types without decimal metadata.
+ withTempPath { path =>
+ val df = sql(s"SELECT 1 a, 123456 b, ${Int.MaxValue.toLong * 10} c,
CAST('1.2' AS BINARY) d")
+ df.write.parquet(path.toString)
+
+ withSQLConf(SQLConf.PARQUET_VECTORIZED_READER_ENABLED.key -> "false") {
+ checkAnswer(readParquet("a DECIMAL(3, 2)", path), sql("SELECT 1.00"))
+ checkAnswer(readParquet("b DECIMAL(3, 2)", path), Row(null))
+ checkAnswer(readParquet("b DECIMAL(11, 1)", path), sql("SELECT
123456.0"))
+ checkAnswer(readParquet("c DECIMAL(11, 1)", path), Row(null))
+ checkAnswer(readParquet("c DECIMAL(13, 0)", path), df.select("c"))
+ val e = intercept[SparkException] {
+ readParquet("d DECIMAL(3, 2)", path).collect()
+ }.getCause
+ assert(e.getMessage.contains("Please read this column/field as Spark
BINARY type"))
+ }
+
+ withSQLConf(SQLConf.PARQUET_VECTORIZED_READER_ENABLED.key -> "true") {
+ Seq("a DECIMAL(3, 2)", "c DECIMAL(18, 1)", "d DECIMAL(37, 1)").foreach
{ schema =>
+ val e = intercept[SparkException] {
+ readParquet(schema, path).collect()
+ }.getCause.getCause
+ assert(e.isInstanceOf[SchemaColumnConvertNotSupportedException])
+ }
+ }
+ }
+
+ // scale is 0
+ withTempPath { path =>
Review comment:
If you don't mind, please make a separate test case, @wangyum . The
existing test case is already a little too long.
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
For queries about this service, please contact Infrastructure at:
[email protected]
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]