cloud-fan commented on a change in pull request #31281:
URL: https://github.com/apache/spark/pull/31281#discussion_r563478821
##########
File path:
sql/catalyst/src/main/scala/org/apache/spark/sql/util/PartitioningUtils.scala
##########
@@ -30,14 +35,34 @@ object PartitioningUtils {
*/
def normalizePartitionSpec[T](
partitionSpec: Map[String, T],
- partColNames: Seq[String],
+ partCols: StructType,
tblName: String,
resolver: Resolver): Map[String, T] = {
val normalizedPartSpec = partitionSpec.toSeq.map { case (key, value) =>
- val normalizedKey = partColNames.find(resolver(_, key)).getOrElse {
- throw new AnalysisException(s"$key is not a valid partition column in
table $tblName.")
+ val normalizedFiled = CharVarcharUtils.getRawSchema(partCols)
Review comment:
nit: we don't need to call `getRawSchema` in the loop body. We can do
`val partSchema = ...` at the beginning.
##########
File path:
sql/catalyst/src/main/scala/org/apache/spark/sql/util/PartitioningUtils.scala
##########
@@ -30,14 +35,33 @@ object PartitioningUtils {
*/
def normalizePartitionSpec[T](
partitionSpec: Map[String, T],
- partColNames: Seq[String],
+ partCols: StructType,
tblName: String,
resolver: Resolver): Map[String, T] = {
+ val rawSchema = CharVarcharUtils.getRawSchema(partCols)
val normalizedPartSpec = partitionSpec.toSeq.map { case (key, value) =>
- val normalizedKey = partColNames.find(resolver(_, key)).getOrElse {
+ val normalizedFiled = rawSchema.find(f => resolver(f.name,
key)).getOrElse {
throw new AnalysisException(s"$key is not a valid partition column in
table $tblName.")
}
- normalizedKey -> value
+
+ val normalizedVal = normalizedFiled.dataType match {
+ case CharType(len) if value != null && value != DEFAULT_PARTITION_NAME
=>
+ val v = value match {
+ case Some(v) => Some(charTypeWriteSideCheck(v.toString, len))
+ case None => None
+ case other => charTypeWriteSideCheck(other.toString, len)
Review comment:
I think we will never reach this branch. We can either fail here, or
return `other` as it is.
##########
File path:
sql/catalyst/src/main/scala/org/apache/spark/sql/util/PartitioningUtils.scala
##########
@@ -30,14 +35,33 @@ object PartitioningUtils {
*/
def normalizePartitionSpec[T](
partitionSpec: Map[String, T],
- partColNames: Seq[String],
+ partCols: StructType,
tblName: String,
resolver: Resolver): Map[String, T] = {
+ val rawSchema = CharVarcharUtils.getRawSchema(partCols)
val normalizedPartSpec = partitionSpec.toSeq.map { case (key, value) =>
- val normalizedKey = partColNames.find(resolver(_, key)).getOrElse {
+ val normalizedFiled = rawSchema.find(f => resolver(f.name,
key)).getOrElse {
throw new AnalysisException(s"$key is not a valid partition column in
table $tblName.")
}
- normalizedKey -> value
+
+ val normalizedVal = normalizedFiled.dataType match {
+ case CharType(len) if value != null && value != DEFAULT_PARTITION_NAME
=>
+ val v = value match {
+ case Some(v) => Some(charTypeWriteSideCheck(v.toString, len))
+ case None => None
+ case other => charTypeWriteSideCheck(other.toString, len)
+ }
+ v.asInstanceOf[T]
+ case VarcharType(len) if value != null && value !=
DEFAULT_PARTITION_NAME =>
+ val v = value match {
+ case Some(v) => Some(varcharTypeWriteSideCheck(v.toString, len))
+ case None => None
+ case other => varcharTypeWriteSideCheck(other.toString, len)
Review comment:
ditto
##########
File path:
sql/core/src/main/scala/org/apache/spark/sql/execution/command/ddl.scala
##########
@@ -465,13 +465,13 @@ case class AlterTableAddPartitionCommand(
override def run(sparkSession: SparkSession): Seq[Row] = {
val catalog = sparkSession.sessionState.catalog
- val table = catalog.getTableMetadata(tableName)
+ val table = catalog.getTableRawMetadata(tableName)
Review comment:
do we need to make this change?
`PartitioningUtils.normalizePartitionSpec` will restore the raw schema of
partition columns.
##########
File path:
sql/core/src/main/scala/org/apache/spark/sql/execution/command/ddl.scala
##########
@@ -521,19 +521,19 @@ case class AlterTableRenamePartitionCommand(
override def run(sparkSession: SparkSession): Seq[Row] = {
val catalog = sparkSession.sessionState.catalog
- val table = catalog.getTableMetadata(tableName)
+ val table = catalog.getTableRawMetadata(tableName)
Review comment:
ditto
##########
File path:
sql/core/src/test/scala/org/apache/spark/sql/CharVarcharTestSuite.scala
##########
@@ -37,31 +37,109 @@ trait CharVarcharTestSuite extends QueryTest with
SQLTestUtils {
assert(CharVarcharUtils.getRawType(f.metadata) == Some(dt))
}
- test("char type values should be padded: top-level columns") {
+ test("char type values should be padded or trimmed: top-level columns") {
withTable("t") {
sql(s"CREATE TABLE t(i STRING, c CHAR(5)) USING $format")
- sql("INSERT INTO t VALUES ('1', 'a')")
- checkAnswer(spark.table("t"), Row("1", "a" + " " * 4))
- checkColType(spark.table("t").schema(1), CharType(5))
+ (0 to 5).map(n => "a" + " " * n).foreach { v =>
+ sql(s"INSERT OVERWRITE t VALUES ('1', '$v')")
+ checkAnswer(spark.table("t"), Row("1", "a" + " " * 4))
+ checkColType(spark.table("t").schema(1), CharType(5))
+ }
sql("INSERT OVERWRITE t VALUES ('1', null)")
checkAnswer(spark.table("t"), Row("1", null))
+
+ val e = intercept[SparkException](sql("INSERT OVERWRITE t VALUES ('1',
'abcdef')"))
+ assert(e.getCause.getMessage.contains("Exceeds char/varchar type length
limitation: 5"))
}
}
- test("char type values should be padded: partitioned columns") {
+ test("char type values should be padded or trimmed: partitioned columns") {
+ withTable("t") {
+ sql(s"CREATE TABLE t(i STRING, c CHAR(5)) USING $format PARTITIONED BY
(c)")
+ (0 to 5).map(n => "a" + " " * n).foreach { v =>
+ sql(s"INSERT OVERWRITE t VALUES ('1', '$v')")
+ checkAnswer(spark.table("t"), Row("1", "a" + " " * 4))
+ checkColType(spark.table("t").schema(1), CharType(5))
+ }
+ val e1 = intercept[SparkException](sql("INSERT OVERWRITE t VALUES ('1',
'abcdef')"))
+ assert(e1.getCause.getMessage.contains("Exceeds char/varchar type length
limitation: 5"))
+ }
+
withTable("t") {
sql(s"CREATE TABLE t(i STRING, c CHAR(5)) USING $format PARTITIONED BY
(c)")
- sql("INSERT INTO t VALUES ('1', 'a')")
- checkAnswer(spark.table("t"), Row("1", "a" + " " * 4))
- checkColType(spark.table("t").schema(1), CharType(5))
+ (0 to 5).map(n => "a" + " " * n).foreach { v =>
+ sql(s"INSERT INTO t VALUES ('1', '$v')")
+ checkAnswer(spark.table("t"), Row("1", "a" + " " * 4))
+ sql(s"ALTER TABLE t DROP PARTITION(c='$v')")
Review comment:
nit: we should test that we can always drop partition with `c='a'`.
##########
File path:
sql/core/src/test/scala/org/apache/spark/sql/CharVarcharTestSuite.scala
##########
@@ -37,31 +37,109 @@ trait CharVarcharTestSuite extends QueryTest with
SQLTestUtils {
assert(CharVarcharUtils.getRawType(f.metadata) == Some(dt))
}
- test("char type values should be padded: top-level columns") {
+ test("char type values should be padded or trimmed: top-level columns") {
withTable("t") {
sql(s"CREATE TABLE t(i STRING, c CHAR(5)) USING $format")
- sql("INSERT INTO t VALUES ('1', 'a')")
- checkAnswer(spark.table("t"), Row("1", "a" + " " * 4))
- checkColType(spark.table("t").schema(1), CharType(5))
+ (0 to 5).map(n => "a" + " " * n).foreach { v =>
+ sql(s"INSERT OVERWRITE t VALUES ('1', '$v')")
+ checkAnswer(spark.table("t"), Row("1", "a" + " " * 4))
+ checkColType(spark.table("t").schema(1), CharType(5))
+ }
sql("INSERT OVERWRITE t VALUES ('1', null)")
checkAnswer(spark.table("t"), Row("1", null))
+
+ val e = intercept[SparkException](sql("INSERT OVERWRITE t VALUES ('1',
'abcdef')"))
Review comment:
I think we already tested it in the length check write tests.
##########
File path:
sql/core/src/test/scala/org/apache/spark/sql/CharVarcharTestSuite.scala
##########
@@ -37,31 +37,109 @@ trait CharVarcharTestSuite extends QueryTest with
SQLTestUtils {
assert(CharVarcharUtils.getRawType(f.metadata) == Some(dt))
}
- test("char type values should be padded: top-level columns") {
+ test("char type values should be padded or trimmed: top-level columns") {
withTable("t") {
sql(s"CREATE TABLE t(i STRING, c CHAR(5)) USING $format")
- sql("INSERT INTO t VALUES ('1', 'a')")
- checkAnswer(spark.table("t"), Row("1", "a" + " " * 4))
- checkColType(spark.table("t").schema(1), CharType(5))
+ (0 to 5).map(n => "a" + " " * n).foreach { v =>
+ sql(s"INSERT OVERWRITE t VALUES ('1', '$v')")
+ checkAnswer(spark.table("t"), Row("1", "a" + " " * 4))
+ checkColType(spark.table("t").schema(1), CharType(5))
+ }
sql("INSERT OVERWRITE t VALUES ('1', null)")
checkAnswer(spark.table("t"), Row("1", null))
+
+ val e = intercept[SparkException](sql("INSERT OVERWRITE t VALUES ('1',
'abcdef')"))
+ assert(e.getCause.getMessage.contains("Exceeds char/varchar type length
limitation: 5"))
}
}
- test("char type values should be padded: partitioned columns") {
+ test("char type values should be padded or trimmed: partitioned columns") {
+ withTable("t") {
+ sql(s"CREATE TABLE t(i STRING, c CHAR(5)) USING $format PARTITIONED BY
(c)")
+ (0 to 5).map(n => "a" + " " * n).foreach { v =>
+ sql(s"INSERT OVERWRITE t VALUES ('1', '$v')")
+ checkAnswer(spark.table("t"), Row("1", "a" + " " * 4))
+ checkColType(spark.table("t").schema(1), CharType(5))
+ }
+ val e1 = intercept[SparkException](sql("INSERT OVERWRITE t VALUES ('1',
'abcdef')"))
Review comment:
ditto
##########
File path:
sql/core/src/test/scala/org/apache/spark/sql/CharVarcharTestSuite.scala
##########
@@ -37,31 +37,109 @@ trait CharVarcharTestSuite extends QueryTest with
SQLTestUtils {
assert(CharVarcharUtils.getRawType(f.metadata) == Some(dt))
}
- test("char type values should be padded: top-level columns") {
+ test("char type values should be padded or trimmed: top-level columns") {
withTable("t") {
sql(s"CREATE TABLE t(i STRING, c CHAR(5)) USING $format")
- sql("INSERT INTO t VALUES ('1', 'a')")
- checkAnswer(spark.table("t"), Row("1", "a" + " " * 4))
- checkColType(spark.table("t").schema(1), CharType(5))
+ (0 to 5).map(n => "a" + " " * n).foreach { v =>
+ sql(s"INSERT OVERWRITE t VALUES ('1', '$v')")
+ checkAnswer(spark.table("t"), Row("1", "a" + " " * 4))
+ checkColType(spark.table("t").schema(1), CharType(5))
+ }
sql("INSERT OVERWRITE t VALUES ('1', null)")
checkAnswer(spark.table("t"), Row("1", null))
+
+ val e = intercept[SparkException](sql("INSERT OVERWRITE t VALUES ('1',
'abcdef')"))
+ assert(e.getCause.getMessage.contains("Exceeds char/varchar type length
limitation: 5"))
}
}
- test("char type values should be padded: partitioned columns") {
+ test("char type values should be padded or trimmed: partitioned columns") {
+ withTable("t") {
+ sql(s"CREATE TABLE t(i STRING, c CHAR(5)) USING $format PARTITIONED BY
(c)")
+ (0 to 5).map(n => "a" + " " * n).foreach { v =>
+ sql(s"INSERT OVERWRITE t VALUES ('1', '$v')")
+ checkAnswer(spark.table("t"), Row("1", "a" + " " * 4))
+ checkColType(spark.table("t").schema(1), CharType(5))
+ }
+ val e1 = intercept[SparkException](sql("INSERT OVERWRITE t VALUES ('1',
'abcdef')"))
+ assert(e1.getCause.getMessage.contains("Exceeds char/varchar type length
limitation: 5"))
+ }
+
withTable("t") {
sql(s"CREATE TABLE t(i STRING, c CHAR(5)) USING $format PARTITIONED BY
(c)")
- sql("INSERT INTO t VALUES ('1', 'a')")
- checkAnswer(spark.table("t"), Row("1", "a" + " " * 4))
- checkColType(spark.table("t").schema(1), CharType(5))
+ (0 to 5).map(n => "a" + " " * n).foreach { v =>
+ sql(s"INSERT INTO t VALUES ('1', '$v')")
+ checkAnswer(spark.table("t"), Row("1", "a" + " " * 4))
+ sql(s"ALTER TABLE t DROP PARTITION(c='$v')")
+ checkAnswer(spark.table("t"), Nil)
+ }
+
+ val e2 = intercept[RuntimeException](sql("ALTER TABLE t DROP
PARTITION(c='abcdef')"))
+ assert(e2.getMessage.contains("Exceeds char/varchar type length
limitation: 5"))
+
+ val e3 = intercept[RuntimeException](sql("ALTER TABLE t ADD
PARTITION(c='abcdef')"))
+ assert(e3.getMessage.contains("Exceeds char/varchar type length
limitation: 5"))
+
+ sql("INSERT OVERWRITE t VALUES ('1', null)")
+ checkAnswer(spark.table("t"), Row("1", null))
+ }
+ }
- sql("ALTER TABLE t DROP PARTITION(c='a')")
+ test("varchar type values length check: partitioned columns dynamic") {
+ (0 to 5).foreach { n =>
+ withTable("t") {
+ sql(s"CREATE TABLE t(i STRING, c VARCHAR(5)) USING $format PARTITIONED
BY (c)")
+ val v = "a" + " " * n
+ sql(s"INSERT INTO t VALUES ('1', '$v')")
+ checkAnswer(spark.table("t"), Row("1", "a" + " " * math.min(n, 4)))
+ checkColType(spark.table("t").schema(1), VarcharType(5))
+ }
+ }
+
+ withTable("t") {
+ sql(s"CREATE TABLE t(i STRING, c VARCHAR(5)) USING $format PARTITIONED
BY (c)")
+ val e = intercept[SparkException](sql("INSERT OVERWRITE t VALUES ('1',
'abcdef')"))
Review comment:
ditto
##########
File path:
sql/core/src/test/scala/org/apache/spark/sql/CharVarcharTestSuite.scala
##########
@@ -37,31 +37,109 @@ trait CharVarcharTestSuite extends QueryTest with
SQLTestUtils {
assert(CharVarcharUtils.getRawType(f.metadata) == Some(dt))
}
- test("char type values should be padded: top-level columns") {
+ test("char type values should be padded or trimmed: top-level columns") {
withTable("t") {
sql(s"CREATE TABLE t(i STRING, c CHAR(5)) USING $format")
- sql("INSERT INTO t VALUES ('1', 'a')")
- checkAnswer(spark.table("t"), Row("1", "a" + " " * 4))
- checkColType(spark.table("t").schema(1), CharType(5))
+ (0 to 5).map(n => "a" + " " * n).foreach { v =>
+ sql(s"INSERT OVERWRITE t VALUES ('1', '$v')")
+ checkAnswer(spark.table("t"), Row("1", "a" + " " * 4))
+ checkColType(spark.table("t").schema(1), CharType(5))
+ }
sql("INSERT OVERWRITE t VALUES ('1', null)")
checkAnswer(spark.table("t"), Row("1", null))
+
+ val e = intercept[SparkException](sql("INSERT OVERWRITE t VALUES ('1',
'abcdef')"))
+ assert(e.getCause.getMessage.contains("Exceeds char/varchar type length
limitation: 5"))
}
}
- test("char type values should be padded: partitioned columns") {
+ test("char type values should be padded or trimmed: partitioned columns") {
+ withTable("t") {
+ sql(s"CREATE TABLE t(i STRING, c CHAR(5)) USING $format PARTITIONED BY
(c)")
+ (0 to 5).map(n => "a" + " " * n).foreach { v =>
+ sql(s"INSERT OVERWRITE t VALUES ('1', '$v')")
+ checkAnswer(spark.table("t"), Row("1", "a" + " " * 4))
+ checkColType(spark.table("t").schema(1), CharType(5))
+ }
+ val e1 = intercept[SparkException](sql("INSERT OVERWRITE t VALUES ('1',
'abcdef')"))
+ assert(e1.getCause.getMessage.contains("Exceeds char/varchar type length
limitation: 5"))
+ }
+
withTable("t") {
sql(s"CREATE TABLE t(i STRING, c CHAR(5)) USING $format PARTITIONED BY
(c)")
- sql("INSERT INTO t VALUES ('1', 'a')")
- checkAnswer(spark.table("t"), Row("1", "a" + " " * 4))
- checkColType(spark.table("t").schema(1), CharType(5))
+ (0 to 5).map(n => "a" + " " * n).foreach { v =>
+ sql(s"INSERT INTO t VALUES ('1', '$v')")
+ checkAnswer(spark.table("t"), Row("1", "a" + " " * 4))
+ sql(s"ALTER TABLE t DROP PARTITION(c='$v')")
+ checkAnswer(spark.table("t"), Nil)
+ }
+
+ val e2 = intercept[RuntimeException](sql("ALTER TABLE t DROP
PARTITION(c='abcdef')"))
+ assert(e2.getMessage.contains("Exceeds char/varchar type length
limitation: 5"))
+
+ val e3 = intercept[RuntimeException](sql("ALTER TABLE t ADD
PARTITION(c='abcdef')"))
+ assert(e3.getMessage.contains("Exceeds char/varchar type length
limitation: 5"))
+
+ sql("INSERT OVERWRITE t VALUES ('1', null)")
+ checkAnswer(spark.table("t"), Row("1", null))
+ }
+ }
- sql("ALTER TABLE t DROP PARTITION(c='a')")
+ test("varchar type values length check: partitioned columns dynamic") {
+ (0 to 5).foreach { n =>
+ withTable("t") {
+ sql(s"CREATE TABLE t(i STRING, c VARCHAR(5)) USING $format PARTITIONED
BY (c)")
+ val v = "a" + " " * n
+ sql(s"INSERT INTO t VALUES ('1', '$v')")
+ checkAnswer(spark.table("t"), Row("1", "a" + " " * math.min(n, 4)))
+ checkColType(spark.table("t").schema(1), VarcharType(5))
+ }
+ }
+
+ withTable("t") {
+ sql(s"CREATE TABLE t(i STRING, c VARCHAR(5)) USING $format PARTITIONED
BY (c)")
+ val e = intercept[SparkException](sql("INSERT OVERWRITE t VALUES ('1',
'abcdef')"))
+ assert(e.getCause.getMessage.contains("Exceeds char/varchar type length
limitation: 5"))
+ }
+
+ (0 to 5).foreach { n =>
+ withTable("t") {
+ sql(s"CREATE TABLE t(i STRING, c VARCHAR(5)) USING $format PARTITIONED
BY (c)")
+ val v = "a" + " " * n
+ sql(s"INSERT INTO t VALUES ('1', '$v')")
+ checkAnswer(spark.table("t"), Row("1", "a" + " " * math.min(n, 4)))
+ sql(s"ALTER TABLE t DROP PARTITION(c='$v')")
+ checkAnswer(spark.table("t"), Nil)
+ }
+ }
+
+ withTable("t") {
+ sql(s"CREATE TABLE t(i STRING, c VARCHAR(5)) USING $format PARTITIONED
BY (c)")
+ val e = intercept[RuntimeException](sql("ALTER TABLE t DROP
PARTITION(c='abcdef')"))
Review comment:
how about ADD PARTITION? can we have one test case for ADD/RENAME/DROP
PARTITION with too-long partition values?
##########
File path:
sql/core/src/test/scala/org/apache/spark/sql/CharVarcharTestSuite.scala
##########
@@ -37,31 +37,109 @@ trait CharVarcharTestSuite extends QueryTest with
SQLTestUtils {
assert(CharVarcharUtils.getRawType(f.metadata) == Some(dt))
}
- test("char type values should be padded: top-level columns") {
+ test("char type values should be padded or trimmed: top-level columns") {
withTable("t") {
sql(s"CREATE TABLE t(i STRING, c CHAR(5)) USING $format")
- sql("INSERT INTO t VALUES ('1', 'a')")
- checkAnswer(spark.table("t"), Row("1", "a" + " " * 4))
- checkColType(spark.table("t").schema(1), CharType(5))
+ (0 to 5).map(n => "a" + " " * n).foreach { v =>
+ sql(s"INSERT OVERWRITE t VALUES ('1', '$v')")
+ checkAnswer(spark.table("t"), Row("1", "a" + " " * 4))
+ checkColType(spark.table("t").schema(1), CharType(5))
+ }
sql("INSERT OVERWRITE t VALUES ('1', null)")
checkAnswer(spark.table("t"), Row("1", null))
+
+ val e = intercept[SparkException](sql("INSERT OVERWRITE t VALUES ('1',
'abcdef')"))
+ assert(e.getCause.getMessage.contains("Exceeds char/varchar type length
limitation: 5"))
}
}
- test("char type values should be padded: partitioned columns") {
+ test("char type values should be padded or trimmed: partitioned columns") {
+ withTable("t") {
+ sql(s"CREATE TABLE t(i STRING, c CHAR(5)) USING $format PARTITIONED BY
(c)")
+ (0 to 5).map(n => "a" + " " * n).foreach { v =>
+ sql(s"INSERT OVERWRITE t VALUES ('1', '$v')")
+ checkAnswer(spark.table("t"), Row("1", "a" + " " * 4))
+ checkColType(spark.table("t").schema(1), CharType(5))
+ }
+ val e1 = intercept[SparkException](sql("INSERT OVERWRITE t VALUES ('1',
'abcdef')"))
+ assert(e1.getCause.getMessage.contains("Exceeds char/varchar type length
limitation: 5"))
+ }
+
withTable("t") {
sql(s"CREATE TABLE t(i STRING, c CHAR(5)) USING $format PARTITIONED BY
(c)")
- sql("INSERT INTO t VALUES ('1', 'a')")
- checkAnswer(spark.table("t"), Row("1", "a" + " " * 4))
- checkColType(spark.table("t").schema(1), CharType(5))
+ (0 to 5).map(n => "a" + " " * n).foreach { v =>
+ sql(s"INSERT INTO t VALUES ('1', '$v')")
+ checkAnswer(spark.table("t"), Row("1", "a" + " " * 4))
+ sql(s"ALTER TABLE t DROP PARTITION(c='$v')")
+ checkAnswer(spark.table("t"), Nil)
+ }
+
+ val e2 = intercept[RuntimeException](sql("ALTER TABLE t DROP
PARTITION(c='abcdef')"))
+ assert(e2.getMessage.contains("Exceeds char/varchar type length
limitation: 5"))
+
+ val e3 = intercept[RuntimeException](sql("ALTER TABLE t ADD
PARTITION(c='abcdef')"))
+ assert(e3.getMessage.contains("Exceeds char/varchar type length
limitation: 5"))
+
+ sql("INSERT OVERWRITE t VALUES ('1', null)")
+ checkAnswer(spark.table("t"), Row("1", null))
+ }
+ }
- sql("ALTER TABLE t DROP PARTITION(c='a')")
+ test("varchar type values length check: partitioned columns dynamic") {
+ (0 to 5).foreach { n =>
+ withTable("t") {
+ sql(s"CREATE TABLE t(i STRING, c VARCHAR(5)) USING $format PARTITIONED
BY (c)")
+ val v = "a" + " " * n
+ sql(s"INSERT INTO t VALUES ('1', '$v')")
+ checkAnswer(spark.table("t"), Row("1", "a" + " " * math.min(n, 4)))
+ checkColType(spark.table("t").schema(1), VarcharType(5))
+ }
+ }
+
+ withTable("t") {
+ sql(s"CREATE TABLE t(i STRING, c VARCHAR(5)) USING $format PARTITIONED
BY (c)")
+ val e = intercept[SparkException](sql("INSERT OVERWRITE t VALUES ('1',
'abcdef')"))
+ assert(e.getCause.getMessage.contains("Exceeds char/varchar type length
limitation: 5"))
+ }
+
+ (0 to 5).foreach { n =>
+ withTable("t") {
+ sql(s"CREATE TABLE t(i STRING, c VARCHAR(5)) USING $format PARTITIONED
BY (c)")
+ val v = "a" + " " * n
+ sql(s"INSERT INTO t VALUES ('1', '$v')")
+ checkAnswer(spark.table("t"), Row("1", "a" + " " * math.min(n, 4)))
+ sql(s"ALTER TABLE t DROP PARTITION(c='$v')")
+ checkAnswer(spark.table("t"), Nil)
+ }
+ }
+
+ withTable("t") {
+ sql(s"CREATE TABLE t(i STRING, c VARCHAR(5)) USING $format PARTITIONED
BY (c)")
+ val e = intercept[RuntimeException](sql("ALTER TABLE t DROP
PARTITION(c='abcdef')"))
+ assert(e.getMessage.contains("Exceeds char/varchar type length
limitation: 5"))
sql("INSERT OVERWRITE t VALUES ('1', null)")
checkAnswer(spark.table("t"), Row("1", null))
}
}
+ test("varchar type values length check: partitioned columns of other types")
{
+ withTable("t") {
+ sql(s"CREATE TABLE t(i STRING, c VARCHAR(5)) USING $format PARTITIONED
BY (c)")
+ Seq(1, 10, 100, 1000, 10000).foreach { v =>
+ sql(s"INSERT OVERWRITE t VALUES ('1', $v)")
+ sql(s"ALTER TABLE t DROP PARTITION(c=$v)")
+ checkAnswer(spark.table("t"), Nil)
+ }
+
+ val e1 = intercept[SparkException](sql(s"INSERT OVERWRITE t VALUES ('1',
100000)"))
+ assert(e1.getCause.getMessage.contains("Exceeds char/varchar type length
limitation: 5"))
+
+ val e2 = intercept[RuntimeException](sql("ALTER TABLE t DROP
PARTITION(c='100000')"))
Review comment:
this is not "other types"
##########
File path:
sql/hive/src/main/scala/org/apache/spark/sql/hive/client/HiveClientImpl.scala
##########
@@ -988,7 +989,15 @@ private[hive] class HiveClientImpl(
private[hive] object HiveClientImpl extends Logging {
/** Converts the native StructField to Hive's FieldSchema. */
def toHiveColumn(c: StructField): FieldSchema = {
- val typeString = HiveVoidType.replaceVoidType(c.dataType).catalogString
+ val typeString = if
(c.metadata.contains(CHAR_VARCHAR_TYPE_STRING_METADATA_KEY)) {
Review comment:
where do we hide the char/varchar from schema before reaching here?
##########
File path:
sql/hive/src/test/scala/org/apache/spark/sql/HiveCharVarcharTestSuite.scala
##########
@@ -50,6 +51,35 @@ class HiveCharVarcharTestSuite extends CharVarcharTestSuite
with TestHiveSinglet
assert(rest.contains("CHAR(5)"))
}
}
+
+ // TODO(SPARK-34203): Move this too super class when the ticket gets fixed
Review comment:
`too` -> `to`
##########
File path:
sql/hive/src/test/scala/org/apache/spark/sql/HiveCharVarcharTestSuite.scala
##########
@@ -50,6 +51,35 @@ class HiveCharVarcharTestSuite extends CharVarcharTestSuite
with TestHiveSinglet
assert(rest.contains("CHAR(5)"))
}
}
+
+ // TODO(SPARK-34203): Move this too super class when the ticket gets fixed
Review comment:
BTW the jira is about null partition value, while this test doesn't
cover null partition at all.
##########
File path:
sql/hive/src/test/scala/org/apache/spark/sql/HiveCharVarcharTestSuite.scala
##########
@@ -50,6 +51,35 @@ class HiveCharVarcharTestSuite extends CharVarcharTestSuite
with TestHiveSinglet
assert(rest.contains("CHAR(5)"))
}
}
+
+ // TODO(SPARK-34203): Move this too super class when the ticket gets fixed
+ test("char type values should be padded or trimmed: static partitioned
columns") {
+ withTable("t") {
+ sql(s"CREATE TABLE t(i STRING, c CHAR(5)) USING $format PARTITIONED BY
(c)")
+ (0 to 5).map(n => "a" + " " * n).foreach { v =>
+ sql(s"INSERT INTO t PARTITION (c ='$v') VALUES ('1')")
+ checkAnswer(spark.table("t"), Row("1", "a" + " " * 4))
+ checkColType(spark.table("t").schema(1), CharType(5))
+ sql(s"ALTER TABLE t DROP PARTITION(c='$v')")
+ checkAnswer(spark.table("t"), Nil)
+ }
+ }
+ }
+
+ test("SPARK-34192: Know issue of hive for tailing spaces") {
Review comment:
Can we put the code snippet in the JIRA as a way to reproduce the bug,
and remove this test?
##########
File path:
sql/catalyst/src/main/scala/org/apache/spark/sql/util/PartitioningUtils.scala
##########
@@ -30,14 +35,33 @@ object PartitioningUtils {
*/
def normalizePartitionSpec[T](
partitionSpec: Map[String, T],
- partColNames: Seq[String],
+ partCols: StructType,
tblName: String,
resolver: Resolver): Map[String, T] = {
+ val rawSchema = CharVarcharUtils.getRawSchema(partCols)
val normalizedPartSpec = partitionSpec.toSeq.map { case (key, value) =>
- val normalizedKey = partColNames.find(resolver(_, key)).getOrElse {
+ val normalizedFiled = rawSchema.find(f => resolver(f.name,
key)).getOrElse {
throw new AnalysisException(s"$key is not a valid partition column in
table $tblName.")
}
- normalizedKey -> value
+
+ val normalizedVal = normalizedFiled.dataType match {
+ case CharType(len) if value != null && value != DEFAULT_PARTITION_NAME
=>
+ val v = value match {
+ case Some(v) => Some(charTypeWriteSideCheck(v.toString, len))
Review comment:
nit:
```
case Some(str: String) => ...
case str: String => ...
case other => other
```
Then we don't need the `v.asInstanceOf[T]` at the end.
##########
File path:
sql/core/src/test/scala/org/apache/spark/sql/CharVarcharTestSuite.scala
##########
@@ -37,31 +37,134 @@ trait CharVarcharTestSuite extends QueryTest with
SQLTestUtils {
assert(CharVarcharUtils.getRawType(f.metadata) == Some(dt))
}
- test("char type values should be padded: top-level columns") {
+ test("char type values should be padded or trimmed: top-level columns") {
withTable("t") {
sql(s"CREATE TABLE t(i STRING, c CHAR(5)) USING $format")
- sql("INSERT INTO t VALUES ('1', 'a')")
- checkAnswer(spark.table("t"), Row("1", "a" + " " * 4))
- checkColType(spark.table("t").schema(1), CharType(5))
+ (0 to 5).map(n => "a" + " " * n).foreach { v =>
+ sql(s"INSERT OVERWRITE t VALUES ('1', '$v')")
+ checkAnswer(spark.table("t"), Row("1", "a" + " " * 4))
+ checkColType(spark.table("t").schema(1), CharType(5))
+ }
+
+ sql("INSERT OVERWRITE t VALUES ('1', null)")
+ checkAnswer(spark.table("t"), Row("1", null))
+ }
+ }
+
+ test("char type values should be padded or trimmed: partitioned columns") {
+ withTable("t") {
+ sql(s"CREATE TABLE t(i STRING, c CHAR(5)) USING $format PARTITIONED BY
(c)")
+ (0 to 5).map(n => "a" + " " * n).foreach { v =>
+ sql(s"INSERT OVERWRITE t VALUES ('1', '$v')")
+ checkAnswer(spark.table("t"), Row("1", "a" + " " * 4))
+ checkColType(spark.table("t").schema(1), CharType(5))
+ }
+ }
+
+ withTable("t") {
+ sql(s"CREATE TABLE t(i STRING, c CHAR(5)) USING $format PARTITIONED BY
(c)")
+ (0 to 5).map(n => "a" + " " * n).foreach { v =>
+ sql(s"INSERT INTO t VALUES ('1', '$v')")
+ checkAnswer(spark.table("t"), Row("1", "a" + " " * 4))
+ sql(s"ALTER TABLE t DROP PARTITION(c='$v')")
Review comment:
To be complete, we need to make sure we can drop the partition with `a`,
`a `, ..., or `a `. That's too many tests, and most users would just write
`a` as they know the padding semantic. I think we only need to test `ALTER
TABLE t DROP PARTITION(c='a')` here.
##########
File path:
sql/core/src/test/scala/org/apache/spark/sql/CharVarcharTestSuite.scala
##########
@@ -37,31 +37,134 @@ trait CharVarcharTestSuite extends QueryTest with
SQLTestUtils {
assert(CharVarcharUtils.getRawType(f.metadata) == Some(dt))
}
- test("char type values should be padded: top-level columns") {
+ test("char type values should be padded or trimmed: top-level columns") {
withTable("t") {
sql(s"CREATE TABLE t(i STRING, c CHAR(5)) USING $format")
- sql("INSERT INTO t VALUES ('1', 'a')")
- checkAnswer(spark.table("t"), Row("1", "a" + " " * 4))
- checkColType(spark.table("t").schema(1), CharType(5))
+ (0 to 5).map(n => "a" + " " * n).foreach { v =>
+ sql(s"INSERT OVERWRITE t VALUES ('1', '$v')")
+ checkAnswer(spark.table("t"), Row("1", "a" + " " * 4))
+ checkColType(spark.table("t").schema(1), CharType(5))
+ }
+
+ sql("INSERT OVERWRITE t VALUES ('1', null)")
+ checkAnswer(spark.table("t"), Row("1", null))
+ }
+ }
+
+ test("char type values should be padded or trimmed: partitioned columns") {
+ withTable("t") {
+ sql(s"CREATE TABLE t(i STRING, c CHAR(5)) USING $format PARTITIONED BY
(c)")
+ (0 to 5).map(n => "a" + " " * n).foreach { v =>
+ sql(s"INSERT OVERWRITE t VALUES ('1', '$v')")
+ checkAnswer(spark.table("t"), Row("1", "a" + " " * 4))
+ checkColType(spark.table("t").schema(1), CharType(5))
+ }
+ }
+
+ withTable("t") {
+ sql(s"CREATE TABLE t(i STRING, c CHAR(5)) USING $format PARTITIONED BY
(c)")
+ (0 to 5).map(n => "a" + " " * n).foreach { v =>
+ sql(s"INSERT INTO t VALUES ('1', '$v')")
+ checkAnswer(spark.table("t"), Row("1", "a" + " " * 4))
+ sql(s"ALTER TABLE t DROP PARTITION(c='$v')")
Review comment:
To be complete, we need to make sure we can drop the partition with `a`,
`a `, ..., or `a (with 4 spaces)`. That's too many tests, and most users would
just write `a` as they know the padding semantic. I think we only need to test
`ALTER TABLE t DROP PARTITION(c='a')` here.
##########
File path:
sql/core/src/test/scala/org/apache/spark/sql/CharVarcharTestSuite.scala
##########
@@ -37,31 +37,134 @@ trait CharVarcharTestSuite extends QueryTest with
SQLTestUtils {
assert(CharVarcharUtils.getRawType(f.metadata) == Some(dt))
}
- test("char type values should be padded: top-level columns") {
+ test("char type values should be padded or trimmed: top-level columns") {
withTable("t") {
sql(s"CREATE TABLE t(i STRING, c CHAR(5)) USING $format")
- sql("INSERT INTO t VALUES ('1', 'a')")
- checkAnswer(spark.table("t"), Row("1", "a" + " " * 4))
- checkColType(spark.table("t").schema(1), CharType(5))
+ (0 to 5).map(n => "a" + " " * n).foreach { v =>
+ sql(s"INSERT OVERWRITE t VALUES ('1', '$v')")
+ checkAnswer(spark.table("t"), Row("1", "a" + " " * 4))
+ checkColType(spark.table("t").schema(1), CharType(5))
+ }
+
+ sql("INSERT OVERWRITE t VALUES ('1', null)")
+ checkAnswer(spark.table("t"), Row("1", null))
+ }
+ }
+
+ test("char type values should be padded or trimmed: partitioned columns") {
+ withTable("t") {
+ sql(s"CREATE TABLE t(i STRING, c CHAR(5)) USING $format PARTITIONED BY
(c)")
+ (0 to 5).map(n => "a" + " " * n).foreach { v =>
+ sql(s"INSERT OVERWRITE t VALUES ('1', '$v')")
+ checkAnswer(spark.table("t"), Row("1", "a" + " " * 4))
+ checkColType(spark.table("t").schema(1), CharType(5))
+ }
+ }
+
+ withTable("t") {
+ sql(s"CREATE TABLE t(i STRING, c CHAR(5)) USING $format PARTITIONED BY
(c)")
+ (0 to 5).map(n => "a" + " " * n).foreach { v =>
+ sql(s"INSERT INTO t VALUES ('1', '$v')")
+ checkAnswer(spark.table("t"), Row("1", "a" + " " * 4))
+ sql(s"ALTER TABLE t DROP PARTITION(c='$v')")
+ checkAnswer(spark.table("t"), Nil)
+ }
sql("INSERT OVERWRITE t VALUES ('1', null)")
checkAnswer(spark.table("t"), Row("1", null))
}
+
+ withTable("t") {
Review comment:
If we change
https://github.com/apache/spark/pull/31281/files#r563523300, we can remove this.
##########
File path:
sql/core/src/test/scala/org/apache/spark/sql/CharVarcharTestSuite.scala
##########
@@ -37,31 +37,134 @@ trait CharVarcharTestSuite extends QueryTest with
SQLTestUtils {
assert(CharVarcharUtils.getRawType(f.metadata) == Some(dt))
}
- test("char type values should be padded: top-level columns") {
+ test("char type values should be padded or trimmed: top-level columns") {
withTable("t") {
sql(s"CREATE TABLE t(i STRING, c CHAR(5)) USING $format")
- sql("INSERT INTO t VALUES ('1', 'a')")
- checkAnswer(spark.table("t"), Row("1", "a" + " " * 4))
- checkColType(spark.table("t").schema(1), CharType(5))
+ (0 to 5).map(n => "a" + " " * n).foreach { v =>
+ sql(s"INSERT OVERWRITE t VALUES ('1', '$v')")
+ checkAnswer(spark.table("t"), Row("1", "a" + " " * 4))
+ checkColType(spark.table("t").schema(1), CharType(5))
+ }
+
+ sql("INSERT OVERWRITE t VALUES ('1', null)")
+ checkAnswer(spark.table("t"), Row("1", null))
+ }
+ }
+
+ test("char type values should be padded or trimmed: partitioned columns") {
+ withTable("t") {
+ sql(s"CREATE TABLE t(i STRING, c CHAR(5)) USING $format PARTITIONED BY
(c)")
+ (0 to 5).map(n => "a" + " " * n).foreach { v =>
+ sql(s"INSERT OVERWRITE t VALUES ('1', '$v')")
+ checkAnswer(spark.table("t"), Row("1", "a" + " " * 4))
+ checkColType(spark.table("t").schema(1), CharType(5))
+ }
+ }
+
+ withTable("t") {
+ sql(s"CREATE TABLE t(i STRING, c CHAR(5)) USING $format PARTITIONED BY
(c)")
+ (0 to 5).map(n => "a" + " " * n).foreach { v =>
+ sql(s"INSERT INTO t VALUES ('1', '$v')")
+ checkAnswer(spark.table("t"), Row("1", "a" + " " * 4))
+ sql(s"ALTER TABLE t DROP PARTITION(c='$v')")
+ checkAnswer(spark.table("t"), Nil)
+ }
sql("INSERT OVERWRITE t VALUES ('1', null)")
checkAnswer(spark.table("t"), Row("1", null))
}
+
+ withTable("t") {
+ sql(s"CREATE TABLE t(i STRING, c CHAR(5)) USING $format PARTITIONED BY
(c)")
+ (0 to 5).map(n => "a" + " " * n).foreach { v =>
+ sql(s"INSERT INTO t VALUES ('1', '$v')")
+ sql(s"ALTER TABLE t DROP PARTITION(c='a')")
+ checkAnswer(spark.table("t"), Nil)
+ }
+ }
}
- test("char type values should be padded: partitioned columns") {
+ test("char type values should be padded or trimmed: static partitioned
columns") {
withTable("t") {
sql(s"CREATE TABLE t(i STRING, c CHAR(5)) USING $format PARTITIONED BY
(c)")
- sql("INSERT INTO t VALUES ('1', 'a')")
- checkAnswer(spark.table("t"), Row("1", "a" + " " * 4))
- checkColType(spark.table("t").schema(1), CharType(5))
+ (0 to 5).map(n => "a" + " " * n).foreach { v =>
+ sql(s"INSERT INTO t PARTITION (c ='$v') VALUES ('1')")
+ checkAnswer(spark.table("t"), Row("1", "a" + " " * 4))
+ checkColType(spark.table("t").schema(1), CharType(5))
+ sql(s"ALTER TABLE t DROP PARTITION(c='$v')")
Review comment:
ditto, we only need to test `PARTITION(c='a')`
##########
File path:
sql/core/src/test/scala/org/apache/spark/sql/CharVarcharTestSuite.scala
##########
@@ -37,31 +37,134 @@ trait CharVarcharTestSuite extends QueryTest with
SQLTestUtils {
assert(CharVarcharUtils.getRawType(f.metadata) == Some(dt))
}
- test("char type values should be padded: top-level columns") {
+ test("char type values should be padded or trimmed: top-level columns") {
withTable("t") {
sql(s"CREATE TABLE t(i STRING, c CHAR(5)) USING $format")
- sql("INSERT INTO t VALUES ('1', 'a')")
- checkAnswer(spark.table("t"), Row("1", "a" + " " * 4))
- checkColType(spark.table("t").schema(1), CharType(5))
+ (0 to 5).map(n => "a" + " " * n).foreach { v =>
+ sql(s"INSERT OVERWRITE t VALUES ('1', '$v')")
+ checkAnswer(spark.table("t"), Row("1", "a" + " " * 4))
+ checkColType(spark.table("t").schema(1), CharType(5))
+ }
+
+ sql("INSERT OVERWRITE t VALUES ('1', null)")
+ checkAnswer(spark.table("t"), Row("1", null))
+ }
+ }
+
+ test("char type values should be padded or trimmed: partitioned columns") {
+ withTable("t") {
+ sql(s"CREATE TABLE t(i STRING, c CHAR(5)) USING $format PARTITIONED BY
(c)")
+ (0 to 5).map(n => "a" + " " * n).foreach { v =>
+ sql(s"INSERT OVERWRITE t VALUES ('1', '$v')")
+ checkAnswer(spark.table("t"), Row("1", "a" + " " * 4))
+ checkColType(spark.table("t").schema(1), CharType(5))
+ }
+ }
+
+ withTable("t") {
+ sql(s"CREATE TABLE t(i STRING, c CHAR(5)) USING $format PARTITIONED BY
(c)")
+ (0 to 5).map(n => "a" + " " * n).foreach { v =>
+ sql(s"INSERT INTO t VALUES ('1', '$v')")
+ checkAnswer(spark.table("t"), Row("1", "a" + " " * 4))
+ sql(s"ALTER TABLE t DROP PARTITION(c='$v')")
+ checkAnswer(spark.table("t"), Nil)
+ }
sql("INSERT OVERWRITE t VALUES ('1', null)")
checkAnswer(spark.table("t"), Row("1", null))
}
+
+ withTable("t") {
+ sql(s"CREATE TABLE t(i STRING, c CHAR(5)) USING $format PARTITIONED BY
(c)")
+ (0 to 5).map(n => "a" + " " * n).foreach { v =>
+ sql(s"INSERT INTO t VALUES ('1', '$v')")
+ sql(s"ALTER TABLE t DROP PARTITION(c='a')")
+ checkAnswer(spark.table("t"), Nil)
+ }
+ }
}
- test("char type values should be padded: partitioned columns") {
+ test("char type values should be padded or trimmed: static partitioned
columns") {
withTable("t") {
sql(s"CREATE TABLE t(i STRING, c CHAR(5)) USING $format PARTITIONED BY
(c)")
- sql("INSERT INTO t VALUES ('1', 'a')")
- checkAnswer(spark.table("t"), Row("1", "a" + " " * 4))
- checkColType(spark.table("t").schema(1), CharType(5))
+ (0 to 5).map(n => "a" + " " * n).foreach { v =>
+ sql(s"INSERT INTO t PARTITION (c ='$v') VALUES ('1')")
+ checkAnswer(spark.table("t"), Row("1", "a" + " " * 4))
+ checkColType(spark.table("t").schema(1), CharType(5))
+ sql(s"ALTER TABLE t DROP PARTITION(c='$v')")
+ checkAnswer(spark.table("t"), Nil)
+ }
+ }
+ }
+
+ test("oversize char/varchar values for alter table partition operations") {
+ Seq("CHAR(5)", "VARCHAR(5)").foreach { typ =>
+ withTable("t") {
+ sql(s"CREATE TABLE t(i STRING, c $typ) USING $format PARTITIONED BY
(c)")
+ Seq("ADD", "DROP").foreach { op =>
+ val e = intercept[RuntimeException](sql(s"ALTER TABLE t $op
PARTITION(c='abcdef')"))
+ assert(e.getMessage.contains("Exceeds char/varchar type length
limitation: 5"))
+ }
+ val e1 = intercept[RuntimeException] {
+ sql(s"ALTER TABLE t PARTITION (c='abcdef') RENAME TO PARTITION
(c='2')")
+ }
+ assert(e1.getMessage.contains("Exceeds char/varchar type length
limitation: 5"))
+ val e2 = intercept[RuntimeException] {
+ sql(s"ALTER TABLE t PARTITION (c='1') RENAME TO PARTITION
(c='abcdef')")
+ }
+ assert(e2.getMessage.contains("Exceeds char/varchar type length
limitation: 5"))
+ }
+ }
+ }
+
+ test("varchar type values length check: partitioned columns dynamic") {
+ (0 to 5).foreach { n =>
+ withTable("t") {
+ sql(s"CREATE TABLE t(i STRING, c VARCHAR(5)) USING $format PARTITIONED
BY (c)")
+ val v = "a" + " " * n
+ sql(s"INSERT INTO t VALUES ('1', '$v')")
+ checkAnswer(spark.table("t"), Row("1", "a" + " " * math.min(n, 4)))
+ checkColType(spark.table("t").schema(1), VarcharType(5))
+ }
+ }
- sql("ALTER TABLE t DROP PARTITION(c='a')")
+ (0 to 5).foreach { n =>
+ withTable("t") {
+ sql(s"CREATE TABLE t(i STRING, c VARCHAR(5)) USING $format PARTITIONED
BY (c)")
+ val v = "a" + " " * n
+ sql(s"INSERT INTO t VALUES ('1', '$v')")
+ checkAnswer(spark.table("t"), Row("1", "a" + " " * math.min(n, 4)))
+ sql(s"ALTER TABLE t DROP PARTITION(c='$v')")
+ checkAnswer(spark.table("t"), Nil)
+ }
+ }
+
+ withTable("t") {
Review comment:
we can remove this as it's already in other tests
##########
File path:
sql/core/src/test/scala/org/apache/spark/sql/CharVarcharTestSuite.scala
##########
@@ -37,31 +37,134 @@ trait CharVarcharTestSuite extends QueryTest with
SQLTestUtils {
assert(CharVarcharUtils.getRawType(f.metadata) == Some(dt))
}
- test("char type values should be padded: top-level columns") {
+ test("char type values should be padded or trimmed: top-level columns") {
withTable("t") {
sql(s"CREATE TABLE t(i STRING, c CHAR(5)) USING $format")
- sql("INSERT INTO t VALUES ('1', 'a')")
- checkAnswer(spark.table("t"), Row("1", "a" + " " * 4))
- checkColType(spark.table("t").schema(1), CharType(5))
+ (0 to 5).map(n => "a" + " " * n).foreach { v =>
+ sql(s"INSERT OVERWRITE t VALUES ('1', '$v')")
+ checkAnswer(spark.table("t"), Row("1", "a" + " " * 4))
+ checkColType(spark.table("t").schema(1), CharType(5))
+ }
+
+ sql("INSERT OVERWRITE t VALUES ('1', null)")
+ checkAnswer(spark.table("t"), Row("1", null))
+ }
+ }
+
+ test("char type values should be padded or trimmed: partitioned columns") {
+ withTable("t") {
+ sql(s"CREATE TABLE t(i STRING, c CHAR(5)) USING $format PARTITIONED BY
(c)")
+ (0 to 5).map(n => "a" + " " * n).foreach { v =>
+ sql(s"INSERT OVERWRITE t VALUES ('1', '$v')")
+ checkAnswer(spark.table("t"), Row("1", "a" + " " * 4))
+ checkColType(spark.table("t").schema(1), CharType(5))
+ }
+ }
+
+ withTable("t") {
+ sql(s"CREATE TABLE t(i STRING, c CHAR(5)) USING $format PARTITIONED BY
(c)")
+ (0 to 5).map(n => "a" + " " * n).foreach { v =>
+ sql(s"INSERT INTO t VALUES ('1', '$v')")
+ checkAnswer(spark.table("t"), Row("1", "a" + " " * 4))
+ sql(s"ALTER TABLE t DROP PARTITION(c='$v')")
+ checkAnswer(spark.table("t"), Nil)
+ }
sql("INSERT OVERWRITE t VALUES ('1', null)")
checkAnswer(spark.table("t"), Row("1", null))
}
+
+ withTable("t") {
+ sql(s"CREATE TABLE t(i STRING, c CHAR(5)) USING $format PARTITIONED BY
(c)")
+ (0 to 5).map(n => "a" + " " * n).foreach { v =>
+ sql(s"INSERT INTO t VALUES ('1', '$v')")
+ sql(s"ALTER TABLE t DROP PARTITION(c='a')")
+ checkAnswer(spark.table("t"), Nil)
+ }
+ }
}
- test("char type values should be padded: partitioned columns") {
+ test("char type values should be padded or trimmed: static partitioned
columns") {
withTable("t") {
sql(s"CREATE TABLE t(i STRING, c CHAR(5)) USING $format PARTITIONED BY
(c)")
- sql("INSERT INTO t VALUES ('1', 'a')")
- checkAnswer(spark.table("t"), Row("1", "a" + " " * 4))
- checkColType(spark.table("t").schema(1), CharType(5))
+ (0 to 5).map(n => "a" + " " * n).foreach { v =>
+ sql(s"INSERT INTO t PARTITION (c ='$v') VALUES ('1')")
+ checkAnswer(spark.table("t"), Row("1", "a" + " " * 4))
+ checkColType(spark.table("t").schema(1), CharType(5))
+ sql(s"ALTER TABLE t DROP PARTITION(c='$v')")
+ checkAnswer(spark.table("t"), Nil)
+ }
+ }
+ }
+
+ test("oversize char/varchar values for alter table partition operations") {
+ Seq("CHAR(5)", "VARCHAR(5)").foreach { typ =>
+ withTable("t") {
+ sql(s"CREATE TABLE t(i STRING, c $typ) USING $format PARTITIONED BY
(c)")
+ Seq("ADD", "DROP").foreach { op =>
+ val e = intercept[RuntimeException](sql(s"ALTER TABLE t $op
PARTITION(c='abcdef')"))
+ assert(e.getMessage.contains("Exceeds char/varchar type length
limitation: 5"))
+ }
+ val e1 = intercept[RuntimeException] {
+ sql(s"ALTER TABLE t PARTITION (c='abcdef') RENAME TO PARTITION
(c='2')")
+ }
+ assert(e1.getMessage.contains("Exceeds char/varchar type length
limitation: 5"))
+ val e2 = intercept[RuntimeException] {
+ sql(s"ALTER TABLE t PARTITION (c='1') RENAME TO PARTITION
(c='abcdef')")
+ }
+ assert(e2.getMessage.contains("Exceeds char/varchar type length
limitation: 5"))
+ }
+ }
+ }
+
+ test("varchar type values length check: partitioned columns dynamic") {
+ (0 to 5).foreach { n =>
+ withTable("t") {
+ sql(s"CREATE TABLE t(i STRING, c VARCHAR(5)) USING $format PARTITIONED
BY (c)")
+ val v = "a" + " " * n
+ sql(s"INSERT INTO t VALUES ('1', '$v')")
+ checkAnswer(spark.table("t"), Row("1", "a" + " " * math.min(n, 4)))
+ checkColType(spark.table("t").schema(1), VarcharType(5))
+ }
+ }
- sql("ALTER TABLE t DROP PARTITION(c='a')")
+ (0 to 5).foreach { n =>
+ withTable("t") {
+ sql(s"CREATE TABLE t(i STRING, c VARCHAR(5)) USING $format PARTITIONED
BY (c)")
+ val v = "a" + " " * n
+ sql(s"INSERT INTO t VALUES ('1', '$v')")
+ checkAnswer(spark.table("t"), Row("1", "a" + " " * math.min(n, 4)))
+ sql(s"ALTER TABLE t DROP PARTITION(c='$v')")
+ checkAnswer(spark.table("t"), Nil)
+ }
+ }
+
+ withTable("t") {
+ sql(s"CREATE TABLE t(i STRING, c VARCHAR(5)) USING $format PARTITIONED
BY (c)")
+ val e = intercept[RuntimeException](sql("ALTER TABLE t DROP
PARTITION(c='abcdef')"))
+ assert(e.getMessage.contains("Exceeds char/varchar type length
limitation: 5"))
sql("INSERT OVERWRITE t VALUES ('1', null)")
checkAnswer(spark.table("t"), Row("1", null))
}
}
+ test("varchar type values length check: partitioned columns of other types")
{
Review comment:
can we test both char and varchar?
##########
File path:
sql/hive/src/main/scala/org/apache/spark/sql/hive/client/HiveClientImpl.scala
##########
@@ -988,7 +989,15 @@ private[hive] class HiveClientImpl(
private[hive] object HiveClientImpl extends Logging {
/** Converts the native StructField to Hive's FieldSchema. */
def toHiveColumn(c: StructField): FieldSchema = {
- val typeString = HiveVoidType.replaceVoidType(c.dataType).catalogString
+ val typeString = if
(c.metadata.contains(CHAR_VARCHAR_TYPE_STRING_METADATA_KEY)) {
+ // For Hive Serde, we still need to to restore the raw type for char and
varchar type.
+ // When reading data in parquet, orc, or avro file format with string
type for char,
+ // the tailing spaces may lost if we are not going to pad it.
+ c.metadata.getString(CHAR_VARCHAR_TYPE_STRING_METADATA_KEY)
Review comment:
we can add a `getRawTypeString` in `CharVarcharUtils`, instead of
exposing `CHAR_VARCHAR_TYPE_STRING_METADATA_KEY`?
##########
File path:
sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/CharVarcharUtils.scala
##########
@@ -31,6 +31,8 @@ object CharVarcharUtils extends Logging {
private val CHAR_VARCHAR_TYPE_STRING_METADATA_KEY =
"__CHAR_VARCHAR_TYPE_STRING"
+ def getRawTypeString: String = CHAR_VARCHAR_TYPE_STRING_METADATA_KEY
Review comment:
it should be `def getRawTypeString(metadata: Metadata): Option[String] =
...`
##########
File path:
sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/CharVarcharUtils.scala
##########
@@ -31,6 +31,8 @@ object CharVarcharUtils extends Logging {
private val CHAR_VARCHAR_TYPE_STRING_METADATA_KEY =
"__CHAR_VARCHAR_TYPE_STRING"
+ def getRawTypeString: String = CHAR_VARCHAR_TYPE_STRING_METADATA_KEY
Review comment:
and `getRawType` can call this method.
----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
For queries about this service, please contact Infrastructure at:
[email protected]
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]