[GitHub] [spark] xkrogen commented on a change in pull request #31203: [SPARK-33212][FOLLOW-UP][BUILD] Bring back duplicate dependency check and add more strict Hadoop version check

2021-01-25 Thread GitBox


xkrogen commented on a change in pull request #31203:
URL: https://github.com/apache/spark/pull/31203#discussion_r563867053



##
File path: core/src/test/scala/org/apache/spark/util/VersionUtilsSuite.scala
##
@@ -98,4 +98,21 @@ class VersionUtilsSuite extends SparkFunSuite {
   }
 }
   }
+
+  test("SPARK-33212: retrieve major/minor/patch version parts") {
+assert(VersionUtils.majorMinorPatchVersion("3.2.2").contains((3, 2, 2)))
+assert(VersionUtils.majorMinorPatchVersion("3.2.2.4").contains((3, 2, 2)))
+assert(VersionUtils.majorMinorPatchVersion("3.2.2-SNAPSHOT").contains((3, 
2, 2)))
+assert(VersionUtils.majorMinorPatchVersion("3.2.2.4XXX").contains((3, 2, 
2)))
+assert(VersionUtils.majorMinorPatchVersion("3.2").contains((3, 2, 0)))
+assert(VersionUtils.majorMinorPatchVersion("3").contains((3, 0, 0)))
+
+// illegal cases
+assert(VersionUtils.majorMinorPatchVersion("ABC").isEmpty)
+assert(VersionUtils.majorMinorPatchVersion("3X").isEmpty)
+assert(VersionUtils.majorMinorPatchVersion("3.2-SNAPSHOT").isEmpty)
+assert(VersionUtils.majorMinorPatchVersion("3.2ABC").isEmpty)
+assert(VersionUtils.majorMinorPatchVersion("3-ABC").isEmpty)
+assert(VersionUtils.majorMinorPatchVersion("3.2.4XYZ").isEmpty)

Review comment:
   You can consider simplifying this a little like:
   ```
   Seq("ABC", "3X", ...).foreach { versionStr =>
 assert(VersionUtils.majorMinorPatchVersion(versionStr).isEmpty, s"version 
$versionStr")
   }
   ```

##
File path: 
sql/hive/src/test/scala/org/apache/spark/sql/hive/client/HadoopVersionInfoSuite.scala
##
@@ -68,4 +69,22 @@ class HadoopVersionInfoSuite extends SparkFunSuite {
   Utils.deleteRecursively(ivyPath)
 }
   }
+
+  test("SPARK-32212: test supportHadoopShadedClient()") {
+assert(IsolatedClientLoader.supportHadoopShadedClient("3.2.2"))
+assert(IsolatedClientLoader.supportHadoopShadedClient("3.2.3"))
+assert(IsolatedClientLoader.supportHadoopShadedClient("3.2.2.1"))
+assert(IsolatedClientLoader.supportHadoopShadedClient("3.2.2-XYZ"))
+assert(IsolatedClientLoader.supportHadoopShadedClient("3.2.2.4-SNAPSHOT"))

Review comment:
   Maybe consider shortening like:
   ```
   Seq("3.2.2", "3.2.3", ...).foreach { versionStr =>
 assert(IsolatedClientLoader.supportHadoopShadedClient(versionStr), 
s"version $versionStr")
   }
   ```
   (and same for the negative cases)

##
File path: core/src/main/scala/org/apache/spark/util/VersionUtils.scala
##
@@ -63,4 +64,37 @@ private[spark] object VersionUtils {
   s" version string, but it could not find the major and minor version 
numbers.")
 }
   }
+
+  /**
+   * Retrieves the major, minor and patch parts from the input `version`. 
Returns `None` if the
+   * input is not of a valid format.

Review comment:
   I think we should mention that minor/patch versions are filled in as 0 
if they're not found. This is different from the behavior of other methods in 
this class (e.g. majorMinor will give an error if minor is not present)

##
File path: 
sql/hive/src/main/scala/org/apache/spark/sql/hive/client/IsolatedClientLoader.scala
##
@@ -88,31 +88,40 @@ private[hive] object IsolatedClientLoader extends Logging {
   barrierPrefixes = barrierPrefixes)
   }
 
-  def hiveVersion(version: String): HiveVersion = version match {
-case "12" | "0.12" | "0.12.0" => hive.v12
-case "13" | "0.13" | "0.13.0" | "0.13.1" => hive.v13
-case "14" | "0.14" | "0.14.0" => hive.v14
-case "1.0" | "1.0.0" | "1.0.1" => hive.v1_0
-case "1.1" | "1.1.0" | "1.1.1" => hive.v1_1
-case "1.2" | "1.2.0" | "1.2.1" | "1.2.2" => hive.v1_2
-case "2.0" | "2.0.0" | "2.0.1" => hive.v2_0
-case "2.1" | "2.1.0" | "2.1.1" => hive.v2_1
-case "2.2" | "2.2.0" => hive.v2_2
-case "2.3" | "2.3.0" | "2.3.1" | "2.3.2" | "2.3.3" | "2.3.4" | "2.3.5" | 
"2.3.6" | "2.3.7" |
- "2.3.8" => hive.v2_3
-case "3.0" | "3.0.0" => hive.v3_0
-case "3.1" | "3.1.0" | "3.1.1" | "3.1.2" => hive.v3_1
-case version =>
+  def hiveVersion(version: String): HiveVersion = {
+VersionUtils.majorMinorPatchVersion(version).flatMap {
+  case (12, _, _) | (0, 12, _) => Some(hive.v12)
+  case (13, _, _) | (0, 13, _) => Some(hive.v13)
+  case (14, _, _) | (0, 14, _) => Some(hive.v14)
+  case (1, 0, _) => Some(hive.v1_0)
+  case (1, 1, _) => Some(hive.v1_1)
+  case (1, 2, _) => Some(hive.v1_2)
+  case (2, 0, _) => Some(hive.v2_0)
+  case (2, 1, _) => Some(hive.v2_1)
+  case (2, 2, _) => Some(hive.v2_2)
+  case (2, 3, _) => Some(hive.v2_3)
+  case (3, 0, _) => Some(hive.v3_0)
+  case (3, 1, _) => Some(hive.v3_1)
+  case _ => None
+}.getOrElse {
   throw new UnsupportedOperationException(s"Unsupported Hive Metastore 
version ($version). " +
 s"Please set ${HiveUtils.HIVE_METASTORE_VERSION.key} with a valid 
version.")
+}
+  }

[GitHub] [spark] xkrogen commented on a change in pull request #31203: [SPARK-33212][FOLLOW-UP][BUILD] Bring back duplicate dependency check and add more strict Hadoop version check

2021-01-25 Thread GitBox


xkrogen commented on a change in pull request #31203:
URL: https://github.com/apache/spark/pull/31203#discussion_r563867053



##
File path: core/src/test/scala/org/apache/spark/util/VersionUtilsSuite.scala
##
@@ -98,4 +98,21 @@ class VersionUtilsSuite extends SparkFunSuite {
   }
 }
   }
+
+  test("SPARK-33212: retrieve major/minor/patch version parts") {
+assert(VersionUtils.majorMinorPatchVersion("3.2.2").contains((3, 2, 2)))
+assert(VersionUtils.majorMinorPatchVersion("3.2.2.4").contains((3, 2, 2)))
+assert(VersionUtils.majorMinorPatchVersion("3.2.2-SNAPSHOT").contains((3, 
2, 2)))
+assert(VersionUtils.majorMinorPatchVersion("3.2.2.4XXX").contains((3, 2, 
2)))
+assert(VersionUtils.majorMinorPatchVersion("3.2").contains((3, 2, 0)))
+assert(VersionUtils.majorMinorPatchVersion("3").contains((3, 0, 0)))
+
+// illegal cases
+assert(VersionUtils.majorMinorPatchVersion("ABC").isEmpty)
+assert(VersionUtils.majorMinorPatchVersion("3X").isEmpty)
+assert(VersionUtils.majorMinorPatchVersion("3.2-SNAPSHOT").isEmpty)
+assert(VersionUtils.majorMinorPatchVersion("3.2ABC").isEmpty)
+assert(VersionUtils.majorMinorPatchVersion("3-ABC").isEmpty)
+assert(VersionUtils.majorMinorPatchVersion("3.2.4XYZ").isEmpty)

Review comment:
   You can consider simplifying this a little like:
   ```
   Seq("ABC", "3X", ...).foreach { versionStr =>
 assert(VersionUtils.majorMinorPatchVersion(versionStr).isEmpty, s"version 
$versionStr")
   }
   ```

##
File path: 
sql/hive/src/test/scala/org/apache/spark/sql/hive/client/HadoopVersionInfoSuite.scala
##
@@ -68,4 +69,22 @@ class HadoopVersionInfoSuite extends SparkFunSuite {
   Utils.deleteRecursively(ivyPath)
 }
   }
+
+  test("SPARK-32212: test supportHadoopShadedClient()") {
+assert(IsolatedClientLoader.supportHadoopShadedClient("3.2.2"))
+assert(IsolatedClientLoader.supportHadoopShadedClient("3.2.3"))
+assert(IsolatedClientLoader.supportHadoopShadedClient("3.2.2.1"))
+assert(IsolatedClientLoader.supportHadoopShadedClient("3.2.2-XYZ"))
+assert(IsolatedClientLoader.supportHadoopShadedClient("3.2.2.4-SNAPSHOT"))

Review comment:
   Maybe consider shortening like:
   ```
   Seq("3.2.2", "3.2.3", ...).foreach { versionStr =>
 assert(IsolatedClientLoader.supportHadoopShadedClient(versionStr), 
s"version $versionStr")
   }
   ```
   (and same for the negative cases)

##
File path: core/src/main/scala/org/apache/spark/util/VersionUtils.scala
##
@@ -63,4 +64,37 @@ private[spark] object VersionUtils {
   s" version string, but it could not find the major and minor version 
numbers.")
 }
   }
+
+  /**
+   * Retrieves the major, minor and patch parts from the input `version`. 
Returns `None` if the
+   * input is not of a valid format.

Review comment:
   I think we should mention that minor/patch versions are filled in as 0 
if they're not found. This is different from the behavior of other methods in 
this class (e.g. majorMinor will give an error if minor is not present)

##
File path: 
sql/hive/src/main/scala/org/apache/spark/sql/hive/client/IsolatedClientLoader.scala
##
@@ -88,31 +88,40 @@ private[hive] object IsolatedClientLoader extends Logging {
   barrierPrefixes = barrierPrefixes)
   }
 
-  def hiveVersion(version: String): HiveVersion = version match {
-case "12" | "0.12" | "0.12.0" => hive.v12
-case "13" | "0.13" | "0.13.0" | "0.13.1" => hive.v13
-case "14" | "0.14" | "0.14.0" => hive.v14
-case "1.0" | "1.0.0" | "1.0.1" => hive.v1_0
-case "1.1" | "1.1.0" | "1.1.1" => hive.v1_1
-case "1.2" | "1.2.0" | "1.2.1" | "1.2.2" => hive.v1_2
-case "2.0" | "2.0.0" | "2.0.1" => hive.v2_0
-case "2.1" | "2.1.0" | "2.1.1" => hive.v2_1
-case "2.2" | "2.2.0" => hive.v2_2
-case "2.3" | "2.3.0" | "2.3.1" | "2.3.2" | "2.3.3" | "2.3.4" | "2.3.5" | 
"2.3.6" | "2.3.7" |
- "2.3.8" => hive.v2_3
-case "3.0" | "3.0.0" => hive.v3_0
-case "3.1" | "3.1.0" | "3.1.1" | "3.1.2" => hive.v3_1
-case version =>
+  def hiveVersion(version: String): HiveVersion = {
+VersionUtils.majorMinorPatchVersion(version).flatMap {
+  case (12, _, _) | (0, 12, _) => Some(hive.v12)
+  case (13, _, _) | (0, 13, _) => Some(hive.v13)
+  case (14, _, _) | (0, 14, _) => Some(hive.v14)
+  case (1, 0, _) => Some(hive.v1_0)
+  case (1, 1, _) => Some(hive.v1_1)
+  case (1, 2, _) => Some(hive.v1_2)
+  case (2, 0, _) => Some(hive.v2_0)
+  case (2, 1, _) => Some(hive.v2_1)
+  case (2, 2, _) => Some(hive.v2_2)
+  case (2, 3, _) => Some(hive.v2_3)
+  case (3, 0, _) => Some(hive.v3_0)
+  case (3, 1, _) => Some(hive.v3_1)
+  case _ => None
+}.getOrElse {
   throw new UnsupportedOperationException(s"Unsupported Hive Metastore 
version ($version). " +
 s"Please set ${HiveUtils.HIVE_METASTORE_VERSION.key} with a valid 
version.")
+}
+  }

[GitHub] [spark] xkrogen commented on a change in pull request #31203: [SPARK-33212][FOLLOW-UP][BUILD] Bring back duplicate dependency check and add more strict Hadoop version check

2021-01-22 Thread GitBox


xkrogen commented on a change in pull request #31203:
URL: https://github.com/apache/spark/pull/31203#discussion_r562924187



##
File path: 
sql/hive/src/main/scala/org/apache/spark/sql/hive/client/IsolatedClientLoader.scala
##
@@ -88,31 +89,71 @@ private[hive] object IsolatedClientLoader extends Logging {
   barrierPrefixes = barrierPrefixes)
   }
 
-  def hiveVersion(version: String): HiveVersion = version match {
-case "12" | "0.12" | "0.12.0" => hive.v12
-case "13" | "0.13" | "0.13.0" | "0.13.1" => hive.v13
-case "14" | "0.14" | "0.14.0" => hive.v14
-case "1.0" | "1.0.0" | "1.0.1" => hive.v1_0
-case "1.1" | "1.1.0" | "1.1.1" => hive.v1_1
-case "1.2" | "1.2.0" | "1.2.1" | "1.2.2" => hive.v1_2
-case "2.0" | "2.0.0" | "2.0.1" => hive.v2_0
-case "2.1" | "2.1.0" | "2.1.1" => hive.v2_1
-case "2.2" | "2.2.0" => hive.v2_2
-case "2.3" | "2.3.0" | "2.3.1" | "2.3.2" | "2.3.3" | "2.3.4" | "2.3.5" | 
"2.3.6" | "2.3.7" |
- "2.3.8" => hive.v2_3
-case "3.0" | "3.0.0" => hive.v3_0
-case "3.1" | "3.1.0" | "3.1.1" | "3.1.2" => hive.v3_1
-case version =>
+  def hiveVersion(version: String): HiveVersion = {
+getVersionParts(version).flatMap {
+  case (12, _, _) | (0, 12, _) => Some(hive.v12)
+  case (13, _, _) | (0, 13, _) => Some(hive.v13)
+  case (14, _, _) | (0, 14, _) => Some(hive.v14)
+  case (1, 0, _) => Some(hive.v1_0)
+  case (1, 1, _) => Some(hive.v1_1)
+  case (1, 2, _) => Some(hive.v1_2)
+  case (2, 0, _) => Some(hive.v2_0)
+  case (2, 1, _) => Some(hive.v2_1)
+  case (2, 2, _) => Some(hive.v2_2)
+  case (2, 3, _) => Some(hive.v2_3)
+  case (3, 0, _) => Some(hive.v3_0)
+  case (3, 1, _) => Some(hive.v3_1)
+  case _ => None
+}.getOrElse {
   throw new UnsupportedOperationException(s"Unsupported Hive Metastore 
version ($version). " +
 s"Please set ${HiveUtils.HIVE_METASTORE_VERSION.key} with a valid 
version.")
+}
+  }
+
+  def supportHadoopShadedClient(hadoopVersion: String): Boolean = {
+getVersionParts(hadoopVersion).exists {
+  case (3, 2, v) if v >= 2 => true
+  case _ => false

Review comment:
   Excellent idea on adding a compatibility test for the built-in Hadoop 
version!





This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org



-
To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org
For additional commands, e-mail: reviews-h...@spark.apache.org



[GitHub] [spark] xkrogen commented on a change in pull request #31203: [SPARK-33212][FOLLOW-UP][BUILD] Bring back duplicate dependency check and add more strict Hadoop version check

2021-01-22 Thread GitBox


xkrogen commented on a change in pull request #31203:
URL: https://github.com/apache/spark/pull/31203#discussion_r562910327



##
File path: 
sql/hive/src/main/scala/org/apache/spark/sql/hive/client/IsolatedClientLoader.scala
##
@@ -88,31 +89,71 @@ private[hive] object IsolatedClientLoader extends Logging {
   barrierPrefixes = barrierPrefixes)
   }
 
-  def hiveVersion(version: String): HiveVersion = version match {
-case "12" | "0.12" | "0.12.0" => hive.v12
-case "13" | "0.13" | "0.13.0" | "0.13.1" => hive.v13
-case "14" | "0.14" | "0.14.0" => hive.v14
-case "1.0" | "1.0.0" | "1.0.1" => hive.v1_0
-case "1.1" | "1.1.0" | "1.1.1" => hive.v1_1
-case "1.2" | "1.2.0" | "1.2.1" | "1.2.2" => hive.v1_2
-case "2.0" | "2.0.0" | "2.0.1" => hive.v2_0
-case "2.1" | "2.1.0" | "2.1.1" => hive.v2_1
-case "2.2" | "2.2.0" => hive.v2_2
-case "2.3" | "2.3.0" | "2.3.1" | "2.3.2" | "2.3.3" | "2.3.4" | "2.3.5" | 
"2.3.6" | "2.3.7" |
- "2.3.8" => hive.v2_3
-case "3.0" | "3.0.0" => hive.v3_0
-case "3.1" | "3.1.0" | "3.1.1" | "3.1.2" => hive.v3_1
-case version =>
+  def hiveVersion(version: String): HiveVersion = {
+getVersionParts(version).flatMap {
+  case (12, _, _) | (0, 12, _) => Some(hive.v12)
+  case (13, _, _) | (0, 13, _) => Some(hive.v13)
+  case (14, _, _) | (0, 14, _) => Some(hive.v14)
+  case (1, 0, _) => Some(hive.v1_0)
+  case (1, 1, _) => Some(hive.v1_1)
+  case (1, 2, _) => Some(hive.v1_2)
+  case (2, 0, _) => Some(hive.v2_0)
+  case (2, 1, _) => Some(hive.v2_1)
+  case (2, 2, _) => Some(hive.v2_2)
+  case (2, 3, _) => Some(hive.v2_3)
+  case (3, 0, _) => Some(hive.v3_0)
+  case (3, 1, _) => Some(hive.v3_1)
+  case _ => None
+}.getOrElse {
   throw new UnsupportedOperationException(s"Unsupported Hive Metastore 
version ($version). " +
 s"Please set ${HiveUtils.HIVE_METASTORE_VERSION.key} with a valid 
version.")
+}
+  }
+
+  def supportHadoopShadedClient(hadoopVersion: String): Boolean = {
+getVersionParts(hadoopVersion).exists {
+  case (3, 2, v) if v >= 2 => true
+  case _ => false

Review comment:
   Interesting... I just worry about forgetting to update this if/when we 
bump the Hadoop version in the future and causing a regression. Has the 
`hadoop-aws` fix made it to be targeted for Hadoop 3.3.1? If so, can we 
reasonably assume that 3.2.2+, 3.3.1+, and 3.4.0+ will have it?
   
   It seems you're more tied into what's happening in the Hadoop world than I 
am these days so I'll take your word in either direction. If we decide _not_ to 
future-proof it, can we create a follow-up JIRA to revisit it once some future 
release is out at which time we would be confident in putting a wildcard?





This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org



-
To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org
For additional commands, e-mail: reviews-h...@spark.apache.org



[GitHub] [spark] xkrogen commented on a change in pull request #31203: [SPARK-33212][FOLLOW-UP][BUILD] Bring back duplicate dependency check and add more strict Hadoop version check

2021-01-22 Thread GitBox


xkrogen commented on a change in pull request #31203:
URL: https://github.com/apache/spark/pull/31203#discussion_r562910327



##
File path: 
sql/hive/src/main/scala/org/apache/spark/sql/hive/client/IsolatedClientLoader.scala
##
@@ -88,31 +89,71 @@ private[hive] object IsolatedClientLoader extends Logging {
   barrierPrefixes = barrierPrefixes)
   }
 
-  def hiveVersion(version: String): HiveVersion = version match {
-case "12" | "0.12" | "0.12.0" => hive.v12
-case "13" | "0.13" | "0.13.0" | "0.13.1" => hive.v13
-case "14" | "0.14" | "0.14.0" => hive.v14
-case "1.0" | "1.0.0" | "1.0.1" => hive.v1_0
-case "1.1" | "1.1.0" | "1.1.1" => hive.v1_1
-case "1.2" | "1.2.0" | "1.2.1" | "1.2.2" => hive.v1_2
-case "2.0" | "2.0.0" | "2.0.1" => hive.v2_0
-case "2.1" | "2.1.0" | "2.1.1" => hive.v2_1
-case "2.2" | "2.2.0" => hive.v2_2
-case "2.3" | "2.3.0" | "2.3.1" | "2.3.2" | "2.3.3" | "2.3.4" | "2.3.5" | 
"2.3.6" | "2.3.7" |
- "2.3.8" => hive.v2_3
-case "3.0" | "3.0.0" => hive.v3_0
-case "3.1" | "3.1.0" | "3.1.1" | "3.1.2" => hive.v3_1
-case version =>
+  def hiveVersion(version: String): HiveVersion = {
+getVersionParts(version).flatMap {
+  case (12, _, _) | (0, 12, _) => Some(hive.v12)
+  case (13, _, _) | (0, 13, _) => Some(hive.v13)
+  case (14, _, _) | (0, 14, _) => Some(hive.v14)
+  case (1, 0, _) => Some(hive.v1_0)
+  case (1, 1, _) => Some(hive.v1_1)
+  case (1, 2, _) => Some(hive.v1_2)
+  case (2, 0, _) => Some(hive.v2_0)
+  case (2, 1, _) => Some(hive.v2_1)
+  case (2, 2, _) => Some(hive.v2_2)
+  case (2, 3, _) => Some(hive.v2_3)
+  case (3, 0, _) => Some(hive.v3_0)
+  case (3, 1, _) => Some(hive.v3_1)
+  case _ => None
+}.getOrElse {
   throw new UnsupportedOperationException(s"Unsupported Hive Metastore 
version ($version). " +
 s"Please set ${HiveUtils.HIVE_METASTORE_VERSION.key} with a valid 
version.")
+}
+  }
+
+  def supportHadoopShadedClient(hadoopVersion: String): Boolean = {
+getVersionParts(hadoopVersion).exists {
+  case (3, 2, v) if v >= 2 => true
+  case _ => false

Review comment:
   Interesting... I just worry about forgetting to update this if/when we 
bump the Hadoop version in the future and causing a regression. Has the 
`hadoop-aws` fix made it into Hadoop 3.3.1? If so, can we reasonably assume 
that 3.2.2+, 3.3.1+, and 3.4.0+ will have it?





This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org



-
To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org
For additional commands, e-mail: reviews-h...@spark.apache.org



[GitHub] [spark] xkrogen commented on a change in pull request #31203: [SPARK-33212][FOLLOW-UP][BUILD] Bring back duplicate dependency check and add more strict Hadoop version check

2021-01-22 Thread GitBox


xkrogen commented on a change in pull request #31203:
URL: https://github.com/apache/spark/pull/31203#discussion_r562903674



##
File path: 
sql/hive/src/main/scala/org/apache/spark/sql/hive/client/IsolatedClientLoader.scala
##
@@ -88,31 +89,71 @@ private[hive] object IsolatedClientLoader extends Logging {
   barrierPrefixes = barrierPrefixes)
   }
 
-  def hiveVersion(version: String): HiveVersion = version match {
-case "12" | "0.12" | "0.12.0" => hive.v12
-case "13" | "0.13" | "0.13.0" | "0.13.1" => hive.v13
-case "14" | "0.14" | "0.14.0" => hive.v14
-case "1.0" | "1.0.0" | "1.0.1" => hive.v1_0
-case "1.1" | "1.1.0" | "1.1.1" => hive.v1_1
-case "1.2" | "1.2.0" | "1.2.1" | "1.2.2" => hive.v1_2
-case "2.0" | "2.0.0" | "2.0.1" => hive.v2_0
-case "2.1" | "2.1.0" | "2.1.1" => hive.v2_1
-case "2.2" | "2.2.0" => hive.v2_2
-case "2.3" | "2.3.0" | "2.3.1" | "2.3.2" | "2.3.3" | "2.3.4" | "2.3.5" | 
"2.3.6" | "2.3.7" |
- "2.3.8" => hive.v2_3
-case "3.0" | "3.0.0" => hive.v3_0
-case "3.1" | "3.1.0" | "3.1.1" | "3.1.2" => hive.v3_1
-case version =>
+  def hiveVersion(version: String): HiveVersion = {
+getVersionParts(version).flatMap {
+  case (12, _, _) | (0, 12, _) => Some(hive.v12)
+  case (13, _, _) | (0, 13, _) => Some(hive.v13)
+  case (14, _, _) | (0, 14, _) => Some(hive.v14)
+  case (1, 0, _) => Some(hive.v1_0)
+  case (1, 1, _) => Some(hive.v1_1)
+  case (1, 2, _) => Some(hive.v1_2)
+  case (2, 0, _) => Some(hive.v2_0)
+  case (2, 1, _) => Some(hive.v2_1)
+  case (2, 2, _) => Some(hive.v2_2)
+  case (2, 3, _) => Some(hive.v2_3)
+  case (3, 0, _) => Some(hive.v3_0)
+  case (3, 1, _) => Some(hive.v3_1)
+  case _ => None
+}.getOrElse {
   throw new UnsupportedOperationException(s"Unsupported Hive Metastore 
version ($version). " +
 s"Please set ${HiveUtils.HIVE_METASTORE_VERSION.key} with a valid 
version.")
+}
+  }
+
+  def supportHadoopShadedClient(hadoopVersion: String): Boolean = {
+getVersionParts(hadoopVersion).exists {
+  case (3, 2, v) if v >= 2 => true
+  case _ => false

Review comment:
   maybe
   ```
   case (maj, _, _) if maj > 3 => true
   case (3, min, _) if min > 2 => true
   case (3, 2, patch) if patch >=2 => true
   ```
   Seems like we can reasonably assume that future versions of Hadoop will 
support the shaded client?

##
File path: 
sql/hive/src/main/scala/org/apache/spark/sql/hive/client/IsolatedClientLoader.scala
##
@@ -88,31 +89,71 @@ private[hive] object IsolatedClientLoader extends Logging {
   barrierPrefixes = barrierPrefixes)
   }
 
-  def hiveVersion(version: String): HiveVersion = version match {
-case "12" | "0.12" | "0.12.0" => hive.v12
-case "13" | "0.13" | "0.13.0" | "0.13.1" => hive.v13
-case "14" | "0.14" | "0.14.0" => hive.v14
-case "1.0" | "1.0.0" | "1.0.1" => hive.v1_0
-case "1.1" | "1.1.0" | "1.1.1" => hive.v1_1
-case "1.2" | "1.2.0" | "1.2.1" | "1.2.2" => hive.v1_2
-case "2.0" | "2.0.0" | "2.0.1" => hive.v2_0
-case "2.1" | "2.1.0" | "2.1.1" => hive.v2_1
-case "2.2" | "2.2.0" => hive.v2_2
-case "2.3" | "2.3.0" | "2.3.1" | "2.3.2" | "2.3.3" | "2.3.4" | "2.3.5" | 
"2.3.6" | "2.3.7" =>
-  hive.v2_3
-case "3.0" | "3.0.0" => hive.v3_0
-case "3.1" | "3.1.0" | "3.1.1" | "3.1.2" => hive.v3_1
-case version =>
+  def hiveVersion(version: String): HiveVersion = {

Review comment:
   Seems a good improvement to me, that table is pretty unsightly as-is.

##
File path: 
sql/hive/src/main/scala/org/apache/spark/sql/hive/client/IsolatedClientLoader.scala
##
@@ -88,31 +89,71 @@ private[hive] object IsolatedClientLoader extends Logging {
   barrierPrefixes = barrierPrefixes)
   }
 
-  def hiveVersion(version: String): HiveVersion = version match {
-case "12" | "0.12" | "0.12.0" => hive.v12
-case "13" | "0.13" | "0.13.0" | "0.13.1" => hive.v13
-case "14" | "0.14" | "0.14.0" => hive.v14
-case "1.0" | "1.0.0" | "1.0.1" => hive.v1_0
-case "1.1" | "1.1.0" | "1.1.1" => hive.v1_1
-case "1.2" | "1.2.0" | "1.2.1" | "1.2.2" => hive.v1_2
-case "2.0" | "2.0.0" | "2.0.1" => hive.v2_0
-case "2.1" | "2.1.0" | "2.1.1" => hive.v2_1
-case "2.2" | "2.2.0" => hive.v2_2
-case "2.3" | "2.3.0" | "2.3.1" | "2.3.2" | "2.3.3" | "2.3.4" | "2.3.5" | 
"2.3.6" | "2.3.7" |
- "2.3.8" => hive.v2_3
-case "3.0" | "3.0.0" => hive.v3_0
-case "3.1" | "3.1.0" | "3.1.1" | "3.1.2" => hive.v3_1
-case version =>
+  def hiveVersion(version: String): HiveVersion = {
+getVersionParts(version).flatMap {
+  case (12, _, _) | (0, 12, _) => Some(hive.v12)
+  case (13, _, _) | (0, 13, _) => Some(hive.v13)
+  case (14, _, _) | (0, 14, _) => Some(hive.v14)
+  case (1, 0, _) => Some(hive.v1_0)
+  case (1, 1, _) => Some(hive.v1_1)
+  case (1, 2, _) => Some(hive.v1_2)
+  case (2, 

[GitHub] [spark] xkrogen commented on a change in pull request #31203: [SPARK-33212][FOLLOW-UP][BUILD] Bring back duplicate dependency check and add more strict Hadoop version check

2021-01-15 Thread GitBox


xkrogen commented on a change in pull request #31203:
URL: https://github.com/apache/spark/pull/31203#discussion_r558714139



##
File path: 
sql/hive/src/main/scala/org/apache/spark/sql/hive/client/IsolatedClientLoader.scala
##
@@ -107,12 +107,17 @@ private[hive] object IsolatedClientLoader extends Logging 
{
 s"Please set ${HiveUtils.HIVE_METASTORE_VERSION.key} with a valid 
version.")
   }
 
+  def supportHadoopShadedClient(hadoopVersion: String): Boolean = 
hadoopVersion match {
+case "3.2.2" => true

Review comment:
   Maintaining a list of versions is probably going to become a pain, is 
there a semantic version comparison function hanging around somewhere we can 
use to verify `isMoreRecentVersion(hadoopVersion, "3.2.2")` ?





This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org



-
To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org
For additional commands, e-mail: reviews-h...@spark.apache.org