sunchao commented on a change in pull request #34715:
URL: https://github.com/apache/spark/pull/34715#discussion_r760735005
##########
File path: python/pyspark/install.py
##########
@@ -83,7 +86,23 @@ def checked_versions(spark_version, hadoop_version,
hive_version):
"one of [%s]" % (hive_version, ",
".join(SUPPORTED_HADOOP_VERSIONS))
)
- return spark_version, hadoop_version, hive_version
+ return spark_version, convert_old_hadoop_version(spark_version,
hadoop_version), hive_version
+
+
+def convert_old_hadoop_version(spark_version, hadoop_version):
+ # check if Spark version < 3.2, if so, convert hadoop3 to hadoop3.2 and
hadoop2 to hadoop2.7
+ version_dict = {
+ "hadoop3": "hadoop3.2",
+ "hadoop2": "hadoop2.7",
+ "without": "without",
+ "without-hadoop": "without-hadoop",
+ }
+ spark_version_parts = re.search("^spark-([0-9]+)\\.([0-9]+)\\.[0-9]+$",
spark_version)
+ spark_major_version = int(spark_version_parts.group(1))
+ spark_minor_version = int(spark_version_parts.group(2))
+ if spark_major_version < 3 or (spark_major_version >= 3 and
spark_minor_version <= 2):
Review comment:
nit: I think it should be `spark_major_version == 3`? it shouldn't apply
to Spark version like `4.2`
##########
File path:
sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveExternalCatalogVersionsSuite.scala
##########
@@ -98,7 +98,9 @@ class HiveExternalCatalogVersionsSuite extends
SparkSubmitTestUtils {
mirrors.distinct :+ "https://archive.apache.org/dist" :+
PROCESS_TABLES.releaseMirror
logInfo(s"Trying to download Spark $version from $sites")
for (site <- sites) {
- val filename = if (version.startsWith("3")) {
+ val filename = if (version.startsWith("3.3")) {
Review comment:
maybe we should make this more general: we don't want to update this
whenever we update the Spark minor version like 3.4
##########
File path:
resource-managers/yarn/src/test/scala/org/apache/spark/deploy/yarn/ClientSuite.scala
##########
@@ -217,7 +217,7 @@ class ClientSuite extends SparkFunSuite with Matchers {
}
test("specify a more specific type for the application") {
- // TODO (SPARK-31733) Make this test case pass with Hadoop-3.2
+ // TODO (SPARK-31733) Make this test case pass with Hadoop-3
Review comment:
nit: maybe `Hadoop-3` -> `hadoop-3`
##########
File path: dev/test-dependencies.sh
##########
@@ -34,8 +34,8 @@ HADOOP_MODULE_PROFILES="-Phive-thriftserver -Pmesos
-Pkubernetes -Pyarn -Phive \
-Pspark-ganglia-lgpl -Pkinesis-asl -Phadoop-cloud"
MVN="build/mvn"
HADOOP_HIVE_PROFILES=(
- hadoop-2.7-hive-2.3
- hadoop-3.2-hive-2.3
+ hadoop-2-hive-2.3
Review comment:
unrelated but the `hive-2.3` seems redundant, since we don't support
alternative Hive profile right now
##########
File path: pom.xml
##########
@@ -3334,7 +3334,7 @@
-->
Review comment:
there are a few more places mentioned `Hadoop-3.2`, better to replace
them too.
##########
File path: python/docs/source/getting_started/install.rst
##########
@@ -55,27 +55,27 @@ For PySpark with/without a specific Hadoop version, you can
install it by using
.. code-block:: bash
- PYSPARK_HADOOP_VERSION=2.7 pip install pyspark
+ PYSPARK_HADOOP_VERSION=2 pip install pyspark
-The default distribution uses Hadoop 3.2 and Hive 2.3. If users specify
different versions of Hadoop, the pip installation automatically
+The default distribution uses Hadoop 3 and Hive 2.3. If users specify
different versions of Hadoop, the pip installation automatically
Review comment:
not sure if we should be more accurate and say `The default distribution
uses Hadoop 3.3 ..`
##########
File path: dev/run-tests-jenkins.py
##########
@@ -170,10 +170,10 @@ def main():
if "test-maven" in ghprb_pull_title:
os.environ["AMPLAB_JENKINS_BUILD_TOOL"] = "maven"
# Switch the Hadoop profile based on the PR title:
- if "test-hadoop2.7" in ghprb_pull_title:
- os.environ["AMPLAB_JENKINS_BUILD_PROFILE"] = "hadoop2.7"
- if "test-hadoop3.2" in ghprb_pull_title:
- os.environ["AMPLAB_JENKINS_BUILD_PROFILE"] = "hadoop3.2"
+ if "test-hadoop2" in ghprb_pull_title:
+ os.environ["AMPLAB_JENKINS_BUILD_PROFILE"] = "hadoop2"
Review comment:
do we need to update the infra code for jenkins CI somewhere?
##########
File path: docs/building-spark.md
##########
@@ -79,9 +79,9 @@ Example:
./build/mvn -Pyarn -Dhadoop.version=3.3.0 -DskipTests clean package
-If you want to build with Hadoop 2.x, enable hadoop-2.7 profile:
+If you want to build with Hadoop 2.x, enable hadoop-2 profile:
Review comment:
maybe use monospace font for `hadoop-2`.
##########
File path: python/pyspark/install.py
##########
@@ -83,7 +86,23 @@ def checked_versions(spark_version, hadoop_version,
hive_version):
"one of [%s]" % (hive_version, ",
".join(SUPPORTED_HADOOP_VERSIONS))
)
- return spark_version, hadoop_version, hive_version
+ return spark_version, convert_old_hadoop_version(spark_version,
hadoop_version), hive_version
+
+
+def convert_old_hadoop_version(spark_version, hadoop_version):
+ # check if Spark version < 3.2, if so, convert hadoop3 to hadoop3.2 and
hadoop2 to hadoop2.7
Review comment:
nit: `<=`
##########
File path: resource-managers/kubernetes/integration-tests/README.md
##########
@@ -17,9 +17,9 @@ To run tests with Java 11 instead of Java 8, use
`--java-image-tag` to specify t
./dev/dev-run-integration-tests.sh --java-image-tag 11-jre-slim
-To run tests with Hadoop 2.7 instead of Hadoop 3.2, use `--hadoop-profile`.
+To run tests with Hadoop 2 instead of Hadoop 3, use `--hadoop-profile`.
Review comment:
nit: maybe `To run tests with Hadoop 2.x instead of Hadoop 3.x`
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]