HyukjinKwon commented on a change in pull request #29703:
URL: https://github.com/apache/spark/pull/29703#discussion_r489937250



##########
File path: python/pyspark/install.py
##########
@@ -0,0 +1,170 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+import os
+import re
+import tarfile
+import traceback
+import urllib.request
+from shutil import rmtree
+# NOTE that we shouldn't import pyspark here because this is used in
+# setup.py, and assume there's no PySpark imported.
+
+DEFAULT_HADOOP = "hadoop3.2"
+DEFAULT_HIVE = "hive2.3"
+SUPPORTED_HADOOP_VERSIONS = ["hadoop2.7", "hadoop3.2", "without-hadoop"]
+SUPPORTED_HIVE_VERSIONS = ["hive1.2", "hive2.3"]
+UNSUPPORTED_COMBINATIONS = [
+    ("without-hadoop", "hive1.2"),
+    ("hadoop3.2", "hive1.2"),
+]
+
+
+def checked_package_name(spark_version, hadoop_version, hive_version):
+    if hive_version == "hive1.2":
+        return "%s-bin-%s-%s" % (spark_version, hadoop_version, hive_version)
+    else:
+        return "%s-bin-%s" % (spark_version, hadoop_version)
+
+
+def checked_versions(spark_version, hadoop_version, hive_version):
+    """
+    Check the valid combinations of supported versions in Spark distributions.
+
+    :param spark_version: Spark version. It should be X.X.X such as '3.0.0' or 
spark-3.0.0.
+    :param hadoop_version: Hadoop version. It should be X.X such as '2.7' or 
'hadoop2.7'.
+        'without' and 'without-hadoop' are supported as special keywords for 
Hadoop free
+        distribution.
+    :param hive_version: Hive version. It should be X.X such as '1.2' or 
'hive1.2'.
+
+    :return it returns fully-qualified versions of Spark, Hadoop and Hive in a 
tuple.
+        For example, spark-3.0.0, hadoop3.2 and hive2.3.
+    """
+    if re.match("^[0-9]+\\.[0-9]+\\.[0-9]+$", spark_version):
+        spark_version = "spark-%s" % spark_version
+    if not spark_version.startswith("spark-"):
+        raise RuntimeError(
+            "Spark version should start with 'spark-' prefix; however, "
+            "got %s" % spark_version)
+
+    if hadoop_version == "without":
+        hadoop_version = "without-hadoop"

Review comment:
       It is verified below `if hadoop_version not in 
SUPPORTED_HADOOP_VERSIONS:` later.  There's a test case here: 
https://github.com/apache/spark/pull/29703/files/033a33ee515b95342e8c5a74e63054d915661579#diff-e23af4eb5cc3bf6af4bc26cb801b7e84R69
 and 
https://github.com/apache/spark/pull/29703/files/033a33ee515b95342e8c5a74e63054d915661579#diff-e23af4eb5cc3bf6af4bc26cb801b7e84R88
   
   Users can specify the Hadoop and Hive versions such as `hadoop3.2` and 
`hive2.3` as well but I didn't document this. These keywords are actually 
ported from SparkR as are `SparkR::install.spark`.




----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
[email protected]



---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to