This is an automated email from the ASF dual-hosted git repository. rawkintrevo pushed a commit to branch pymahout-feature in repository https://gitbox.apache.org/repos/asf/mahout.git
commit 35e43606701f5f6d756e4406d15f756f70e050b5 Author: Trevor Grant <[email protected]> AuthorDate: Wed Nov 10 09:09:01 2021 -0600 init pymahout --- .gitignore | 1 + pymahout/README.md | 18 +++++++++++ pymahout/environment/Dockerfile | 16 ++++++++++ pymahout/tester.py | 70 +++++++++++++++++++++++++++++++++++++++++ 4 files changed, 105 insertions(+) diff --git a/.gitignore b/.gitignore index b89159c..5f02e2b 100644 --- a/.gitignore +++ b/.gitignore @@ -25,6 +25,7 @@ bin/derby.log bin/metastore_db *jar *log +**tgz website/_site website/Gemfile.lock website/.bundle diff --git a/pymahout/README.md b/pymahout/README.md new file mode 100644 index 0000000..da2e5b2 --- /dev/null +++ b/pymahout/README.md @@ -0,0 +1,18 @@ + +## Here thar be dragons + +So the first mess to get through is trying to get all the components to play nice. To do that we need to get Java, Py4j, Spark, Mahout, et al. jiving. + +See the Dockerfile in environment. Make that docker file- then run the file in tester.py- its sets it up and lets you make some random matricies (incore). + +First step- i just wanted to get a first push up. + +1. Build mahout- copy dependency reduced jar as well as core/hsdf/spark jars to env/ +2. download apache spark 2.4.7 tgz also to env/ +3. build docker file, something like `docker build -t rawkintrevo/pymahout .` +4. `docker run -it rawkintrevo/pymahout python` +5. profit + +Good Mahouting, + +tg diff --git a/pymahout/environment/Dockerfile b/pymahout/environment/Dockerfile new file mode 100644 index 0000000..6fdabaf --- /dev/null +++ b/pymahout/environment/Dockerfile @@ -0,0 +1,16 @@ +FROM python:3.7 + +ADD spark-2.4.7-bin-hadoop2.7.tgz /tmp/spark-2.4.7-bin-hadoop2.7.tgz + +RUN apt-get update && apt-get install default-jdk -y +RUN pip install pyspark==2.4.5 + +RUN mv /tmp/spark-2.4.7-bin-hadoop2.7.tgz/spark-2.4.7-bin-hadoop2.7 /opt/spark + +ADD apache-mahout-14.2-SNAPSHOT-dependency-reduced.jar /opt/spark/jars +ADD mahout-core-14.2-SNAPSHOT-scala_2.11.jar /opt/spark/jars +ADD mahout-spark-14.2-SNAPSHOT-scala_2.11.jar /opt/spark/jars +ADD mahout-hdfs-14.2-SNAPSHOT.jar /opt/spark/jars + +ENV SPARK_HOME=/opt/spark +ENV JAVA_HOME=/usr/lib/jvm/java-1.11.0-openjdk-amd64 diff --git a/pymahout/tester.py b/pymahout/tester.py new file mode 100644 index 0000000..9f1b741 --- /dev/null +++ b/pymahout/tester.py @@ -0,0 +1,70 @@ +import pyspark + +version = "14.2-SNAPHOT" +spark_jars_packages = ','.join(['org.apache.mahout:mahout-core:14.1', + 'org.apache.mahout:mahout-spark:jar:scala-2.11:14.1']) + +jars = ['https://repo1.maven.org/maven2/org/apache/mahout/mahout-spark/14.1/mahout-spark-14.1-scala_2.11.jar', + 'https://repo1.maven.org/maven2/org/apache/mahout/mahout-core/14.1/mahout-core-14.1-scala_2.11.jar'] + +jars = ','.join([f'file:///opt/spark/jars/mahout-core-14.2-SNAPSHOT-scala_2.11.jar', + f'file:///opt/spark/jars/apache-mahout-14.2-SNAPSHOT-dependency-reduced.jar', + f'file:///opt/spark/jars/mahout-spark-14.2-SNAPSHOT-scala_2.11.jar',]) +spark_conf = pyspark.SparkConf() +spark_conf.setAll([ + ('spark.kryo.referenceTracking','false'), + ('spark.kryo.registrator', 'org.apache.mahout.sparkbindings.io.MahoutKryoRegistrator'), + ('spark.kryoserializer.buffer', '32'), + ('spark.kryoserializer.buffer.max', '600m'), + ('spark.serializer','org.apache.spark.serializer.KryoSerializer'), + # ('spark.jars.packages', spark_jars_packages) , + ('spark.jars', jars) + ]) + +sc = pyspark.SparkContext('local[*]', conf=spark_conf) + +jvm = sc._gateway._jvm + +m = sc._gateway.jvm.org.apache.mahout.math.Matrices().uniformGenerator(1).asFormatString() +## ^^ Method does not exist + +""" +in core +""" +m1 = sc._gateway.jvm.org.apache.mahout.math.Matrices().uniformView(3, 3, 1) +print(m1.asFormatString()) + +m2 = sc._gateway.jvm.org.apache.mahout.math.Matrices().uniformView(3, 3, 1) +print(m2.asFormatString()) + +print(m1.plus(m2).asFormatString()) + +"""out of core""" + +# Returns the static object instance on the heap + +def ref_scala_object(object_name): + clazz = sc._gateway.jvm.java.lang.Class.forName(object_name+"$") + ff = clazz.getDeclaredField("MODULE$") + o = ff.get(None) + return o + + +scala_none = ref_scala_object("scala.None") +scala_none.getClass().getName() + + +#sdc = sc._gateway.jvm.org.apache.mahout.sparkbindings.SparkDistributedContext(sc) +""" +Traceback (most recent call last): + File "<stdin>", line 1, in <module> + File "/usr/local/lib/python3.7/site-packages/py4j/java_gateway.py", line 1516, in __call__ + [get_command_part(arg, self._pool) for arg in new_args]) + File "/usr/local/lib/python3.7/site-packages/py4j/java_gateway.py", line 1516, in <listcomp> + [get_command_part(arg, self._pool) for arg in new_args]) + File "/usr/local/lib/python3.7/site-packages/py4j/protocol.py", line 298, in get_command_part + command_part = REFERENCE_TYPE + parameter._get_object_id() +AttributeError: 'SparkContext' object has no attribute '_get_object_id' +""" + +ref_scala_object("org.apache.mahout.sparkbindings") \ No newline at end of file
