git commit: [SPARK-3573][MLLIB] Make MLlib's Vector compatible with SQL's SchemaRDD
Repository: spark Updated Branches: refs/heads/master 04450d115 - 1a9c6cdda [SPARK-3573][MLLIB] Make MLlib's Vector compatible with SQL's SchemaRDD Register MLlib's Vector as a SQL user-defined type (UDT) in both Scala and Python. With this PR, we can easily map a RDD[LabeledPoint] to a SchemaRDD, and then select columns or save to a Parquet file. Examples in Scala/Python are attached. The Scala code was copied from jkbradley. ~~This PR contains the changes from #3068 . I will rebase after #3068 is merged.~~ marmbrus jkbradley Author: Xiangrui Meng m...@databricks.com Closes #3070 from mengxr/SPARK-3573 and squashes the following commits: 3a0b6e5 [Xiangrui Meng] organize imports 236f0a0 [Xiangrui Meng] register vector as UDT and provide dataset examples Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/1a9c6cdd Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/1a9c6cdd Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/1a9c6cdd Branch: refs/heads/master Commit: 1a9c6cddadebdc53d083ac3e0da276ce979b5d1f Parents: 04450d1 Author: Xiangrui Meng m...@databricks.com Authored: Mon Nov 3 22:29:48 2014 -0800 Committer: Xiangrui Meng m...@databricks.com Committed: Mon Nov 3 22:29:48 2014 -0800 -- dev/run-tests | 2 +- .../src/main/python/mllib/dataset_example.py| 62 ++ .../spark/examples/mllib/DatasetExample.scala | 121 +++ mllib/pom.xml | 5 + .../org/apache/spark/mllib/linalg/Vectors.scala | 69 ++- .../spark/mllib/linalg/VectorsSuite.scala | 11 ++ python/pyspark/mllib/linalg.py | 50 python/pyspark/mllib/tests.py | 39 +- 8 files changed, 353 insertions(+), 6 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/1a9c6cdd/dev/run-tests -- diff --git a/dev/run-tests b/dev/run-tests index 0e9eefa..de607e4 100755 --- a/dev/run-tests +++ b/dev/run-tests @@ -180,7 +180,7 @@ CURRENT_BLOCK=$BLOCK_SPARK_UNIT_TESTS if [ -n $_SQL_TESTS_ONLY ]; then # This must be an array of individual arguments. Otherwise, having one long string #+ will be interpreted as a single test, which doesn't work. -SBT_MAVEN_TEST_ARGS=(catalyst/test sql/test hive/test) +SBT_MAVEN_TEST_ARGS=(catalyst/test sql/test hive/test mllib/test) else SBT_MAVEN_TEST_ARGS=(test) fi http://git-wip-us.apache.org/repos/asf/spark/blob/1a9c6cdd/examples/src/main/python/mllib/dataset_example.py -- diff --git a/examples/src/main/python/mllib/dataset_example.py b/examples/src/main/python/mllib/dataset_example.py new file mode 100644 index 000..540dae7 --- /dev/null +++ b/examples/src/main/python/mllib/dataset_example.py @@ -0,0 +1,62 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the License); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +#http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an AS IS BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + + +An example of how to use SchemaRDD as a dataset for ML. Run with:: +bin/spark-submit examples/src/main/python/mllib/dataset_example.py + + +import os +import sys +import tempfile +import shutil + +from pyspark import SparkContext +from pyspark.sql import SQLContext +from pyspark.mllib.util import MLUtils +from pyspark.mllib.stat import Statistics + + +def summarize(dataset): +print schema: %s % dataset.schema().json() +labels = dataset.map(lambda r: r.label) +print label average: %f % labels.mean() +features = dataset.map(lambda r: r.features) +summary = Statistics.colStats(features) +print features average: %r % summary.mean() + +if __name__ == __main__: +if len(sys.argv) 2: +print sys.stderr, Usage: dataset_example.py libsvm file +exit(-1) +sc = SparkContext(appName=DatasetExample) +sqlCtx = SQLContext(sc) +if len(sys.argv) == 2: +input = sys.argv[1] +else: +input = data/mllib/sample_libsvm_data.txt +points = MLUtils.loadLibSVMFile(sc, input) +dataset0 =
git commit: [SPARK-3573][MLLIB] Make MLlib's Vector compatible with SQL's SchemaRDD
Repository: spark Updated Branches: refs/heads/branch-1.2 42d02db86 - 8395e8fbd [SPARK-3573][MLLIB] Make MLlib's Vector compatible with SQL's SchemaRDD Register MLlib's Vector as a SQL user-defined type (UDT) in both Scala and Python. With this PR, we can easily map a RDD[LabeledPoint] to a SchemaRDD, and then select columns or save to a Parquet file. Examples in Scala/Python are attached. The Scala code was copied from jkbradley. ~~This PR contains the changes from #3068 . I will rebase after #3068 is merged.~~ marmbrus jkbradley Author: Xiangrui Meng m...@databricks.com Closes #3070 from mengxr/SPARK-3573 and squashes the following commits: 3a0b6e5 [Xiangrui Meng] organize imports 236f0a0 [Xiangrui Meng] register vector as UDT and provide dataset examples (cherry picked from commit 1a9c6cddadebdc53d083ac3e0da276ce979b5d1f) Signed-off-by: Xiangrui Meng m...@databricks.com Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/8395e8fb Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/8395e8fb Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/8395e8fb Branch: refs/heads/branch-1.2 Commit: 8395e8fbdf23bef286ec68a4bbadcc448b504c2c Parents: 42d02db Author: Xiangrui Meng m...@databricks.com Authored: Mon Nov 3 22:29:48 2014 -0800 Committer: Xiangrui Meng m...@databricks.com Committed: Mon Nov 3 22:31:43 2014 -0800 -- dev/run-tests | 2 +- .../src/main/python/mllib/dataset_example.py| 62 ++ .../spark/examples/mllib/DatasetExample.scala | 121 +++ mllib/pom.xml | 5 + .../org/apache/spark/mllib/linalg/Vectors.scala | 69 ++- .../spark/mllib/linalg/VectorsSuite.scala | 11 ++ python/pyspark/mllib/linalg.py | 50 python/pyspark/mllib/tests.py | 39 +- 8 files changed, 353 insertions(+), 6 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/8395e8fb/dev/run-tests -- diff --git a/dev/run-tests b/dev/run-tests index 0e9eefa..de607e4 100755 --- a/dev/run-tests +++ b/dev/run-tests @@ -180,7 +180,7 @@ CURRENT_BLOCK=$BLOCK_SPARK_UNIT_TESTS if [ -n $_SQL_TESTS_ONLY ]; then # This must be an array of individual arguments. Otherwise, having one long string #+ will be interpreted as a single test, which doesn't work. -SBT_MAVEN_TEST_ARGS=(catalyst/test sql/test hive/test) +SBT_MAVEN_TEST_ARGS=(catalyst/test sql/test hive/test mllib/test) else SBT_MAVEN_TEST_ARGS=(test) fi http://git-wip-us.apache.org/repos/asf/spark/blob/8395e8fb/examples/src/main/python/mllib/dataset_example.py -- diff --git a/examples/src/main/python/mllib/dataset_example.py b/examples/src/main/python/mllib/dataset_example.py new file mode 100644 index 000..540dae7 --- /dev/null +++ b/examples/src/main/python/mllib/dataset_example.py @@ -0,0 +1,62 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the License); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +#http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an AS IS BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + + +An example of how to use SchemaRDD as a dataset for ML. Run with:: +bin/spark-submit examples/src/main/python/mllib/dataset_example.py + + +import os +import sys +import tempfile +import shutil + +from pyspark import SparkContext +from pyspark.sql import SQLContext +from pyspark.mllib.util import MLUtils +from pyspark.mllib.stat import Statistics + + +def summarize(dataset): +print schema: %s % dataset.schema().json() +labels = dataset.map(lambda r: r.label) +print label average: %f % labels.mean() +features = dataset.map(lambda r: r.features) +summary = Statistics.colStats(features) +print features average: %r % summary.mean() + +if __name__ == __main__: +if len(sys.argv) 2: +print sys.stderr, Usage: dataset_example.py libsvm file +exit(-1) +sc = SparkContext(appName=DatasetExample) +sqlCtx = SQLContext(sc) +if len(sys.argv) == 2: +input = sys.argv[1] +else: +