Repository: spark Updated Branches: refs/heads/branch-2.4 313a1f0a7 -> f575616db
[SPARK-25859][ML] add scala/java/python example and doc for PrefixSpan ## What changes were proposed in this pull request? add scala/java/python example and doc for PrefixSpan in branch 2.4 ## How was this patch tested? Manually tested Author: Huaxin Gao <huax...@us.ibm.com> Closes #22863 from huaxingao/mydocbranch. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/f575616d Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/f575616d Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/f575616d Branch: refs/heads/branch-2.4 Commit: f575616db69bcca9edf2a3dfe234003ab4c71b30 Parents: 313a1f0 Author: Huaxin Gao <huax...@us.ibm.com> Authored: Sat Oct 27 15:14:29 2018 -0700 Committer: Felix Cheung <felixche...@apache.org> Committed: Sat Oct 27 15:14:29 2018 -0700 ---------------------------------------------------------------------- docs/ml-frequent-pattern-mining.md | 46 +++++++++++++ .../examples/ml/JavaPrefixSpanExample.java | 68 ++++++++++++++++++++ .../src/main/python/ml/prefixspan_example.py | 48 ++++++++++++++ .../spark/examples/ml/PrefixSpanExample.scala | 62 ++++++++++++++++++ 4 files changed, 224 insertions(+) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/spark/blob/f575616d/docs/ml-frequent-pattern-mining.md ---------------------------------------------------------------------- diff --git a/docs/ml-frequent-pattern-mining.md b/docs/ml-frequent-pattern-mining.md index 81634de..c0928ab 100644 --- a/docs/ml-frequent-pattern-mining.md +++ b/docs/ml-frequent-pattern-mining.md @@ -85,3 +85,49 @@ Refer to the [R API docs](api/R/spark.fpGrowth.html) for more details. </div> </div> + +## PrefixSpan + +PrefixSpan is a sequential pattern mining algorithm described in +[Pei et al., Mining Sequential Patterns by Pattern-Growth: The +PrefixSpan Approach](http://dx.doi.org/10.1109%2FTKDE.2004.77). We refer +the reader to the referenced paper for formalizing the sequential +pattern mining problem. + +`spark.ml`'s PrefixSpan implementation takes the following parameters: + +* `minSupport`: the minimum support required to be considered a frequent + sequential pattern. +* `maxPatternLength`: the maximum length of a frequent sequential + pattern. Any frequent pattern exceeding this length will not be + included in the results. +* `maxLocalProjDBSize`: the maximum number of items allowed in a + prefix-projected database before local iterative processing of the + projected database begins. This parameter should be tuned with respect + to the size of your executors. +* `sequenceCol`: the name of the sequence column in dataset (default "sequence"), rows with + nulls in this column are ignored. + +**Examples** + +<div class="codetabs"> + +<div data-lang="scala" markdown="1"> +Refer to the [Scala API docs](api/scala/index.html#org.apache.spark.ml.fpm.PrefixSpan) for more details. + +{% include_example scala/org/apache/spark/examples/ml/PrefixSpanExample.scala %} +</div> + +<div data-lang="java" markdown="1"> +Refer to the [Java API docs](api/java/org/apache/spark/ml/fpm/PrefixSpan.html) for more details. + +{% include_example java/org/apache/spark/examples/ml/JavaPrefixSpanExample.java %} +</div> + +<div data-lang="python" markdown="1"> +Refer to the [Python API docs](api/python/pyspark.ml.html#pyspark.ml.fpm.PrefixSpan) for more details. + +{% include_example python/ml/prefixspan_example.py %} +</div> + +</div> http://git-wip-us.apache.org/repos/asf/spark/blob/f575616d/examples/src/main/java/org/apache/spark/examples/ml/JavaPrefixSpanExample.java ---------------------------------------------------------------------- diff --git a/examples/src/main/java/org/apache/spark/examples/ml/JavaPrefixSpanExample.java b/examples/src/main/java/org/apache/spark/examples/ml/JavaPrefixSpanExample.java new file mode 100644 index 0000000..98ffd4f --- /dev/null +++ b/examples/src/main/java/org/apache/spark/examples/ml/JavaPrefixSpanExample.java @@ -0,0 +1,68 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.examples.ml; + +// $example on$ + +import org.apache.spark.ml.fpm.PrefixSpan; +import org.apache.spark.sql.Dataset; +import org.apache.spark.sql.Row; +import org.apache.spark.sql.RowFactory; +import org.apache.spark.sql.SparkSession; +import org.apache.spark.sql.types.*; + +import java.util.Arrays; +import java.util.List; +// $example off$ + +/** + * An example demonstrating PrefixSpan. + * Run with + * <pre> + * bin/run-example ml.JavaPrefixSpanExample + * </pre> + */ +public class JavaPrefixSpanExample { + public static void main(String[] args) { + SparkSession spark = SparkSession + .builder() + .appName("JavaPrefixSpanExample") + .getOrCreate(); + + // $example on$ + List<Row> data = Arrays.asList( + RowFactory.create(Arrays.asList(Arrays.asList(1, 2), Arrays.asList(3))), + RowFactory.create(Arrays.asList(Arrays.asList(1), Arrays.asList(3, 2), Arrays.asList(1,2))), + RowFactory.create(Arrays.asList(Arrays.asList(1, 2), Arrays.asList(5))), + RowFactory.create(Arrays.asList(Arrays.asList(6))) + ); + StructType schema = new StructType(new StructField[]{ new StructField( + "sequence", new ArrayType(new ArrayType(DataTypes.IntegerType, true), true), + false, Metadata.empty()) + }); + Dataset<Row> sequenceDF = spark.createDataFrame(data, schema); + + PrefixSpan prefixSpan = new PrefixSpan().setMinSupport(0.5).setMaxPatternLength(5); + + // Finding frequent sequential patterns + prefixSpan.findFrequentSequentialPatterns(sequenceDF).show(); + // $example off$ + + spark.stop(); + } +} http://git-wip-us.apache.org/repos/asf/spark/blob/f575616d/examples/src/main/python/ml/prefixspan_example.py ---------------------------------------------------------------------- diff --git a/examples/src/main/python/ml/prefixspan_example.py b/examples/src/main/python/ml/prefixspan_example.py new file mode 100644 index 0000000..88d1d41 --- /dev/null +++ b/examples/src/main/python/ml/prefixspan_example.py @@ -0,0 +1,48 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +""" +An example demonstrating PrefixSpan. +Run with: + bin/spark-submit examples/src/main/python/ml/prefixspan_example.py +""" +# $example on$ +from pyspark.ml.fpm import PrefixSpan +# $example off$ +from pyspark.sql import Row, SparkSession + +if __name__ == "__main__": + spark = SparkSession\ + .builder\ + .appName("PrefixSpanExample")\ + .getOrCreate() + sc = spark.sparkContext + + # $example on$ + df = sc.parallelize([Row(sequence=[[1, 2], [3]]), + Row(sequence=[[1], [3, 2], [1, 2]]), + Row(sequence=[[1, 2], [5]]), + Row(sequence=[[6]])]).toDF() + + prefixSpan = PrefixSpan(minSupport=0.5, maxPatternLength=5, + maxLocalProjDBSize=32000000) + + # Find frequent sequential patterns. + prefixSpan.findFrequentSequentialPatterns(df).show() + # $example off$ + + spark.stop() http://git-wip-us.apache.org/repos/asf/spark/blob/f575616d/examples/src/main/scala/org/apache/spark/examples/ml/PrefixSpanExample.scala ---------------------------------------------------------------------- diff --git a/examples/src/main/scala/org/apache/spark/examples/ml/PrefixSpanExample.scala b/examples/src/main/scala/org/apache/spark/examples/ml/PrefixSpanExample.scala new file mode 100644 index 0000000..0a2d310 --- /dev/null +++ b/examples/src/main/scala/org/apache/spark/examples/ml/PrefixSpanExample.scala @@ -0,0 +1,62 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.examples.ml + +// scalastyle:off println + +// $example on$ +import org.apache.spark.ml.fpm.PrefixSpan +// $example off$ +import org.apache.spark.sql.SparkSession + +/** + * An example demonstrating PrefixSpan. + * Run with + * {{{ + * bin/run-example ml.PrefixSpanExample + * }}} + */ +object PrefixSpanExample { + + def main(args: Array[String]): Unit = { + val spark = SparkSession + .builder + .appName(s"${this.getClass.getSimpleName}") + .getOrCreate() + import spark.implicits._ + + // $example on$ + val smallTestData = Seq( + Seq(Seq(1, 2), Seq(3)), + Seq(Seq(1), Seq(3, 2), Seq(1, 2)), + Seq(Seq(1, 2), Seq(5)), + Seq(Seq(6))) + + val df = smallTestData.toDF("sequence") + val result = new PrefixSpan() + .setMinSupport(0.5) + .setMaxPatternLength(5) + .setMaxLocalProjDBSize(32000000) + .findFrequentSequentialPatterns(df) + .show() + // $example off$ + + spark.stop() + } +} +// scalastyle:on println --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org