Github user mengxr commented on a diff in the pull request:
https://github.com/apache/spark/pull/21265#discussion_r192001814
--- Diff: python/pyspark/ml/fpm.py ---
@@ -243,3 +244,105 @@ def setParams(self, minSupport=0.3,
minConfidence=0.8, itemsCol="items",
def _create_model(self, java_model):
return FPGrowthModel(java_model)
+
+
+class PrefixSpan(JavaParams):
+ """
+ .. note:: Experimental
+
+ A parallel PrefixSpan algorithm to mine frequent sequential patterns.
+ The PrefixSpan algorithm is described in J. Pei, et al., PrefixSpan:
Mining Sequential Patterns
+ Efficiently by Prefix-Projected Pattern Growth
+ (see <a href="http://doi.org/10.1109/ICDE.2001.914830">here</a>).
+ This class is not yet an Estimator/Transformer, use
:py:func:`findFrequentSequentialPatterns`
+ method to run the PrefixSpan algorithm.
+
+ @see <a
href="https://en.wikipedia.org/wiki/Sequential_Pattern_Mining">Sequential
Pattern Mining
+ (Wikipedia)</a>
+ .. versionadded:: 2.4.0
+
+ """
+
+ minSupport = Param(Params._dummy(), "minSupport", "The minimal support
level of the " +
+ "sequential pattern. Sequential pattern that
appears more than " +
+ "(minSupport * size-of-the-dataset) times will be
output. Must be >= 0.",
+ typeConverter=TypeConverters.toFloat)
+
+ maxPatternLength = Param(Params._dummy(), "maxPatternLength",
+ "The maximal length of the sequential
pattern. Must be > 0.",
+ typeConverter=TypeConverters.toInt)
+
+ maxLocalProjDBSize = Param(Params._dummy(), "maxLocalProjDBSize",
+ "The maximum number of items (including
delimiters used in the " +
+ "internal storage format) allowed in a
projected database before " +
+ "local processing. If a projected database
exceeds this size, " +
+ "another iteration of distributed prefix
growth is run. " +
+ "Must be > 0.",
+ typeConverter=TypeConverters.toInt)
+
+ sequenceCol = Param(Params._dummy(), "sequenceCol", "The name of the
sequence column in " +
+ "dataset, rows with nulls in this column are
ignored.",
+ typeConverter=TypeConverters.toString)
+
+ @keyword_only
+ def __init__(self, minSupport=0.1, maxPatternLength=10,
maxLocalProjDBSize=32000000,
+ sequenceCol="sequence"):
+ """
+ __init__(self, minSupport=0.1, maxPatternLength=10,
maxLocalProjDBSize=32000000, \
+ sequenceCol="sequence")
+ """
+ super(PrefixSpan, self).__init__()
+ self._java_obj =
self._new_java_obj("org.apache.spark.ml.fpm.PrefixSpan", self.uid)
+ self._setDefault(minSupport=0.1, maxPatternLength=10,
maxLocalProjDBSize=32000000,
+ sequenceCol="sequence")
+ kwargs = self._input_kwargs
+ self.setParams(**kwargs)
+
+ @keyword_only
+ @since("2.4.0")
+ def setParams(self, minSupport=0.1, maxPatternLength=10,
maxLocalProjDBSize=32000000,
+ sequenceCol="sequence"):
+ """
+ setParams(self, minSupport=0.1, maxPatternLength=10,
maxLocalProjDBSize=32000000, \
+ sequenceCol="sequence")
+ """
+ kwargs = self._input_kwargs
+ return self._set(**kwargs)
+
+ @since("2.4.0")
+ def findFrequentSequentialPatterns(self, dataset):
+ """
+ .. note:: Experimental
+ Finds the complete set of frequent sequential patterns in the
input sequences of itemsets.
+
+ :param dataset: A dataset or a dataframe containing a sequence
column which is
--- End diff --
There is no `Dataset` in PySpark.
---
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]