Github user viirya commented on a diff in the pull request:
https://github.com/apache/spark/pull/21710#discussion_r203148676
--- Diff: R/pkg/R/mllib_fpm.R ---
@@ -154,3 +160,74 @@ setMethod("write.ml", signature(object =
"FPGrowthModel", path = "character"),
function(object, path, overwrite = FALSE) {
write_internal(object, path, overwrite)
})
+
+#' PrefixSpan
+#'
+#' A parallel PrefixSpan algorithm to mine frequent sequential patterns.
+#' \code{spark.prefixSpan} returns an instance of PrefixSpan.
+#' \code{spark.findFrequentSequentialPatterns} returns a complete set of
frequent sequential
+#' patterns.
+#' For more details, see
+#'
\href{https://spark.apache.org/docs/latest/mllib-frequent-pattern-mining.html#prefixspan}{
+#' PrefixSpan}.
+#'
+#' @param minSupport Minimal support level.
+#' @param maxPatternLength Maximal pattern length.
+#' @param maxLocalProjDBSize Maximum number of items (including delimiters
used in the internal
+#' storage format) allowed in a projected
database before local
+#' processing.
+#' @param sequenceCol name of the sequence column in dataset.
+#' @param ... additional argument(s) passed to the method.
+#' @return \code{spark.prefixSpan} returns an instance of PrefixSpan
+#' @rdname spark.prefixSpan
+#' @name spark.prefixSpan
+#' @aliases spark.prefixSpan,ANY-method
+#' @examples
+#' \dontrun{
+#' df <- createDataFrame(list(list(list(list(1L, 2L), list(3L))),
+#' list(list(list(1L), list(3L, 2L), list(1L, 2L))),
+#' list(list(list(1L, 2L), list(5L))),
+#' list(list(list(6L)))), schema = c("sequence"))
+#' prefix_Span <- spark.prefixSpan(minSupport = 0.5, maxPatternLength = 5L,
+#' maxLocalProjDBSize = 32000000L)
+#' frequency <- spark.findFrequentSequentialPatterns(prefix_Span, df)
+#' showDF(frequency)
+#' }
+#' @note spark.prefixSpan since 2.4.0
+setMethod("spark.prefixSpan", signature(),
+ function(minSupport=0.1, maxPatternLength=10L,
+ maxLocalProjDBSize=32000000L, sequenceCol="sequence") {
+ if (!is.numeric(minSupport) || minSupport < 0) {
+ stop("minSupport should be a number with value >= 0.")
+ }
+ if (!is.integer(maxPatternLength) || maxPatternLength <= 0) {
+ stop("maxPatternLength should be a number with value > 0.")
+ }
+ if (!is.numeric(maxLocalProjDBSize) || maxLocalProjDBSize <=
0) {
+ stop("maxLocalProjDBSize should be a number with value > 0.")
+ }
+
+ jobj <- callJStatic("org.apache.spark.ml.r.PrefixSpanWrapper",
"getPrefixSpan",
+ as.numeric(minSupport),
as.integer(maxPatternLength),
+ as.numeric(maxLocalProjDBSize),
as.character(sequenceCol))
+ new("PrefixSpan", jobj = jobj)
+ })
+
+# Find frequent sequential patterns.
+
+#' @param object a prefixSpan object.
+#' @param data A SparkDataFrame.
+#' @return A complete set of frequent sequential patterns in the input
sequences of itemsets.
+#' The returned \code{SparkDataFrame} contains columns of sequence
and corresponding
+#' frequency. The schema of it will be:
+#' \code{sequence: ArrayType(ArrayType(T))} (T is the item type)
+#' \code{freq: Long}
+#' @rdname spark.prefixSpan
--- End diff --
`findFrequentSequentialPatterns`?
---
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]