[GitHub] spark pull request: [SPARK-11940][PYSPARK] Python API for ml.clust...

yanboliang Mon, 29 Feb 2016 03:02:34 -0800

Github user yanboliang commented on a diff in the pull request:

    https://github.com/apache/spark/pull/10242#discussion_r54395661
  
    --- Diff: python/pyspark/ml/clustering.py ---
    @@ -291,6 +292,317 @@ def _create_model(self, java_model):
             return BisectingKMeansModel(java_model)
     
     
    +class LDAModel(JavaModel):
    +    """ A clustering model derived from the LDA method.
    +
    +    Latent Dirichlet Allocation (LDA), a topic model designed for text 
documents.
    +    Terminology
    +    - "word" = "term": an element of the vocabulary
    +    - "token": instance of a term appearing in a document
    +    - "topic": multinomial distribution over words representing some 
concept
    +    References:
    +    - Original LDA paper (journal version):
    +    Blei, Ng, and Jordan.  "Latent Dirichlet Allocation."  JMLR, 2003.
    +    """
    +
    +    @since("2.0.0")
    +    def isDistributed(self):
    +        """Indicates whether this instance is of type 
DistributedLDAModel"""
    +        return self._call_java("isDistributed")
    +
    +    @since("2.0.0")
    +    def vocabSize(self):
    +        """Vocabulary size (number of terms or terms in the vocabulary)"""
    +        return self._call_java("vocabSize")
    +
    +    @since("2.0.0")
    +    def topicsMatrix(self):
    +        """Inferred topics, where each topic is represented by a 
distribution over terms."""
    +        return self._call_java("topicsMatrix")
    +
    +    @since("2.0.0")
    +    def logLikelihood(self, dataset):
    +        """Calculates a lower bound on the log likelihood of the entire 
corpus."""
    +        return self._call_java("logLikelihood", dataset)
    +
    +    @since("2.0.0")
    +    def logPerplexity(self, dataset):
    +        """Calculate an upper bound bound on perplexity.  (Lower is 
better.)"""
    +        return self._call_java("logPerplexity", dataset)
    +
    +    @since("2.0.0")
    +    def describeTopics(self, maxTermsPerTopic=10):
    +        """Return the topics described by weighted terms.
    +
    +        WARNING: If vocabSize and k are large, this can return a large 
object!
    +
    +        :param maxTermsPerTopic: Maximum number of terms to collect for 
each topic.
    +            (default: vocabulary size)
    +        :return: Array over topics. Each topic is represented as a pair of 
matching arrays:
    +            (term indices, term weights in topic).
    +            Each topic's terms are sorted in order of decreasing weight.
    +        """
    +        return self._call_java("describeTopics", maxTermsPerTopic)
    +
    +
    +class DistributedLDAModel(LDAModel):
    +    """
    +    Model fitted by LDA.
    +
    +    .. versionadded:: 2.0.0
    +    """
    +    def toLocal(self):
    +        return self._call_java("toLocal")
    +
    +
    +class LocalLDAModel(LDAModel):
    +    """
    +    Model fitted by LDA.
    +
    +    .. versionadded:: 2.0.0
    +    """
    +    pass
    +
    +
    +class LDA(JavaEstimator, HasFeaturesCol, HasMaxIter, HasSeed, 
HasCheckpointInterval):
    +    """ A clustering model derived from the LDA method.
    +
    +    Latent Dirichlet Allocation (LDA), a topic model designed for text 
documents.
    +    Terminology
    +    - "word" = "term": an element of the vocabulary
    +    - "token": instance of a term appearing in a document
    +    - "topic": multinomial distribution over words representing some 
concept
    +    References:
    +    - Original LDA paper (journal version):
    +    Blei, Ng, and Jordan.  "Latent Dirichlet Allocation."  JMLR, 2003.
    +
    +    >>> from pyspark.mllib.linalg import Vectors, SparseVector
    +    >>> from pyspark.ml.clustering import LDA
    +    >>> df = sqlContext.createDataFrame([[1, Vectors.dense([0.0, 1.0])], \
    +        [2, SparseVector(2, {0: 1.0})],], ["id", "features"])
    +    >>> lda = LDA(k=2, seed=1, optimizer="em")
    +    >>> model = lda.fit(df)
    +    >>> model.isDistributed()
    +    True
    +    >>> localModel = model.toLocal()
    +    >>> localModel.isDistributed()
    +    False
    +    >>> model.vocabSize()
    +    2
    +    >>> model.describeTopics().show()
    +    +-----+-----------+--------------------+
    +    |topic|termIndices|         termWeights|
    +    +-----+-----------+--------------------+
    +    |    0|     [1, 0]|[0.50401530077160...|
    +    |    1|     [0, 1]|[0.50401530077160...|
    +    +-----+-----------+--------------------+
    +    ...
    +    >>> model.topicsMatrix()
    +    DenseMatrix(2, 2, [0.496, 0.504, 0.504, 0.496], 0)
    +
    +    .. versionadded:: 2.0.0
    +    """
    +
    +    # a placeholder to make it appear in the generated doc
    +    k = Param(Params._dummy(), "k", "number of clusters to create")
    +    optimizer = Param(Params._dummy(), "optimizer",
    +                      "LDA optimizer, Currently 'em', 'online' are 
supported.")
    +    learningOffSet = Param(Params._dummy(), "learningOffSet",
    +                           "learning parameter that downweights early 
iterations")
    +    learningDecay = Param(Params._dummy(), "learningDecay", "Learning 
rate")
    +    subsamplingRate = Param(Params._dummy(), "subsamplingRate",
    +                            "Fraction of the corpus to be sampled")
    +    optimizeDocConcentration = Param(Params._dummy(), 
"optimizeDocConcentration",
    +                                     "Indicates whether the 
docConcentration \
    +                                     will be optimized during training")
    +
    +    @keyword_only
    +    def __init__(self, featuresCol="features", k=10,
    +                 optimizer="online", learningOffset=1024.0, 
learningDecay=0.51,
    +                 subsamplingRate=0.05, optimizeDocConcentration=True,
    +                 checkpointInterval=10, maxIter=20, seed=None):
    +        """
    +        :param featureCol:          feature column name in the dataframe
    +        :param k:                   Number of clusters you want
    +        :param optimizer:           LDAOptimizer used to perform the 
actual calculation.
    +            Currently "em", "online" are supported. Default to "em".
    +
    +        :param learningOffset:      A (positive) learning parameter that 
downweights early
    +            iterations. Larger values make early iterations count less. 
This is called "tau0"
    +            in the Online LDA paper (Hoffman et al., 2010) Default: 
1024.0, following Hoffman et al.
    +        :param learningDecay:       Learning rate, set as an exponential 
decay rate. This should
    +            be between (0.5, 1.0] to guarantee asymptotic convergence. 
This is called "kappa" in
    +            the Online LDA paper (Hoffman et al., 2010). Default: 0.51, 
based on Hoffman et al.
    +        :param subsamplingRate:     Fraction of the corpus to be sampled 
and used in each iteration
    +            of mini-batch gradient descent, in range (0, 1]. Note that 
this should be adjusted in
    +            synch with [[LDA.maxIter]] so the entire corpus is used.  
Specifically, set both so that
    +            maxIterations * miniBatchFraction >= 1. Default:0.05
    +        :param optimizeDocConcentration:       Indicates whether the 
docConcentration (Dirichlet
    +            parameter for document-topic distribution) will be optimized 
during training. Setting
    +            this to true will make the model more expressive and fit the 
training data better.
    +            Default: false
    +        :param checkpointInterval:  Param for set checkpoint interval (>= 
1) or disable
    +            checkpoint (-1). E.g. 10 means that the cache will get 
checkpointed every 10 iterations.
    +        :param maxIter:             Number of iterations. Default to 20
    +        :param seed:                Param for random seed
    +        __init__(self, featuresCol="features", k=10, \
    +                 optimizer="online", learningOffset=1024.0, 
learningDecay=0.51, \
    +                 subsamplingRate=0.05, optimizeDocConcentration=True, \
    +                 checkpointInterval=10, maxIter=20, seed=None):
    +        """
    +        super(LDA, self).__init__()
    +        self._java_obj = 
self._new_java_obj("org.apache.spark.ml.clustering.LDA", self.uid)
    +        self.k = Param(self, "k", "number of clusters to create")
    +        self.optimizer = Param(self, "optimizer",
    +                               "LDA optimizer, Currently 'em', 'online' 
are supported.")
    +        self.learningOffset = Param(self, "learningOffset",
    +                                    "learning parameter that downweights 
early iterations")
    +        self.learningDecay = Param(self, "learningDecay", "Learning rate")
    +        self.subsamplingRate = Param(self, "subsamplingRate",
    +                                     "Fraction of the corpus to be 
sampled")
    +        self.optimizeDocConcentration = Param(self, 
"optimizeDocConcentration",
    +                                              "Indicates whether the 
docConcentration \
    +                                              will be optimized during 
training")
    +        self._setDefault(k=10, optimizer="online", learningOffset=1024.0, 
learningDecay=0.51,
    +                         subsamplingRate=0.05, 
optimizeDocConcentration=True,
    +                         checkpointInterval=10, maxIter=20)
    +        kwargs = self.__init__._input_kwargs
    +        self.setParams(**kwargs)
    +
    +    def _create_model(self, java_model):
    +        if self.getOptimizer() == "em":
    +            return DistributedLDAModel(java_model)
    +        else:
    +            return LocalLDAModel(java_model)
    +
    +    @keyword_only
    +    @since("2.0.0")
    +    def setParams(self, featuresCol="features", k=10,
    +                  optimizer="online", learningOffset=1024.0, 
learningDecay=0.51,
    +                  subsamplingRate=0.05, optimizeDocConcentration=True,
    +                  checkpointInterval=10, maxIter=20, seed=None):
    +        """
    +        setParams(self, featuresCol="features", k=10, \
    +                  optimizer="online", learningOffset=1024.0, 
learningDecay=0.51, \
    +                  subsamplingRate=0.05, optimizeDocConcentration=True, \
    +                  checkpointInterval=10, maxIter=20, seed=None):
    +
    +        Sets params for LDA.
    +        """
    +        kwargs = self.setParams._input_kwargs
    +        return self._set(**kwargs)
    +
    +    @since("2.0.0")
    +    def setK(self, value):
    +        """
    +        Sets the value of :py:attr:`k`.
    +
    +        >>> algo = LDA().setK(10)
    +        >>> algo.getK()
    +        10
    --- End diff --
    
    Minor: It's better to add these tests to unit test files, you can refer 
#10975.



---
If your project is set up for it, you can reply to this email and have your
reply appear on GitHub as well. If your project does not have this feature
enabled and wishes so, or if the feature is enabled but not working, please
contact infrastructure at [email protected] or file a JIRA ticket
with INFRA.
---

---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

[GitHub] spark pull request: [SPARK-11940][PYSPARK] Python API for ml.clust...

Reply via email to