Github user MLnick commented on a diff in the pull request:

    https://github.com/apache/spark/pull/12927#discussion_r62799764
  
    --- Diff: 
examples/src/main/java/org/apache/spark/examples/ml/JavaLDAExample.java ---
    @@ -17,80 +17,53 @@
     
     package org.apache.spark.examples.ml;
     // $example on$
    -import java.util.regex.Pattern;
    -
    -import org.apache.spark.api.java.JavaRDD;
    -import org.apache.spark.api.java.function.Function;
     import org.apache.spark.ml.clustering.LDA;
     import org.apache.spark.ml.clustering.LDAModel;
    -import org.apache.spark.mllib.linalg.Vector;
    -import org.apache.spark.mllib.linalg.VectorUDT;
    -import org.apache.spark.mllib.linalg.Vectors;
     import org.apache.spark.sql.Dataset;
     import org.apache.spark.sql.Row;
     import org.apache.spark.sql.SparkSession;
    -import org.apache.spark.sql.catalyst.expressions.GenericRow;
    -import org.apache.spark.sql.types.Metadata;
    -import org.apache.spark.sql.types.StructField;
    -import org.apache.spark.sql.types.StructType;
     // $example off$
     
     /**
    - * An example demonstrating LDA
    + * An example demonstrating LDA.
      * Run with
      * <pre>
      * bin/run-example ml.JavaLDAExample
      * </pre>
      */
     public class JavaLDAExample {
     
    -  // $example on$
    -  private static class ParseVector implements Function<String, Row> {
    -    private static final Pattern separator = Pattern.compile(" ");
    -
    -    @Override
    -    public Row call(String line) {
    -      String[] tok = separator.split(line);
    -      double[] point = new double[tok.length];
    -      for (int i = 0; i < tok.length; ++i) {
    -        point[i] = Double.parseDouble(tok[i]);
    -      }
    -      Vector[] points = {Vectors.dense(point)};
    -      return new GenericRow(points);
    -    }
    -  }
    -
       public static void main(String[] args) {
    -
    -    String inputFile = "data/mllib/sample_lda_data.txt";
    -
    -    // Parses the arguments
    +    // Creates a SparkSession
         SparkSession spark = SparkSession
           .builder()
           .appName("JavaLDAExample")
           .getOrCreate();
     
    -    // Loads data
    -    JavaRDD<Row> points = spark.read().text(inputFile).javaRDD().map(new 
ParseVector());
    -    StructField[] fields = {new StructField("features", new VectorUDT(), 
false, Metadata.empty())};
    -    StructType schema = new StructType(fields);
    -    Dataset<Row> dataset = spark.createDataFrame(points, schema);
    +    // $example on$
    +    // Loads data.
    +    Dataset<Row> dataset = spark.read().format("libsvm")
    +      .load("data/mllib/sample_lda_libsvm_data.txt");
     
    -    // Trains a LDA model
    -    LDA lda = new LDA()
    -      .setK(10)
    -      .setMaxIter(10);
    +    // Trains a LDA model.
    +    LDA lda = new LDA().setK(10).setMaxIter(10);
         LDAModel model = lda.fit(dataset);
     
    -    System.out.println(model.logLikelihood(dataset));
    -    System.out.println(model.logPerplexity(dataset));
    -
    -    // Shows the result
    +    double ll = model.logLikelihood(dataset);
    +    double lp = model.logPerplexity(dataset);
    +    System.out.println("The lower bound on the log likelihood of the 
entire corpus: " + ll);
    +    System.out.println("The upper bound bound on perplexity: " + lp);
    +    
    +    // describeTopics.
    --- End diff --
    
    Describe topics


---
If your project is set up for it, you can reply to this email and have your
reply appear on GitHub as well. If your project does not have this feature
enabled and wishes so, or if the feature is enabled but not working, please
contact infrastructure at [email protected] or file a JIRA ticket
with INFRA.
---

---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to