http://git-wip-us.apache.org/repos/asf/spark-website/blob/d2bcf185/site/docs/2.1.0/ml-features.html ---------------------------------------------------------------------- diff --git a/site/docs/2.1.0/ml-features.html b/site/docs/2.1.0/ml-features.html index 64463de..a2f102b 100644 --- a/site/docs/2.1.0/ml-features.html +++ b/site/docs/2.1.0/ml-features.html @@ -318,52 +318,52 @@ <p><strong>Table of Contents</strong></p> <ul id="markdown-toc"> - <li><a href="#feature-extractors" id="markdown-toc-feature-extractors">Feature Extractors</a> <ul> - <li><a href="#tf-idf" id="markdown-toc-tf-idf">TF-IDF</a></li> - <li><a href="#word2vec" id="markdown-toc-word2vec">Word2Vec</a></li> - <li><a href="#countvectorizer" id="markdown-toc-countvectorizer">CountVectorizer</a></li> + <li><a href="#feature-extractors">Feature Extractors</a> <ul> + <li><a href="#tf-idf">TF-IDF</a></li> + <li><a href="#word2vec">Word2Vec</a></li> + <li><a href="#countvectorizer">CountVectorizer</a></li> </ul> </li> - <li><a href="#feature-transformers" id="markdown-toc-feature-transformers">Feature Transformers</a> <ul> - <li><a href="#tokenizer" id="markdown-toc-tokenizer">Tokenizer</a></li> - <li><a href="#stopwordsremover" id="markdown-toc-stopwordsremover">StopWordsRemover</a></li> - <li><a href="#n-gram" id="markdown-toc-n-gram">$n$-gram</a></li> - <li><a href="#binarizer" id="markdown-toc-binarizer">Binarizer</a></li> - <li><a href="#pca" id="markdown-toc-pca">PCA</a></li> - <li><a href="#polynomialexpansion" id="markdown-toc-polynomialexpansion">PolynomialExpansion</a></li> - <li><a href="#discrete-cosine-transform-dct" id="markdown-toc-discrete-cosine-transform-dct">Discrete Cosine Transform (DCT)</a></li> - <li><a href="#stringindexer" id="markdown-toc-stringindexer">StringIndexer</a></li> - <li><a href="#indextostring" id="markdown-toc-indextostring">IndexToString</a></li> - <li><a href="#onehotencoder" id="markdown-toc-onehotencoder">OneHotEncoder</a></li> - <li><a href="#vectorindexer" id="markdown-toc-vectorindexer">VectorIndexer</a></li> - <li><a href="#interaction" id="markdown-toc-interaction">Interaction</a></li> - <li><a href="#normalizer" id="markdown-toc-normalizer">Normalizer</a></li> - <li><a href="#standardscaler" id="markdown-toc-standardscaler">StandardScaler</a></li> - <li><a href="#minmaxscaler" id="markdown-toc-minmaxscaler">MinMaxScaler</a></li> - <li><a href="#maxabsscaler" id="markdown-toc-maxabsscaler">MaxAbsScaler</a></li> - <li><a href="#bucketizer" id="markdown-toc-bucketizer">Bucketizer</a></li> - <li><a href="#elementwiseproduct" id="markdown-toc-elementwiseproduct">ElementwiseProduct</a></li> - <li><a href="#sqltransformer" id="markdown-toc-sqltransformer">SQLTransformer</a></li> - <li><a href="#vectorassembler" id="markdown-toc-vectorassembler">VectorAssembler</a></li> - <li><a href="#quantilediscretizer" id="markdown-toc-quantilediscretizer">QuantileDiscretizer</a></li> + <li><a href="#feature-transformers">Feature Transformers</a> <ul> + <li><a href="#tokenizer">Tokenizer</a></li> + <li><a href="#stopwordsremover">StopWordsRemover</a></li> + <li><a href="#n-gram">$n$-gram</a></li> + <li><a href="#binarizer">Binarizer</a></li> + <li><a href="#pca">PCA</a></li> + <li><a href="#polynomialexpansion">PolynomialExpansion</a></li> + <li><a href="#discrete-cosine-transform-dct">Discrete Cosine Transform (DCT)</a></li> + <li><a href="#stringindexer">StringIndexer</a></li> + <li><a href="#indextostring">IndexToString</a></li> + <li><a href="#onehotencoder">OneHotEncoder</a></li> + <li><a href="#vectorindexer">VectorIndexer</a></li> + <li><a href="#interaction">Interaction</a></li> + <li><a href="#normalizer">Normalizer</a></li> + <li><a href="#standardscaler">StandardScaler</a></li> + <li><a href="#minmaxscaler">MinMaxScaler</a></li> + <li><a href="#maxabsscaler">MaxAbsScaler</a></li> + <li><a href="#bucketizer">Bucketizer</a></li> + <li><a href="#elementwiseproduct">ElementwiseProduct</a></li> + <li><a href="#sqltransformer">SQLTransformer</a></li> + <li><a href="#vectorassembler">VectorAssembler</a></li> + <li><a href="#quantilediscretizer">QuantileDiscretizer</a></li> </ul> </li> - <li><a href="#feature-selectors" id="markdown-toc-feature-selectors">Feature Selectors</a> <ul> - <li><a href="#vectorslicer" id="markdown-toc-vectorslicer">VectorSlicer</a></li> - <li><a href="#rformula" id="markdown-toc-rformula">RFormula</a></li> - <li><a href="#chisqselector" id="markdown-toc-chisqselector">ChiSqSelector</a></li> + <li><a href="#feature-selectors">Feature Selectors</a> <ul> + <li><a href="#vectorslicer">VectorSlicer</a></li> + <li><a href="#rformula">RFormula</a></li> + <li><a href="#chisqselector">ChiSqSelector</a></li> </ul> </li> - <li><a href="#locality-sensitive-hashing" id="markdown-toc-locality-sensitive-hashing">Locality Sensitive Hashing</a> <ul> - <li><a href="#lsh-operations" id="markdown-toc-lsh-operations">LSH Operations</a> <ul> - <li><a href="#feature-transformation" id="markdown-toc-feature-transformation">Feature Transformation</a></li> - <li><a href="#approximate-similarity-join" id="markdown-toc-approximate-similarity-join">Approximate Similarity Join</a></li> - <li><a href="#approximate-nearest-neighbor-search" id="markdown-toc-approximate-nearest-neighbor-search">Approximate Nearest Neighbor Search</a></li> + <li><a href="#locality-sensitive-hashing">Locality Sensitive Hashing</a> <ul> + <li><a href="#lsh-operations">LSH Operations</a> <ul> + <li><a href="#feature-transformation">Feature Transformation</a></li> + <li><a href="#approximate-similarity-join">Approximate Similarity Join</a></li> + <li><a href="#approximate-nearest-neighbor-search">Approximate Nearest Neighbor Search</a></li> </ul> </li> - <li><a href="#lsh-algorithms" id="markdown-toc-lsh-algorithms">LSH Algorithms</a> <ul> - <li><a href="#bucketed-random-projection-for-euclidean-distance" id="markdown-toc-bucketed-random-projection-for-euclidean-distance">Bucketed Random Projection for Euclidean Distance</a></li> - <li><a href="#minhash-for-jaccard-distance" id="markdown-toc-minhash-for-jaccard-distance">MinHash for Jaccard Distance</a></li> + <li><a href="#lsh-algorithms">LSH Algorithms</a> <ul> + <li><a href="#bucketed-random-projection-for-euclidean-distance">Bucketed Random Projection for Euclidean Distance</a></li> + <li><a href="#minhash-for-jaccard-distance">MinHash for Jaccard Distance</a></li> </ul> </li> </ul> @@ -395,7 +395,7 @@ TFIDF(t, d, D) = TF(t, d) \cdot IDF(t, D). There are several variants on the definition of term frequency and document frequency. In MLlib, we separate TF and IDF to make them flexible.</p> -<p><strong>TF</strong>: Both <code>HashingTF</code> and <code>CountVectorizer</code> can be used to generate the term frequency vectors.</p> +<p><strong>TF</strong>: Both <code>HashingTF</code> and <code>CountVectorizer</code> can be used to generate the term frequency vectors. </p> <p><code>HashingTF</code> is a <code>Transformer</code> which takes sets of terms and converts those sets into fixed-length feature vectors. In text processing, a “set of terms” might be a bag of words. @@ -437,7 +437,7 @@ when using text as features. Our feature vectors could then be passed to a lear <p>Refer to the <a href="api/scala/index.html#org.apache.spark.ml.feature.HashingTF">HashingTF Scala docs</a> and the <a href="api/scala/index.html#org.apache.spark.ml.feature.IDF">IDF Scala docs</a> for more details on the API.</p> - <div class="highlight"><pre><span class="k">import</span> <span class="nn">org.apache.spark.ml.feature.</span><span class="o">{</span><span class="nc">HashingTF</span><span class="o">,</span> <span class="nc">IDF</span><span class="o">,</span> <span class="nc">Tokenizer</span><span class="o">}</span> + <div class="highlight"><pre><span></span><span class="k">import</span> <span class="nn">org.apache.spark.ml.feature.</span><span class="o">{</span><span class="nc">HashingTF</span><span class="o">,</span> <span class="nc">IDF</span><span class="o">,</span> <span class="nc">Tokenizer</span><span class="o">}</span> <span class="k">val</span> <span class="n">sentenceData</span> <span class="k">=</span> <span class="n">spark</span><span class="o">.</span><span class="n">createDataFrame</span><span class="o">(</span><span class="nc">Seq</span><span class="o">(</span> <span class="o">(</span><span class="mf">0.0</span><span class="o">,</span> <span class="s">"Hi I heard about Spark"</span><span class="o">),</span> @@ -468,7 +468,7 @@ the <a href="api/scala/index.html#org.apache.spark.ml.feature.IDF">IDF Scala doc <p>Refer to the <a href="api/java/org/apache/spark/ml/feature/HashingTF.html">HashingTF Java docs</a> and the <a href="api/java/org/apache/spark/ml/feature/IDF.html">IDF Java docs</a> for more details on the API.</p> - <div class="highlight"><pre><span class="kn">import</span> <span class="nn">java.util.Arrays</span><span class="o">;</span> + <div class="highlight"><pre><span></span><span class="kn">import</span> <span class="nn">java.util.Arrays</span><span class="o">;</span> <span class="kn">import</span> <span class="nn">java.util.List</span><span class="o">;</span> <span class="kn">import</span> <span class="nn">org.apache.spark.ml.feature.HashingTF</span><span class="o">;</span> @@ -489,17 +489,17 @@ the <a href="api/scala/index.html#org.apache.spark.ml.feature.IDF">IDF Scala doc <span class="n">RowFactory</span><span class="o">.</span><span class="na">create</span><span class="o">(</span><span class="mf">0.0</span><span class="o">,</span> <span class="s">"I wish Java could use case classes"</span><span class="o">),</span> <span class="n">RowFactory</span><span class="o">.</span><span class="na">create</span><span class="o">(</span><span class="mf">1.0</span><span class="o">,</span> <span class="s">"Logistic regression models are neat"</span><span class="o">)</span> <span class="o">);</span> -<span class="n">StructType</span> <span class="n">schema</span> <span class="o">=</span> <span class="k">new</span> <span class="nf">StructType</span><span class="o">(</span><span class="k">new</span> <span class="n">StructField</span><span class="o">[]{</span> - <span class="k">new</span> <span class="nf">StructField</span><span class="o">(</span><span class="s">"label"</span><span class="o">,</span> <span class="n">DataTypes</span><span class="o">.</span><span class="na">DoubleType</span><span class="o">,</span> <span class="kc">false</span><span class="o">,</span> <span class="n">Metadata</span><span class="o">.</span><span class="na">empty</span><span class="o">()),</span> - <span class="k">new</span> <span class="nf">StructField</span><span class="o">(</span><span class="s">"sentence"</span><span class="o">,</span> <span class="n">DataTypes</span><span class="o">.</span><span class="na">StringType</span><span class="o">,</span> <span class="kc">false</span><span class="o">,</span> <span class="n">Metadata</span><span class="o">.</span><span class="na">empty</span><span class="o">())</span> +<span class="n">StructType</span> <span class="n">schema</span> <span class="o">=</span> <span class="k">new</span> <span class="n">StructType</span><span class="o">(</span><span class="k">new</span> <span class="n">StructField</span><span class="o">[]{</span> + <span class="k">new</span> <span class="n">StructField</span><span class="o">(</span><span class="s">"label"</span><span class="o">,</span> <span class="n">DataTypes</span><span class="o">.</span><span class="na">DoubleType</span><span class="o">,</span> <span class="kc">false</span><span class="o">,</span> <span class="n">Metadata</span><span class="o">.</span><span class="na">empty</span><span class="o">()),</span> + <span class="k">new</span> <span class="n">StructField</span><span class="o">(</span><span class="s">"sentence"</span><span class="o">,</span> <span class="n">DataTypes</span><span class="o">.</span><span class="na">StringType</span><span class="o">,</span> <span class="kc">false</span><span class="o">,</span> <span class="n">Metadata</span><span class="o">.</span><span class="na">empty</span><span class="o">())</span> <span class="o">});</span> <span class="n">Dataset</span><span class="o"><</span><span class="n">Row</span><span class="o">></span> <span class="n">sentenceData</span> <span class="o">=</span> <span class="n">spark</span><span class="o">.</span><span class="na">createDataFrame</span><span class="o">(</span><span class="n">data</span><span class="o">,</span> <span class="n">schema</span><span class="o">);</span> -<span class="n">Tokenizer</span> <span class="n">tokenizer</span> <span class="o">=</span> <span class="k">new</span> <span class="nf">Tokenizer</span><span class="o">().</span><span class="na">setInputCol</span><span class="o">(</span><span class="s">"sentence"</span><span class="o">).</span><span class="na">setOutputCol</span><span class="o">(</span><span class="s">"words"</span><span class="o">);</span> +<span class="n">Tokenizer</span> <span class="n">tokenizer</span> <span class="o">=</span> <span class="k">new</span> <span class="n">Tokenizer</span><span class="o">().</span><span class="na">setInputCol</span><span class="o">(</span><span class="s">"sentence"</span><span class="o">).</span><span class="na">setOutputCol</span><span class="o">(</span><span class="s">"words"</span><span class="o">);</span> <span class="n">Dataset</span><span class="o"><</span><span class="n">Row</span><span class="o">></span> <span class="n">wordsData</span> <span class="o">=</span> <span class="n">tokenizer</span><span class="o">.</span><span class="na">transform</span><span class="o">(</span><span class="n">sentenceData</span><span class="o">);</span> <span class="kt">int</span> <span class="n">numFeatures</span> <span class="o">=</span> <span class="mi">20</span><span class="o">;</span> -<span class="n">HashingTF</span> <span class="n">hashingTF</span> <span class="o">=</span> <span class="k">new</span> <span class="nf">HashingTF</span><span class="o">()</span> +<span class="n">HashingTF</span> <span class="n">hashingTF</span> <span class="o">=</span> <span class="k">new</span> <span class="n">HashingTF</span><span class="o">()</span> <span class="o">.</span><span class="na">setInputCol</span><span class="o">(</span><span class="s">"words"</span><span class="o">)</span> <span class="o">.</span><span class="na">setOutputCol</span><span class="o">(</span><span class="s">"rawFeatures"</span><span class="o">)</span> <span class="o">.</span><span class="na">setNumFeatures</span><span class="o">(</span><span class="n">numFeatures</span><span class="o">);</span> @@ -507,7 +507,7 @@ the <a href="api/scala/index.html#org.apache.spark.ml.feature.IDF">IDF Scala doc <span class="n">Dataset</span><span class="o"><</span><span class="n">Row</span><span class="o">></span> <span class="n">featurizedData</span> <span class="o">=</span> <span class="n">hashingTF</span><span class="o">.</span><span class="na">transform</span><span class="o">(</span><span class="n">wordsData</span><span class="o">);</span> <span class="c1">// alternatively, CountVectorizer can also be used to get term frequency vectors</span> -<span class="n">IDF</span> <span class="n">idf</span> <span class="o">=</span> <span class="k">new</span> <span class="nf">IDF</span><span class="o">().</span><span class="na">setInputCol</span><span class="o">(</span><span class="s">"rawFeatures"</span><span class="o">).</span><span class="na">setOutputCol</span><span class="o">(</span><span class="s">"features"</span><span class="o">);</span> +<span class="n">IDF</span> <span class="n">idf</span> <span class="o">=</span> <span class="k">new</span> <span class="n">IDF</span><span class="o">().</span><span class="na">setInputCol</span><span class="o">(</span><span class="s">"rawFeatures"</span><span class="o">).</span><span class="na">setOutputCol</span><span class="o">(</span><span class="s">"features"</span><span class="o">);</span> <span class="n">IDFModel</span> <span class="n">idfModel</span> <span class="o">=</span> <span class="n">idf</span><span class="o">.</span><span class="na">fit</span><span class="o">(</span><span class="n">featurizedData</span><span class="o">);</span> <span class="n">Dataset</span><span class="o"><</span><span class="n">Row</span><span class="o">></span> <span class="n">rescaledData</span> <span class="o">=</span> <span class="n">idfModel</span><span class="o">.</span><span class="na">transform</span><span class="o">(</span><span class="n">featurizedData</span><span class="o">);</span> @@ -521,26 +521,26 @@ the <a href="api/scala/index.html#org.apache.spark.ml.feature.IDF">IDF Scala doc <p>Refer to the <a href="api/python/pyspark.ml.html#pyspark.ml.feature.HashingTF">HashingTF Python docs</a> and the <a href="api/python/pyspark.ml.html#pyspark.ml.feature.IDF">IDF Python docs</a> for more details on the API.</p> - <div class="highlight"><pre><span class="kn">from</span> <span class="nn">pyspark.ml.feature</span> <span class="kn">import</span> <span class="n">HashingTF</span><span class="p">,</span> <span class="n">IDF</span><span class="p">,</span> <span class="n">Tokenizer</span> + <div class="highlight"><pre><span></span><span class="kn">from</span> <span class="nn">pyspark.ml.feature</span> <span class="kn">import</span> <span class="n">HashingTF</span><span class="p">,</span> <span class="n">IDF</span><span class="p">,</span> <span class="n">Tokenizer</span> <span class="n">sentenceData</span> <span class="o">=</span> <span class="n">spark</span><span class="o">.</span><span class="n">createDataFrame</span><span class="p">([</span> - <span class="p">(</span><span class="mf">0.0</span><span class="p">,</span> <span class="s">"Hi I heard about Spark"</span><span class="p">),</span> - <span class="p">(</span><span class="mf">0.0</span><span class="p">,</span> <span class="s">"I wish Java could use case classes"</span><span class="p">),</span> - <span class="p">(</span><span class="mf">1.0</span><span class="p">,</span> <span class="s">"Logistic regression models are neat"</span><span class="p">)</span> -<span class="p">],</span> <span class="p">[</span><span class="s">"label"</span><span class="p">,</span> <span class="s">"sentence"</span><span class="p">])</span> + <span class="p">(</span><span class="mf">0.0</span><span class="p">,</span> <span class="s2">"Hi I heard about Spark"</span><span class="p">),</span> + <span class="p">(</span><span class="mf">0.0</span><span class="p">,</span> <span class="s2">"I wish Java could use case classes"</span><span class="p">),</span> + <span class="p">(</span><span class="mf">1.0</span><span class="p">,</span> <span class="s2">"Logistic regression models are neat"</span><span class="p">)</span> +<span class="p">],</span> <span class="p">[</span><span class="s2">"label"</span><span class="p">,</span> <span class="s2">"sentence"</span><span class="p">])</span> -<span class="n">tokenizer</span> <span class="o">=</span> <span class="n">Tokenizer</span><span class="p">(</span><span class="n">inputCol</span><span class="o">=</span><span class="s">"sentence"</span><span class="p">,</span> <span class="n">outputCol</span><span class="o">=</span><span class="s">"words"</span><span class="p">)</span> +<span class="n">tokenizer</span> <span class="o">=</span> <span class="n">Tokenizer</span><span class="p">(</span><span class="n">inputCol</span><span class="o">=</span><span class="s2">"sentence"</span><span class="p">,</span> <span class="n">outputCol</span><span class="o">=</span><span class="s2">"words"</span><span class="p">)</span> <span class="n">wordsData</span> <span class="o">=</span> <span class="n">tokenizer</span><span class="o">.</span><span class="n">transform</span><span class="p">(</span><span class="n">sentenceData</span><span class="p">)</span> -<span class="n">hashingTF</span> <span class="o">=</span> <span class="n">HashingTF</span><span class="p">(</span><span class="n">inputCol</span><span class="o">=</span><span class="s">"words"</span><span class="p">,</span> <span class="n">outputCol</span><span class="o">=</span><span class="s">"rawFeatures"</span><span class="p">,</span> <span class="n">numFeatures</span><span class="o">=</span><span class="mi">20</span><span class="p">)</span> +<span class="n">hashingTF</span> <span class="o">=</span> <span class="n">HashingTF</span><span class="p">(</span><span class="n">inputCol</span><span class="o">=</span><span class="s2">"words"</span><span class="p">,</span> <span class="n">outputCol</span><span class="o">=</span><span class="s2">"rawFeatures"</span><span class="p">,</span> <span class="n">numFeatures</span><span class="o">=</span><span class="mi">20</span><span class="p">)</span> <span class="n">featurizedData</span> <span class="o">=</span> <span class="n">hashingTF</span><span class="o">.</span><span class="n">transform</span><span class="p">(</span><span class="n">wordsData</span><span class="p">)</span> -<span class="c"># alternatively, CountVectorizer can also be used to get term frequency vectors</span> +<span class="c1"># alternatively, CountVectorizer can also be used to get term frequency vectors</span> -<span class="n">idf</span> <span class="o">=</span> <span class="n">IDF</span><span class="p">(</span><span class="n">inputCol</span><span class="o">=</span><span class="s">"rawFeatures"</span><span class="p">,</span> <span class="n">outputCol</span><span class="o">=</span><span class="s">"features"</span><span class="p">)</span> +<span class="n">idf</span> <span class="o">=</span> <span class="n">IDF</span><span class="p">(</span><span class="n">inputCol</span><span class="o">=</span><span class="s2">"rawFeatures"</span><span class="p">,</span> <span class="n">outputCol</span><span class="o">=</span><span class="s2">"features"</span><span class="p">)</span> <span class="n">idfModel</span> <span class="o">=</span> <span class="n">idf</span><span class="o">.</span><span class="n">fit</span><span class="p">(</span><span class="n">featurizedData</span><span class="p">)</span> <span class="n">rescaledData</span> <span class="o">=</span> <span class="n">idfModel</span><span class="o">.</span><span class="n">transform</span><span class="p">(</span><span class="n">featurizedData</span><span class="p">)</span> -<span class="n">rescaledData</span><span class="o">.</span><span class="n">select</span><span class="p">(</span><span class="s">"label"</span><span class="p">,</span> <span class="s">"features"</span><span class="p">)</span><span class="o">.</span><span class="n">show</span><span class="p">()</span> +<span class="n">rescaledData</span><span class="o">.</span><span class="n">select</span><span class="p">(</span><span class="s2">"label"</span><span class="p">,</span> <span class="s2">"features"</span><span class="p">)</span><span class="o">.</span><span class="n">show</span><span class="p">()</span> </pre></div> <div><small>Find full example code at "examples/src/main/python/ml/tf_idf_example.py" in the Spark repo.</small></div> </div> @@ -563,7 +563,7 @@ details.</p> <p>Refer to the <a href="api/scala/index.html#org.apache.spark.ml.feature.Word2Vec">Word2Vec Scala docs</a> for more details on the API.</p> - <div class="highlight"><pre><span class="k">import</span> <span class="nn">org.apache.spark.ml.feature.Word2Vec</span> + <div class="highlight"><pre><span></span><span class="k">import</span> <span class="nn">org.apache.spark.ml.feature.Word2Vec</span> <span class="k">import</span> <span class="nn">org.apache.spark.ml.linalg.Vector</span> <span class="k">import</span> <span class="nn">org.apache.spark.sql.Row</span> @@ -584,7 +584,7 @@ for more details on the API.</p> <span class="k">val</span> <span class="n">result</span> <span class="k">=</span> <span class="n">model</span><span class="o">.</span><span class="n">transform</span><span class="o">(</span><span class="n">documentDF</span><span class="o">)</span> <span class="n">result</span><span class="o">.</span><span class="n">collect</span><span class="o">().</span><span class="n">foreach</span> <span class="o">{</span> <span class="k">case</span> <span class="nc">Row</span><span class="o">(</span><span class="n">text</span><span class="k">:</span> <span class="kt">Seq</span><span class="o">[</span><span class="k">_</span><span class="o">],</span> <span class="n">features</span><span class="k">:</span> <span class="kt">Vector</span><span class="o">)</span> <span class="k">=></span> - <span class="n">println</span><span class="o">(</span><span class="n">s</span><span class="s">"Text: [${text.mkString("</span><span class="o">,</span> <span class="s">")}] => \nVector: $features\n"</span><span class="o">)</span> <span class="o">}</span> + <span class="n">println</span><span class="o">(</span><span class="s">s"Text: [</span><span class="si">${</span><span class="n">text</span><span class="o">.</span><span class="n">mkString</span><span class="o">(</span><span class="s">", "</span><span class="o">)</span><span class="si">}</span><span class="s">] => \nVector: </span><span class="si">$features</span><span class="s">\n"</span><span class="o">)</span> <span class="o">}</span> </pre></div> <div><small>Find full example code at "examples/src/main/scala/org/apache/spark/examples/ml/Word2VecExample.scala" in the Spark repo.</small></div> </div> @@ -594,7 +594,7 @@ for more details on the API.</p> <p>Refer to the <a href="api/java/org/apache/spark/ml/feature/Word2Vec.html">Word2Vec Java docs</a> for more details on the API.</p> - <div class="highlight"><pre><span class="kn">import</span> <span class="nn">java.util.Arrays</span><span class="o">;</span> + <div class="highlight"><pre><span></span><span class="kn">import</span> <span class="nn">java.util.Arrays</span><span class="o">;</span> <span class="kn">import</span> <span class="nn">java.util.List</span><span class="o">;</span> <span class="kn">import</span> <span class="nn">org.apache.spark.ml.feature.Word2Vec</span><span class="o">;</span> @@ -612,13 +612,13 @@ for more details on the API.</p> <span class="n">RowFactory</span><span class="o">.</span><span class="na">create</span><span class="o">(</span><span class="n">Arrays</span><span class="o">.</span><span class="na">asList</span><span class="o">(</span><span class="s">"I wish Java could use case classes"</span><span class="o">.</span><span class="na">split</span><span class="o">(</span><span class="s">" "</span><span class="o">))),</span> <span class="n">RowFactory</span><span class="o">.</span><span class="na">create</span><span class="o">(</span><span class="n">Arrays</span><span class="o">.</span><span class="na">asList</span><span class="o">(</span><span class="s">"Logistic regression models are neat"</span><span class="o">.</span><span class="na">split</span><span class="o">(</span><span class="s">" "</span><span class="o">)))</span> <span class="o">);</span> -<span class="n">StructType</span> <span class="n">schema</span> <span class="o">=</span> <span class="k">new</span> <span class="nf">StructType</span><span class="o">(</span><span class="k">new</span> <span class="n">StructField</span><span class="o">[]{</span> - <span class="k">new</span> <span class="nf">StructField</span><span class="o">(</span><span class="s">"text"</span><span class="o">,</span> <span class="k">new</span> <span class="nf">ArrayType</span><span class="o">(</span><span class="n">DataTypes</span><span class="o">.</span><span class="na">StringType</span><span class="o">,</span> <span class="kc">true</span><span class="o">),</span> <span class="kc">false</span><span class="o">,</span> <span class="n">Metadata</span><span class="o">.</span><span class="na">empty</span><span class="o">())</span> +<span class="n">StructType</span> <span class="n">schema</span> <span class="o">=</span> <span class="k">new</span> <span class="n">StructType</span><span class="o">(</span><span class="k">new</span> <span class="n">StructField</span><span class="o">[]{</span> + <span class="k">new</span> <span class="n">StructField</span><span class="o">(</span><span class="s">"text"</span><span class="o">,</span> <span class="k">new</span> <span class="n">ArrayType</span><span class="o">(</span><span class="n">DataTypes</span><span class="o">.</span><span class="na">StringType</span><span class="o">,</span> <span class="kc">true</span><span class="o">),</span> <span class="kc">false</span><span class="o">,</span> <span class="n">Metadata</span><span class="o">.</span><span class="na">empty</span><span class="o">())</span> <span class="o">});</span> <span class="n">Dataset</span><span class="o"><</span><span class="n">Row</span><span class="o">></span> <span class="n">documentDF</span> <span class="o">=</span> <span class="n">spark</span><span class="o">.</span><span class="na">createDataFrame</span><span class="o">(</span><span class="n">data</span><span class="o">,</span> <span class="n">schema</span><span class="o">);</span> <span class="c1">// Learn a mapping from words to Vectors.</span> -<span class="n">Word2Vec</span> <span class="n">word2Vec</span> <span class="o">=</span> <span class="k">new</span> <span class="nf">Word2Vec</span><span class="o">()</span> +<span class="n">Word2Vec</span> <span class="n">word2Vec</span> <span class="o">=</span> <span class="k">new</span> <span class="n">Word2Vec</span><span class="o">()</span> <span class="o">.</span><span class="na">setInputCol</span><span class="o">(</span><span class="s">"text"</span><span class="o">)</span> <span class="o">.</span><span class="na">setOutputCol</span><span class="o">(</span><span class="s">"result"</span><span class="o">)</span> <span class="o">.</span><span class="na">setVectorSize</span><span class="o">(</span><span class="mi">3</span><span class="o">)</span> @@ -641,23 +641,23 @@ for more details on the API.</p> <p>Refer to the <a href="api/python/pyspark.ml.html#pyspark.ml.feature.Word2Vec">Word2Vec Python docs</a> for more details on the API.</p> - <div class="highlight"><pre><span class="kn">from</span> <span class="nn">pyspark.ml.feature</span> <span class="kn">import</span> <span class="n">Word2Vec</span> + <div class="highlight"><pre><span></span><span class="kn">from</span> <span class="nn">pyspark.ml.feature</span> <span class="kn">import</span> <span class="n">Word2Vec</span> -<span class="c"># Input data: Each row is a bag of words from a sentence or document.</span> +<span class="c1"># Input data: Each row is a bag of words from a sentence or document.</span> <span class="n">documentDF</span> <span class="o">=</span> <span class="n">spark</span><span class="o">.</span><span class="n">createDataFrame</span><span class="p">([</span> - <span class="p">(</span><span class="s">"Hi I heard about Spark"</span><span class="o">.</span><span class="n">split</span><span class="p">(</span><span class="s">" "</span><span class="p">),</span> <span class="p">),</span> - <span class="p">(</span><span class="s">"I wish Java could use case classes"</span><span class="o">.</span><span class="n">split</span><span class="p">(</span><span class="s">" "</span><span class="p">),</span> <span class="p">),</span> - <span class="p">(</span><span class="s">"Logistic regression models are neat"</span><span class="o">.</span><span class="n">split</span><span class="p">(</span><span class="s">" "</span><span class="p">),</span> <span class="p">)</span> -<span class="p">],</span> <span class="p">[</span><span class="s">"text"</span><span class="p">])</span> + <span class="p">(</span><span class="s2">"Hi I heard about Spark"</span><span class="o">.</span><span class="n">split</span><span class="p">(</span><span class="s2">" "</span><span class="p">),</span> <span class="p">),</span> + <span class="p">(</span><span class="s2">"I wish Java could use case classes"</span><span class="o">.</span><span class="n">split</span><span class="p">(</span><span class="s2">" "</span><span class="p">),</span> <span class="p">),</span> + <span class="p">(</span><span class="s2">"Logistic regression models are neat"</span><span class="o">.</span><span class="n">split</span><span class="p">(</span><span class="s2">" "</span><span class="p">),</span> <span class="p">)</span> +<span class="p">],</span> <span class="p">[</span><span class="s2">"text"</span><span class="p">])</span> -<span class="c"># Learn a mapping from words to Vectors.</span> -<span class="n">word2Vec</span> <span class="o">=</span> <span class="n">Word2Vec</span><span class="p">(</span><span class="n">vectorSize</span><span class="o">=</span><span class="mi">3</span><span class="p">,</span> <span class="n">minCount</span><span class="o">=</span><span class="mi">0</span><span class="p">,</span> <span class="n">inputCol</span><span class="o">=</span><span class="s">"text"</span><span class="p">,</span> <span class="n">outputCol</span><span class="o">=</span><span class="s">"result"</span><span class="p">)</span> +<span class="c1"># Learn a mapping from words to Vectors.</span> +<span class="n">word2Vec</span> <span class="o">=</span> <span class="n">Word2Vec</span><span class="p">(</span><span class="n">vectorSize</span><span class="o">=</span><span class="mi">3</span><span class="p">,</span> <span class="n">minCount</span><span class="o">=</span><span class="mi">0</span><span class="p">,</span> <span class="n">inputCol</span><span class="o">=</span><span class="s2">"text"</span><span class="p">,</span> <span class="n">outputCol</span><span class="o">=</span><span class="s2">"result"</span><span class="p">)</span> <span class="n">model</span> <span class="o">=</span> <span class="n">word2Vec</span><span class="o">.</span><span class="n">fit</span><span class="p">(</span><span class="n">documentDF</span><span class="p">)</span> <span class="n">result</span> <span class="o">=</span> <span class="n">model</span><span class="o">.</span><span class="n">transform</span><span class="p">(</span><span class="n">documentDF</span><span class="p">)</span> <span class="k">for</span> <span class="n">row</span> <span class="ow">in</span> <span class="n">result</span><span class="o">.</span><span class="n">collect</span><span class="p">():</span> <span class="n">text</span><span class="p">,</span> <span class="n">vector</span> <span class="o">=</span> <span class="n">row</span> - <span class="k">print</span><span class="p">(</span><span class="s">"Text: [</span><span class="si">%s</span><span class="s">] => </span><span class="se">\n</span><span class="s">Vector: </span><span class="si">%s</span><span class="se">\n</span><span class="s">"</span> <span class="o">%</span> <span class="p">(</span><span class="s">", "</span><span class="o">.</span><span class="n">join</span><span class="p">(</span><span class="n">text</span><span class="p">),</span> <span class="nb">str</span><span class="p">(</span><span class="n">vector</span><span class="p">)))</span> + <span class="k">print</span><span class="p">(</span><span class="s2">"Text: [</span><span class="si">%s</span><span class="s2">] => </span><span class="se">\n</span><span class="s2">Vector: </span><span class="si">%s</span><span class="se">\n</span><span class="s2">"</span> <span class="o">%</span> <span class="p">(</span><span class="s2">", "</span><span class="o">.</span><span class="n">join</span><span class="p">(</span><span class="n">text</span><span class="p">),</span> <span class="nb">str</span><span class="p">(</span><span class="n">vector</span><span class="p">)))</span> </pre></div> <div><small>Find full example code at "examples/src/main/python/ml/word2vec_example.py" in the Spark repo.</small></div> </div> @@ -707,7 +707,7 @@ Then the output column “vector” after transformation contains:</p> and the <a href="api/scala/index.html#org.apache.spark.ml.feature.CountVectorizerModel">CountVectorizerModel Scala docs</a> for more details on the API.</p> - <div class="highlight"><pre><span class="k">import</span> <span class="nn">org.apache.spark.ml.feature.</span><span class="o">{</span><span class="nc">CountVectorizer</span><span class="o">,</span> <span class="nc">CountVectorizerModel</span><span class="o">}</span> + <div class="highlight"><pre><span></span><span class="k">import</span> <span class="nn">org.apache.spark.ml.feature.</span><span class="o">{</span><span class="nc">CountVectorizer</span><span class="o">,</span> <span class="nc">CountVectorizerModel</span><span class="o">}</span> <span class="k">val</span> <span class="n">df</span> <span class="k">=</span> <span class="n">spark</span><span class="o">.</span><span class="n">createDataFrame</span><span class="o">(</span><span class="nc">Seq</span><span class="o">(</span> <span class="o">(</span><span class="mi">0</span><span class="o">,</span> <span class="nc">Array</span><span class="o">(</span><span class="s">"a"</span><span class="o">,</span> <span class="s">"b"</span><span class="o">,</span> <span class="s">"c"</span><span class="o">)),</span> @@ -738,7 +738,7 @@ for more details on the API.</p> and the <a href="api/java/org/apache/spark/ml/feature/CountVectorizerModel.html">CountVectorizerModel Java docs</a> for more details on the API.</p> - <div class="highlight"><pre><span class="kn">import</span> <span class="nn">java.util.Arrays</span><span class="o">;</span> + <div class="highlight"><pre><span></span><span class="kn">import</span> <span class="nn">java.util.Arrays</span><span class="o">;</span> <span class="kn">import</span> <span class="nn">java.util.List</span><span class="o">;</span> <span class="kn">import</span> <span class="nn">org.apache.spark.ml.feature.CountVectorizer</span><span class="o">;</span> @@ -754,13 +754,13 @@ for more details on the API.</p> <span class="n">RowFactory</span><span class="o">.</span><span class="na">create</span><span class="o">(</span><span class="n">Arrays</span><span class="o">.</span><span class="na">asList</span><span class="o">(</span><span class="s">"a"</span><span class="o">,</span> <span class="s">"b"</span><span class="o">,</span> <span class="s">"c"</span><span class="o">)),</span> <span class="n">RowFactory</span><span class="o">.</span><span class="na">create</span><span class="o">(</span><span class="n">Arrays</span><span class="o">.</span><span class="na">asList</span><span class="o">(</span><span class="s">"a"</span><span class="o">,</span> <span class="s">"b"</span><span class="o">,</span> <span class="s">"b"</span><span class="o">,</span> <span class="s">"c"</span><span class="o">,</span> <span class="s">"a"</span><span class="o">))</span> <span class="o">);</span> -<span class="n">StructType</span> <span class="n">schema</span> <span class="o">=</span> <span class="k">new</span> <span class="nf">StructType</span><span class="o">(</span><span class="k">new</span> <span class="n">StructField</span> <span class="o">[]</span> <span class="o">{</span> - <span class="k">new</span> <span class="nf">StructField</span><span class="o">(</span><span class="s">"text"</span><span class="o">,</span> <span class="k">new</span> <span class="nf">ArrayType</span><span class="o">(</span><span class="n">DataTypes</span><span class="o">.</span><span class="na">StringType</span><span class="o">,</span> <span class="kc">true</span><span class="o">),</span> <span class="kc">false</span><span class="o">,</span> <span class="n">Metadata</span><span class="o">.</span><span class="na">empty</span><span class="o">())</span> +<span class="n">StructType</span> <span class="n">schema</span> <span class="o">=</span> <span class="k">new</span> <span class="n">StructType</span><span class="o">(</span><span class="k">new</span> <span class="n">StructField</span> <span class="o">[]</span> <span class="o">{</span> + <span class="k">new</span> <span class="n">StructField</span><span class="o">(</span><span class="s">"text"</span><span class="o">,</span> <span class="k">new</span> <span class="n">ArrayType</span><span class="o">(</span><span class="n">DataTypes</span><span class="o">.</span><span class="na">StringType</span><span class="o">,</span> <span class="kc">true</span><span class="o">),</span> <span class="kc">false</span><span class="o">,</span> <span class="n">Metadata</span><span class="o">.</span><span class="na">empty</span><span class="o">())</span> <span class="o">});</span> <span class="n">Dataset</span><span class="o"><</span><span class="n">Row</span><span class="o">></span> <span class="n">df</span> <span class="o">=</span> <span class="n">spark</span><span class="o">.</span><span class="na">createDataFrame</span><span class="o">(</span><span class="n">data</span><span class="o">,</span> <span class="n">schema</span><span class="o">);</span> <span class="c1">// fit a CountVectorizerModel from the corpus</span> -<span class="n">CountVectorizerModel</span> <span class="n">cvModel</span> <span class="o">=</span> <span class="k">new</span> <span class="nf">CountVectorizer</span><span class="o">()</span> +<span class="n">CountVectorizerModel</span> <span class="n">cvModel</span> <span class="o">=</span> <span class="k">new</span> <span class="n">CountVectorizer</span><span class="o">()</span> <span class="o">.</span><span class="na">setInputCol</span><span class="o">(</span><span class="s">"text"</span><span class="o">)</span> <span class="o">.</span><span class="na">setOutputCol</span><span class="o">(</span><span class="s">"feature"</span><span class="o">)</span> <span class="o">.</span><span class="na">setVocabSize</span><span class="o">(</span><span class="mi">3</span><span class="o">)</span> @@ -768,7 +768,7 @@ for more details on the API.</p> <span class="o">.</span><span class="na">fit</span><span class="o">(</span><span class="n">df</span><span class="o">);</span> <span class="c1">// alternatively, define CountVectorizerModel with a-priori vocabulary</span> -<span class="n">CountVectorizerModel</span> <span class="n">cvm</span> <span class="o">=</span> <span class="k">new</span> <span class="nf">CountVectorizerModel</span><span class="o">(</span><span class="k">new</span> <span class="n">String</span><span class="o">[]{</span><span class="s">"a"</span><span class="o">,</span> <span class="s">"b"</span><span class="o">,</span> <span class="s">"c"</span><span class="o">})</span> +<span class="n">CountVectorizerModel</span> <span class="n">cvm</span> <span class="o">=</span> <span class="k">new</span> <span class="n">CountVectorizerModel</span><span class="o">(</span><span class="k">new</span> <span class="n">String</span><span class="o">[]{</span><span class="s">"a"</span><span class="o">,</span> <span class="s">"b"</span><span class="o">,</span> <span class="s">"c"</span><span class="o">})</span> <span class="o">.</span><span class="na">setInputCol</span><span class="o">(</span><span class="s">"text"</span><span class="o">)</span> <span class="o">.</span><span class="na">setOutputCol</span><span class="o">(</span><span class="s">"feature"</span><span class="o">);</span> @@ -783,16 +783,16 @@ for more details on the API.</p> and the <a href="api/python/pyspark.ml.html#pyspark.ml.feature.CountVectorizerModel">CountVectorizerModel Python docs</a> for more details on the API.</p> - <div class="highlight"><pre><span class="kn">from</span> <span class="nn">pyspark.ml.feature</span> <span class="kn">import</span> <span class="n">CountVectorizer</span> + <div class="highlight"><pre><span></span><span class="kn">from</span> <span class="nn">pyspark.ml.feature</span> <span class="kn">import</span> <span class="n">CountVectorizer</span> -<span class="c"># Input data: Each row is a bag of words with a ID.</span> +<span class="c1"># Input data: Each row is a bag of words with a ID.</span> <span class="n">df</span> <span class="o">=</span> <span class="n">spark</span><span class="o">.</span><span class="n">createDataFrame</span><span class="p">([</span> - <span class="p">(</span><span class="mi">0</span><span class="p">,</span> <span class="s">"a b c"</span><span class="o">.</span><span class="n">split</span><span class="p">(</span><span class="s">" "</span><span class="p">)),</span> - <span class="p">(</span><span class="mi">1</span><span class="p">,</span> <span class="s">"a b b c a"</span><span class="o">.</span><span class="n">split</span><span class="p">(</span><span class="s">" "</span><span class="p">))</span> -<span class="p">],</span> <span class="p">[</span><span class="s">"id"</span><span class="p">,</span> <span class="s">"words"</span><span class="p">])</span> + <span class="p">(</span><span class="mi">0</span><span class="p">,</span> <span class="s2">"a b c"</span><span class="o">.</span><span class="n">split</span><span class="p">(</span><span class="s2">" "</span><span class="p">)),</span> + <span class="p">(</span><span class="mi">1</span><span class="p">,</span> <span class="s2">"a b b c a"</span><span class="o">.</span><span class="n">split</span><span class="p">(</span><span class="s2">" "</span><span class="p">))</span> +<span class="p">],</span> <span class="p">[</span><span class="s2">"id"</span><span class="p">,</span> <span class="s2">"words"</span><span class="p">])</span> -<span class="c"># fit a CountVectorizerModel from the corpus.</span> -<span class="n">cv</span> <span class="o">=</span> <span class="n">CountVectorizer</span><span class="p">(</span><span class="n">inputCol</span><span class="o">=</span><span class="s">"words"</span><span class="p">,</span> <span class="n">outputCol</span><span class="o">=</span><span class="s">"features"</span><span class="p">,</span> <span class="n">vocabSize</span><span class="o">=</span><span class="mi">3</span><span class="p">,</span> <span class="n">minDF</span><span class="o">=</span><span class="mf">2.0</span><span class="p">)</span> +<span class="c1"># fit a CountVectorizerModel from the corpus.</span> +<span class="n">cv</span> <span class="o">=</span> <span class="n">CountVectorizer</span><span class="p">(</span><span class="n">inputCol</span><span class="o">=</span><span class="s2">"words"</span><span class="p">,</span> <span class="n">outputCol</span><span class="o">=</span><span class="s2">"features"</span><span class="p">,</span> <span class="n">vocabSize</span><span class="o">=</span><span class="mi">3</span><span class="p">,</span> <span class="n">minDF</span><span class="o">=</span><span class="mf">2.0</span><span class="p">)</span> <span class="n">model</span> <span class="o">=</span> <span class="n">cv</span><span class="o">.</span><span class="n">fit</span><span class="p">(</span><span class="n">df</span><span class="p">)</span> @@ -822,7 +822,7 @@ for more details on the API.</p> and the <a href="api/scala/index.html#org.apache.spark.ml.feature.RegexTokenizer">RegexTokenizer Scala docs</a> for more details on the API.</p> - <div class="highlight"><pre><span class="k">import</span> <span class="nn">org.apache.spark.ml.feature.</span><span class="o">{</span><span class="nc">RegexTokenizer</span><span class="o">,</span> <span class="nc">Tokenizer</span><span class="o">}</span> + <div class="highlight"><pre><span></span><span class="k">import</span> <span class="nn">org.apache.spark.ml.feature.</span><span class="o">{</span><span class="nc">RegexTokenizer</span><span class="o">,</span> <span class="nc">Tokenizer</span><span class="o">}</span> <span class="k">import</span> <span class="nn">org.apache.spark.sql.functions._</span> <span class="k">val</span> <span class="n">sentenceDataFrame</span> <span class="k">=</span> <span class="n">spark</span><span class="o">.</span><span class="n">createDataFrame</span><span class="o">(</span><span class="nc">Seq</span><span class="o">(</span> @@ -856,7 +856,7 @@ for more details on the API.</p> and the <a href="api/java/org/apache/spark/ml/feature/RegexTokenizer.html">RegexTokenizer Java docs</a> for more details on the API.</p> - <div class="highlight"><pre><span class="kn">import</span> <span class="nn">java.util.Arrays</span><span class="o">;</span> + <div class="highlight"><pre><span></span><span class="kn">import</span> <span class="nn">java.util.Arrays</span><span class="o">;</span> <span class="kn">import</span> <span class="nn">java.util.List</span><span class="o">;</span> <span class="kn">import</span> <span class="nn">scala.collection.mutable.WrappedArray</span><span class="o">;</span> @@ -878,16 +878,16 @@ for more details on the API.</p> <span class="n">RowFactory</span><span class="o">.</span><span class="na">create</span><span class="o">(</span><span class="mi">2</span><span class="o">,</span> <span class="s">"Logistic,regression,models,are,neat"</span><span class="o">)</span> <span class="o">);</span> -<span class="n">StructType</span> <span class="n">schema</span> <span class="o">=</span> <span class="k">new</span> <span class="nf">StructType</span><span class="o">(</span><span class="k">new</span> <span class="n">StructField</span><span class="o">[]{</span> - <span class="k">new</span> <span class="nf">StructField</span><span class="o">(</span><span class="s">"id"</span><span class="o">,</span> <span class="n">DataTypes</span><span class="o">.</span><span class="na">IntegerType</span><span class="o">,</span> <span class="kc">false</span><span class="o">,</span> <span class="n">Metadata</span><span class="o">.</span><span class="na">empty</span><span class="o">()),</span> - <span class="k">new</span> <span class="nf">StructField</span><span class="o">(</span><span class="s">"sentence"</span><span class="o">,</span> <span class="n">DataTypes</span><span class="o">.</span><span class="na">StringType</span><span class="o">,</span> <span class="kc">false</span><span class="o">,</span> <span class="n">Metadata</span><span class="o">.</span><span class="na">empty</span><span class="o">())</span> +<span class="n">StructType</span> <span class="n">schema</span> <span class="o">=</span> <span class="k">new</span> <span class="n">StructType</span><span class="o">(</span><span class="k">new</span> <span class="n">StructField</span><span class="o">[]{</span> + <span class="k">new</span> <span class="n">StructField</span><span class="o">(</span><span class="s">"id"</span><span class="o">,</span> <span class="n">DataTypes</span><span class="o">.</span><span class="na">IntegerType</span><span class="o">,</span> <span class="kc">false</span><span class="o">,</span> <span class="n">Metadata</span><span class="o">.</span><span class="na">empty</span><span class="o">()),</span> + <span class="k">new</span> <span class="n">StructField</span><span class="o">(</span><span class="s">"sentence"</span><span class="o">,</span> <span class="n">DataTypes</span><span class="o">.</span><span class="na">StringType</span><span class="o">,</span> <span class="kc">false</span><span class="o">,</span> <span class="n">Metadata</span><span class="o">.</span><span class="na">empty</span><span class="o">())</span> <span class="o">});</span> <span class="n">Dataset</span><span class="o"><</span><span class="n">Row</span><span class="o">></span> <span class="n">sentenceDataFrame</span> <span class="o">=</span> <span class="n">spark</span><span class="o">.</span><span class="na">createDataFrame</span><span class="o">(</span><span class="n">data</span><span class="o">,</span> <span class="n">schema</span><span class="o">);</span> -<span class="n">Tokenizer</span> <span class="n">tokenizer</span> <span class="o">=</span> <span class="k">new</span> <span class="nf">Tokenizer</span><span class="o">().</span><span class="na">setInputCol</span><span class="o">(</span><span class="s">"sentence"</span><span class="o">).</span><span class="na">setOutputCol</span><span class="o">(</span><span class="s">"words"</span><span class="o">);</span> +<span class="n">Tokenizer</span> <span class="n">tokenizer</span> <span class="o">=</span> <span class="k">new</span> <span class="n">Tokenizer</span><span class="o">().</span><span class="na">setInputCol</span><span class="o">(</span><span class="s">"sentence"</span><span class="o">).</span><span class="na">setOutputCol</span><span class="o">(</span><span class="s">"words"</span><span class="o">);</span> -<span class="n">RegexTokenizer</span> <span class="n">regexTokenizer</span> <span class="o">=</span> <span class="k">new</span> <span class="nf">RegexTokenizer</span><span class="o">()</span> +<span class="n">RegexTokenizer</span> <span class="n">regexTokenizer</span> <span class="o">=</span> <span class="k">new</span> <span class="n">RegexTokenizer</span><span class="o">()</span> <span class="o">.</span><span class="na">setInputCol</span><span class="o">(</span><span class="s">"sentence"</span><span class="o">)</span> <span class="o">.</span><span class="na">setOutputCol</span><span class="o">(</span><span class="s">"words"</span><span class="o">)</span> <span class="o">.</span><span class="na">setPattern</span><span class="o">(</span><span class="s">"\\W"</span><span class="o">);</span> <span class="c1">// alternatively .setPattern("\\w+").setGaps(false);</span> @@ -916,30 +916,30 @@ for more details on the API.</p> the <a href="api/python/pyspark.ml.html#pyspark.ml.feature.RegexTokenizer">RegexTokenizer Python docs</a> for more details on the API.</p> - <div class="highlight"><pre><span class="kn">from</span> <span class="nn">pyspark.ml.feature</span> <span class="kn">import</span> <span class="n">Tokenizer</span><span class="p">,</span> <span class="n">RegexTokenizer</span> + <div class="highlight"><pre><span></span><span class="kn">from</span> <span class="nn">pyspark.ml.feature</span> <span class="kn">import</span> <span class="n">Tokenizer</span><span class="p">,</span> <span class="n">RegexTokenizer</span> <span class="kn">from</span> <span class="nn">pyspark.sql.functions</span> <span class="kn">import</span> <span class="n">col</span><span class="p">,</span> <span class="n">udf</span> <span class="kn">from</span> <span class="nn">pyspark.sql.types</span> <span class="kn">import</span> <span class="n">IntegerType</span> <span class="n">sentenceDataFrame</span> <span class="o">=</span> <span class="n">spark</span><span class="o">.</span><span class="n">createDataFrame</span><span class="p">([</span> - <span class="p">(</span><span class="mi">0</span><span class="p">,</span> <span class="s">"Hi I heard about Spark"</span><span class="p">),</span> - <span class="p">(</span><span class="mi">1</span><span class="p">,</span> <span class="s">"I wish Java could use case classes"</span><span class="p">),</span> - <span class="p">(</span><span class="mi">2</span><span class="p">,</span> <span class="s">"Logistic,regression,models,are,neat"</span><span class="p">)</span> -<span class="p">],</span> <span class="p">[</span><span class="s">"id"</span><span class="p">,</span> <span class="s">"sentence"</span><span class="p">])</span> + <span class="p">(</span><span class="mi">0</span><span class="p">,</span> <span class="s2">"Hi I heard about Spark"</span><span class="p">),</span> + <span class="p">(</span><span class="mi">1</span><span class="p">,</span> <span class="s2">"I wish Java could use case classes"</span><span class="p">),</span> + <span class="p">(</span><span class="mi">2</span><span class="p">,</span> <span class="s2">"Logistic,regression,models,are,neat"</span><span class="p">)</span> +<span class="p">],</span> <span class="p">[</span><span class="s2">"id"</span><span class="p">,</span> <span class="s2">"sentence"</span><span class="p">])</span> -<span class="n">tokenizer</span> <span class="o">=</span> <span class="n">Tokenizer</span><span class="p">(</span><span class="n">inputCol</span><span class="o">=</span><span class="s">"sentence"</span><span class="p">,</span> <span class="n">outputCol</span><span class="o">=</span><span class="s">"words"</span><span class="p">)</span> +<span class="n">tokenizer</span> <span class="o">=</span> <span class="n">Tokenizer</span><span class="p">(</span><span class="n">inputCol</span><span class="o">=</span><span class="s2">"sentence"</span><span class="p">,</span> <span class="n">outputCol</span><span class="o">=</span><span class="s2">"words"</span><span class="p">)</span> -<span class="n">regexTokenizer</span> <span class="o">=</span> <span class="n">RegexTokenizer</span><span class="p">(</span><span class="n">inputCol</span><span class="o">=</span><span class="s">"sentence"</span><span class="p">,</span> <span class="n">outputCol</span><span class="o">=</span><span class="s">"words"</span><span class="p">,</span> <span class="n">pattern</span><span class="o">=</span><span class="s">"</span><span class="se">\\</span><span class="s">W"</span><span class="p">)</span> -<span class="c"># alternatively, pattern="\\w+", gaps(False)</span> +<span class="n">regexTokenizer</span> <span class="o">=</span> <span class="n">RegexTokenizer</span><span class="p">(</span><span class="n">inputCol</span><span class="o">=</span><span class="s2">"sentence"</span><span class="p">,</span> <span class="n">outputCol</span><span class="o">=</span><span class="s2">"words"</span><span class="p">,</span> <span class="n">pattern</span><span class="o">=</span><span class="s2">"</span><span class="se">\\</span><span class="s2">W"</span><span class="p">)</span> +<span class="c1"># alternatively, pattern="\\w+", gaps(False)</span> <span class="n">countTokens</span> <span class="o">=</span> <span class="n">udf</span><span class="p">(</span><span class="k">lambda</span> <span class="n">words</span><span class="p">:</span> <span class="nb">len</span><span class="p">(</span><span class="n">words</span><span class="p">),</span> <span class="n">IntegerType</span><span class="p">())</span> <span class="n">tokenized</span> <span class="o">=</span> <span class="n">tokenizer</span><span class="o">.</span><span class="n">transform</span><span class="p">(</span><span class="n">sentenceDataFrame</span><span class="p">)</span> -<span class="n">tokenized</span><span class="o">.</span><span class="n">select</span><span class="p">(</span><span class="s">"sentence"</span><span class="p">,</span> <span class="s">"words"</span><span class="p">)</span>\ - <span class="o">.</span><span class="n">withColumn</span><span class="p">(</span><span class="s">"tokens"</span><span class="p">,</span> <span class="n">countTokens</span><span class="p">(</span><span class="n">col</span><span class="p">(</span><span class="s">"words"</span><span class="p">)))</span><span class="o">.</span><span class="n">show</span><span class="p">(</span><span class="n">truncate</span><span class="o">=</span><span class="bp">False</span><span class="p">)</span> +<span class="n">tokenized</span><span class="o">.</span><span class="n">select</span><span class="p">(</span><span class="s2">"sentence"</span><span class="p">,</span> <span class="s2">"words"</span><span class="p">)</span>\ + <span class="o">.</span><span class="n">withColumn</span><span class="p">(</span><span class="s2">"tokens"</span><span class="p">,</span> <span class="n">countTokens</span><span class="p">(</span><span class="n">col</span><span class="p">(</span><span class="s2">"words"</span><span class="p">)))</span><span class="o">.</span><span class="n">show</span><span class="p">(</span><span class="n">truncate</span><span class="o">=</span><span class="bp">False</span><span class="p">)</span> <span class="n">regexTokenized</span> <span class="o">=</span> <span class="n">regexTokenizer</span><span class="o">.</span><span class="n">transform</span><span class="p">(</span><span class="n">sentenceDataFrame</span><span class="p">)</span> -<span class="n">regexTokenized</span><span class="o">.</span><span class="n">select</span><span class="p">(</span><span class="s">"sentence"</span><span class="p">,</span> <span class="s">"words"</span><span class="p">)</span> \ - <span class="o">.</span><span class="n">withColumn</span><span class="p">(</span><span class="s">"tokens"</span><span class="p">,</span> <span class="n">countTokens</span><span class="p">(</span><span class="n">col</span><span class="p">(</span><span class="s">"words"</span><span class="p">)))</span><span class="o">.</span><span class="n">show</span><span class="p">(</span><span class="n">truncate</span><span class="o">=</span><span class="bp">False</span><span class="p">)</span> +<span class="n">regexTokenized</span><span class="o">.</span><span class="n">select</span><span class="p">(</span><span class="s2">"sentence"</span><span class="p">,</span> <span class="s2">"words"</span><span class="p">)</span> \ + <span class="o">.</span><span class="n">withColumn</span><span class="p">(</span><span class="s2">"tokens"</span><span class="p">,</span> <span class="n">countTokens</span><span class="p">(</span><span class="n">col</span><span class="p">(</span><span class="s2">"words"</span><span class="p">)))</span><span class="o">.</span><span class="n">show</span><span class="p">(</span><span class="n">truncate</span><span class="o">=</span><span class="bp">False</span><span class="p">)</span> </pre></div> <div><small>Find full example code at "examples/src/main/python/ml/tokenizer_example.py" in the Spark repo.</small></div> </div> @@ -989,7 +989,7 @@ filtered out.</p> <p>Refer to the <a href="api/scala/index.html#org.apache.spark.ml.feature.StopWordsRemover">StopWordsRemover Scala docs</a> for more details on the API.</p> - <div class="highlight"><pre><span class="k">import</span> <span class="nn">org.apache.spark.ml.feature.StopWordsRemover</span> + <div class="highlight"><pre><span></span><span class="k">import</span> <span class="nn">org.apache.spark.ml.feature.StopWordsRemover</span> <span class="k">val</span> <span class="n">remover</span> <span class="k">=</span> <span class="k">new</span> <span class="nc">StopWordsRemover</span><span class="o">()</span> <span class="o">.</span><span class="n">setInputCol</span><span class="o">(</span><span class="s">"raw"</span><span class="o">)</span> @@ -1010,7 +1010,7 @@ for more details on the API.</p> <p>Refer to the <a href="api/java/org/apache/spark/ml/feature/StopWordsRemover.html">StopWordsRemover Java docs</a> for more details on the API.</p> - <div class="highlight"><pre><span class="kn">import</span> <span class="nn">java.util.Arrays</span><span class="o">;</span> + <div class="highlight"><pre><span></span><span class="kn">import</span> <span class="nn">java.util.Arrays</span><span class="o">;</span> <span class="kn">import</span> <span class="nn">java.util.List</span><span class="o">;</span> <span class="kn">import</span> <span class="nn">org.apache.spark.ml.feature.StopWordsRemover</span><span class="o">;</span> @@ -1022,7 +1022,7 @@ for more details on the API.</p> <span class="kn">import</span> <span class="nn">org.apache.spark.sql.types.StructField</span><span class="o">;</span> <span class="kn">import</span> <span class="nn">org.apache.spark.sql.types.StructType</span><span class="o">;</span> -<span class="n">StopWordsRemover</span> <span class="n">remover</span> <span class="o">=</span> <span class="k">new</span> <span class="nf">StopWordsRemover</span><span class="o">()</span> +<span class="n">StopWordsRemover</span> <span class="n">remover</span> <span class="o">=</span> <span class="k">new</span> <span class="n">StopWordsRemover</span><span class="o">()</span> <span class="o">.</span><span class="na">setInputCol</span><span class="o">(</span><span class="s">"raw"</span><span class="o">)</span> <span class="o">.</span><span class="na">setOutputCol</span><span class="o">(</span><span class="s">"filtered"</span><span class="o">);</span> @@ -1031,8 +1031,8 @@ for more details on the API.</p> <span class="n">RowFactory</span><span class="o">.</span><span class="na">create</span><span class="o">(</span><span class="n">Arrays</span><span class="o">.</span><span class="na">asList</span><span class="o">(</span><span class="s">"Mary"</span><span class="o">,</span> <span class="s">"had"</span><span class="o">,</span> <span class="s">"a"</span><span class="o">,</span> <span class="s">"little"</span><span class="o">,</span> <span class="s">"lamb"</span><span class="o">))</span> <span class="o">);</span> -<span class="n">StructType</span> <span class="n">schema</span> <span class="o">=</span> <span class="k">new</span> <span class="nf">StructType</span><span class="o">(</span><span class="k">new</span> <span class="n">StructField</span><span class="o">[]{</span> - <span class="k">new</span> <span class="nf">StructField</span><span class="o">(</span> +<span class="n">StructType</span> <span class="n">schema</span> <span class="o">=</span> <span class="k">new</span> <span class="n">StructType</span><span class="o">(</span><span class="k">new</span> <span class="n">StructField</span><span class="o">[]{</span> + <span class="k">new</span> <span class="n">StructField</span><span class="o">(</span> <span class="s">"raw"</span><span class="o">,</span> <span class="n">DataTypes</span><span class="o">.</span><span class="na">createArrayType</span><span class="o">(</span><span class="n">DataTypes</span><span class="o">.</span><span class="na">StringType</span><span class="o">),</span> <span class="kc">false</span><span class="o">,</span> <span class="n">Metadata</span><span class="o">.</span><span class="na">empty</span><span class="o">())</span> <span class="o">});</span> @@ -1047,14 +1047,14 @@ for more details on the API.</p> <p>Refer to the <a href="api/python/pyspark.ml.html#pyspark.ml.feature.StopWordsRemover">StopWordsRemover Python docs</a> for more details on the API.</p> - <div class="highlight"><pre><span class="kn">from</span> <span class="nn">pyspark.ml.feature</span> <span class="kn">import</span> <span class="n">StopWordsRemover</span> + <div class="highlight"><pre><span></span><span class="kn">from</span> <span class="nn">pyspark.ml.feature</span> <span class="kn">import</span> <span class="n">StopWordsRemover</span> <span class="n">sentenceData</span> <span class="o">=</span> <span class="n">spark</span><span class="o">.</span><span class="n">createDataFrame</span><span class="p">([</span> - <span class="p">(</span><span class="mi">0</span><span class="p">,</span> <span class="p">[</span><span class="s">"I"</span><span class="p">,</span> <span class="s">"saw"</span><span class="p">,</span> <span class="s">"the"</span><span class="p">,</span> <span class="s">"red"</span><span class="p">,</span> <span class="s">"balloon"</span><span class="p">]),</span> - <span class="p">(</span><span class="mi">1</span><span class="p">,</span> <span class="p">[</span><span class="s">"Mary"</span><span class="p">,</span> <span class="s">"had"</span><span class="p">,</span> <span class="s">"a"</span><span class="p">,</span> <span class="s">"little"</span><span class="p">,</span> <span class="s">"lamb"</span><span class="p">])</span> -<span class="p">],</span> <span class="p">[</span><span class="s">"id"</span><span class="p">,</span> <span class="s">"raw"</span><span class="p">])</span> + <span class="p">(</span><span class="mi">0</span><span class="p">,</span> <span class="p">[</span><span class="s2">"I"</span><span class="p">,</span> <span class="s2">"saw"</span><span class="p">,</span> <span class="s2">"the"</span><span class="p">,</span> <span class="s2">"red"</span><span class="p">,</span> <span class="s2">"balloon"</span><span class="p">]),</span> + <span class="p">(</span><span class="mi">1</span><span class="p">,</span> <span class="p">[</span><span class="s2">"Mary"</span><span class="p">,</span> <span class="s2">"had"</span><span class="p">,</span> <span class="s2">"a"</span><span class="p">,</span> <span class="s2">"little"</span><span class="p">,</span> <span class="s2">"lamb"</span><span class="p">])</span> +<span class="p">],</span> <span class="p">[</span><span class="s2">"id"</span><span class="p">,</span> <span class="s2">"raw"</span><span class="p">])</span> -<span class="n">remover</span> <span class="o">=</span> <span class="n">StopWordsRemover</span><span class="p">(</span><span class="n">inputCol</span><span class="o">=</span><span class="s">"raw"</span><span class="p">,</span> <span class="n">outputCol</span><span class="o">=</span><span class="s">"filtered"</span><span class="p">)</span> +<span class="n">remover</span> <span class="o">=</span> <span class="n">StopWordsRemover</span><span class="p">(</span><span class="n">inputCol</span><span class="o">=</span><span class="s2">"raw"</span><span class="p">,</span> <span class="n">outputCol</span><span class="o">=</span><span class="s2">"filtered"</span><span class="p">)</span> <span class="n">remover</span><span class="o">.</span><span class="n">transform</span><span class="p">(</span><span class="n">sentenceData</span><span class="p">)</span><span class="o">.</span><span class="n">show</span><span class="p">(</span><span class="n">truncate</span><span class="o">=</span><span class="bp">False</span><span class="p">)</span> </pre></div> <div><small>Find full example code at "examples/src/main/python/ml/stopwords_remover_example.py" in the Spark repo.</small></div> @@ -1074,7 +1074,7 @@ for more details on the API.</p> <p>Refer to the <a href="api/scala/index.html#org.apache.spark.ml.feature.NGram">NGram Scala docs</a> for more details on the API.</p> - <div class="highlight"><pre><span class="k">import</span> <span class="nn">org.apache.spark.ml.feature.NGram</span> + <div class="highlight"><pre><span></span><span class="k">import</span> <span class="nn">org.apache.spark.ml.feature.NGram</span> <span class="k">val</span> <span class="n">wordDataFrame</span> <span class="k">=</span> <span class="n">spark</span><span class="o">.</span><span class="n">createDataFrame</span><span class="o">(</span><span class="nc">Seq</span><span class="o">(</span> <span class="o">(</span><span class="mi">0</span><span class="o">,</span> <span class="nc">Array</span><span class="o">(</span><span class="s">"Hi"</span><span class="o">,</span> <span class="s">"I"</span><span class="o">,</span> <span class="s">"heard"</span><span class="o">,</span> <span class="s">"about"</span><span class="o">,</span> <span class="s">"Spark"</span><span class="o">)),</span> @@ -1095,7 +1095,7 @@ for more details on the API.</p> <p>Refer to the <a href="api/java/org/apache/spark/ml/feature/NGram.html">NGram Java docs</a> for more details on the API.</p> - <div class="highlight"><pre><span class="kn">import</span> <span class="nn">java.util.Arrays</span><span class="o">;</span> + <div class="highlight"><pre><span></span><span class="kn">import</span> <span class="nn">java.util.Arrays</span><span class="o">;</span> <span class="kn">import</span> <span class="nn">java.util.List</span><span class="o">;</span> <span class="kn">import</span> <span class="nn">org.apache.spark.ml.feature.NGram</span><span class="o">;</span> @@ -1112,15 +1112,15 @@ for more details on the API.</p> <span class="n">RowFactory</span><span class="o">.</span><span class="na">create</span><span class="o">(</span><span class="mi">2</span><span class="o">,</span> <span class="n">Arrays</span><span class="o">.</span><span class="na">asList</span><span class="o">(</span><span class="s">"Logistic"</span><span class="o">,</span> <span class="s">"regression"</span><span class="o">,</span> <span class="s">"models"</span><span class="o">,</span> <span class="s">"are"</span><span class="o">,</span> <span class="s">"neat"</span><span class="o">))</span> <span class="o">);</span> -<span class="n">StructType</span> <span class="n">schema</span> <span class="o">=</span> <span class="k">new</span> <span class="nf">StructType</span><span class="o">(</span><span class="k">new</span> <span class="n">StructField</span><span class="o">[]{</span> - <span class="k">new</span> <span class="nf">StructField</span><span class="o">(</span><span class="s">"id"</span><span class="o">,</span> <span class="n">DataTypes</span><span class="o">.</span><span class="na">IntegerType</span><span class="o">,</span> <span class="kc">false</span><span class="o">,</span> <span class="n">Metadata</span><span class="o">.</span><span class="na">empty</span><span class="o">()),</span> - <span class="k">new</span> <span class="nf">StructField</span><span class="o">(</span> +<span class="n">StructType</span> <span class="n">schema</span> <span class="o">=</span> <span class="k">new</span> <span class="n">StructType</span><span class="o">(</span><span class="k">new</span> <span class="n">StructField</span><span class="o">[]{</span> + <span class="k">new</span> <span class="n">StructField</span><span class="o">(</span><span class="s">"id"</span><span class="o">,</span> <span class="n">DataTypes</span><span class="o">.</span><span class="na">IntegerType</span><span class="o">,</span> <span class="kc">false</span><span class="o">,</span> <span class="n">Metadata</span><span class="o">.</span><span class="na">empty</span><span class="o">()),</span> + <span class="k">new</span> <span class="n">StructField</span><span class="o">(</span> <span class="s">"words"</span><span class="o">,</span> <span class="n">DataTypes</span><span class="o">.</span><span class="na">createArrayType</span><span class="o">(</span><span class="n">DataTypes</span><span class="o">.</span><span class="na">StringType</span><span class="o">),</span> <span class="kc">false</span><span class="o">,</span> <span class="n">Metadata</span><span class="o">.</span><span class="na">empty</span><span class="o">())</span> <span class="o">});</span> <span class="n">Dataset</span><span class="o"><</span><span class="n">Row</span><span class="o">></span> <span class="n">wordDataFrame</span> <span class="o">=</span> <span class="n">spark</span><span class="o">.</span><span class="na">createDataFrame</span><span class="o">(</span><span class="n">data</span><span class="o">,</span> <span class="n">schema</span><span class="o">);</span> -<span class="n">NGram</span> <span class="n">ngramTransformer</span> <span class="o">=</span> <span class="k">new</span> <span class="nf">NGram</span><span class="o">().</span><span class="na">setN</span><span class="o">(</span><span class="mi">2</span><span class="o">).</span><span class="na">setInputCol</span><span class="o">(</span><span class="s">"words"</span><span class="o">).</span><span class="na">setOutputCol</span><span class="o">(</span><span class="s">"ngrams"</span><span class="o">);</span> +<span class="n">NGram</span> <span class="n">ngramTransformer</span> <span class="o">=</span> <span class="k">new</span> <span class="n">NGram</span><span class="o">().</span><span class="na">setN</span><span class="o">(</span><span class="mi">2</span><span class="o">).</span><span class="na">setInputCol</span><span class="o">(</span><span class="s">"words"</span><span class="o">).</span><span class="na">setOutputCol</span><span class="o">(</span><span class="s">"ngrams"</span><span class="o">);</span> <span class="n">Dataset</span><span class="o"><</span><span class="n">Row</span><span class="o">></span> <span class="n">ngramDataFrame</span> <span class="o">=</span> <span class="n">ngramTransformer</span><span class="o">.</span><span class="na">transform</span><span class="o">(</span><span class="n">wordDataFrame</span><span class="o">);</span> <span class="n">ngramDataFrame</span><span class="o">.</span><span class="na">select</span><span class="o">(</span><span class="s">"ngrams"</span><span class="o">).</span><span class="na">show</span><span class="o">(</span><span class="kc">false</span><span class="o">);</span> @@ -1133,18 +1133,18 @@ for more details on the API.</p> <p>Refer to the <a href="api/python/pyspark.ml.html#pyspark.ml.feature.NGram">NGram Python docs</a> for more details on the API.</p> - <div class="highlight"><pre><span class="kn">from</span> <span class="nn">pyspark.ml.feature</span> <span class="kn">import</span> <span class="n">NGram</span> + <div class="highlight"><pre><span></span><span class="kn">from</span> <span class="nn">pyspark.ml.feature</span> <span class="kn">import</span> <span class="n">NGram</span> <span class="n">wordDataFrame</span> <span class="o">=</span> <span class="n">spark</span><span class="o">.</span><span class="n">createDataFrame</span><span class="p">([</span> - <span class="p">(</span><span class="mi">0</span><span class="p">,</span> <span class="p">[</span><span class="s">"Hi"</span><span class="p">,</span> <span class="s">"I"</span><span class="p">,</span> <span class="s">"heard"</span><span class="p">,</span> <span class="s">"about"</span><span class="p">,</span> <span class="s">"Spark"</span><span class="p">]),</span> - <span class="p">(</span><span class="mi">1</span><span class="p">,</span> <span class="p">[</span><span class="s">"I"</span><span class="p">,</span> <span class="s">"wish"</span><span class="p">,</span> <span class="s">"Java"</span><span class="p">,</span> <span class="s">"could"</span><span class="p">,</span> <span class="s">"use"</span><span class="p">,</span> <span class="s">"case"</span><span class="p">,</span> <span class="s">"classes"</span><span class="p">]),</span> - <span class="p">(</span><span class="mi">2</span><span class="p">,</span> <span class="p">[</span><span class="s">"Logistic"</span><span class="p">,</span> <span class="s">"regression"</span><span class="p">,</span> <span class="s">"models"</span><span class="p">,</span> <span class="s">"are"</span><span class="p">,</span> <span class="s">"neat"</span><span class="p">])</span> -<span class="p">],</span> <span class="p">[</span><span class="s">"id"</span><span class="p">,</span> <span class="s">"words"</span><span class="p">])</span> + <span class="p">(</span><span class="mi">0</span><span class="p">,</span> <span class="p">[</span><span class="s2">"Hi"</span><span class="p">,</span> <span class="s2">"I"</span><span class="p">,</span> <span class="s2">"heard"</span><span class="p">,</span> <span class="s2">"about"</span><span class="p">,</span> <span class="s2">"Spark"</span><span class="p">]),</span> + <span class="p">(</span><span class="mi">1</span><span class="p">,</span> <span class="p">[</span><span class="s2">"I"</span><span class="p">,</span> <span class="s2">"wish"</span><span class="p">,</span> <span class="s2">"Java"</span><span class="p">,</span> <span class="s2">"could"</span><span class="p">,</span> <span class="s2">"use"</span><span class="p">,</span> <span class="s2">"case"</span><span class="p">,</span> <span class="s2">"classes"</span><span class="p">]),</span> + <span class="p">(</span><span class="mi">2</span><span class="p">,</span> <span class="p">[</span><span class="s2">"Logistic"</span><span class="p">,</span> <span class="s2">"regression"</span><span class="p">,</span> <span class="s2">"models"</span><span class="p">,</span> <span class="s2">"are"</span><span class="p">,</span> <span class="s2">"neat"</span><span class="p">])</span> +<span class="p">],</span> <span class="p">[</span><span class="s2">"id"</span><span class="p">,</span> <span class="s2">"words"</span><span class="p">])</span> -<span class="n">ngram</span> <span class="o">=</span> <span class="n">NGram</span><span class="p">(</span><span class="n">n</span><span class="o">=</span><span class="mi">2</span><span class="p">,</span> <span class="n">inputCol</span><span class="o">=</span><span class="s">"words"</span><span class="p">,</span> <span class="n">outputCol</span><span class="o">=</span><span class="s">"ngrams"</span><span class="p">)</span> +<span class="n">ngram</span> <span class="o">=</span> <span class="n">NGram</span><span class="p">(</span><span class="n">n</span><span class="o">=</span><span class="mi">2</span><span class="p">,</span> <span class="n">inputCol</span><span class="o">=</span><span class="s2">"words"</span><span class="p">,</span> <span class="n">outputCol</span><span class="o">=</span><span class="s2">"ngrams"</span><span class="p">)</span> <span class="n">ngramDataFrame</span> <span class="o">=</span> <span class="n">ngram</span><span class="o">.</span><span class="n">transform</span><span class="p">(</span><span class="n">wordDataFrame</span><span class="p">)</span> -<span class="n">ngramDataFrame</span><span class="o">.</span><span class="n">select</span><span class="p">(</span><span class="s">"ngrams"</span><span class="p">)</span><span class="o">.</span><span class="n">show</span><span class="p">(</span><span class="n">truncate</span><span class="o">=</span><span class="bp">False</span><span class="p">)</span> +<span class="n">ngramDataFrame</span><span class="o">.</span><span class="n">select</span><span class="p">(</span><span class="s2">"ngrams"</span><span class="p">)</span><span class="o">.</span><span class="n">show</span><span class="p">(</span><span class="n">truncate</span><span class="o">=</span><span class="bp">False</span><span class="p">)</span> </pre></div> <div><small>Find full example code at "examples/src/main/python/ml/n_gram_example.py" in the Spark repo.</small></div> </div> @@ -1165,7 +1165,7 @@ for <code>inputCol</code>.</p> <p>Refer to the <a href="api/scala/index.html#org.apache.spark.ml.feature.Binarizer">Binarizer Scala docs</a> for more details on the API.</p> - <div class="highlight"><pre><span class="k">import</span> <span class="nn">org.apache.spark.ml.feature.Binarizer</span> + <div class="highlight"><pre><span></span><span class="k">import</span> <span class="nn">org.apache.spark.ml.feature.Binarizer</span> <span class="k">val</span> <span class="n">data</span> <span class="k">=</span> <span class="nc">Array</span><span class="o">((</span><span class="mi">0</span><span class="o">,</span> <span class="mf">0.1</span><span class="o">),</span> <span class="o">(</span><span class="mi">1</span><span class="o">,</span> <span class="mf">0.8</span><span class="o">),</span> <span class="o">(</span><span class="mi">2</span><span class="o">,</span> <span class="mf">0.2</span><span class="o">))</span> <span class="k">val</span> <span class="n">dataFrame</span> <span class="k">=</span> <span class="n">spark</span><span class="o">.</span><span class="n">createDataFrame</span><span class="o">(</span><span class="n">data</span><span class="o">).</span><span class="n">toDF</span><span class="o">(</span><span class="s">"id"</span><span class="o">,</span> <span class="s">"feature"</span><span class="o">)</span> @@ -1177,7 +1177,7 @@ for more details on the API.</p> <span class="k">val</span> <span class="n">binarizedDataFrame</span> <span class="k">=</span> <span class="n">binarizer</span><span class="o">.</span><span class="n">transform</span><span class="o">(</span><span class="n">dataFrame</span><span class="o">)</span> -<span class="n">println</span><span class="o">(</span><span class="n">s</span><span class="s">"Binarizer output with Threshold = ${binarizer.getThreshold}"</span><span class="o">)</span> +<span class="n">println</span><span class="o">(</span><span class="s">s"Binarizer output with Threshold = </span><span class="si">${</span><span class="n">binarizer</span><span class="o">.</span><span class="n">getThreshold</span><span class="si">}</span><span class="s">"</span><span class="o">)</span> <span class="n">binarizedDataFrame</span><span class="o">.</span><span class="n">show</span><span class="o">()</span> </pre></div> <div><small>Find full example code at "examples/src/main/scala/org/apache/spark/examples/ml/BinarizerExample.scala" in the Spark repo.</small></div> @@ -1188,7 +1188,7 @@ for more details on the API.</p> <p>Refer to the <a href="api/java/org/apache/spark/ml/feature/Binarizer.html">Binarizer Java docs</a> for more details on the API.</p> - <div class="highlight"><pre><span class="kn">import</span> <span class="nn">java.util.Arrays</span><span class="o">;</span> + <div class="highlight"><pre><span></span><span class="kn">import</span> <span class="nn">java.util.Arrays</span><span class="o">;</span> <span class="kn">import</span> <span class="nn">java.util.List</span><span class="o">;</span> <span class="kn">import</span> <span class="nn">org.apache.spark.ml.feature.Binarizer</span><span class="o">;</span> @@ -1204,13 +1204,13 @@ for more details on the API.</p> <span class="n">RowFactory</span><span class="o">.</span><span class="na">create</span><span class="o">(</span><span class="mi">1</span><span class="o">,</span> <span class="mf">0.8</span><span class="o">),</span> <span class="n">RowFactory</span><span class="o">.</span><span class="na">create</span><span class="o">(</span><span class="mi">2</span><span class="o">,</span> <span class="mf">0.2</span><span class="o">)</span> <span class="o">);</span> -<span class="n">StructType</span> <span class="n">schema</span> <span class="o">=</span> <span class="k">new</span> <span class="nf">StructType</span><span class="o">(</span><span class="k">new</span> <span class="n">StructField</span><span class="o">[]{</span> - <span class="k">new</span> <span class="nf">StructField</span><span class="o">(</span><span class="s">"id"</span><span class="o">,</span> <span class="n">DataTypes</span><span class="o">.</span><span class="na">IntegerType</span><span class="o">,</span> <span class="kc">false</span><span class="o">,</span> <span class="n">Metadata</span><span class="o">.</span><span class="na">empty</span><span class="o">()),</span> - <span class="k">new</span> <span class="nf">StructField</span><span class="o">(</span><span class="s">"feature"</span><span class="o">,</span> <span class="n">DataTypes</span><span class="o">.</span><span class="na">DoubleType</span><span class="o">,</span> <span class="kc">false</span>
<TRUNCATED> --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org