spark git commit: [SPARK-13068][PYSPARK][ML] Type conversion for Pyspark params

jkbradley Wed, 23 Mar 2016 11:21:07 -0700

Repository: spark
Updated Branches:
  refs/heads/master 48ee16d80 -> 30bdb5cbd



[SPARK-13068][PYSPARK][ML] Type conversion for Pyspark params

## What changes were proposed in this pull request?

This patch adds type conversion functionality for parameters in Pyspark. A 
`typeConverter` field is added to the constructor of `Param` class. This 
argument is a function which converts values passed to this param to the 
appropriate type if possible. This is beneficial so that the params can fail at 
set time if they are given inappropriate values, but even more so because 
coherent error messages are now provided when Py4J cannot cast the python type 
to the appropriate Java type.

This patch also adds a `TypeConverters` class with factory methods for common 
type conversions. Most of the changes involve adding these factory type 
converters to existing params. The previous solution to this issue, 
`expectedType`, is deprecated and can be removed in 2.1.0 as discussed on the 
Jira.

## How was this patch tested?

Unit tests were added in python/pyspark/ml/tests.py to test parameter type 
conversion. These tests check that values that should be convertible are 
converted correctly, and that the appropriate errors are thrown when invalid 
values are provided.

Author: sethah <[email protected]>

Closes #11663 from sethah/SPARK-13068-tc.


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/30bdb5cb
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/30bdb5cb
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/30bdb5cb

Branch: refs/heads/master
Commit: 30bdb5cbd9aec191cf15cdc83c3fee375c04c2b2
Parents: 48ee16d
Author: sethah <[email protected]>
Authored: Wed Mar 23 11:20:44 2016 -0700
Committer: Joseph K. Bradley <[email protected]>
Committed: Wed Mar 23 11:20:44 2016 -0700

----------------------------------------------------------------------
 python/pyspark/ml/classification.py             |  20 +-
 python/pyspark/ml/clustering.py                 |  14 +-
 python/pyspark/ml/feature.py                    |  95 ++++++----
 python/pyspark/ml/param/__init__.py             | 181 +++++++++++++++++--
 .../pyspark/ml/param/_shared_params_code_gen.py |  91 ++++++----
 python/pyspark/ml/param/shared.py               |  58 +++---
 python/pyspark/ml/recommendation.py             |  25 ++-
 python/pyspark/ml/regression.py                 |  25 ++-
 python/pyspark/ml/tests.py                      |  83 +++++++--
 python/pyspark/ml/tuning.py                     |   5 +-
 10 files changed, 421 insertions(+), 176 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/spark/blob/30bdb5cb/python/pyspark/ml/classification.py
----------------------------------------------------------------------
diff --git a/python/pyspark/ml/classification.py 
b/python/pyspark/ml/classification.py
index 8075108..fdeccf8 100644
--- a/python/pyspark/ml/classification.py
+++ b/python/pyspark/ml/classification.py
@@ -20,6 +20,7 @@ import warnings
 from pyspark import since
 from pyspark.ml.util import *
 from pyspark.ml.wrapper import JavaEstimator, JavaModel
+from pyspark.ml.param import TypeConverters
 from pyspark.ml.param.shared import *
 from pyspark.ml.regression import (
     RandomForestParams, TreeEnsembleParams, DecisionTreeModel, 
TreeEnsembleModels)
@@ -87,7 +88,8 @@ class LogisticRegression(JavaEstimator, HasFeaturesCol, 
HasLabelCol, HasPredicti
 
     threshold = Param(Params._dummy(), "threshold",
                       "Threshold in binary classification prediction, in range 
[0, 1]." +
-                      " If threshold and thresholds are both set, they must 
match.")
+                      " If threshold and thresholds are both set, they must 
match.",
+                      typeConverter=TypeConverters.toFloat)
 
     @keyword_only
     def __init__(self, featuresCol="features", labelCol="label", 
predictionCol="prediction",
@@ -243,7 +245,7 @@ class TreeClassifierParams(object):
     impurity = Param(Params._dummy(), "impurity",
                      "Criterion used for information gain calculation 
(case-insensitive). " +
                      "Supported options: " +
-                     ", ".join(supportedImpurities))
+                     ", ".join(supportedImpurities), 
typeConverter=TypeConverters.toString)
 
     def __init__(self):
         super(TreeClassifierParams, self).__init__()
@@ -534,7 +536,8 @@ class GBTClassifier(JavaEstimator, HasFeaturesCol, 
HasLabelCol, HasPredictionCol
 
     lossType = Param(Params._dummy(), "lossType",
                      "Loss function which GBT tries to minimize 
(case-insensitive). " +
-                     "Supported options: " + ", 
".join(GBTParams.supportedLossTypes))
+                     "Supported options: " + ", 
".join(GBTParams.supportedLossTypes),
+                     typeConverter=TypeConverters.toString)
 
     @keyword_only
     def __init__(self, featuresCol="features", labelCol="label", 
predictionCol="prediction",
@@ -652,9 +655,10 @@ class NaiveBayes(JavaEstimator, HasFeaturesCol, 
HasLabelCol, HasPredictionCol, H
     """
 
     smoothing = Param(Params._dummy(), "smoothing", "The smoothing parameter, 
should be >= 0, " +
-                      "default is 1.0")
+                      "default is 1.0", typeConverter=TypeConverters.toFloat)
     modelType = Param(Params._dummy(), "modelType", "The model type which is a 
string " +
-                      "(case-sensitive). Supported options: multinomial 
(default) and bernoulli.")
+                      "(case-sensitive). Supported options: multinomial 
(default) and bernoulli.",
+                      typeConverter=TypeConverters.toString)
 
     @keyword_only
     def __init__(self, featuresCol="features", labelCol="label", 
predictionCol="prediction",
@@ -782,11 +786,13 @@ class MultilayerPerceptronClassifier(JavaEstimator, 
HasFeaturesCol, HasLabelCol,
 
     layers = Param(Params._dummy(), "layers", "Sizes of layers from input 
layer to output layer " +
                    "E.g., Array(780, 100, 10) means 780 inputs, one hidden 
layer with 100 " +
-                   "neurons and output layer of 10 neurons, default is [1, 
1].")
+                   "neurons and output layer of 10 neurons, default is [1, 
1].",
+                   typeConverter=TypeConverters.toListInt)
     blockSize = Param(Params._dummy(), "blockSize", "Block size for stacking 
input data in " +
                       "matrices. Data is stacked within partitions. If block 
size is more than " +
                       "remaining data in a partition then it is adjusted to 
the size of this " +
-                      "data. Recommended size is between 10 and 1000, default 
is 128.")
+                      "data. Recommended size is between 10 and 1000, default 
is 128.",
+                      typeConverter=TypeConverters.toInt)
 
     @keyword_only
     def __init__(self, featuresCol="features", labelCol="label", 
predictionCol="prediction",

http://git-wip-us.apache.org/repos/asf/spark/blob/30bdb5cb/python/pyspark/ml/clustering.py
----------------------------------------------------------------------
diff --git a/python/pyspark/ml/clustering.py b/python/pyspark/ml/clustering.py
index 2db5b82..e22d5c8 100644
--- a/python/pyspark/ml/clustering.py
+++ b/python/pyspark/ml/clustering.py
@@ -87,12 +87,14 @@ class KMeans(JavaEstimator, HasFeaturesCol, 
HasPredictionCol, HasMaxIter, HasTol
     .. versionadded:: 1.5.0
     """
 
-    k = Param(Params._dummy(), "k", "number of clusters to create")
+    k = Param(Params._dummy(), "k", "number of clusters to create",
+              typeConverter=TypeConverters.toInt)
     initMode = Param(Params._dummy(), "initMode",
                      "the initialization algorithm. This can be either 
\"random\" to " +
                      "choose random points as initial cluster centers, or 
\"k-means||\" " +
-                     "to use a parallel variant of k-means++")
-    initSteps = Param(Params._dummy(), "initSteps", "steps for k-means 
initialization mode")
+                     "to use a parallel variant of k-means++", 
TypeConverters.toString)
+    initSteps = Param(Params._dummy(), "initSteps", "steps for k-means 
initialization mode",
+                      typeConverter=TypeConverters.toInt)
 
     @keyword_only
     def __init__(self, featuresCol="features", predictionCol="prediction", k=2,
@@ -227,10 +229,12 @@ class BisectingKMeans(JavaEstimator, HasFeaturesCol, 
HasPredictionCol, HasMaxIte
     .. versionadded:: 2.0.0
     """
 
-    k = Param(Params._dummy(), "k", "number of clusters to create")
+    k = Param(Params._dummy(), "k", "number of clusters to create",
+              typeConverter=TypeConverters.toInt)
     minDivisibleClusterSize = Param(Params._dummy(), "minDivisibleClusterSize",
                                     "the minimum number of points (if >= 1.0) 
" +
-                                    "or the minimum proportion")
+                                    "or the minimum proportion",
+                                    typeConverter=TypeConverters.toFloat)
 
     @keyword_only
     def __init__(self, featuresCol="features", predictionCol="prediction", 
maxIter=20,

http://git-wip-us.apache.org/repos/asf/spark/blob/30bdb5cb/python/pyspark/ml/feature.py
----------------------------------------------------------------------
diff --git a/python/pyspark/ml/feature.py b/python/pyspark/ml/feature.py
index 16cb9d1..86b5328 100644
--- a/python/pyspark/ml/feature.py
+++ b/python/pyspark/ml/feature.py
@@ -83,7 +83,8 @@ class Binarizer(JavaTransformer, HasInputCol, HasOutputCol, 
JavaMLReadable, Java
     """
 
     threshold = Param(Params._dummy(), "threshold",
-                      "threshold in binary classification prediction, in range 
[0, 1]")
+                      "threshold in binary classification prediction, in range 
[0, 1]",
+                      typeConverter=TypeConverters.toFloat)
 
     @keyword_only
     def __init__(self, threshold=0.0, inputCol=None, outputCol=None):
@@ -159,7 +160,8 @@ class Bucketizer(JavaTransformer, HasInputCol, 
HasOutputCol, JavaMLReadable, Jav
               "range [x,y) except the last bucket, which also includes y. The 
splits " +
               "should be strictly increasing. Values at -inf, inf must be 
explicitly " +
               "provided to cover all Double values; otherwise, values outside 
the splits " +
-              "specified will be treated as errors.")
+              "specified will be treated as errors.",
+              typeConverter=TypeConverters.toListFloat)
 
     @keyword_only
     def __init__(self, splits=None, inputCol=None, outputCol=None):
@@ -243,15 +245,17 @@ class CountVectorizer(JavaEstimator, HasInputCol, 
HasOutputCol, JavaMLReadable,
         " threshold are ignored. If this is an integer >= 1, then this 
specifies a count (of" +
         " times the term must appear in the document); if this is a double in 
[0,1), then this " +
         "specifies a fraction (out of the document's token count). Note that 
the parameter is " +
-        "only used in transform of CountVectorizerModel and does not affect 
fitting. Default 1.0")
+        "only used in transform of CountVectorizerModel and does not affect 
fitting. Default 1.0",
+        typeConverter=TypeConverters.toFloat)
     minDF = Param(
         Params._dummy(), "minDF", "Specifies the minimum number of" +
         " different documents a term must appear in to be included in the 
vocabulary." +
         " If this is an integer >= 1, this specifies the number of documents 
the term must" +
         " appear in; if this is a double in [0,1), then this specifies the 
fraction of documents." +
-        " Default 1.0")
+        " Default 1.0", typeConverter=TypeConverters.toFloat)
     vocabSize = Param(
-        Params._dummy(), "vocabSize", "max size of the vocabulary. Default 1 
<< 18.")
+        Params._dummy(), "vocabSize", "max size of the vocabulary. Default 1 
<< 18.",
+        typeConverter=TypeConverters.toInt)
 
     @keyword_only
     def __init__(self, minTF=1.0, minDF=1.0, vocabSize=1 << 18, inputCol=None, 
outputCol=None):
@@ -375,7 +379,7 @@ class DCT(JavaTransformer, HasInputCol, HasOutputCol, 
JavaMLReadable, JavaMLWrit
     """
 
     inverse = Param(Params._dummy(), "inverse", "Set transformer to perform 
inverse DCT, " +
-                    "default False.")
+                    "default False.", typeConverter=TypeConverters.toBoolean)
 
     @keyword_only
     def __init__(self, inverse=False, inputCol=None, outputCol=None):
@@ -441,8 +445,8 @@ class ElementwiseProduct(JavaTransformer, HasInputCol, 
HasOutputCol, JavaMLReada
     .. versionadded:: 1.5.0
     """
 
-    scalingVec = Param(Params._dummy(), "scalingVec", "vector for hadamard 
product, " +
-                       "it must be MLlib Vector type.")
+    scalingVec = Param(Params._dummy(), "scalingVec", "Vector for hadamard 
product.",
+                       typeConverter=TypeConverters.toVector)
 
     @keyword_only
     def __init__(self, scalingVec=None, inputCol=None, outputCol=None):
@@ -564,7 +568,8 @@ class IDF(JavaEstimator, HasInputCol, HasOutputCol, 
JavaMLReadable, JavaMLWritab
     """
 
     minDocFreq = Param(Params._dummy(), "minDocFreq",
-                       "minimum of documents in which a term should appear for 
filtering")
+                       "minimum of documents in which a term should appear for 
filtering",
+                       typeConverter=TypeConverters.toInt)
 
     @keyword_only
     def __init__(self, minDocFreq=0, inputCol=None, outputCol=None):
@@ -746,8 +751,10 @@ class MinMaxScaler(JavaEstimator, HasInputCol, 
HasOutputCol, JavaMLReadable, Jav
     .. versionadded:: 1.6.0
     """
 
-    min = Param(Params._dummy(), "min", "Lower bound of the output feature 
range")
-    max = Param(Params._dummy(), "max", "Upper bound of the output feature 
range")
+    min = Param(Params._dummy(), "min", "Lower bound of the output feature 
range",
+                typeConverter=TypeConverters.toFloat)
+    max = Param(Params._dummy(), "max", "Upper bound of the output feature 
range",
+                typeConverter=TypeConverters.toFloat)
 
     @keyword_only
     def __init__(self, min=0.0, max=1.0, inputCol=None, outputCol=None):
@@ -870,7 +877,8 @@ class NGram(JavaTransformer, HasInputCol, HasOutputCol, 
JavaMLReadable, JavaMLWr
     .. versionadded:: 1.5.0
     """
 
-    n = Param(Params._dummy(), "n", "number of elements per n-gram (>=1)")
+    n = Param(Params._dummy(), "n", "number of elements per n-gram (>=1)",
+              typeConverter=TypeConverters.toInt)
 
     @keyword_only
     def __init__(self, n=2, inputCol=None, outputCol=None):
@@ -936,7 +944,8 @@ class Normalizer(JavaTransformer, HasInputCol, 
HasOutputCol, JavaMLReadable, Jav
     .. versionadded:: 1.4.0
     """
 
-    p = Param(Params._dummy(), "p", "the p norm value.")
+    p = Param(Params._dummy(), "p", "the p norm value.",
+              typeConverter=TypeConverters.toFloat)
 
     @keyword_only
     def __init__(self, p=2.0, inputCol=None, outputCol=None):
@@ -1018,7 +1027,8 @@ class OneHotEncoder(JavaTransformer, HasInputCol, 
HasOutputCol, JavaMLReadable,
     .. versionadded:: 1.4.0
     """
 
-    dropLast = Param(Params._dummy(), "dropLast", "whether to drop the last 
category")
+    dropLast = Param(Params._dummy(), "dropLast", "whether to drop the last 
category",
+                     typeConverter=TypeConverters.toBoolean)
 
     @keyword_only
     def __init__(self, dropLast=True, inputCol=None, outputCol=None):
@@ -1085,7 +1095,8 @@ class PolynomialExpansion(JavaTransformer, HasInputCol, 
HasOutputCol, JavaMLRead
     .. versionadded:: 1.4.0
     """
 
-    degree = Param(Params._dummy(), "degree", "the polynomial degree to expand 
(>= 1)")
+    degree = Param(Params._dummy(), "degree", "the polynomial degree to expand 
(>= 1)",
+                   typeConverter=TypeConverters.toInt)
 
     @keyword_only
     def __init__(self, degree=2, inputCol=None, outputCol=None):
@@ -1163,7 +1174,8 @@ class QuantileDiscretizer(JavaEstimator, HasInputCol, 
HasOutputCol, HasSeed, Jav
     # a placeholder to make it appear in the generated doc
     numBuckets = Param(Params._dummy(), "numBuckets",
                        "Maximum number of buckets (quantiles, or " +
-                       "categories) into which data points are grouped. Must 
be >= 2. Default 2.")
+                       "categories) into which data points are grouped. Must 
be >= 2. Default 2.",
+                       typeConverter=TypeConverters.toInt)
 
     @keyword_only
     def __init__(self, numBuckets=2, inputCol=None, outputCol=None, seed=None):
@@ -1255,11 +1267,13 @@ class RegexTokenizer(JavaTransformer, HasInputCol, 
HasOutputCol, JavaMLReadable,
     .. versionadded:: 1.4.0
     """
 
-    minTokenLength = Param(Params._dummy(), "minTokenLength", "minimum token 
length (>= 0)")
+    minTokenLength = Param(Params._dummy(), "minTokenLength", "minimum token 
length (>= 0)",
+                           typeConverter=TypeConverters.toInt)
     gaps = Param(Params._dummy(), "gaps", "whether regex splits on gaps (True) 
or matches tokens")
-    pattern = Param(Params._dummy(), "pattern", "regex pattern (Java dialect) 
used for tokenizing")
+    pattern = Param(Params._dummy(), "pattern", "regex pattern (Java dialect) 
used for tokenizing",
+                    TypeConverters.toString)
     toLowercase = Param(Params._dummy(), "toLowercase", "whether to convert 
all characters to " +
-                        "lowercase before tokenizing")
+                        "lowercase before tokenizing", 
TypeConverters.toBoolean)
 
     @keyword_only
     def __init__(self, minTokenLength=1, gaps=True, pattern="\\s+", 
inputCol=None,
@@ -1370,7 +1384,7 @@ class SQLTransformer(JavaTransformer, JavaMLReadable, 
JavaMLWritable):
     .. versionadded:: 1.6.0
     """
 
-    statement = Param(Params._dummy(), "statement", "SQL statement")
+    statement = Param(Params._dummy(), "statement", "SQL statement", 
TypeConverters.toString)
 
     @keyword_only
     def __init__(self, statement=None):
@@ -1444,8 +1458,9 @@ class StandardScaler(JavaEstimator, HasInputCol, 
HasOutputCol, JavaMLReadable, J
     .. versionadded:: 1.4.0
     """
 
-    withMean = Param(Params._dummy(), "withMean", "Center data with mean")
-    withStd = Param(Params._dummy(), "withStd", "Scale to unit standard 
deviation")
+    withMean = Param(Params._dummy(), "withMean", "Center data with mean", 
TypeConverters.toBoolean)
+    withStd = Param(Params._dummy(), "withStd", "Scale to unit standard 
deviation",
+                    TypeConverters.toBoolean)
 
     @keyword_only
     def __init__(self, withMean=False, withStd=True, inputCol=None, 
outputCol=None):
@@ -1628,7 +1643,8 @@ class IndexToString(JavaTransformer, HasInputCol, 
HasOutputCol, JavaMLReadable,
 
     labels = Param(Params._dummy(), "labels",
                    "Optional array of labels specifying index-string mapping." 
+
-                   " If not provided or if empty, then metadata from inputCol 
is used instead.")
+                   " If not provided or if empty, then metadata from inputCol 
is used instead.",
+                   typeConverter=TypeConverters.toListString)
 
     @keyword_only
     def __init__(self, inputCol=None, outputCol=None, labels=None):
@@ -1689,9 +1705,10 @@ class StopWordsRemover(JavaTransformer, HasInputCol, 
HasOutputCol, JavaMLReadabl
     .. versionadded:: 1.6.0
     """
 
-    stopWords = Param(Params._dummy(), "stopWords", "The words to be filtered 
out")
+    stopWords = Param(Params._dummy(), "stopWords", "The words to be filtered 
out",
+                      typeConverter=TypeConverters.toListString)
     caseSensitive = Param(Params._dummy(), "caseSensitive", "whether to do a 
case sensitive " +
-                          "comparison over the stop words")
+                          "comparison over the stop words", 
TypeConverters.toBoolean)
 
     @keyword_only
     def __init__(self, inputCol=None, outputCol=None, stopWords=None,
@@ -1930,7 +1947,7 @@ class VectorIndexer(JavaEstimator, HasInputCol, 
HasOutputCol, JavaMLReadable, Ja
     maxCategories = Param(Params._dummy(), "maxCategories",
                           "Threshold for the number of values a categorical 
feature can take " +
                           "(>= 2). If a feature is found to have > 
maxCategories values, then " +
-                          "it is declared continuous.")
+                          "it is declared continuous.", 
typeConverter=TypeConverters.toInt)
 
     @keyword_only
     def __init__(self, maxCategories=20, inputCol=None, outputCol=None):
@@ -2035,11 +2052,12 @@ class VectorSlicer(JavaTransformer, HasInputCol, 
HasOutputCol, JavaMLReadable, J
     """
 
     indices = Param(Params._dummy(), "indices", "An array of indices to select 
features from " +
-                    "a vector column. There can be no overlap with names.")
+                    "a vector column. There can be no overlap with names.",
+                    typeConverter=TypeConverters.toListInt)
     names = Param(Params._dummy(), "names", "An array of feature names to 
select features from " +
                   "a vector column. These names must be specified by ML " +
                   "org.apache.spark.ml.attribute.Attribute. There can be no 
overlap with " +
-                  "indices.")
+                  "indices.", typeConverter=TypeConverters.toListString)
 
     @keyword_only
     def __init__(self, inputCol=None, outputCol=None, indices=None, 
names=None):
@@ -2147,12 +2165,14 @@ class Word2Vec(JavaEstimator, HasStepSize, HasMaxIter, 
HasSeed, HasInputCol, Has
     """
 
     vectorSize = Param(Params._dummy(), "vectorSize",
-                       "the dimension of codes after transforming from words")
+                       "the dimension of codes after transforming from words",
+                       typeConverter=TypeConverters.toInt)
     numPartitions = Param(Params._dummy(), "numPartitions",
-                          "number of partitions for sentences of words")
+                          "number of partitions for sentences of words",
+                          typeConverter=TypeConverters.toInt)
     minCount = Param(Params._dummy(), "minCount",
                      "the minimum number of times a token must appear to be 
included in the " +
-                     "word2vec model's vocabulary")
+                     "word2vec model's vocabulary", 
typeConverter=TypeConverters.toInt)
 
     @keyword_only
     def __init__(self, vectorSize=100, minCount=5, numPartitions=1, 
stepSize=0.025, maxIter=1,
@@ -2293,7 +2313,8 @@ class PCA(JavaEstimator, HasInputCol, HasOutputCol, 
JavaMLReadable, JavaMLWritab
     .. versionadded:: 1.5.0
     """
 
-    k = Param(Params._dummy(), "k", "the number of principal components")
+    k = Param(Params._dummy(), "k", "the number of principal components",
+              typeConverter=TypeConverters.toInt)
 
     @keyword_only
     def __init__(self, k=None, inputCol=None, outputCol=None):
@@ -2425,7 +2446,7 @@ class RFormula(JavaEstimator, HasFeaturesCol, 
HasLabelCol, JavaMLReadable, JavaM
     .. versionadded:: 1.5.0
     """
 
-    formula = Param(Params._dummy(), "formula", "R model formula")
+    formula = Param(Params._dummy(), "formula", "R model formula", 
TypeConverters.toString)
 
     @keyword_only
     def __init__(self, formula=None, featuresCol="features", labelCol="label"):
@@ -2511,12 +2532,11 @@ class ChiSqSelector(JavaEstimator, HasFeaturesCol, 
HasOutputCol, HasLabelCol, Ja
     .. versionadded:: 2.0.0
     """
 
-    # a placeholder to make it appear in the generated doc
     numTopFeatures = \
         Param(Params._dummy(), "numTopFeatures",
               "Number of features that selector will select, ordered by 
statistics value " +
               "descending. If the number of features is < numTopFeatures, then 
this will select " +
-              "all features.")
+              "all features.", typeConverter=TypeConverters.toInt)
 
     @keyword_only
     def __init__(self, numTopFeatures=50, featuresCol="features", 
outputCol=None, labelCol="label"):
@@ -2525,11 +2545,6 @@ class ChiSqSelector(JavaEstimator, HasFeaturesCol, 
HasOutputCol, HasLabelCol, Ja
         """
         super(ChiSqSelector, self).__init__()
         self._java_obj = 
self._new_java_obj("org.apache.spark.ml.feature.ChiSqSelector", self.uid)
-        self.numTopFeatures = \
-            Param(self, "numTopFeatures",
-                  "Number of features that selector will select, ordered by 
statistics value " +
-                  "descending. If the number of features is < numTopFeatures, 
then this will " +
-                  "select all features.")
         kwargs = self.__init__._input_kwargs
         self.setParams(**kwargs)
 

http://git-wip-us.apache.org/repos/asf/spark/blob/30bdb5cb/python/pyspark/ml/param/__init__.py
----------------------------------------------------------------------
diff --git a/python/pyspark/ml/param/__init__.py 
b/python/pyspark/ml/param/__init__.py
index c0f0a71..a126529 100644
--- a/python/pyspark/ml/param/__init__.py
+++ b/python/pyspark/ml/param/__init__.py
@@ -14,31 +14,47 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 #
+import array
+import sys
+if sys.version > '3':
+    basestring = str
+    xrange = range
+    unicode = str
 
 from abc import ABCMeta
 import copy
+import numpy as np
+import warnings
 
 from pyspark import since
 from pyspark.ml.util import Identifiable
+from pyspark.mllib.linalg import DenseVector, Vector
 
 
-__all__ = ['Param', 'Params']
+__all__ = ['Param', 'Params', 'TypeConverters']
 
 
 class Param(object):
     """
     A param with self-contained documentation.
 
+    Note: `expectedType` is deprecated and will be removed in 2.1. Use 
typeConverter instead,
+          as a keyword argument.
+
     .. versionadded:: 1.3.0
     """
 
-    def __init__(self, parent, name, doc, expectedType=None):
+    def __init__(self, parent, name, doc, expectedType=None, 
typeConverter=None):
         if not isinstance(parent, Identifiable):
             raise TypeError("Parent must be an Identifiable but got type %s." 
% type(parent))
         self.parent = parent.uid
         self.name = str(name)
         self.doc = str(doc)
         self.expectedType = expectedType
+        if expectedType is not None:
+            warnings.warn("expectedType is deprecated and will be removed in 
2.1. " +
+                          "Use typeConverter instead, as a keyword argument.")
+        self.typeConverter = TypeConverters.identity if typeConverter is None 
else typeConverter
 
     def _copy_new_parent(self, parent):
         """Copy the current param to a new parent, must be a dummy param."""
@@ -65,6 +81,146 @@ class Param(object):
             return False
 
 
+class TypeConverters(object):
+    """
+    .. note:: DeveloperApi
+
+    Factory methods for common type conversion functions for 
`Param.typeConverter`.
+
+    .. versionadded:: 2.0.0
+    """
+
+    @staticmethod
+    def _is_numeric(value):
+        vtype = type(value)
+        return vtype in [int, float, np.float64, np.int64] or vtype.__name__ 
== 'long'
+
+    @staticmethod
+    def _is_integer(value):
+        return TypeConverters._is_numeric(value) and float(value).is_integer()
+
+    @staticmethod
+    def _can_convert_to_list(value):
+        vtype = type(value)
+        return vtype in [list, np.ndarray, tuple, xrange, array.array] or 
isinstance(value, Vector)
+
+    @staticmethod
+    def _can_convert_to_string(value):
+        vtype = type(value)
+        return isinstance(value, basestring) or vtype in [np.unicode_, 
np.string_, np.str_]
+
+    @staticmethod
+    def identity(value):
+        """
+        Dummy converter that just returns value.
+        """
+        return value
+
+    @staticmethod
+    def toList(value):
+        """
+        Convert a value to a list, if possible.
+        """
+        if type(value) == list:
+            return value
+        elif type(value) in [np.ndarray, tuple, xrange, array.array]:
+            return list(value)
+        elif isinstance(value, Vector):
+            return list(value.toArray())
+        else:
+            raise TypeError("Could not convert %s to list" % value)
+
+    @staticmethod
+    def toListFloat(value):
+        """
+        Convert a value to list of floats, if possible.
+        """
+        if TypeConverters._can_convert_to_list(value):
+            value = TypeConverters.toList(value)
+            if all(map(lambda v: TypeConverters._is_numeric(v), value)):
+                return [float(v) for v in value]
+        raise TypeError("Could not convert %s to list of floats" % value)
+
+    @staticmethod
+    def toListInt(value):
+        """
+        Convert a value to list of ints, if possible.
+        """
+        if TypeConverters._can_convert_to_list(value):
+            value = TypeConverters.toList(value)
+            if all(map(lambda v: TypeConverters._is_integer(v), value)):
+                return [int(v) for v in value]
+        raise TypeError("Could not convert %s to list of ints" % value)
+
+    @staticmethod
+    def toListString(value):
+        """
+        Convert a value to list of strings, if possible.
+        """
+        if TypeConverters._can_convert_to_list(value):
+            value = TypeConverters.toList(value)
+            if all(map(lambda v: TypeConverters._can_convert_to_string(v), 
value)):
+                return [TypeConverters.toString(v) for v in value]
+        raise TypeError("Could not convert %s to list of strings" % value)
+
+    @staticmethod
+    def toVector(value):
+        """
+        Convert a value to a MLlib Vector, if possible.
+        """
+        if isinstance(value, Vector):
+            return value
+        elif TypeConverters._can_convert_to_list(value):
+            value = TypeConverters.toList(value)
+            if all(map(lambda v: TypeConverters._is_numeric(v), value)):
+                return DenseVector(value)
+        raise TypeError("Could not convert %s to vector" % value)
+
+    @staticmethod
+    def toFloat(value):
+        """
+        Convert a value to a float, if possible.
+        """
+        if TypeConverters._is_numeric(value):
+            return float(value)
+        else:
+            raise TypeError("Could not convert %s to float" % value)
+
+    @staticmethod
+    def toInt(value):
+        """
+        Convert a value to an int, if possible.
+        """
+        if TypeConverters._is_integer(value):
+            return int(value)
+        else:
+            raise TypeError("Could not convert %s to int" % value)
+
+    @staticmethod
+    def toString(value):
+        """
+        Convert a value to a string, if possible.
+        """
+        if isinstance(value, basestring):
+            return value
+        elif type(value) in [np.string_, np.str_]:
+            return str(value)
+        elif type(value) == np.unicode_:
+            return unicode(value)
+        else:
+            raise TypeError("Could not convert %s to string type" % 
type(value))
+
+    @staticmethod
+    def toBoolean(value):
+        """
+        Convert a value to a boolean, if possible.
+        """
+        if type(value) == bool:
+            return value
+        else:
+            raise TypeError("Boolean Param requires value of type bool. Found 
%s." % type(value))
+
+
 class Params(Identifiable):
     """
     Components that take parameters. This also provides an internal
@@ -275,23 +431,12 @@ class Params(Identifiable):
         """
         for param, value in kwargs.items():
             p = getattr(self, param)
-            if p.expectedType is None or type(value) == p.expectedType or 
value is None:
-                self._paramMap[getattr(self, param)] = value
-            else:
+            if value is not None:
                 try:
-                    # Try and do "safe" conversions that don't lose information
-                    if p.expectedType == float:
-                        self._paramMap[getattr(self, param)] = float(value)
-                    # Python 3 unified long & int
-                    elif p.expectedType == int and type(value).__name__ == 
'long':
-                        self._paramMap[getattr(self, param)] = value
-                    else:
-                        raise Exception(
-                            "Provided type {0} incompatible with type {1} for 
param {2}"
-                            .format(type(value), p.expectedType, p))
-                except ValueError:
-                    raise Exception(("Failed to convert {0} to type {1} for 
param {2}"
-                                     .format(type(value), p.expectedType, p)))
+                    value = p.typeConverter(value)
+                except TypeError as e:
+                    raise TypeError('Invalid param value given for param "%s". 
%s' % (p.name, e))
+            self._paramMap[p] = value
         return self
 
     def _setDefault(self, **kwargs):

http://git-wip-us.apache.org/repos/asf/spark/blob/30bdb5cb/python/pyspark/ml/param/_shared_params_code_gen.py
----------------------------------------------------------------------
diff --git a/python/pyspark/ml/param/_shared_params_code_gen.py 
b/python/pyspark/ml/param/_shared_params_code_gen.py
index 5e297b8..7dd2937 100644
--- a/python/pyspark/ml/param/_shared_params_code_gen.py
+++ b/python/pyspark/ml/param/_shared_params_code_gen.py
@@ -38,7 +38,7 @@ header = """#
 # python _shared_params_code_gen.py > shared.py
 
 
-def _gen_param_header(name, doc, defaultValueStr, expectedType):
+def _gen_param_header(name, doc, defaultValueStr, typeConverter):
     """
     Generates the header part for shared variables
 
@@ -50,7 +50,7 @@ def _gen_param_header(name, doc, defaultValueStr, 
expectedType):
     Mixin for param $name: $doc
     """
 
-    $name = Param(Params._dummy(), "$name", "$doc", $expectedType)
+    $name = Param(Params._dummy(), "$name", "$doc", 
typeConverter=$typeConverter)
 
     def __init__(self):
         super(Has$Name, self).__init__()'''
@@ -60,15 +60,14 @@ def _gen_param_header(name, doc, defaultValueStr, 
expectedType):
         self._setDefault($name=$defaultValueStr)'''
 
     Name = name[0].upper() + name[1:]
-    expectedTypeName = str(expectedType)
-    if expectedType is not None:
-        expectedTypeName = expectedType.__name__
+    if typeConverter is None:
+        typeConverter = str(None)
     return template \
         .replace("$name", name) \
         .replace("$Name", Name) \
         .replace("$doc", doc) \
         .replace("$defaultValueStr", str(defaultValueStr)) \
-        .replace("$expectedType", expectedTypeName)
+        .replace("$typeConverter", typeConverter)
 
 
 def _gen_param_code(name, doc, defaultValueStr):
@@ -105,64 +104,73 @@ def _gen_param_code(name, doc, defaultValueStr):
 if __name__ == "__main__":
     print(header)
     print("\n# DO NOT MODIFY THIS FILE! It was generated by 
_shared_params_code_gen.py.\n")
-    print("from pyspark.ml.param import Param, Params\n\n")
+    print("from pyspark.ml.param import *\n\n")
     shared = [
-        ("maxIter", "max number of iterations (>= 0).", None, int),
-        ("regParam", "regularization parameter (>= 0).", None, float),
-        ("featuresCol", "features column name.", "'features'", str),
-        ("labelCol", "label column name.", "'label'", str),
-        ("predictionCol", "prediction column name.", "'prediction'", str),
+        ("maxIter", "max number of iterations (>= 0).", None, 
"TypeConverters.toInt"),
+        ("regParam", "regularization parameter (>= 0).", None, 
"TypeConverters.toFloat"),
+        ("featuresCol", "features column name.", "'features'", 
"TypeConverters.toString"),
+        ("labelCol", "label column name.", "'label'", 
"TypeConverters.toString"),
+        ("predictionCol", "prediction column name.", "'prediction'", 
"TypeConverters.toString"),
         ("probabilityCol", "Column name for predicted class conditional 
probabilities. " +
          "Note: Not all models output well-calibrated probability estimates! 
These probabilities " +
-         "should be treated as confidences, not precise probabilities.", 
"'probability'", str),
+         "should be treated as confidences, not precise probabilities.", 
"'probability'",
+         "TypeConverters.toString"),
         ("rawPredictionCol", "raw prediction (a.k.a. confidence) column 
name.", "'rawPrediction'",
-         str),
-        ("inputCol", "input column name.", None, str),
-        ("inputCols", "input column names.", None, None),
-        ("outputCol", "output column name.", "self.uid + '__output'", str),
-        ("numFeatures", "number of features.", None, int),
+         "TypeConverters.toString"),
+        ("inputCol", "input column name.", None, "TypeConverters.toString"),
+        ("inputCols", "input column names.", None, 
"TypeConverters.toListString"),
+        ("outputCol", "output column name.", "self.uid + '__output'", 
"TypeConverters.toString"),
+        ("numFeatures", "number of features.", None, "TypeConverters.toInt"),
         ("checkpointInterval", "set checkpoint interval (>= 1) or disable 
checkpoint (-1). " +
-         "E.g. 10 means that the cache will get checkpointed every 10 
iterations.", None, int),
-        ("seed", "random seed.", "hash(type(self).__name__)", int),
-        ("tol", "the convergence tolerance for iterative algorithms.", None, 
float),
-        ("stepSize", "Step size to be used for each iteration of 
optimization.", None, float),
+         "E.g. 10 means that the cache will get checkpointed every 10 
iterations.", None,
+         "TypeConverters.toInt"),
+        ("seed", "random seed.", "hash(type(self).__name__)", 
"TypeConverters.toInt"),
+        ("tol", "the convergence tolerance for iterative algorithms.", None,
+         "TypeConverters.toFloat"),
+        ("stepSize", "Step size to be used for each iteration of 
optimization.", None,
+         "TypeConverters.toFloat"),
         ("handleInvalid", "how to handle invalid entries. Options are skip 
(which will filter " +
          "out rows with bad values), or error (which will throw an errror). 
More options may be " +
-         "added later.", None, str),
+         "added later.", None, "TypeConverters.toBoolean"),
         ("elasticNetParam", "the ElasticNet mixing parameter, in range [0, 1]. 
For alpha = 0, " +
-         "the penalty is an L2 penalty. For alpha = 1, it is an L1 penalty.", 
"0.0", float),
-        ("fitIntercept", "whether to fit an intercept term.", "True", bool),
+         "the penalty is an L2 penalty. For alpha = 1, it is an L1 penalty.", 
"0.0",
+         "TypeConverters.toFloat"),
+        ("fitIntercept", "whether to fit an intercept term.", "True", 
"TypeConverters.toBoolean"),
         ("standardization", "whether to standardize the training features 
before fitting the " +
-         "model.", "True", bool),
+         "model.", "True", "TypeConverters.toBoolean"),
         ("thresholds", "Thresholds in multi-class classification to adjust the 
probability of " +
          "predicting each class. Array must have length equal to the number of 
classes, with " +
          "values >= 0. The class with largest value p/t is predicted, where p 
is the original " +
-         "probability of that class and t is the class' threshold.", None, 
None),
+         "probability of that class and t is the class' threshold.", None,
+         "TypeConverters.toListFloat"),
         ("weightCol", "weight column name. If this is not set or empty, we 
treat " +
-         "all instance weights as 1.0.", None, str),
+         "all instance weights as 1.0.", None, "TypeConverters.toString"),
         ("solver", "the solver algorithm for optimization. If this is not set 
or empty, " +
-         "default value is 'auto'.", "'auto'", str)]
+         "default value is 'auto'.", "'auto'", "TypeConverters.toString")]
 
     code = []
-    for name, doc, defaultValueStr, expectedType in shared:
-        param_code = _gen_param_header(name, doc, defaultValueStr, 
expectedType)
+    for name, doc, defaultValueStr, typeConverter in shared:
+        param_code = _gen_param_header(name, doc, defaultValueStr, 
typeConverter)
         code.append(param_code + "\n" + _gen_param_code(name, doc, 
defaultValueStr))
 
     decisionTreeParams = [
         ("maxDepth", "Maximum depth of the tree. (>= 0) E.g., depth 0 means 1 
leaf node; " +
-         "depth 1 means 1 internal node + 2 leaf nodes."),
+         "depth 1 means 1 internal node + 2 leaf nodes.", 
"TypeConverters.toInt"),
         ("maxBins", "Max number of bins for" +
          " discretizing continuous features.  Must be >=2 and >= number of 
categories for any" +
-         " categorical feature."),
+         " categorical feature.", "TypeConverters.toInt"),
         ("minInstancesPerNode", "Minimum number of instances each child must 
have after split. " +
          "If a split causes the left or right child to have fewer than 
minInstancesPerNode, the " +
-         "split will be discarded as invalid. Should be >= 1."),
-        ("minInfoGain", "Minimum information gain for a split to be considered 
at a tree node."),
-        ("maxMemoryInMB", "Maximum memory in MB allocated to histogram 
aggregation."),
+         "split will be discarded as invalid. Should be >= 1.", 
"TypeConverters.toInt"),
+        ("minInfoGain", "Minimum information gain for a split to be considered 
at a tree node.",
+         "TypeConverters.toFloat"),
+        ("maxMemoryInMB", "Maximum memory in MB allocated to histogram 
aggregation.",
+         "TypeConverters.toInt"),
         ("cacheNodeIds", "If false, the algorithm will pass trees to executors 
to match " +
          "instances with nodes. If true, the algorithm will cache node IDs for 
each instance. " +
          "Caching can speed up training of deeper trees. Users can set how 
often should the " +
-         "cache be checkpointed or disable it by setting checkpointInterval.")]
+         "cache be checkpointed or disable it by setting checkpointInterval.",
+         "TypeConverters.toBoolean")]
 
     decisionTreeCode = '''class DecisionTreeParams(Params):
     """
@@ -175,9 +183,12 @@ if __name__ == "__main__":
         super(DecisionTreeParams, self).__init__()'''
     dtParamMethods = ""
     dummyPlaceholders = ""
-    paramTemplate = """$name = Param($owner, "$name", "$doc")"""
-    for name, doc in decisionTreeParams:
-        variable = paramTemplate.replace("$name", name).replace("$doc", doc)
+    paramTemplate = """$name = Param($owner, "$name", "$doc", 
typeConverter=$typeConverterStr)"""
+    for name, doc, typeConverterStr in decisionTreeParams:
+        if typeConverterStr is None:
+            typeConverterStr = str(None)
+        variable = paramTemplate.replace("$name", name).replace("$doc", doc) \
+            .replace("$typeConverterStr", typeConverterStr)
         dummyPlaceholders += variable.replace("$owner", "Params._dummy()") + 
"\n    "
         dtParamMethods += _gen_param_code(name, doc, None) + "\n"
     code.append(decisionTreeCode.replace("$dummyPlaceHolders", 
dummyPlaceholders) + "\n" +

http://git-wip-us.apache.org/repos/asf/spark/blob/30bdb5cb/python/pyspark/ml/param/shared.py
----------------------------------------------------------------------
diff --git a/python/pyspark/ml/param/shared.py 
b/python/pyspark/ml/param/shared.py
index db4a8a5..83fbd59 100644
--- a/python/pyspark/ml/param/shared.py
+++ b/python/pyspark/ml/param/shared.py
@@ -17,7 +17,7 @@
 
 # DO NOT MODIFY THIS FILE! It was generated by _shared_params_code_gen.py.
 
-from pyspark.ml.param import Param, Params
+from pyspark.ml.param import *
 
 
 class HasMaxIter(Params):
@@ -25,7 +25,7 @@ class HasMaxIter(Params):
     Mixin for param maxIter: max number of iterations (>= 0).
     """
 
-    maxIter = Param(Params._dummy(), "maxIter", "max number of iterations (>= 
0).", int)
+    maxIter = Param(Params._dummy(), "maxIter", "max number of iterations (>= 
0).", typeConverter=TypeConverters.toInt)
 
     def __init__(self):
         super(HasMaxIter, self).__init__()
@@ -49,7 +49,7 @@ class HasRegParam(Params):
     Mixin for param regParam: regularization parameter (>= 0).
     """
 
-    regParam = Param(Params._dummy(), "regParam", "regularization parameter 
(>= 0).", float)
+    regParam = Param(Params._dummy(), "regParam", "regularization parameter 
(>= 0).", typeConverter=TypeConverters.toFloat)
 
     def __init__(self):
         super(HasRegParam, self).__init__()
@@ -73,7 +73,7 @@ class HasFeaturesCol(Params):
     Mixin for param featuresCol: features column name.
     """
 
-    featuresCol = Param(Params._dummy(), "featuresCol", "features column 
name.", str)
+    featuresCol = Param(Params._dummy(), "featuresCol", "features column 
name.", typeConverter=TypeConverters.toString)
 
     def __init__(self):
         super(HasFeaturesCol, self).__init__()
@@ -98,7 +98,7 @@ class HasLabelCol(Params):
     Mixin for param labelCol: label column name.
     """
 
-    labelCol = Param(Params._dummy(), "labelCol", "label column name.", str)
+    labelCol = Param(Params._dummy(), "labelCol", "label column name.", 
typeConverter=TypeConverters.toString)
 
     def __init__(self):
         super(HasLabelCol, self).__init__()
@@ -123,7 +123,7 @@ class HasPredictionCol(Params):
     Mixin for param predictionCol: prediction column name.
     """
 
-    predictionCol = Param(Params._dummy(), "predictionCol", "prediction column 
name.", str)
+    predictionCol = Param(Params._dummy(), "predictionCol", "prediction column 
name.", typeConverter=TypeConverters.toString)
 
     def __init__(self):
         super(HasPredictionCol, self).__init__()
@@ -148,7 +148,7 @@ class HasProbabilityCol(Params):
     Mixin for param probabilityCol: Column name for predicted class 
conditional probabilities. Note: Not all models output well-calibrated 
probability estimates! These probabilities should be treated as confidences, 
not precise probabilities.
     """
 
-    probabilityCol = Param(Params._dummy(), "probabilityCol", "Column name for 
predicted class conditional probabilities. Note: Not all models output 
well-calibrated probability estimates! These probabilities should be treated as 
confidences, not precise probabilities.", str)
+    probabilityCol = Param(Params._dummy(), "probabilityCol", "Column name for 
predicted class conditional probabilities. Note: Not all models output 
well-calibrated probability estimates! These probabilities should be treated as 
confidences, not precise probabilities.", typeConverter=TypeConverters.toString)
 
     def __init__(self):
         super(HasProbabilityCol, self).__init__()
@@ -173,7 +173,7 @@ class HasRawPredictionCol(Params):
     Mixin for param rawPredictionCol: raw prediction (a.k.a. confidence) 
column name.
     """
 
-    rawPredictionCol = Param(Params._dummy(), "rawPredictionCol", "raw 
prediction (a.k.a. confidence) column name.", str)
+    rawPredictionCol = Param(Params._dummy(), "rawPredictionCol", "raw 
prediction (a.k.a. confidence) column name.", 
typeConverter=TypeConverters.toString)
 
     def __init__(self):
         super(HasRawPredictionCol, self).__init__()
@@ -198,7 +198,7 @@ class HasInputCol(Params):
     Mixin for param inputCol: input column name.
     """
 
-    inputCol = Param(Params._dummy(), "inputCol", "input column name.", str)
+    inputCol = Param(Params._dummy(), "inputCol", "input column name.", 
typeConverter=TypeConverters.toString)
 
     def __init__(self):
         super(HasInputCol, self).__init__()
@@ -222,7 +222,7 @@ class HasInputCols(Params):
     Mixin for param inputCols: input column names.
     """
 
-    inputCols = Param(Params._dummy(), "inputCols", "input column names.", 
None)
+    inputCols = Param(Params._dummy(), "inputCols", "input column names.", 
typeConverter=TypeConverters.toListString)
 
     def __init__(self):
         super(HasInputCols, self).__init__()
@@ -246,7 +246,7 @@ class HasOutputCol(Params):
     Mixin for param outputCol: output column name.
     """
 
-    outputCol = Param(Params._dummy(), "outputCol", "output column name.", str)
+    outputCol = Param(Params._dummy(), "outputCol", "output column name.", 
typeConverter=TypeConverters.toString)
 
     def __init__(self):
         super(HasOutputCol, self).__init__()
@@ -271,7 +271,7 @@ class HasNumFeatures(Params):
     Mixin for param numFeatures: number of features.
     """
 
-    numFeatures = Param(Params._dummy(), "numFeatures", "number of features.", 
int)
+    numFeatures = Param(Params._dummy(), "numFeatures", "number of features.", 
typeConverter=TypeConverters.toInt)
 
     def __init__(self):
         super(HasNumFeatures, self).__init__()
@@ -295,7 +295,7 @@ class HasCheckpointInterval(Params):
     Mixin for param checkpointInterval: set checkpoint interval (>= 1) or 
disable checkpoint (-1). E.g. 10 means that the cache will get checkpointed 
every 10 iterations.
     """
 
-    checkpointInterval = Param(Params._dummy(), "checkpointInterval", "set 
checkpoint interval (>= 1) or disable checkpoint (-1). E.g. 10 means that the 
cache will get checkpointed every 10 iterations.", int)
+    checkpointInterval = Param(Params._dummy(), "checkpointInterval", "set 
checkpoint interval (>= 1) or disable checkpoint (-1). E.g. 10 means that the 
cache will get checkpointed every 10 iterations.", 
typeConverter=TypeConverters.toInt)
 
     def __init__(self):
         super(HasCheckpointInterval, self).__init__()
@@ -319,7 +319,7 @@ class HasSeed(Params):
     Mixin for param seed: random seed.
     """
 
-    seed = Param(Params._dummy(), "seed", "random seed.", int)
+    seed = Param(Params._dummy(), "seed", "random seed.", 
typeConverter=TypeConverters.toInt)
 
     def __init__(self):
         super(HasSeed, self).__init__()
@@ -344,7 +344,7 @@ class HasTol(Params):
     Mixin for param tol: the convergence tolerance for iterative algorithms.
     """
 
-    tol = Param(Params._dummy(), "tol", "the convergence tolerance for 
iterative algorithms.", float)
+    tol = Param(Params._dummy(), "tol", "the convergence tolerance for 
iterative algorithms.", typeConverter=TypeConverters.toFloat)
 
     def __init__(self):
         super(HasTol, self).__init__()
@@ -368,7 +368,7 @@ class HasStepSize(Params):
     Mixin for param stepSize: Step size to be used for each iteration of 
optimization.
     """
 
-    stepSize = Param(Params._dummy(), "stepSize", "Step size to be used for 
each iteration of optimization.", float)
+    stepSize = Param(Params._dummy(), "stepSize", "Step size to be used for 
each iteration of optimization.", typeConverter=TypeConverters.toFloat)
 
     def __init__(self):
         super(HasStepSize, self).__init__()
@@ -392,7 +392,7 @@ class HasHandleInvalid(Params):
     Mixin for param handleInvalid: how to handle invalid entries. Options are 
skip (which will filter out rows with bad values), or error (which will throw 
an errror). More options may be added later.
     """
 
-    handleInvalid = Param(Params._dummy(), "handleInvalid", "how to handle 
invalid entries. Options are skip (which will filter out rows with bad values), 
or error (which will throw an errror). More options may be added later.", str)
+    handleInvalid = Param(Params._dummy(), "handleInvalid", "how to handle 
invalid entries. Options are skip (which will filter out rows with bad values), 
or error (which will throw an errror). More options may be added later.", 
typeConverter=TypeConverters.toBoolean)
 
     def __init__(self):
         super(HasHandleInvalid, self).__init__()
@@ -416,7 +416,7 @@ class HasElasticNetParam(Params):
     Mixin for param elasticNetParam: the ElasticNet mixing parameter, in range 
[0, 1]. For alpha = 0, the penalty is an L2 penalty. For alpha = 1, it is an L1 
penalty.
     """
 
-    elasticNetParam = Param(Params._dummy(), "elasticNetParam", "the 
ElasticNet mixing parameter, in range [0, 1]. For alpha = 0, the penalty is an 
L2 penalty. For alpha = 1, it is an L1 penalty.", float)
+    elasticNetParam = Param(Params._dummy(), "elasticNetParam", "the 
ElasticNet mixing parameter, in range [0, 1]. For alpha = 0, the penalty is an 
L2 penalty. For alpha = 1, it is an L1 penalty.", 
typeConverter=TypeConverters.toFloat)
 
     def __init__(self):
         super(HasElasticNetParam, self).__init__()
@@ -441,7 +441,7 @@ class HasFitIntercept(Params):
     Mixin for param fitIntercept: whether to fit an intercept term.
     """
 
-    fitIntercept = Param(Params._dummy(), "fitIntercept", "whether to fit an 
intercept term.", bool)
+    fitIntercept = Param(Params._dummy(), "fitIntercept", "whether to fit an 
intercept term.", typeConverter=TypeConverters.toBoolean)
 
     def __init__(self):
         super(HasFitIntercept, self).__init__()
@@ -466,7 +466,7 @@ class HasStandardization(Params):
     Mixin for param standardization: whether to standardize the training 
features before fitting the model.
     """
 
-    standardization = Param(Params._dummy(), "standardization", "whether to 
standardize the training features before fitting the model.", bool)
+    standardization = Param(Params._dummy(), "standardization", "whether to 
standardize the training features before fitting the model.", 
typeConverter=TypeConverters.toBoolean)
 
     def __init__(self):
         super(HasStandardization, self).__init__()
@@ -491,7 +491,7 @@ class HasThresholds(Params):
     Mixin for param thresholds: Thresholds in multi-class classification to 
adjust the probability of predicting each class. Array must have length equal 
to the number of classes, with values >= 0. The class with largest value p/t is 
predicted, where p is the original probability of that class and t is the 
class' threshold.
     """
 
-    thresholds = Param(Params._dummy(), "thresholds", "Thresholds in 
multi-class classification to adjust the probability of predicting each class. 
Array must have length equal to the number of classes, with values >= 0. The 
class with largest value p/t is predicted, where p is the original probability 
of that class and t is the class' threshold.", None)
+    thresholds = Param(Params._dummy(), "thresholds", "Thresholds in 
multi-class classification to adjust the probability of predicting each class. 
Array must have length equal to the number of classes, with values >= 0. The 
class with largest value p/t is predicted, where p is the original probability 
of that class and t is the class' threshold.", 
typeConverter=TypeConverters.toListFloat)
 
     def __init__(self):
         super(HasThresholds, self).__init__()
@@ -515,7 +515,7 @@ class HasWeightCol(Params):
     Mixin for param weightCol: weight column name. If this is not set or 
empty, we treat all instance weights as 1.0.
     """
 
-    weightCol = Param(Params._dummy(), "weightCol", "weight column name. If 
this is not set or empty, we treat all instance weights as 1.0.", str)
+    weightCol = Param(Params._dummy(), "weightCol", "weight column name. If 
this is not set or empty, we treat all instance weights as 1.0.", 
typeConverter=TypeConverters.toString)
 
     def __init__(self):
         super(HasWeightCol, self).__init__()
@@ -539,7 +539,7 @@ class HasSolver(Params):
     Mixin for param solver: the solver algorithm for optimization. If this is 
not set or empty, default value is 'auto'.
     """
 
-    solver = Param(Params._dummy(), "solver", "the solver algorithm for 
optimization. If this is not set or empty, default value is 'auto'.", str)
+    solver = Param(Params._dummy(), "solver", "the solver algorithm for 
optimization. If this is not set or empty, default value is 'auto'.", 
typeConverter=TypeConverters.toString)
 
     def __init__(self):
         super(HasSolver, self).__init__()
@@ -564,12 +564,12 @@ class DecisionTreeParams(Params):
     Mixin for Decision Tree parameters.
     """
 
-    maxDepth = Param(Params._dummy(), "maxDepth", "Maximum depth of the tree. 
(>= 0) E.g., depth 0 means 1 leaf node; depth 1 means 1 internal node + 2 leaf 
nodes.")
-    maxBins = Param(Params._dummy(), "maxBins", "Max number of bins for 
discretizing continuous features.  Must be >=2 and >= number of categories for 
any categorical feature.")
-    minInstancesPerNode = Param(Params._dummy(), "minInstancesPerNode", 
"Minimum number of instances each child must have after split. If a split 
causes the left or right child to have fewer than minInstancesPerNode, the 
split will be discarded as invalid. Should be >= 1.")
-    minInfoGain = Param(Params._dummy(), "minInfoGain", "Minimum information 
gain for a split to be considered at a tree node.")
-    maxMemoryInMB = Param(Params._dummy(), "maxMemoryInMB", "Maximum memory in 
MB allocated to histogram aggregation.")
-    cacheNodeIds = Param(Params._dummy(), "cacheNodeIds", "If false, the 
algorithm will pass trees to executors to match instances with nodes. If true, 
the algorithm will cache node IDs for each instance. Caching can speed up 
training of deeper trees. Users can set how often should the cache be 
checkpointed or disable it by setting checkpointInterval.")
+    maxDepth = Param(Params._dummy(), "maxDepth", "Maximum depth of the tree. 
(>= 0) E.g., depth 0 means 1 leaf node; depth 1 means 1 internal node + 2 leaf 
nodes.", typeConverter=TypeConverters.toInt)
+    maxBins = Param(Params._dummy(), "maxBins", "Max number of bins for 
discretizing continuous features.  Must be >=2 and >= number of categories for 
any categorical feature.", typeConverter=TypeConverters.toInt)
+    minInstancesPerNode = Param(Params._dummy(), "minInstancesPerNode", 
"Minimum number of instances each child must have after split. If a split 
causes the left or right child to have fewer than minInstancesPerNode, the 
split will be discarded as invalid. Should be >= 1.", 
typeConverter=TypeConverters.toInt)
+    minInfoGain = Param(Params._dummy(), "minInfoGain", "Minimum information 
gain for a split to be considered at a tree node.", 
typeConverter=TypeConverters.toFloat)
+    maxMemoryInMB = Param(Params._dummy(), "maxMemoryInMB", "Maximum memory in 
MB allocated to histogram aggregation.", typeConverter=TypeConverters.toInt)
+    cacheNodeIds = Param(Params._dummy(), "cacheNodeIds", "If false, the 
algorithm will pass trees to executors to match instances with nodes. If true, 
the algorithm will cache node IDs for each instance. Caching can speed up 
training of deeper trees. Users can set how often should the cache be 
checkpointed or disable it by setting checkpointInterval.", 
typeConverter=TypeConverters.toBoolean)
     
 
     def __init__(self):

http://git-wip-us.apache.org/repos/asf/spark/blob/30bdb5cb/python/pyspark/ml/recommendation.py
----------------------------------------------------------------------
diff --git a/python/pyspark/ml/recommendation.py 
b/python/pyspark/ml/recommendation.py
index de4c267..7c7a1b6 100644
--- a/python/pyspark/ml/recommendation.py
+++ b/python/pyspark/ml/recommendation.py
@@ -100,16 +100,23 @@ class ALS(JavaEstimator, HasCheckpointInterval, 
HasMaxIter, HasPredictionCol, Ha
     .. versionadded:: 1.4.0
     """
 
-    rank = Param(Params._dummy(), "rank", "rank of the factorization")
-    numUserBlocks = Param(Params._dummy(), "numUserBlocks", "number of user 
blocks")
-    numItemBlocks = Param(Params._dummy(), "numItemBlocks", "number of item 
blocks")
-    implicitPrefs = Param(Params._dummy(), "implicitPrefs", "whether to use 
implicit preference")
-    alpha = Param(Params._dummy(), "alpha", "alpha for implicit preference")
-    userCol = Param(Params._dummy(), "userCol", "column name for user ids")
-    itemCol = Param(Params._dummy(), "itemCol", "column name for item ids")
-    ratingCol = Param(Params._dummy(), "ratingCol", "column name for ratings")
+    rank = Param(Params._dummy(), "rank", "rank of the factorization",
+                 typeConverter=TypeConverters.toInt)
+    numUserBlocks = Param(Params._dummy(), "numUserBlocks", "number of user 
blocks",
+                          typeConverter=TypeConverters.toInt)
+    numItemBlocks = Param(Params._dummy(), "numItemBlocks", "number of item 
blocks",
+                          typeConverter=TypeConverters.toInt)
+    implicitPrefs = Param(Params._dummy(), "implicitPrefs", "whether to use 
implicit preference",
+                          TypeConverters.toBoolean)
+    alpha = Param(Params._dummy(), "alpha", "alpha for implicit preference",
+                  typeConverter=TypeConverters.toFloat)
+    userCol = Param(Params._dummy(), "userCol", "column name for user ids", 
TypeConverters.toString)
+    itemCol = Param(Params._dummy(), "itemCol", "column name for item ids", 
TypeConverters.toString)
+    ratingCol = Param(Params._dummy(), "ratingCol", "column name for ratings",
+                      TypeConverters.toString)
     nonnegative = Param(Params._dummy(), "nonnegative",
-                        "whether to use nonnegative constraint for least 
squares")
+                        "whether to use nonnegative constraint for least 
squares",
+                        TypeConverters.toBoolean)
 
     @keyword_only
     def __init__(self, rank=10, maxIter=10, regParam=0.1, numUserBlocks=10, 
numItemBlocks=10,

http://git-wip-us.apache.org/repos/asf/spark/blob/30bdb5cb/python/pyspark/ml/regression.py
----------------------------------------------------------------------
diff --git a/python/pyspark/ml/regression.py b/python/pyspark/ml/regression.py
index 664a44b..8982608 100644
--- a/python/pyspark/ml/regression.py
+++ b/python/pyspark/ml/regression.py
@@ -189,10 +189,11 @@ class IsotonicRegression(JavaEstimator, HasFeaturesCol, 
HasLabelCol, HasPredicti
     isotonic = \
         Param(Params._dummy(), "isotonic",
               "whether the output sequence should be isotonic/increasing 
(true) or" +
-              "antitonic/decreasing (false).")
+              "antitonic/decreasing (false).", 
typeConverter=TypeConverters.toBoolean)
     featureIndex = \
         Param(Params._dummy(), "featureIndex",
-              "The index of the feature if featuresCol is a vector column, no 
effect otherwise.")
+              "The index of the feature if featuresCol is a vector column, no 
effect otherwise.",
+              typeConverter=TypeConverters.toInt)
 
     @keyword_only
     def __init__(self, featuresCol="features", labelCol="label", 
predictionCol="prediction",
@@ -278,7 +279,8 @@ class TreeEnsembleParams(DecisionTreeParams):
     """
 
     subsamplingRate = Param(Params._dummy(), "subsamplingRate", "Fraction of 
the training data " +
-                            "used for learning each decision tree, in range 
(0, 1].")
+                            "used for learning each decision tree, in range 
(0, 1].",
+                            typeConverter=TypeConverters.toFloat)
 
     def __init__(self):
         super(TreeEnsembleParams, self).__init__()
@@ -335,11 +337,13 @@ class RandomForestParams(TreeEnsembleParams):
     """
 
     supportedFeatureSubsetStrategies = ["auto", "all", "onethird", "sqrt", 
"log2"]
-    numTrees = Param(Params._dummy(), "numTrees", "Number of trees to train 
(>= 1).")
+    numTrees = Param(Params._dummy(), "numTrees", "Number of trees to train 
(>= 1).",
+                     typeConverter=TypeConverters.toInt)
     featureSubsetStrategy = \
         Param(Params._dummy(), "featureSubsetStrategy",
               "The number of features to consider for splits at each tree 
node. Supported " +
-              "options: " + ", ".join(supportedFeatureSubsetStrategies))
+              "options: " + ", ".join(supportedFeatureSubsetStrategies),
+              typeConverter=TypeConverters.toString)
 
     def __init__(self):
         super(RandomForestParams, self).__init__()
@@ -653,7 +657,8 @@ class GBTRegressor(JavaEstimator, HasFeaturesCol, 
HasLabelCol, HasPredictionCol,
 
     lossType = Param(Params._dummy(), "lossType",
                      "Loss function which GBT tries to minimize 
(case-insensitive). " +
-                     "Supported options: " + ", 
".join(GBTParams.supportedLossTypes))
+                     "Supported options: " + ", 
".join(GBTParams.supportedLossTypes),
+                     typeConverter=TypeConverters.toString)
 
     @keyword_only
     def __init__(self, featuresCol="features", labelCol="label", 
predictionCol="prediction",
@@ -767,14 +772,16 @@ class AFTSurvivalRegression(JavaEstimator, 
HasFeaturesCol, HasLabelCol, HasPredi
     censorCol = Param(Params._dummy(), "censorCol",
                       "censor column name. The value of this column could be 0 
or 1. " +
                       "If the value is 1, it means the event has occurred i.e. 
" +
-                      "uncensored; otherwise censored.")
+                      "uncensored; otherwise censored.", 
typeConverter=TypeConverters.toString)
     quantileProbabilities = \
         Param(Params._dummy(), "quantileProbabilities",
               "quantile probabilities array. Values of the quantile 
probabilities array " +
-              "should be in the range (0, 1) and the array should be 
non-empty.")
+              "should be in the range (0, 1) and the array should be 
non-empty.",
+              typeConverter=TypeConverters.toListFloat)
     quantilesCol = Param(Params._dummy(), "quantilesCol",
                          "quantiles column name. This column will output 
quantiles of " +
-                         "corresponding quantileProbabilities if it is set.")
+                         "corresponding quantileProbabilities if it is set.",
+                         typeConverter=TypeConverters.toString)
 
     @keyword_only
     def __init__(self, featuresCol="features", labelCol="label", 
predictionCol="prediction",

http://git-wip-us.apache.org/repos/asf/spark/blob/30bdb5cb/python/pyspark/ml/tests.py
----------------------------------------------------------------------
diff --git a/python/pyspark/ml/tests.py b/python/pyspark/ml/tests.py
index 211248e..2fa5da7 100644
--- a/python/pyspark/ml/tests.py
+++ b/python/pyspark/ml/tests.py
@@ -18,8 +18,11 @@
 """
 Unit tests for Spark ML Python APIs.
 """
-
+import array
 import sys
+if sys.version > '3':
+    xrange = range
+
 try:
     import xmlrunner
 except ImportError:
@@ -36,19 +39,20 @@ else:
 
 from shutil import rmtree
 import tempfile
+import numpy as np
 
 from pyspark.ml import Estimator, Model, Pipeline, PipelineModel, Transformer
 from pyspark.ml.classification import LogisticRegression
 from pyspark.ml.clustering import KMeans
 from pyspark.ml.evaluation import RegressionEvaluator
 from pyspark.ml.feature import *
-from pyspark.ml.param import Param, Params
+from pyspark.ml.param import Param, Params, TypeConverters
 from pyspark.ml.param.shared import HasMaxIter, HasInputCol, HasSeed
 from pyspark.ml.regression import LinearRegression
 from pyspark.ml.tuning import *
 from pyspark.ml.util import keyword_only
 from pyspark.ml.wrapper import JavaWrapper
-from pyspark.mllib.linalg import DenseVector
+from pyspark.mllib.linalg import DenseVector, SparseVector
 from pyspark.sql import DataFrame, SQLContext, Row
 from pyspark.sql.functions import rand
 from pyspark.tests import ReusedPySparkTestCase as PySparkTestCase
@@ -104,20 +108,65 @@ class ParamTypeConversionTests(PySparkTestCase):
     Test that param type conversion happens.
     """
 
-    def test_int_to_float(self):
-        from pyspark.mllib.linalg import Vectors
-        df = self.sc.parallelize([
-            Row(label=1.0, weight=2.0, features=Vectors.dense(1.0))]).toDF()
-        lr = LogisticRegression(elasticNetParam=0)
-        lr.fit(df)
-        lr.setElasticNetParam(0)
-        lr.fit(df)
-
-    def test_invalid_to_float(self):
-        from pyspark.mllib.linalg import Vectors
-        self.assertRaises(Exception, lambda: 
LogisticRegression(elasticNetParam="happy"))
-        lr = LogisticRegression(elasticNetParam=0)
-        self.assertRaises(Exception, lambda: lr.setElasticNetParam("panda"))
+    def test_int(self):
+        lr = LogisticRegression(maxIter=5.0)
+        self.assertEqual(lr.getMaxIter(), 5)
+        self.assertTrue(type(lr.getMaxIter()) == int)
+        self.assertRaises(TypeError, lambda: 
LogisticRegression(maxIter="notAnInt"))
+        self.assertRaises(TypeError, lambda: LogisticRegression(maxIter=5.1))
+
+    def test_float(self):
+        lr = LogisticRegression(tol=1)
+        self.assertEqual(lr.getTol(), 1.0)
+        self.assertTrue(type(lr.getTol()) == float)
+        self.assertRaises(TypeError, lambda: 
LogisticRegression(tol="notAFloat"))
+
+    def test_vector(self):
+        ewp = ElementwiseProduct(scalingVec=[1, 3])
+        self.assertEqual(ewp.getScalingVec(), DenseVector([1.0, 3.0]))
+        ewp = ElementwiseProduct(scalingVec=np.array([1.2, 3.4]))
+        self.assertEqual(ewp.getScalingVec(), DenseVector([1.2, 3.4]))
+        self.assertRaises(TypeError, lambda: 
ElementwiseProduct(scalingVec=["a", "b"]))
+
+    def test_list(self):
+        l = [0, 1]
+        for lst_like in [l, np.array(l), DenseVector(l), SparseVector(len(l), 
range(len(l)), l),
+                         array.array('l', l), xrange(2), tuple(l)]:
+            converted = TypeConverters.toList(lst_like)
+            self.assertEqual(type(converted), list)
+            self.assertListEqual(converted, l)
+
+    def test_list_int(self):
+        for indices in [[1.0, 2.0], np.array([1.0, 2.0]), DenseVector([1.0, 
2.0]),
+                        SparseVector(2, {0: 1.0, 1: 2.0}), xrange(1, 3), (1.0, 
2.0),
+                        array.array('d', [1.0, 2.0])]:
+            vs = VectorSlicer(indices=indices)
+            self.assertListEqual(vs.getIndices(), [1, 2])
+            self.assertTrue(all([type(v) == int for v in vs.getIndices()]))
+        self.assertRaises(TypeError, lambda: VectorSlicer(indices=["a", "b"]))
+
+    def test_list_float(self):
+        b = Bucketizer(splits=[1, 4])
+        self.assertEqual(b.getSplits(), [1.0, 4.0])
+        self.assertTrue(all([type(v) == float for v in b.getSplits()]))
+        self.assertRaises(TypeError, lambda: Bucketizer(splits=["a", 1.0]))
+
+    def test_list_string(self):
+        for labels in [np.array(['a', u'b']), ['a', u'b'], np.array(['a', 
'b'])]:
+            idx_to_string = IndexToString(labels=labels)
+            self.assertListEqual(idx_to_string.getLabels(), ['a', 'b'])
+        self.assertRaises(TypeError, lambda: IndexToString(labels=['a', 2]))
+
+    def test_string(self):
+        lr = LogisticRegression()
+        for col in ['features', u'features', np.str_('features')]:
+            lr.setFeaturesCol(col)
+            self.assertEqual(lr.getFeaturesCol(), 'features')
+        self.assertRaises(TypeError, lambda: 
LogisticRegression(featuresCol=2.3))
+
+    def test_bool(self):
+        self.assertRaises(TypeError, lambda: 
LogisticRegression(fitIntercept=1))
+        self.assertRaises(TypeError, lambda: 
LogisticRegression(fitIntercept="false"))
 
 
 class PipelineTests(PySparkTestCase):

http://git-wip-us.apache.org/repos/asf/spark/blob/30bdb5cb/python/pyspark/ml/tuning.py
----------------------------------------------------------------------
diff --git a/python/pyspark/ml/tuning.py b/python/pyspark/ml/tuning.py
index 77af009..a528d22 100644
--- a/python/pyspark/ml/tuning.py
+++ b/python/pyspark/ml/tuning.py
@@ -20,7 +20,7 @@ import numpy as np
 
 from pyspark import since
 from pyspark.ml import Estimator, Model
-from pyspark.ml.param import Params, Param
+from pyspark.ml.param import Params, Param, TypeConverters
 from pyspark.ml.param.shared import HasSeed
 from pyspark.ml.util import keyword_only
 from pyspark.sql.functions import rand
@@ -121,7 +121,8 @@ class CrossValidator(Estimator, HasSeed):
     evaluator = Param(
         Params._dummy(), "evaluator",
         "evaluator used to select hyper-parameters that maximize the 
cross-validated metric")
-    numFolds = Param(Params._dummy(), "numFolds", "number of folds for cross 
validation")
+    numFolds = Param(Params._dummy(), "numFolds", "number of folds for cross 
validation",
+                     typeConverter=TypeConverters.toInt)
 
     @keyword_only
     def __init__(self, estimator=None, estimatorParamMaps=None, 
evaluator=None, numFolds=3,


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

spark git commit: [SPARK-13068][PYSPARK][ML] Type conversion for Pyspark params

Reply via email to