zhengruifeng created SPARK-17797:
------------------------------------
Summary: LabelCol support non-double datatypes
Key: SPARK-17797
URL: https://issues.apache.org/jira/browse/SPARK-17797
Project: Spark
Issue Type: Bug
Components: ML
Reporter: zhengruifeng
Without precomputed meta {{numValues}}, method {{Classifier.getNumClasses()}}
do not support numeric types other than Double.
{code}
scala> val path =
"/Users/zrf/.dev/spark-2.0.1-bin-hadoop2.7/data/mllib/sample_libsvm_data.txt"
path: String =
/Users/zrf/.dev/spark-2.0.1-bin-hadoop2.7/data/mllib/sample_libsvm_data.txt
scala> val data = spark.read.format("libsvm").load(path).persist()
data: org.apache.spark.sql.Dataset[org.apache.spark.sql.Row] = [label: double,
features: vector]
scala>
scala> val data2 = data.select(col("label").cast(LongType), col("features"))
data2: org.apache.spark.sql.DataFrame = [label: bigint, features: vector]
scala> val model = new NaiveBayes().fit(data)
model: org.apache.spark.ml.classification.NaiveBayesModel = NaiveBayesModel
(uid=nb_1e27d7acf0b3) with 2 classes
scala> val model = new NaiveBayes().fit(data2)
java.lang.ClassCastException: java.lang.Long cannot be cast to java.lang.Double
at scala.runtime.BoxesRunTime.unboxToDouble(BoxesRunTime.java:114)
at org.apache.spark.sql.Row$class.getDouble(Row.scala:242)
at
org.apache.spark.sql.catalyst.expressions.GenericRow.getDouble(rows.scala:192)
at
org.apache.spark.ml.classification.Classifier.getNumClasses(Classifier.scala:115)
at org.apache.spark.ml.classification.NaiveBayes.train(NaiveBayes.scala:104)
at org.apache.spark.ml.classification.NaiveBayes.train(NaiveBayes.scala:76)
at org.apache.spark.ml.Predictor.fit(Predictor.scala:90)
... 54 elided
{code}
--
This message was sent by Atlassian JIRA
(v6.3.4#6332)
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]