[
https://issues.apache.org/jira/browse/SPARK-20445?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=15980950#comment-15980950
]
Hyukjin Kwon commented on SPARK-20445:
--------------------------------------
[~surya78] are you able to test this against higher version or current master?
> pyspark.sql.utils.IllegalArgumentException: u'DecisionTreeClassifier was
> given input with invalid label column label, without the number of classes
> specified. See StringIndexer
> --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
>
> Key: SPARK-20445
> URL: https://issues.apache.org/jira/browse/SPARK-20445
> Project: Spark
> Issue Type: Bug
> Components: MLlib
> Affects Versions: 1.6.1
> Reporter: surya pratap
>
> #Load the CSV file into a RDD
> irisData = sc.textFile("/home/infademo/surya/iris.csv")
> irisData.cache()
> irisData.count()
> #Remove the first line (contains headers)
> dataLines = irisData.filter(lambda x: "Sepal" not in x)
> dataLines.count()
> from pyspark.sql import Row
> #Create a Data Frame from the data
> parts = dataLines.map(lambda l: l.split(","))
> irisMap = parts.map(lambda p: Row(SEPAL_LENGTH=float(p[0]),\
> SEPAL_WIDTH=float(p[1]), \
> PETAL_LENGTH=float(p[2]), \
> PETAL_WIDTH=float(p[3]), \
> SPECIES=p[4] ))
> # Infer the schema, and register the DataFrame as a table.
> irisDf = sqlContext.createDataFrame(irisMap)
> irisDf.cache()
> #Add a numeric indexer for the label/target column
> from pyspark.ml.feature import StringIndexer
> stringIndexer = StringIndexer(inputCol="SPECIES", outputCol="IND_SPECIES")
> si_model = stringIndexer.fit(irisDf)
> irisNormDf = si_model.transform(irisDf)
> irisNormDf.select("SPECIES","IND_SPECIES").distinct().collect()
> irisNormDf.cache()
>
> """--------------------------------------------------------------------------
> Perform Data Analytics
>
> -------------------------------------------------------------------------"""
> #See standard parameters
> irisNormDf.describe().show()
> #Find correlation between predictors and target
> for i in irisNormDf.columns:
> if not( isinstance(irisNormDf.select(i).take(1)[0][0], basestring)) :
> print( "Correlation to Species for ", i, \
> irisNormDf.stat.corr('IND_SPECIES',i))
> #Transform to a Data Frame for input to Machine Learing
> #Drop columns that are not required (low correlation)
> from pyspark.mllib.linalg import Vectors
> from pyspark.mllib.linalg import SparseVector
> from pyspark.mllib.regression import LabeledPoint
> from pyspark.mllib.util import MLUtils
> import org.apache.spark.mllib.linalg.{Matrix, Matrices}
> from pyspark.mllib.linalg.distributed import RowMatrix
> from pyspark.ml.linalg import Vectors
> pyspark.mllib.linalg.Vector
> def transformToLabeledPoint(row) :
> lp = ( row["SPECIES"], row["IND_SPECIES"], \
> Vectors.dense([row["SEPAL_LENGTH"],\
> row["SEPAL_WIDTH"], \
> row["PETAL_LENGTH"], \
> row["PETAL_WIDTH"]]))
> return lp
> irisLp = irisNormDf.rdd.map(transformToLabeledPoint)
> irisLpDf = sqlContext.createDataFrame(irisLp,["species","label",
> "features"])
> irisLpDf.select("species","label","features").show(10)
> irisLpDf.cache()
>
> """--------------------------------------------------------------------------
> Perform Machine Learning
>
> -------------------------------------------------------------------------"""
> #Split into training and testing data
> (trainingData, testData) = irisLpDf.randomSplit([0.9, 0.1])
> trainingData.count()
> testData.count()
> testData.collect()
> from pyspark.ml.classification import DecisionTreeClassifier
> from pyspark.ml.evaluation import MulticlassClassificationEvaluator
> #Create the model
> dtClassifer = DecisionTreeClassifier(maxDepth=2, labelCol="label",\
> featuresCol="features")
> dtModel = dtClassifer.fit(trainingData)
>
> issue part:-
>
> dtModel = dtClassifer.fit(trainingData) Traceback (most recent call last):
> File "", line 1, in File
> "/opt/mapr/spark/spark-1.6.1-bin-hadoop2.6/python/pyspark/ml/pipeline.py",
> line 69, in fit return self._fit(dataset) File
> "/opt/mapr/spark/spark-1.6.1-bin-hadoop2.6/python/pyspark/ml/wrapper.py",
> line 133, in _fit java_model = self._fit_java(dataset) File
> "/opt/mapr/spark/spark-1.6.1-bin-hadoop2.6/python/pyspark/ml/wrapper.py",
> line 130, in _fit_java return self._java_obj.fit(dataset._jdf) File
> "/opt/mapr/spark/spark-1.6.1-bin-hadoop2.6/python/lib/py4j-0.9-src.zip/py4j/java_gateway.py",
> line 813, in call File
> "/opt/mapr/spark/spark-1.6.1-bin-hadoop2.6/python/pyspark/sql/utils.py", line
> 53, in deco raise IllegalArgumentException(s.split(': ', 1)[1], stackTrace)
> pyspark.sql.utils.IllegalArgumentException: u'DecisionTreeClassifier was
> given input with invalid label column label, without the number of classes
> specified. See StringIndexer.'
--
This message was sent by Atlassian JIRA
(v6.3.15#6346)
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]