[ 
https://issues.apache.org/jira/browse/SPARK-20445?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=15980949#comment-15980949
 ] 

Hyukjin Kwon commented on SPARK-20445:
--------------------------------------

I can't reproduce this with the codes below against the current master:

{code}
irisData = sc.textFile("./iris.csv")
irisData.cache()
irisData.count()

dataLines = irisData.filter(lambda x: "Sepal" not in x)
dataLines.count()

from pyspark.sql import Row
#Create a Data Frame from the data
parts = dataLines.map(lambda l: l.split(","))
irisMap = parts.map(lambda p: Row(SEPAL_LENGTH=float(p[0]),\
                                SEPAL_WIDTH=float(p[1]), \
                                PETAL_LENGTH=float(p[2]), \
                                PETAL_WIDTH=float(p[3]), \
                                SPECIES=p[4]))

# Infer the schema, and register the DataFrame as a table.
irisDf = sqlContext.createDataFrame(irisMap)
irisDf.cache()

#Add a numeric indexer for the label/target column
from pyspark.ml.feature import StringIndexer
stringIndexer = StringIndexer(inputCol="SPECIES", outputCol="IND_SPECIES")
si_model = stringIndexer.fit(irisDf)
irisNormDf = si_model.transform(irisDf)

irisNormDf.select("SPECIES","IND_SPECIES").distinct().collect()
irisNormDf.cache()

"""--------------------------------------------------------------------------
Perform Data Analytics
-------------------------------------------------------------------------"""

#See standard parameters
irisNormDf.describe().show()

#Find correlation between predictors and target
for i in irisNormDf.columns:
    if not( isinstance(irisNormDf.select(i).take(1)[0][0], basestring)) :
        print( "Correlation to Species for ", i, \
                    irisNormDf.stat.corr('IND_SPECIES',i))




#Transform to a Data Frame for input to Machine Learing
#Drop columns that are not required (low correlation)

from pyspark.mllib.linalg import Vectors
from pyspark.mllib.linalg import SparseVector
from pyspark.mllib.regression import LabeledPoint
from pyspark.mllib.util import MLUtils
from pyspark.mllib.linalg.distributed import RowMatrix

from pyspark.ml.linalg import Vectors
pyspark.mllib.linalg.Vector
def transformToLabeledPoint(row) :
    lp = ( row["SPECIES"], row["IND_SPECIES"], \
                Vectors.dense([row["SEPAL_LENGTH"],\
                        row["SEPAL_WIDTH"], \
                        row["PETAL_LENGTH"], \
                        row["PETAL_WIDTH"]]))
    return lp




irisLp = irisNormDf.rdd.map(transformToLabeledPoint)
irisLpDf = sqlContext.createDataFrame(irisLp,["species","label", "features"])
irisLpDf.select("species","label","features").show(10)
irisLpDf.cache()

"""--------------------------------------------------------------------------
Perform Machine Learning
-------------------------------------------------------------------------"""
#Split into training and testing data
(trainingData, testData) = irisLpDf.randomSplit([0.9, 0.1])
trainingData.count()
testData.count()
testData.collect()

from pyspark.ml.classification import DecisionTreeClassifier
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

#Create the model
dtClassifer = DecisionTreeClassifier(maxDepth=2, labelCol="label",\
                featuresCol="features")

dtModel = dtClassifer.fit(trainingData)
{code}

with the data 
https://raw.githubusercontent.com/uiuc-cse/data-fa14/gh-pages/data/iris.csv

I manually removed the header. It looks fixed somewhere.


> pyspark.sql.utils.IllegalArgumentException: u'DecisionTreeClassifier was 
> given input with invalid label column label, without the number of classes 
> specified. See StringIndexer
> --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
>
>                 Key: SPARK-20445
>                 URL: https://issues.apache.org/jira/browse/SPARK-20445
>             Project: Spark
>          Issue Type: Bug
>          Components: MLlib
>    Affects Versions: 1.6.1
>            Reporter: surya pratap
>
>  #Load the CSV file into a RDD
>     irisData = sc.textFile("/home/infademo/surya/iris.csv")
>     irisData.cache()
>     irisData.count()
>     #Remove the first line (contains headers)
>     dataLines = irisData.filter(lambda x: "Sepal" not in x)
>     dataLines.count()
>     from pyspark.sql import Row
>     #Create a Data Frame from the data
>     parts = dataLines.map(lambda l: l.split(","))
>     irisMap = parts.map(lambda p: Row(SEPAL_LENGTH=float(p[0]),\
>                                     SEPAL_WIDTH=float(p[1]), \
>                                     PETAL_LENGTH=float(p[2]), \
>                                     PETAL_WIDTH=float(p[3]), \
>                                     SPECIES=p[4] ))
>     # Infer the schema, and register the DataFrame as a table.
>     irisDf = sqlContext.createDataFrame(irisMap)
>     irisDf.cache()
>     #Add a numeric indexer for the label/target column
>     from pyspark.ml.feature import StringIndexer
>     stringIndexer = StringIndexer(inputCol="SPECIES", outputCol="IND_SPECIES")
>     si_model = stringIndexer.fit(irisDf)
>     irisNormDf = si_model.transform(irisDf)
>     irisNormDf.select("SPECIES","IND_SPECIES").distinct().collect()
>     irisNormDf.cache()
>     
> """--------------------------------------------------------------------------
>     Perform Data Analytics
>     
> -------------------------------------------------------------------------"""
>     #See standard parameters
>     irisNormDf.describe().show()
>     #Find correlation between predictors and target
>     for i in irisNormDf.columns:
>         if not( isinstance(irisNormDf.select(i).take(1)[0][0], basestring)) :
>             print( "Correlation to Species for ", i, \
>                         irisNormDf.stat.corr('IND_SPECIES',i))
>     #Transform to a Data Frame for input to Machine Learing
>     #Drop columns that are not required (low correlation)
>     from pyspark.mllib.linalg import Vectors
>     from pyspark.mllib.linalg import SparseVector
>     from pyspark.mllib.regression import LabeledPoint
>     from pyspark.mllib.util import MLUtils
>     import org.apache.spark.mllib.linalg.{Matrix, Matrices}
>     from pyspark.mllib.linalg.distributed import RowMatrix
>     from pyspark.ml.linalg import Vectors
>     pyspark.mllib.linalg.Vector
>     def transformToLabeledPoint(row) :
>         lp = ( row["SPECIES"], row["IND_SPECIES"], \
>                     Vectors.dense([row["SEPAL_LENGTH"],\
>                             row["SEPAL_WIDTH"], \
>                             row["PETAL_LENGTH"], \
>                             row["PETAL_WIDTH"]]))
>         return lp
>     irisLp = irisNormDf.rdd.map(transformToLabeledPoint)
>     irisLpDf = sqlContext.createDataFrame(irisLp,["species","label", 
> "features"])
>     irisLpDf.select("species","label","features").show(10)
>     irisLpDf.cache()
>     
> """--------------------------------------------------------------------------
>     Perform Machine Learning
>     
> -------------------------------------------------------------------------"""
>     #Split into training and testing data
>     (trainingData, testData) = irisLpDf.randomSplit([0.9, 0.1])
>     trainingData.count()
>     testData.count()
>     testData.collect()
>     from pyspark.ml.classification import DecisionTreeClassifier
>     from pyspark.ml.evaluation import MulticlassClassificationEvaluator
>     #Create the model
>     dtClassifer = DecisionTreeClassifier(maxDepth=2, labelCol="label",\
>                     featuresCol="features")
>    dtModel = dtClassifer.fit(trainingData)
>    
>    issue part:-
>    
>    dtModel = dtClassifer.fit(trainingData) Traceback (most recent call last): 
> File "", line 1, in File 
> "/opt/mapr/spark/spark-1.6.1-bin-hadoop2.6/python/pyspark/ml/pipeline.py", 
> line 69, in fit return self._fit(dataset) File 
> "/opt/mapr/spark/spark-1.6.1-bin-hadoop2.6/python/pyspark/ml/wrapper.py", 
> line 133, in _fit java_model = self._fit_java(dataset) File 
> "/opt/mapr/spark/spark-1.6.1-bin-hadoop2.6/python/pyspark/ml/wrapper.py", 
> line 130, in _fit_java return self._java_obj.fit(dataset._jdf) File 
> "/opt/mapr/spark/spark-1.6.1-bin-hadoop2.6/python/lib/py4j-0.9-src.zip/py4j/java_gateway.py",
>  line 813, in call File 
> "/opt/mapr/spark/spark-1.6.1-bin-hadoop2.6/python/pyspark/sql/utils.py", line 
> 53, in deco raise IllegalArgumentException(s.split(': ', 1)[1], stackTrace) 
> pyspark.sql.utils.IllegalArgumentException: u'DecisionTreeClassifier was 
> given input with invalid label column label, without the number of classes 
> specified. See StringIndexer.'



--
This message was sent by Atlassian JIRA
(v6.3.15#6346)

---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to