[jira] [Created] (SPARK-20445) pyspark.sql.utils.IllegalArgumentException: u'DecisionTreeClassifier was given input with invalid label column label, without the number of classes specified. See StringIndexer

surya pratap (JIRA) Mon, 24 Apr 2017 02:39:26 -0700

surya pratap created SPARK-20445:
------------------------------------

             Summary: pyspark.sql.utils.IllegalArgumentException: 
u'DecisionTreeClassifier was given input with invalid label column label, 
without the number of classes specified. See StringIndexer
                 Key: SPARK-20445
                 URL: https://issues.apache.org/jira/browse/SPARK-20445
             Project: Spark
          Issue Type: Bug
          Components: MLlib
    Affects Versions: 1.6.1
            Reporter: surya pratap



 #Load the CSV file into a RDD
    irisData = sc.textFile("/home/infademo/surya/iris.csv")
    irisData.cache()
    irisData.count()

    #Remove the first line (contains headers)
    dataLines = irisData.filter(lambda x: "Sepal" not in x)
    dataLines.count()

    from pyspark.sql import Row
    #Create a Data Frame from the data
    parts = dataLines.map(lambda l: l.split(","))
    irisMap = parts.map(lambda p: Row(SEPAL_LENGTH=float(p[0]),\
                                    SEPAL_WIDTH=float(p[1]), \
                                    PETAL_LENGTH=float(p[2]), \
                                    PETAL_WIDTH=float(p[3]), \
                                    SPECIES=p[4] ))

    # Infer the schema, and register the DataFrame as a table.
    irisDf = sqlContext.createDataFrame(irisMap)
    irisDf.cache()

    #Add a numeric indexer for the label/target column
    from pyspark.ml.feature import StringIndexer
    stringIndexer = StringIndexer(inputCol="SPECIES", outputCol="IND_SPECIES")
    si_model = stringIndexer.fit(irisDf)
    irisNormDf = si_model.transform(irisDf)

    irisNormDf.select("SPECIES","IND_SPECIES").distinct().collect()
    irisNormDf.cache()

    
"""--------------------------------------------------------------------------
    Perform Data Analytics
    -------------------------------------------------------------------------"""

    #See standard parameters
    irisNormDf.describe().show()

    #Find correlation between predictors and target
    for i in irisNormDf.columns:
        if not( isinstance(irisNormDf.select(i).take(1)[0][0], basestring)) :
            print( "Correlation to Species for ", i, \
                        irisNormDf.stat.corr('IND_SPECIES',i))



    #Transform to a Data Frame for input to Machine Learing
    #Drop columns that are not required (low correlation)

    from pyspark.mllib.linalg import Vectors
    from pyspark.mllib.linalg import SparseVector
    from pyspark.mllib.regression import LabeledPoint
    from pyspark.mllib.util import MLUtils
    import org.apache.spark.mllib.linalg.{Matrix, Matrices}
    from pyspark.mllib.linalg.distributed import RowMatrix

    from pyspark.ml.linalg import Vectors
    pyspark.mllib.linalg.Vector
    def transformToLabeledPoint(row) :
        lp = ( row["SPECIES"], row["IND_SPECIES"], \
                    Vectors.dense([row["SEPAL_LENGTH"],\
                            row["SEPAL_WIDTH"], \
                            row["PETAL_LENGTH"], \
                            row["PETAL_WIDTH"]]))
        return lp




    irisLp = irisNormDf.rdd.map(transformToLabeledPoint)
    irisLpDf = sqlContext.createDataFrame(irisLp,["species","label", 
"features"])
    irisLpDf.select("species","label","features").show(10)
    irisLpDf.cache()

    
"""--------------------------------------------------------------------------
    Perform Machine Learning
    -------------------------------------------------------------------------"""
    #Split into training and testing data
    (trainingData, testData) = irisLpDf.randomSplit([0.9, 0.1])
    trainingData.count()
    testData.count()
    testData.collect()

    from pyspark.ml.classification import DecisionTreeClassifier
    from pyspark.ml.evaluation import MulticlassClassificationEvaluator

    #Create the model
    dtClassifer = DecisionTreeClassifier(maxDepth=2, labelCol="label",\
                    featuresCol="features")

   dtModel = dtClassifer.fit(trainingData)
   
   issue part:-
   
   dtModel = dtClassifer.fit(trainingData) Traceback (most recent call last): 
File "", line 1, in File 
"/opt/mapr/spark/spark-1.6.1-bin-hadoop2.6/python/pyspark/ml/pipeline.py", line 
69, in fit return self._fit(dataset) File 
"/opt/mapr/spark/spark-1.6.1-bin-hadoop2.6/python/pyspark/ml/wrapper.py", line 
133, in _fit java_model = self._fit_java(dataset) File 
"/opt/mapr/spark/spark-1.6.1-bin-hadoop2.6/python/pyspark/ml/wrapper.py", line 
130, in _fit_java return self._java_obj.fit(dataset._jdf) File 
"/opt/mapr/spark/spark-1.6.1-bin-hadoop2.6/python/lib/py4j-0.9-src.zip/py4j/java_gateway.py",
 line 813, in call File 
"/opt/mapr/spark/spark-1.6.1-bin-hadoop2.6/python/pyspark/sql/utils.py", line 
53, in deco raise IllegalArgumentException(s.split(': ', 1)[1], stackTrace) 
pyspark.sql.utils.IllegalArgumentException: u'DecisionTreeClassifier was given 
input with invalid label column label, without the number of classes specified. 
See StringIndexer.'



--
This message was sent by Atlassian JIRA
(v6.3.15#6346)

---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

[jira] [Created] (SPARK-20445) pyspark.sql.utils.IllegalArgumentException: u'DecisionTreeClassifier was given input with invalid label column label, without the number of classes specified. See StringIndexer

Reply via email to