surya pratap created SPARK-20445:
------------------------------------
Summary: pyspark.sql.utils.IllegalArgumentException:
u'DecisionTreeClassifier was given input with invalid label column label,
without the number of classes specified. See StringIndexer
Key: SPARK-20445
URL: https://issues.apache.org/jira/browse/SPARK-20445
Project: Spark
Issue Type: Bug
Components: MLlib
Affects Versions: 1.6.1
Reporter: surya pratap
#Load the CSV file into a RDD
irisData = sc.textFile("/home/infademo/surya/iris.csv")
irisData.cache()
irisData.count()
#Remove the first line (contains headers)
dataLines = irisData.filter(lambda x: "Sepal" not in x)
dataLines.count()
from pyspark.sql import Row
#Create a Data Frame from the data
parts = dataLines.map(lambda l: l.split(","))
irisMap = parts.map(lambda p: Row(SEPAL_LENGTH=float(p[0]),\
SEPAL_WIDTH=float(p[1]), \
PETAL_LENGTH=float(p[2]), \
PETAL_WIDTH=float(p[3]), \
SPECIES=p[4] ))
# Infer the schema, and register the DataFrame as a table.
irisDf = sqlContext.createDataFrame(irisMap)
irisDf.cache()
#Add a numeric indexer for the label/target column
from pyspark.ml.feature import StringIndexer
stringIndexer = StringIndexer(inputCol="SPECIES", outputCol="IND_SPECIES")
si_model = stringIndexer.fit(irisDf)
irisNormDf = si_model.transform(irisDf)
irisNormDf.select("SPECIES","IND_SPECIES").distinct().collect()
irisNormDf.cache()
"""--------------------------------------------------------------------------
Perform Data Analytics
-------------------------------------------------------------------------"""
#See standard parameters
irisNormDf.describe().show()
#Find correlation between predictors and target
for i in irisNormDf.columns:
if not( isinstance(irisNormDf.select(i).take(1)[0][0], basestring)) :
print( "Correlation to Species for ", i, \
irisNormDf.stat.corr('IND_SPECIES',i))
#Transform to a Data Frame for input to Machine Learing
#Drop columns that are not required (low correlation)
from pyspark.mllib.linalg import Vectors
from pyspark.mllib.linalg import SparseVector
from pyspark.mllib.regression import LabeledPoint
from pyspark.mllib.util import MLUtils
import org.apache.spark.mllib.linalg.{Matrix, Matrices}
from pyspark.mllib.linalg.distributed import RowMatrix
from pyspark.ml.linalg import Vectors
pyspark.mllib.linalg.Vector
def transformToLabeledPoint(row) :
lp = ( row["SPECIES"], row["IND_SPECIES"], \
Vectors.dense([row["SEPAL_LENGTH"],\
row["SEPAL_WIDTH"], \
row["PETAL_LENGTH"], \
row["PETAL_WIDTH"]]))
return lp
irisLp = irisNormDf.rdd.map(transformToLabeledPoint)
irisLpDf = sqlContext.createDataFrame(irisLp,["species","label",
"features"])
irisLpDf.select("species","label","features").show(10)
irisLpDf.cache()
"""--------------------------------------------------------------------------
Perform Machine Learning
-------------------------------------------------------------------------"""
#Split into training and testing data
(trainingData, testData) = irisLpDf.randomSplit([0.9, 0.1])
trainingData.count()
testData.count()
testData.collect()
from pyspark.ml.classification import DecisionTreeClassifier
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
#Create the model
dtClassifer = DecisionTreeClassifier(maxDepth=2, labelCol="label",\
featuresCol="features")
dtModel = dtClassifer.fit(trainingData)
issue part:-
dtModel = dtClassifer.fit(trainingData) Traceback (most recent call last):
File "", line 1, in File
"/opt/mapr/spark/spark-1.6.1-bin-hadoop2.6/python/pyspark/ml/pipeline.py", line
69, in fit return self._fit(dataset) File
"/opt/mapr/spark/spark-1.6.1-bin-hadoop2.6/python/pyspark/ml/wrapper.py", line
133, in _fit java_model = self._fit_java(dataset) File
"/opt/mapr/spark/spark-1.6.1-bin-hadoop2.6/python/pyspark/ml/wrapper.py", line
130, in _fit_java return self._java_obj.fit(dataset._jdf) File
"/opt/mapr/spark/spark-1.6.1-bin-hadoop2.6/python/lib/py4j-0.9-src.zip/py4j/java_gateway.py",
line 813, in call File
"/opt/mapr/spark/spark-1.6.1-bin-hadoop2.6/python/pyspark/sql/utils.py", line
53, in deco raise IllegalArgumentException(s.split(': ', 1)[1], stackTrace)
pyspark.sql.utils.IllegalArgumentException: u'DecisionTreeClassifier was given
input with invalid label column label, without the number of classes specified.
See StringIndexer.'
--
This message was sent by Atlassian JIRA
(v6.3.15#6346)
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]