Repository: spark Updated Branches: refs/heads/master a59ab594c -> 635ef407e
[SPARK-15211][SQL] Select features column from LibSVMRelation causes failure ## What changes were proposed in this pull request? We need to use `requiredSchema` in `LibSVMRelation` to project the fetch required columns when loading data from this data source. Otherwise, when users try to select `features` column, it will cause failure. ## How was this patch tested? `LibSVMRelationSuite`. Author: Liang-Chi Hsieh <sim...@tw.ibm.com> Closes #12986 from viirya/fix-libsvmrelation. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/635ef407 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/635ef407 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/635ef407 Branch: refs/heads/master Commit: 635ef407e11dec41ae9bc428935fb8fdaa482f7e Parents: a59ab59 Author: Liang-Chi Hsieh <sim...@tw.ibm.com> Authored: Mon May 9 15:05:06 2016 +0800 Committer: Cheng Lian <l...@databricks.com> Committed: Mon May 9 15:05:06 2016 +0800 ---------------------------------------------------------------------- .../apache/spark/ml/source/libsvm/LibSVMRelation.scala | 10 +++++++++- .../spark/ml/source/libsvm/LibSVMRelationSuite.scala | 1 + 2 files changed, 10 insertions(+), 1 deletion(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/spark/blob/635ef407/mllib/src/main/scala/org/apache/spark/ml/source/libsvm/LibSVMRelation.scala ---------------------------------------------------------------------- diff --git a/mllib/src/main/scala/org/apache/spark/ml/source/libsvm/LibSVMRelation.scala b/mllib/src/main/scala/org/apache/spark/ml/source/libsvm/LibSVMRelation.scala index 5f78fab..68a855c 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/source/libsvm/LibSVMRelation.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/source/libsvm/LibSVMRelation.scala @@ -203,10 +203,18 @@ class DefaultSource extends FileFormat with DataSourceRegister { } val converter = RowEncoder(dataSchema) + val fullOutput = dataSchema.map { f => + AttributeReference(f.name, f.dataType, f.nullable, f.metadata)() + } + val requiredOutput = fullOutput.filter { a => + requiredSchema.fieldNames.contains(a.name) + } + + val requiredColumns = GenerateUnsafeProjection.generate(requiredOutput, fullOutput) points.map { pt => val features = if (sparse) pt.features.toSparse else pt.features.toDense - converter.toRow(Row(pt.label, features)) + requiredColumns(converter.toRow(Row(pt.label, features))) } } } http://git-wip-us.apache.org/repos/asf/spark/blob/635ef407/mllib/src/test/scala/org/apache/spark/ml/source/libsvm/LibSVMRelationSuite.scala ---------------------------------------------------------------------- diff --git a/mllib/src/test/scala/org/apache/spark/ml/source/libsvm/LibSVMRelationSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/source/libsvm/LibSVMRelationSuite.scala index e52fbd7..1d7144f 100644 --- a/mllib/src/test/scala/org/apache/spark/ml/source/libsvm/LibSVMRelationSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/ml/source/libsvm/LibSVMRelationSuite.scala @@ -108,5 +108,6 @@ class LibSVMRelationSuite extends SparkFunSuite with MLlibTestSparkContext { test("select features from libsvm relation") { val df = sqlContext.read.format("libsvm").load(path) df.select("features").rdd.map { case Row(d: Vector) => d }.first + df.select("features").collect } } --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org