spark git commit: [SPARK-15211][SQL] Select features column from LibSVMRelation causes failure

lian Mon, 09 May 2016 00:05:29 -0700

Repository: spark
Updated Branches:
  refs/heads/master a59ab594c -> 635ef407e



[SPARK-15211][SQL] Select features column from LibSVMRelation causes failure

## What changes were proposed in this pull request?

We need to use `requiredSchema` in `LibSVMRelation` to project the fetch 
required columns when loading data from this data source. Otherwise, when users 
try to select `features` column, it will cause failure.

## How was this patch tested?
`LibSVMRelationSuite`.

Author: Liang-Chi Hsieh <[email protected]>

Closes #12986 from viirya/fix-libsvmrelation.


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/635ef407
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/635ef407
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/635ef407

Branch: refs/heads/master
Commit: 635ef407e11dec41ae9bc428935fb8fdaa482f7e
Parents: a59ab59
Author: Liang-Chi Hsieh <[email protected]>
Authored: Mon May 9 15:05:06 2016 +0800
Committer: Cheng Lian <[email protected]>
Committed: Mon May 9 15:05:06 2016 +0800

----------------------------------------------------------------------
 .../apache/spark/ml/source/libsvm/LibSVMRelation.scala    | 10 +++++++++-
 .../spark/ml/source/libsvm/LibSVMRelationSuite.scala      |  1 +
 2 files changed, 10 insertions(+), 1 deletion(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/spark/blob/635ef407/mllib/src/main/scala/org/apache/spark/ml/source/libsvm/LibSVMRelation.scala
----------------------------------------------------------------------
diff --git 
a/mllib/src/main/scala/org/apache/spark/ml/source/libsvm/LibSVMRelation.scala 
b/mllib/src/main/scala/org/apache/spark/ml/source/libsvm/LibSVMRelation.scala
index 5f78fab..68a855c 100644
--- 
a/mllib/src/main/scala/org/apache/spark/ml/source/libsvm/LibSVMRelation.scala
+++ 
b/mllib/src/main/scala/org/apache/spark/ml/source/libsvm/LibSVMRelation.scala
@@ -203,10 +203,18 @@ class DefaultSource extends FileFormat with 
DataSourceRegister {
           }
 
       val converter = RowEncoder(dataSchema)
+      val fullOutput = dataSchema.map { f =>
+        AttributeReference(f.name, f.dataType, f.nullable, f.metadata)()
+      }
+      val requiredOutput = fullOutput.filter { a =>
+        requiredSchema.fieldNames.contains(a.name)
+      }
+
+      val requiredColumns = GenerateUnsafeProjection.generate(requiredOutput, 
fullOutput)
 
       points.map { pt =>
         val features = if (sparse) pt.features.toSparse else 
pt.features.toDense
-        converter.toRow(Row(pt.label, features))
+        requiredColumns(converter.toRow(Row(pt.label, features)))
       }
     }
   }

http://git-wip-us.apache.org/repos/asf/spark/blob/635ef407/mllib/src/test/scala/org/apache/spark/ml/source/libsvm/LibSVMRelationSuite.scala
----------------------------------------------------------------------
diff --git 
a/mllib/src/test/scala/org/apache/spark/ml/source/libsvm/LibSVMRelationSuite.scala
 
b/mllib/src/test/scala/org/apache/spark/ml/source/libsvm/LibSVMRelationSuite.scala
index e52fbd7..1d7144f 100644
--- 
a/mllib/src/test/scala/org/apache/spark/ml/source/libsvm/LibSVMRelationSuite.scala
+++ 
b/mllib/src/test/scala/org/apache/spark/ml/source/libsvm/LibSVMRelationSuite.scala
@@ -108,5 +108,6 @@ class LibSVMRelationSuite extends SparkFunSuite with 
MLlibTestSparkContext {
   test("select features from libsvm relation") {
     val df = sqlContext.read.format("libsvm").load(path)
     df.select("features").rdd.map { case Row(d: Vector) => d }.first
+    df.select("features").collect
   }
 }


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

spark git commit: [SPARK-15211][SQL] Select features column from LibSVMRelation causes failure

Reply via email to