Github user yogeshg commented on a diff in the pull request:
https://github.com/apache/spark/pull/6452#discussion_r175548228
--- Diff:
mllib/src/main/scala/org/apache/spark/ml/feature/VectorAssembler.scala ---
@@ -46,19 +47,59 @@ class VectorAssembler(override val uid: String)
def setOutputCol(value: String): this.type = set(outputCol, value)
override def transform(dataset: DataFrame): DataFrame = {
+ // Schema transformation.
+ val schema = dataset.schema
+ lazy val first = dataset.first()
+ val attrs = $(inputCols).flatMap { c =>
+ val field = schema(c)
+ val index = schema.fieldIndex(c)
+ field.dataType match {
+ case DoubleType =>
+ val attr = Attribute.fromStructField(field)
+ // If the input column doesn't have ML attribute, assume numeric.
+ if (attr == UnresolvedAttribute) {
+ Some(NumericAttribute.defaultAttr.withName(c))
+ } else {
+ Some(attr.withName(c))
+ }
+ case _: NumericType | BooleanType =>
--- End diff --
I had a quick question regarding this, why do we not consider a 3.2 case
where we have Scalar type with ML attributes? is it because there's no such
thing?
---
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]