Repository: spark Updated Branches: refs/heads/master a1d1529da -> 6bdddb6f6
[SPARK-6361][SQL] support adding a column with metadata in DF This is used by ML pipelines to embed ML attributes in columns created by ML transformers/estimators. marmbrus Author: Xiangrui Meng <[email protected]> Closes #5151 from mengxr/SPARK-6361 and squashes the following commits: bb30de3 [Xiangrui Meng] support adding a column with metadata in DF Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/6bdddb6f Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/6bdddb6f Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/6bdddb6f Branch: refs/heads/master Commit: 6bdddb6f6ffbd1bee4c45880904e9561b18764a7 Parents: a1d1529 Author: Xiangrui Meng <[email protected]> Authored: Tue Mar 24 12:08:19 2015 -0700 Committer: Michael Armbrust <[email protected]> Committed: Tue Mar 24 12:08:19 2015 -0700 ---------------------------------------------------------------------- .../catalyst/expressions/namedExpressions.scala | 20 +++++++++++++------- .../scala/org/apache/spark/sql/Column.scala | 13 +++++++++++++ .../spark/sql/ColumnExpressionSuite.scala | 15 ++++++++++++--- 3 files changed, 38 insertions(+), 10 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/spark/blob/6bdddb6f/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/namedExpressions.scala ---------------------------------------------------------------------- diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/namedExpressions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/namedExpressions.scala index 17f7f9f..3dd7d38 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/namedExpressions.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/namedExpressions.scala @@ -95,9 +95,12 @@ abstract class Attribute extends NamedExpression { * @param name the name to be associated with the result of computing [[child]]. * @param exprId A globally unique id used to check if an [[AttributeReference]] refers to this * alias. Auto-assigned if left blank. + * @param explicitMetadata Explicit metadata associated with this alias that overwrites child's. */ -case class Alias(child: Expression, name: String) - (val exprId: ExprId = NamedExpression.newExprId, val qualifiers: Seq[String] = Nil) +case class Alias(child: Expression, name: String)( + val exprId: ExprId = NamedExpression.newExprId, + val qualifiers: Seq[String] = Nil, + val explicitMetadata: Option[Metadata] = None) extends NamedExpression with trees.UnaryNode[Expression] { override type EvaluatedType = Any @@ -107,9 +110,11 @@ case class Alias(child: Expression, name: String) override def dataType = child.dataType override def nullable = child.nullable override def metadata: Metadata = { - child match { - case named: NamedExpression => named.metadata - case _ => Metadata.empty + explicitMetadata.getOrElse { + child match { + case named: NamedExpression => named.metadata + case _ => Metadata.empty + } } } @@ -123,11 +128,12 @@ case class Alias(child: Expression, name: String) override def toString: String = s"$child AS $name#${exprId.id}$typeSuffix" - override protected final def otherCopyArgs = exprId :: qualifiers :: Nil + override protected final def otherCopyArgs = exprId :: qualifiers :: explicitMetadata :: Nil override def equals(other: Any): Boolean = other match { case a: Alias => - name == a.name && exprId == a.exprId && child == a.child && qualifiers == a.qualifiers + name == a.name && exprId == a.exprId && child == a.child && qualifiers == a.qualifiers && + explicitMetadata == a.explicitMetadata case _ => false } } http://git-wip-us.apache.org/repos/asf/spark/blob/6bdddb6f/sql/core/src/main/scala/org/apache/spark/sql/Column.scala ---------------------------------------------------------------------- diff --git a/sql/core/src/main/scala/org/apache/spark/sql/Column.scala b/sql/core/src/main/scala/org/apache/spark/sql/Column.scala index ec7d15f..2ae47e0 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/Column.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/Column.scala @@ -595,6 +595,19 @@ class Column(protected[sql] val expr: Expression) { def as(alias: Symbol): Column = Alias(expr, alias.name)() /** + * Gives the column an alias with metadata. + * {{{ + * val metadata: Metadata = ... + * df.select($"colA".as("colB", metadata)) + * }}} + * + * @group expr_ops + */ + def as(alias: String, metadata: Metadata): Column = { + Alias(expr, alias)(explicitMetadata = Some(metadata)) + } + + /** * Casts the column to a different data type. * {{{ * // Casts colA to IntegerType. http://git-wip-us.apache.org/repos/asf/spark/blob/6bdddb6f/sql/core/src/test/scala/org/apache/spark/sql/ColumnExpressionSuite.scala ---------------------------------------------------------------------- diff --git a/sql/core/src/test/scala/org/apache/spark/sql/ColumnExpressionSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/ColumnExpressionSuite.scala index a53ae97..bc8fae1 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/ColumnExpressionSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/ColumnExpressionSuite.scala @@ -17,12 +17,10 @@ package org.apache.spark.sql -import org.apache.spark.sql.catalyst.expressions.NamedExpression -import org.apache.spark.sql.catalyst.plans.logical.{Project, NoRelation} import org.apache.spark.sql.functions._ import org.apache.spark.sql.test.TestSQLContext import org.apache.spark.sql.test.TestSQLContext.implicits._ -import org.apache.spark.sql.types.{BooleanType, IntegerType, StructField, StructType} +import org.apache.spark.sql.types._ class ColumnExpressionSuite extends QueryTest { @@ -322,4 +320,15 @@ class ColumnExpressionSuite extends QueryTest { assert('key.desc == 'key.desc) assert('key.desc != 'key.asc) } + + test("alias with metadata") { + val metadata = new MetadataBuilder() + .putString("originName", "value") + .build() + val schema = testData + .select($"*", col("value").as("abc", metadata)) + .schema + assert(schema("value").metadata === Metadata.empty) + assert(schema("abc").metadata === metadata) + } } --------------------------------------------------------------------- To unsubscribe, e-mail: [email protected] For additional commands, e-mail: [email protected]
