[ https://issues.apache.org/jira/browse/SPARK-13913?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=15677144#comment-15677144 ]
Barry Becker edited comment on SPARK-13913 at 11/18/16 5:02 PM: ---------------------------------------------------------------- I can still reproduce this using spark 1.6.3. My dataframe has columns that include a column named "unique. string" I get an exception when I attempt this snippet: val c = col('`unique. string`') datasetDf = datasetDf .withColumn("unique. string" + "\_CLEANED\_\_", when(c.isNull, lit(Double.NaN)).otherwise(c)) The error I get is: org.apache.spark.sql.AnalysisException: cannot resolve 'unique. string' given input columns: [string-normal_CLEANED__, verySmallRange, dateColumn_CLEANED__, unique. string_CLEANED__, posNoZero_CLEANED__, normal$with$Null, allOneValue, unique. string, string/odd, posNoZero, all.Small, string-normal, smallRange, all*Null, veryLargeRange, mostly.Zero, largeRange, all#Zero, allNegative, dateColumn, normal123, string/odd_CLEANED__]; Note that the column "unique. string" is in the list of columns, but its not getting found. The stack trace is {code} at org.apache.spark.sql.catalyst.analysis.package$AnalysisErrorAt.failAnalysis(package.scala:42) at org.apache.spark.sql.catalyst.analysis.CheckAnalysis$$anonfun$checkAnalysis$1$$anonfun$apply$2.applyOrElse(CheckAnalysis.scala:60) at org.apache.spark.sql.catalyst.analysis.CheckAnalysis$$anonfun$checkAnalysis$1$$anonfun$apply$2.applyOrElse(CheckAnalysis.scala:57) at org.apache.spark.sql.catalyst.trees.TreeNode$$anonfun$transformUp$1.apply(TreeNode.scala:335) at org.apache.spark.sql.catalyst.trees.TreeNode$$anonfun$transformUp$1.apply(TreeNode.scala:335) at org.apache.spark.sql.catalyst.trees.CurrentOrigin$.withOrigin(TreeNode.scala:69) at org.apache.spark.sql.catalyst.trees.TreeNode.transformUp(TreeNode.scala:334) at org.apache.spark.sql.catalyst.plans.QueryPlan.transformExpressionUp$1(QueryPlan.scala:108) at org.apache.spark.sql.catalyst.plans.QueryPlan.org$apache$spark$sql$catalyst$plans$QueryPlan$$recursiveTransform$2(QueryPlan.scala:118) at org.apache.spark.sql.catalyst.plans.QueryPlan$$anonfun$org$apache$spark$sql$catalyst$plans$QueryPlan$$recursiveTransform$2$1.apply(QueryPlan.scala:122) at scala.collection.TraversableLike$$anonfun$map$1.apply(TraversableLike.scala:245) at scala.collection.TraversableLike$$anonfun$map$1.apply(TraversableLike.scala:245) at scala.collection.immutable.List.foreach(List.scala:381) at scala.collection.TraversableLike$class.map(TraversableLike.scala:245) at scala.collection.immutable.List.map(List.scala:285) at org.apache.spark.sql.catalyst.plans.QueryPlan.org$apache$spark$sql$catalyst$plans$QueryPlan$$recursiveTransform$2(QueryPlan.scala:122) at org.apache.spark.sql.catalyst.plans.QueryPlan$$anonfun$2.apply(QueryPlan.scala:127) at scala.collection.Iterator$$anon$11.next(Iterator.scala:370) at scala.collection.Iterator$class.foreach(Iterator.scala:750) at scala.collection.AbstractIterator.foreach(Iterator.scala:1202) at scala.collection.generic.Growable$class.$plus$plus$eq(Growable.scala:59) at scala.collection.mutable.ArrayBuffer.$plus$plus$eq(ArrayBuffer.scala:104) at scala.collection.mutable.ArrayBuffer.$plus$plus$eq(ArrayBuffer.scala:48) at scala.collection.TraversableOnce$class.to(TraversableOnce.scala:295) at scala.collection.AbstractIterator.to(Iterator.scala:1202) at scala.collection.TraversableOnce$class.toBuffer(TraversableOnce.scala:287) at scala.collection.AbstractIterator.toBuffer(Iterator.scala:1202) at scala.collection.TraversableOnce$class.toArray(TraversableOnce.scala:274) at scala.collection.AbstractIterator.toArray(Iterator.scala:1202) at org.apache.spark.sql.catalyst.plans.QueryPlan.transformExpressionsUp(QueryPlan.scala:127) at org.apache.spark.sql.catalyst.analysis.CheckAnalysis$$anonfun$checkAnalysis$1.apply(CheckAnalysis.scala:57) at org.apache.spark.sql.catalyst.analysis.CheckAnalysis$$anonfun$checkAnalysis$1.apply(CheckAnalysis.scala:50) at org.apache.spark.sql.catalyst.trees.TreeNode.foreachUp(TreeNode.scala:121) at org.apache.spark.sql.catalyst.analysis.CheckAnalysis$class.checkAnalysis(CheckAnalysis.scala:50) at org.apache.spark.sql.catalyst.analysis.Analyzer.checkAnalysis(Analyzer.scala:44) at org.apache.spark.sql.execution.QueryExecution.assertAnalyzed(QueryExecution.scala:34) at org.apache.spark.sql.DataFrame.<init>(DataFrame.scala:133) at org.apache.spark.sql.DataFrame.org$apache$spark$sql$DataFrame$$withPlan(DataFrame.scala:2127) at org.apache.spark.sql.DataFrame.select(DataFrame.scala:707) at org.apache.spark.sql.DataFrame.withColumn(DataFrame.scala:1186) at com.mineset.spark.ml.MLDatasetCleaner$$anonfun$cleanDataset$4.apply(MLDatasetCleaner.scala:110) at com.mineset.spark.ml.MLDatasetCleaner$$anonfun$cleanDataset$4.apply(MLDatasetCleaner.scala:99) at scala.collection.immutable.List.foreach(List.scala:381) at com.mxxx.spark.ml.MLDatasetCleaner.cleanDataset(MLDatasetCleaner.scala:99) {code} was (Author: barrybecker4): I can still reproduce this using spark 1.6.3. My dataframe has columns that include a column named "unique. string" I get an exception when I attempt this snippet: val c = col('`unique. string`') datasetDf = datasetDf .withColumn("unique. string" + "_CLEANED__", when(c.isNull, lit(Double.NaN)).otherwise(c)) The error I get is: org.apache.spark.sql.AnalysisException: cannot resolve 'unique. string' given input columns: [string-normal_CLEANED__, verySmallRange, dateColumn_CLEANED__, unique. string_CLEANED__, posNoZero_CLEANED__, normal$with$Null, allOneValue, unique. string, string/odd, posNoZero, all.Small, string-normal, smallRange, all*Null, veryLargeRange, mostly.Zero, largeRange, all#Zero, allNegative, dateColumn, normal123, string/odd_CLEANED__]; Note that the column "unique. string" is in the list of columns, but its not getting found. The stack trace is {code} at org.apache.spark.sql.catalyst.analysis.package$AnalysisErrorAt.failAnalysis(package.scala:42) at org.apache.spark.sql.catalyst.analysis.CheckAnalysis$$anonfun$checkAnalysis$1$$anonfun$apply$2.applyOrElse(CheckAnalysis.scala:60) at org.apache.spark.sql.catalyst.analysis.CheckAnalysis$$anonfun$checkAnalysis$1$$anonfun$apply$2.applyOrElse(CheckAnalysis.scala:57) at org.apache.spark.sql.catalyst.trees.TreeNode$$anonfun$transformUp$1.apply(TreeNode.scala:335) at org.apache.spark.sql.catalyst.trees.TreeNode$$anonfun$transformUp$1.apply(TreeNode.scala:335) at org.apache.spark.sql.catalyst.trees.CurrentOrigin$.withOrigin(TreeNode.scala:69) at org.apache.spark.sql.catalyst.trees.TreeNode.transformUp(TreeNode.scala:334) at org.apache.spark.sql.catalyst.plans.QueryPlan.transformExpressionUp$1(QueryPlan.scala:108) at org.apache.spark.sql.catalyst.plans.QueryPlan.org$apache$spark$sql$catalyst$plans$QueryPlan$$recursiveTransform$2(QueryPlan.scala:118) at org.apache.spark.sql.catalyst.plans.QueryPlan$$anonfun$org$apache$spark$sql$catalyst$plans$QueryPlan$$recursiveTransform$2$1.apply(QueryPlan.scala:122) at scala.collection.TraversableLike$$anonfun$map$1.apply(TraversableLike.scala:245) at scala.collection.TraversableLike$$anonfun$map$1.apply(TraversableLike.scala:245) at scala.collection.immutable.List.foreach(List.scala:381) at scala.collection.TraversableLike$class.map(TraversableLike.scala:245) at scala.collection.immutable.List.map(List.scala:285) at org.apache.spark.sql.catalyst.plans.QueryPlan.org$apache$spark$sql$catalyst$plans$QueryPlan$$recursiveTransform$2(QueryPlan.scala:122) at org.apache.spark.sql.catalyst.plans.QueryPlan$$anonfun$2.apply(QueryPlan.scala:127) at scala.collection.Iterator$$anon$11.next(Iterator.scala:370) at scala.collection.Iterator$class.foreach(Iterator.scala:750) at scala.collection.AbstractIterator.foreach(Iterator.scala:1202) at scala.collection.generic.Growable$class.$plus$plus$eq(Growable.scala:59) at scala.collection.mutable.ArrayBuffer.$plus$plus$eq(ArrayBuffer.scala:104) at scala.collection.mutable.ArrayBuffer.$plus$plus$eq(ArrayBuffer.scala:48) at scala.collection.TraversableOnce$class.to(TraversableOnce.scala:295) at scala.collection.AbstractIterator.to(Iterator.scala:1202) at scala.collection.TraversableOnce$class.toBuffer(TraversableOnce.scala:287) at scala.collection.AbstractIterator.toBuffer(Iterator.scala:1202) at scala.collection.TraversableOnce$class.toArray(TraversableOnce.scala:274) at scala.collection.AbstractIterator.toArray(Iterator.scala:1202) at org.apache.spark.sql.catalyst.plans.QueryPlan.transformExpressionsUp(QueryPlan.scala:127) at org.apache.spark.sql.catalyst.analysis.CheckAnalysis$$anonfun$checkAnalysis$1.apply(CheckAnalysis.scala:57) at org.apache.spark.sql.catalyst.analysis.CheckAnalysis$$anonfun$checkAnalysis$1.apply(CheckAnalysis.scala:50) at org.apache.spark.sql.catalyst.trees.TreeNode.foreachUp(TreeNode.scala:121) at org.apache.spark.sql.catalyst.analysis.CheckAnalysis$class.checkAnalysis(CheckAnalysis.scala:50) at org.apache.spark.sql.catalyst.analysis.Analyzer.checkAnalysis(Analyzer.scala:44) at org.apache.spark.sql.execution.QueryExecution.assertAnalyzed(QueryExecution.scala:34) at org.apache.spark.sql.DataFrame.<init>(DataFrame.scala:133) at org.apache.spark.sql.DataFrame.org$apache$spark$sql$DataFrame$$withPlan(DataFrame.scala:2127) at org.apache.spark.sql.DataFrame.select(DataFrame.scala:707) at org.apache.spark.sql.DataFrame.withColumn(DataFrame.scala:1186) at com.mineset.spark.ml.MLDatasetCleaner$$anonfun$cleanDataset$4.apply(MLDatasetCleaner.scala:110) at com.mineset.spark.ml.MLDatasetCleaner$$anonfun$cleanDataset$4.apply(MLDatasetCleaner.scala:99) at scala.collection.immutable.List.foreach(List.scala:381) at com.mxxx.spark.ml.MLDatasetCleaner.cleanDataset(MLDatasetCleaner.scala:99) {code} > DataFrame.withColumn fails when trying to replace existing column with dot in > name > ---------------------------------------------------------------------------------- > > Key: SPARK-13913 > URL: https://issues.apache.org/jira/browse/SPARK-13913 > Project: Spark > Issue Type: Bug > Components: SQL > Affects Versions: 1.6.0 > Reporter: Emmanuel Leroy > > http://stackoverflow.com/questions/36000147/spark-1-6-apply-function-to-column-with-dot-in-name-how-to-properly-escape-coln/36005334#36005334 > if I do (column name exists already and has dot in it, but is not a nested > column): > scala> df = df.withColumn("raw.hourOfDay", df.col("`raw.hourOfDay`")) > scala> df = df.withColumn("raw.hourOfDay", df.col("`raw.hourOfDay`")) > org.apache.spark.sql.AnalysisException: cannot resolve 'raw.minOfDay' given > input columns raw.hourOfDay_2, raw.dayOfWeek, raw.sensor2, raw.hourOfDay, > raw.minOfDay; > at > org.apache.spark.sql.catalyst.analysis.package$AnalysisErrorAt.failAnalysis(package.scala:42) > at > org.apache.spark.sql.catalyst.analysis.CheckAnalysis$$anonfun$checkAnalysis$1$$anonfun$apply$2.applyOrElse(CheckAnalysis.scala:60) > at > org.apache.spark.sql.catalyst.analysis.CheckAnalysis$$anonfun$checkAnalysis$1$$anonfun$apply$2.applyOrElse(CheckAnalysis.scala:57) > at > org.apache.spark.sql.catalyst.trees.TreeNode$$anonfun$transformUp$1.apply(TreeNode.scala:319) > at > org.apache.spark.sql.catalyst.trees.TreeNode$$anonfun$transformUp$1.apply(TreeNode.scala:319) > at > org.apache.spark.sql.catalyst.trees.CurrentOrigin$.withOrigin(TreeNode.scala:53) > at > org.apache.spark.sql.catalyst.trees.TreeNode.transformUp(TreeNode.scala:318) > at > org.apache.spark.sql.catalyst.plans.QueryPlan.transformExpressionUp$1(QueryPlan.scala:107) > at > org.apache.spark.sql.catalyst.plans.QueryPlan.org$apache$spark$sql$catalyst$plans$QueryPlan$$recursiveTransform$2(QueryPlan.scala:117) > at > org.apache.spark.sql.catalyst.plans.QueryPlan$$anonfun$org$apache$spark$sql$catalyst$plans$QueryPlan$$recursiveTransform$2$1.apply(QueryPlan.scala:121) > at > scala.collection.TraversableLike$$anonfun$map$1.apply(TraversableLike.scala:244) > at > scala.collection.TraversableLike$$anonfun$map$1.apply(TraversableLike.scala:244) > at scala.collection.immutable.List.foreach(List.scala:318) > at > scala.collection.TraversableLike$class.map(TraversableLike.scala:244) > at scala.collection.AbstractTraversable.map(Traversable.scala:105) > at > org.apache.spark.sql.catalyst.plans.QueryPlan.org$apache$spark$sql$catalyst$plans$QueryPlan$$recursiveTransform$2(QueryPlan.scala:121) > at > org.apache.spark.sql.catalyst.plans.QueryPlan$$anonfun$2.apply(QueryPlan.scala:125) > at scala.collection.Iterator$$anon$11.next(Iterator.scala:328) > at scala.collection.Iterator$class.foreach(Iterator.scala:727) > at scala.collection.AbstractIterator.foreach(Iterator.scala:1157) > at > scala.collection.generic.Growable$class.$plus$plus$eq(Growable.scala:48) > at > scala.collection.mutable.ArrayBuffer.$plus$plus$eq(ArrayBuffer.scala:103) > but if I do: > scala> df = df.withColumn("raw.hourOfDay_2", df.col("`raw.hourOfDay`")) > scala> df.printSchema > root > |-- raw.hourOfDay: long (nullable = true) > |-- raw.minOfDay: long (nullable = true) > |-- raw.dayOfWeek: long (nullable = true) > |-- raw.sensor2: long (nullable = true) > |-- raw.hourOfDay_2: long (nullable = true) > it works fine (i.e. new column is created with dot in ColName). > The only difference is that the name "raw.hourOfDay_2" does not exist yet, > and is properly created as a colName with dot, not as a nested column. > The documentation however says that if the column exists it will replace it, > but it seems there is a miss-interpretation of the column name as a nested > column > def withColumn(colName: String, col: Column): DataFrame > Returns a new DataFrame by adding a column or replacing the existing column > that has the same name. > Replacing a column without a dot in it works fine. -- This message was sent by Atlassian JIRA (v6.3.4#6332) --------------------------------------------------------------------- To unsubscribe, e-mail: issues-unsubscr...@spark.apache.org For additional commands, e-mail: issues-h...@spark.apache.org