The snippet at the end worked for me. We run Spark 1.3.x, so DataFrame.drop is not available to us.
As pointed out by Yana, DataFrame operations typically return a new DataFrame, so use as such: import com.foo.sparkstuff.DataFrameOps._ ... val df = ... val prunedDf = df.dropColumns("one_col", "other_col") package com.foo.sparkstuff import org.apache.spark.sql.{Column, DataFrame} import scala.language.implicitConversions class PimpedDataFrame(frame: DataFrame) { /** * Drop named columns from dataframe. Replace with DataFrame.drop when upgrading to Spark 1.4.0. */ def dropColumns(toDrop: String*): DataFrame = { val invalid = toDrop filterNot(frame.columns.contains(_)) if (invalid.nonEmpty) { throw new IllegalArgumentException("Columns not found: " + invalid.mkString(",")) } val newColumns = frame.columns filter {c => !toDrop.contains(c)} map {new Column(_)} frame.select(newColumns:_*) } } object DataFrameOps { implicit def pimpDataFrame(df: DataFrame): PimpedDataFrame = new PimpedDataFrame(df) } On Thu, Jul 16, 2015 at 4:57 PM, <saif.a.ell...@wellsfargo.com> wrote: > Hi, > > In a hundred columns dataframe, I wish to either select all of them except > or drop the ones I dont want. > > I am failing in doing such simple task, tried two ways > > val clean_cols = df.columns.filterNot(col_name => > col_name.startWith(“STATE_”).mkString(“, “) > df.select(clean_cols) > > But this throws exception: > org.apache.spark.sql.AnalysisException: cannot resolve 'asd_dt, > industry_area,...’ > at > org.apache.spark.sql.catalyst.analysis.package$AnalysisErrorAt.failAnalysis(package.scala:42) > at > org.apache.spark.sql.catalyst.analysis.CheckAnalysis$$anonfun$checkAnalysis$1$$anonfun$apply$2.applyOrElse(CheckAnalysis.scala:63) > at > org.apache.spark.sql.catalyst.analysis.CheckAnalysis$$anonfun$checkAnalysis$1$$anonfun$apply$2.applyOrElse(CheckAnalysis.scala:52) > at > org.apache.spark.sql.catalyst.trees.TreeNode$$anonfun$transformUp$1.apply(TreeNode.scala:286) > at > org.apache.spark.sql.catalyst.trees.TreeNode$$anonfun$transformUp$1.apply(TreeNode.scala:286) > at > org.apache.spark.sql.catalyst.trees.CurrentOrigin$.withOrigin(TreeNode.scala:51) > at > org.apache.spark.sql.catalyst.trees.TreeNode.transformUp(TreeNode.scala:285) > at > org.apache.spark.sql.catalyst.plans.QueryPlan.org$apache$spark$sql$catalyst$plans$QueryPlan$$transformExpressionUp$1(QueryPlan.scala:108) > at > org.apache.spark.sql.catalyst.plans.QueryPlan$$anonfun$2$$anonfun$apply$2.apply(QueryPlan.scala:123) > > The other thing I tried is > > df.columns.filter(col_name => col_name.startWith(“STATE_”) > for (col <- cols) df.drop(col) > > But this other thing doesn’t do anything or hangs up. > > Saif > > > --------------------------------------------------------------------- To unsubscribe, e-mail: user-unsubscr...@spark.apache.org For additional commands, e-mail: user-h...@spark.apache.org