hvanhovell commented on code in PR #40217: URL: https://github.com/apache/spark/pull/40217#discussion_r1124627948
########## connector/connect/client/jvm/src/main/scala/org/apache/spark/sql/DataFrameNaFunctions.scala: ########## @@ -0,0 +1,524 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql + +import java.{lang => jl} +import java.util.Locale + +import scala.collection.JavaConverters._ + +import org.apache.spark.connect.proto.{DataType => GDataType, NAReplace, Relation} +import org.apache.spark.connect.proto.Expression.{Literal => GLiteral} +import org.apache.spark.connect.proto.NAReplace.Replacement +import org.apache.spark.sql.types.{BooleanType, DataType, DoubleType, FloatType, IntegerType, LongType, StringType} + +/** + * Functionality for working with missing data in `DataFrame`s. + * + * @since 3.4.0 + */ +final class DataFrameNaFunctions private[sql] (sparkSession: SparkSession, root: Relation) { + + /** + * Returns a new `DataFrame` that drops rows containing any null or NaN values. + * + * @since 3.4.0 + */ + def drop(): DataFrame = buildDropDataFrame(None, None) + + /** + * Returns a new `DataFrame` that drops rows containing null or NaN values. + * + * If `how` is "any", then drop rows containing any null or NaN values. If `how` is "all", then + * drop rows only if every column is null or NaN for that row. + * + * @since 3.4.0 + */ + def drop(how: String): DataFrame = { + val minNonNulls = how.toLowerCase(Locale.ROOT) match { + case "any" => None // No-Op. Do nothing. + case "all" => Some(1) + case _ => throw new IllegalArgumentException(s"how ($how) must be 'any' or 'all'") + } + buildDropDataFrame(None, minNonNulls) + } + + /** + * Returns a new `DataFrame` that drops rows containing any null or NaN values in the specified + * columns. + * + * @since 3.4.0 + */ + def drop(cols: Array[String]): DataFrame = drop(cols.toSeq) + + /** + * (Scala-specific) Returns a new `DataFrame` that drops rows containing any null or NaN values + * in the specified columns. + * + * @since 3.4.0 + */ + def drop(cols: Seq[String]): DataFrame = buildDropDataFrame(Some(cols), None) + + /** + * Returns a new `DataFrame` that drops rows containing null or NaN values in the specified + * columns. + * + * If `how` is "any", then drop rows containing any null or NaN values in the specified columns. + * If `how` is "all", then drop rows only if every specified column is null or NaN for that row. + * + * @since 3.4.0 + */ + def drop(how: String, cols: Array[String]): DataFrame = drop(how, cols.toSeq) + + /** + * (Scala-specific) Returns a new `DataFrame` that drops rows containing null or NaN values in + * the specified columns. + * + * If `how` is "any", then drop rows containing any null or NaN values in the specified columns. + * If `how` is "all", then drop rows only if every specified column is null or NaN for that row. + * + * @since 3.4.0 + */ + def drop(how: String, cols: Seq[String]): DataFrame = { + val minNonNulls = how.toLowerCase(Locale.ROOT) match { + case "any" => None // No-Op. Do nothing. + case "all" => Some(1) + case _ => throw new IllegalArgumentException(s"how ($how) must be 'any' or 'all'") + } + buildDropDataFrame(Some(cols), minNonNulls) + } + + /** + * Returns a new `DataFrame` that drops rows containing less than `minNonNulls` non-null and + * non-NaN values. + * + * @since 3.4.0 + */ + def drop(minNonNulls: Int): DataFrame = { + buildDropDataFrame(None, Some(minNonNulls)) + } + + /** + * Returns a new `DataFrame` that drops rows containing less than `minNonNulls` non-null and + * non-NaN values in the specified columns. + * + * @since 3.4.0 + */ + def drop(minNonNulls: Int, cols: Array[String]): DataFrame = drop(minNonNulls, cols.toSeq) + + /** + * (Scala-specific) Returns a new `DataFrame` that drops rows containing less than `minNonNulls` + * non-null and non-NaN values in the specified columns. + * + * @since 3.4.0 + */ + def drop(minNonNulls: Int, cols: Seq[String]): DataFrame = { + buildDropDataFrame(Some(cols), Some(minNonNulls)) + } + + private def buildDropDataFrame( + cols: Option[Seq[String]], + minNonNulls: Option[Int]): DataFrame = { + sparkSession.newDataFrame { builder => + val dropNaBuilder = builder.getDropNaBuilder.setInput(root) + cols.foreach(c => dropNaBuilder.addAllCols(c.asJava)) + minNonNulls.foreach(dropNaBuilder.setMinNonNulls) + } + } + + /** + * Returns a new `DataFrame` that replaces null or NaN values in numeric columns with `value`. + * + * @since 3.4.0 + */ + def fill(value: Long): DataFrame = { + buildFillDataFrame(None, convertToLiteral(value, LongType)) + } + + /** + * Returns a new `DataFrame` that replaces null or NaN values in specified numeric columns. If a + * specified column is not a numeric column, it is ignored. + * + * @since 3.4.0 + */ + def fill(value: Long, cols: Array[String]): DataFrame = fill(value, cols.toSeq) + + /** + * (Scala-specific) Returns a new `DataFrame` that replaces null or NaN values in specified + * numeric columns. If a specified column is not a numeric column, it is ignored. + * + * @since 3.4.0 + */ + def fill(value: Long, cols: Seq[String]): DataFrame = { + buildFillDataFrame(Some(cols), convertToLiteral(value, LongType)) + } + + /** + * Returns a new `DataFrame` that replaces null or NaN values in numeric columns with `value`. + * + * @since 3.4.0 + */ + def fill(value: Double): DataFrame = { + buildFillDataFrame(None, convertToLiteral(value, DoubleType)) + } + + /** + * Returns a new `DataFrame` that replaces null or NaN values in specified numeric columns. If a + * specified column is not a numeric column, it is ignored. + * + * @since 3.4.0 + */ + def fill(value: Double, cols: Array[String]): DataFrame = fill(value, cols.toSeq) + + /** + * (Scala-specific) Returns a new `DataFrame` that replaces null or NaN values in specified + * numeric columns. If a specified column is not a numeric column, it is ignored. + * + * @since 3.4.0 + */ + def fill(value: Double, cols: Seq[String]): DataFrame = { + buildFillDataFrame(Some(cols), convertToLiteral(value, DoubleType)) + } + + /** + * Returns a new `DataFrame` that replaces null values in string columns with `value`. + * + * @since 3.4.0 + */ + def fill(value: String): DataFrame = { + buildFillDataFrame(None, convertToLiteral(value, StringType)) + } + + /** + * Returns a new `DataFrame` that replaces null values in specified string columns. If a + * specified column is not a string column, it is ignored. + * + * @since 3.4.0 + */ + def fill(value: String, cols: Array[String]): DataFrame = fill(value, cols.toSeq) + + /** + * (Scala-specific) Returns a new `DataFrame` that replaces null values in specified string + * columns. If a specified column is not a string column, it is ignored. + * + * @since 3.4.0 + */ + def fill(value: String, cols: Seq[String]): DataFrame = { + buildFillDataFrame(Some(cols), convertToLiteral(value, StringType)) Review Comment: AFAICT null is not allowed as a fill value, a null will fall through to the default in this code: https://github.com/apache/spark/blob/master/sql/core/src/main/scala/org/apache/spark/sql/DataFrameNaFunctions.scala#L517-L523 -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: [email protected] For queries about this service, please contact Infrastructure at: [email protected] --------------------------------------------------------------------- To unsubscribe, e-mail: [email protected] For additional commands, e-mail: [email protected]
