[
https://issues.apache.org/jira/browse/SPARK-49016?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel
]
ASF GitHub Bot reassigned SPARK-49016:
--------------------------------------
Assignee: Apache Spark
> Spark DataSet.isEmpty behaviour is different on CSV than JSON
> -------------------------------------------------------------
>
> Key: SPARK-49016
> URL: https://issues.apache.org/jira/browse/SPARK-49016
> Project: Spark
> Issue Type: Bug
> Components: Spark Core
> Affects Versions: 3.5.1, 3.4.3
> Reporter: Marius Butan
> Assignee: Apache Spark
> Priority: Major
> Labels: pull-request-available
> Attachments: image-2024-07-26-15-50-10-280.png,
> image-2024-07-26-15-50-24-308.png
>
>
> Spark DataSet.isEmpty behaviour is different on CSV than JSON:
> * CSV → dataSet.isEmpty return the values for any query
> * JSON → dataSet.isEmpty throws error when filter is only
> {_}corrupt{_}_record is null:
> !image-2024-07-26-15-50-10-280.png!
> Tested version: Spark 3.4.3, Spark 3.5.1
> Expected behaviour: throw error on both file types or return the correct value
>
> In order to demonstrate the behaviour I added an unit test
>
> test.csv
> {code:java}
> first,second,third{code}
> test.json
> {code:java}
> {"first": "first", "second": "second", "third": "third"}{code}
> Code:
> {noformat}
> import org.apache.spark.sql.Dataset;
> import org.apache.spark.sql.Row;
> import org.apache.spark.sql.SparkSession;
> import org.junit.jupiter.api.AfterEach;
> import org.junit.jupiter.api.BeforeEach;
> import org.junit.jupiter.api.Test;
> public class SparkIsEmptyTest {
> private SparkSession sparkSession;
> @BeforeEach
> void setUp() {
> sparkSession = getSpark();
> }
> @AfterEach
> void after() {
> sparkSession.close();
> }
> @Test
> void testDatasetIsEmptyForCsv() {
> var dataSet = runCsvQuery("select first, second, third,
> _corrupt_record from tempView where _corrupt_record is null");
> assert !dataSet.isEmpty();
> }
> @Test
> void testDatasetIsEmptyForJson() {
> var dataSet = runJsonQuery("select first, second, third,
> _corrupt_record from tempView where _corrupt_record is null");
> assert !dataSet.isEmpty();
> }
> @Test
> void testDatasetIsEmptyForJsonAnd1Eq1() {
> var dataSet = runJsonQuery(
> "select first, second, third, _corrupt_record from tempView
> where _corrupt_record is null and 1=1");
> assert !dataSet.isEmpty();
> }
> @Test
> void testDatasetIsEmptyForCsvAnd1Eq1() {
> var dataSet = runCsvQuery(
> "select first, second, third, _corrupt_record from tempView
> where _corrupt_record is null and 1=1");
> assert !dataSet.isEmpty();
> }
> @Test
> void testDatasetIsEmptyForJsonAndOtherCondition() {
> var dataSet = runJsonQuery("select first, second, third,
> _corrupt_record from tempView where _corrupt_record is null and
> first='first'");
> assert !dataSet.isEmpty();
> }
> @Test
> void testDatasetIsEmptyForCsvAndOtherCondition() {
> var dataSet = runCsvQuery("select first, second, third,
> _corrupt_record from tempView where _corrupt_record is null and
> first='first'");
> assert !dataSet.isEmpty();
> }
> @Test
> void testDatasetIsEmptyForJsonAggregation() {
> var dataSet = runJsonQuery("select count(1) from tempView where
> _corrupt_record is null");
> assert !dataSet.isEmpty();
> }
> @Test
> void testDatasetIsEmptyForCsvAggregation() {
> var dataSet = runCsvQuery("select count(1) from tempView where
> _corrupt_record is null");
> assert !dataSet.isEmpty();
> }
> @Test
> void testDatasetIsEmptyForJsonAggregationGroupBy() {
> var dataSet = runJsonQuery("select count(1) , first from tempView
> where _corrupt_record is null group by first");
> assert !dataSet.isEmpty();
> }
> @Test
> void testDatasetIsEmptyForCsvAggregationGroupBy() {
> var dataSet = runJsonQuery("select count(1) , first from tempView
> where _corrupt_record is null group by first");
> assert !dataSet.isEmpty();
> }
> private SparkSession getSpark() {
> return SparkSession.builder()
> .master("local")
> .appName("spark-dataset-isEmpty-issue")
> .config("spark.ui.enabled", "false")
> .getOrCreate();
> }
> private Dataset<?> runJsonQuery(String query) {
> Dataset<Row> dataset = sparkSession.read()
> .schema("first STRING,second String, third STRING,
> _corrupt_record STRING")
> .option("columnNameOfCorruptRecord", "_corrupt_record")
> .json("test.json");
> dataset.createOrReplaceTempView("tempView");
> var dataSet = sparkSession.sql(query);
> dataSet.show();
> return dataSet;
> }
> private Dataset<?> runCsvQuery(String query) {
> Dataset<Row> dataset = sparkSession.read()
> .schema("first STRING,second String, third STRING,
> _corrupt_record STRING")
> .option("columnNameOfCorruptRecord", "_corrupt_record")
> .csv("test.csv");
> dataset.createOrReplaceTempView("tempView");
> var dataSet = sparkSession.sql(query);
> dataSet.show();
> return dataSet;
> }
> }{noformat}
> Result:
> !image-2024-07-26-15-50-24-308.png!
>
--
This message was sent by Atlassian Jira
(v8.20.10#820010)
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]