This is an automated email from the ASF dual-hosted git repository.

snlee pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/pinot.git


The following commit(s) were added to refs/heads/master by this push:
     new e1a66512dc CSVRecordReader – Set header record when header is 
explicitly provided (#11594)
e1a66512dc is described below

commit e1a66512dcb9d8ac5bf6ef5d1454961fc6d997ad
Author: Ragesh Rajagopalan <[email protected]>
AuthorDate: Thu Sep 14 20:44:11 2023 -0700

    CSVRecordReader – Set header record when header is explicitly provided 
(#11594)
---
 .../plugin/inputformat/csv/CSVRecordReader.java    | 11 ++++--
 .../inputformat/csv/CSVRecordReaderTest.java       | 45 +++++++++++++++++++++-
 .../src/test/resources/dataFileWithNoHeader2.csv   |  4 ++
 .../test/resources/dataFileWithValidHeaders.csv    |  5 +++
 4 files changed, 60 insertions(+), 5 deletions(-)

diff --git 
a/pinot-plugins/pinot-input-format/pinot-csv/src/main/java/org/apache/pinot/plugin/inputformat/csv/CSVRecordReader.java
 
b/pinot-plugins/pinot-input-format/pinot-csv/src/main/java/org/apache/pinot/plugin/inputformat/csv/CSVRecordReader.java
index 49339ac218..c4dc8c167f 100644
--- 
a/pinot-plugins/pinot-input-format/pinot-csv/src/main/java/org/apache/pinot/plugin/inputformat/csv/CSVRecordReader.java
+++ 
b/pinot-plugins/pinot-input-format/pinot-csv/src/main/java/org/apache/pinot/plugin/inputformat/csv/CSVRecordReader.java
@@ -137,11 +137,11 @@ public class CSVRecordReader implements RecordReader {
       }
 
       if (_isHeaderProvided) {
+        _headerMap = parseLineAsHeader(config.getHeader());
+        _format = _format.builder().setHeader(_headerMap.keySet().toArray(new 
String[0])).build();
         if (!_useLineIterator) {
           validateHeaderForDelimiter(delimiter, config.getHeader(), _format);
         }
-        _headerMap = parseLineAsHeader(config.getHeader());
-        _format = _format.builder().setHeader(_headerMap.keySet().toArray(new 
String[0])).build();
       }
 
       if (config.isMultiValueDelimiterEnabled()) {
@@ -329,7 +329,12 @@ public class CSVRecordReader implements RecordReader {
       // read the first line
       String headerLine = _bufferedReader.readLine();
       _headerMap = parseLineAsHeader(headerLine);
-      _format = _format.builder().setHeader(_headerMap.keySet().toArray(new 
String[0])).build();
+      _format = _format.builder()
+          // If header isn't provided, the first line would be set as header 
and the 'skipHeader' property
+          // is set to false.
+          .setSkipHeaderRecord(false)
+          .setHeader(_headerMap.keySet().toArray(new String[0]))
+          .build();
     }
     _nextLine = _bufferedReader.readLine();
   }
diff --git 
a/pinot-plugins/pinot-input-format/pinot-csv/src/test/java/org/apache/pinot/plugin/inputformat/csv/CSVRecordReaderTest.java
 
b/pinot-plugins/pinot-input-format/pinot-csv/src/test/java/org/apache/pinot/plugin/inputformat/csv/CSVRecordReaderTest.java
index e174ced7e5..d245fb33c0 100644
--- 
a/pinot-plugins/pinot-input-format/pinot-csv/src/test/java/org/apache/pinot/plugin/inputformat/csv/CSVRecordReaderTest.java
+++ 
b/pinot-plugins/pinot-input-format/pinot-csv/src/test/java/org/apache/pinot/plugin/inputformat/csv/CSVRecordReaderTest.java
@@ -532,6 +532,46 @@ public class CSVRecordReaderTest extends 
AbstractRecordReaderTest {
     // Note: The default CSVRecordReader cannot handle unparseable rows
   }
 
+  @Test
+  public void testReadingDataFileWithNoHeaderAndDataRecordsWithEmptyValues()
+      throws URISyntaxException, IOException {
+    URI uri = 
ClassLoader.getSystemResource("dataFileWithNoHeader2.csv").toURI();
+    File dataFile = new File(uri);
+
+    // test using line iterator
+    CSVRecordReaderConfig readerConfig = new CSVRecordReaderConfig();
+    readerConfig.setSkipUnParseableLines(true);
+    readerConfig.setHeader("key,num0,num1");
+    List<GenericRow> genericRows = readCSVRecords(dataFile, readerConfig, 
null, false);
+    Assert.assertEquals(4, genericRows.size());
+
+    // test using default CSVRecordReader
+    readerConfig.setSkipUnParseableLines(false);
+    genericRows = readCSVRecords(dataFile, readerConfig, null, false);
+    Assert.assertEquals(4, genericRows.size());
+  }
+
+  @Test
+  public void testReadingDataFileWithValidHeaders()
+      throws URISyntaxException, IOException {
+    URI uri = 
ClassLoader.getSystemResource("dataFileWithValidHeaders.csv").toURI();
+    File dataFile = new File(uri);
+
+    // test using line iterator
+    CSVRecordReaderConfig readerConfig = new CSVRecordReaderConfig();
+    readerConfig.setSkipUnParseableLines(true);
+    // No explicit header is set and attempt to skip the header should be 
ignored. 1st line would be treated as the
+    // header line.
+    readerConfig.setSkipHeader(false);
+    List<GenericRow> genericRows = readCSVRecords(dataFile, readerConfig, 
null, false);
+    Assert.assertEquals(4, genericRows.size());
+
+    // test using default CSVRecordReader
+    readerConfig.setSkipUnParseableLines(false);
+    genericRows = readCSVRecords(dataFile, readerConfig, null, false);
+    Assert.assertEquals(4, genericRows.size());
+  }
+
   private List<GenericRow> readCSVRecords(File dataFile,
       CSVRecordReaderConfig readerConfig, GenericRow genericRow, boolean 
rewind)
       throws IOException {
@@ -543,10 +583,11 @@ public class CSVRecordReaderTest extends 
AbstractRecordReaderTest {
       while (recordReader.hasNext()) {
         if (genericRow != null) {
           recordReader.next(reuse);
+          genericRows.add(reuse);
         } else {
-          recordReader.next();
+          GenericRow nextRow = recordReader.next();
+          genericRows.add(nextRow);
         }
-        genericRows.add(genericRow);
       }
 
       if (rewind) {
diff --git 
a/pinot-plugins/pinot-input-format/pinot-csv/src/test/resources/dataFileWithNoHeader2.csv
 
b/pinot-plugins/pinot-input-format/pinot-csv/src/test/resources/dataFileWithNoHeader2.csv
new file mode 100644
index 0000000000..e54016ac4c
--- /dev/null
+++ 
b/pinot-plugins/pinot-input-format/pinot-csv/src/test/resources/dataFileWithNoHeader2.csv
@@ -0,0 +1,4 @@
+"key00",12.3,8.42
+"key01",,7.1
+"key02",,16.81
+"key03",,7.12
diff --git 
a/pinot-plugins/pinot-input-format/pinot-csv/src/test/resources/dataFileWithValidHeaders.csv
 
b/pinot-plugins/pinot-input-format/pinot-csv/src/test/resources/dataFileWithValidHeaders.csv
new file mode 100644
index 0000000000..010cab05fa
--- /dev/null
+++ 
b/pinot-plugins/pinot-input-format/pinot-csv/src/test/resources/dataFileWithValidHeaders.csv
@@ -0,0 +1,5 @@
+"key","num0","num1"
+"key00",12.3,8.42
+"key01",,7.1
+"key02",,16.81
+"key03",,7.12


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to