[
https://issues.apache.org/jira/browse/FLINK-3901?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=15332758#comment-15332758
]
ASF GitHub Bot commented on FLINK-3901:
---------------------------------------
Github user fhueske commented on a diff in the pull request:
https://github.com/apache/flink/pull/1989#discussion_r67257863
--- Diff:
flink-libraries/flink-table/src/test/java/org/apache/flink/api/java/io/RowCsvInputFormatTest.java
---
@@ -0,0 +1,1075 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.flink.api.java.io;
+
+import static org.hamcrest.CoreMatchers.is;
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertNotNull;
+import static org.junit.Assert.assertNull;
+import static org.junit.Assert.assertThat;
+import static org.junit.Assert.assertTrue;
+import static org.junit.Assert.fail;
+
+import java.io.File;
+import java.io.FileOutputStream;
+import java.io.IOException;
+import java.io.OutputStreamWriter;
+import java.nio.charset.StandardCharsets;
+import java.util.ArrayList;
+import java.util.List;
+
+import org.apache.flink.api.common.io.ParseException;
+import org.apache.flink.api.common.typeinfo.BasicTypeInfo;
+import org.apache.flink.api.common.typeinfo.TypeInformation;
+import org.apache.flink.api.java.typeutils.PojoTypeInfo;
+import org.apache.flink.api.java.typeutils.TypeExtractor;
+import org.apache.flink.api.table.Row;
+import org.apache.flink.api.table.typeutils.RowTypeInfo;
+import org.apache.flink.configuration.Configuration;
+import org.apache.flink.core.fs.FileInputSplit;
+import org.apache.flink.core.fs.Path;
+import org.apache.flink.types.parser.FieldParser;
+import org.apache.flink.types.parser.StringParser;
+import org.junit.Test;
+
+public class RowCsvInputFormatTest {
+
+ private static final Path PATH = new Path("an/ignored/file/");
+
+ //Static variables for testing the removal of \r\n to \n
+ private static final String FIRST_PART = "That is the first part";
+
+ private static final String SECOND_PART = "That is the second part";
+
+ @Test
+ public void ignoreInvalidLines() {
+ try {
+ String fileContent =
+ "header1|header2|header3|\n"+
+ "this is|1|2.0|\n"+
+ "//a comment\n" +
+ "a test|3|4.0|\n" +
+ "#next|5|6.0|\n";
+
+ FileInputSplit split = createTempFile(fileContent);
+
+ RowTypeInfo typeInfo = new RowTypeInfo(new
TypeInformation<?>[] {
+ BasicTypeInfo.STRING_TYPE_INFO,
+ BasicTypeInfo.INT_TYPE_INFO,
+ BasicTypeInfo.DOUBLE_TYPE_INFO
+ });
+ CsvInputFormat<Row> format = new
RowCsvInputFormat(PATH, "\n", "|", typeInfo);
+ format.setLenient(false);
+
+ Configuration parameters = new Configuration();
+ format.configure(parameters);
+ format.open(split);
+
+ Row result = new Row(3);
+
+ try {
+ result = format.nextRecord(result);
+ fail("Parse Exception was not thrown! (Invalid
int value)");
+ } catch (ParseException ex) {
+ }
+
+ // if format has lenient == false this can be asserted
only after FLINK-3908
+// result = format.nextRecord(result);
+// assertNotNull(result);
+// assertEquals("this is", result.productElement(0));
+// assertEquals(new Integer(1), result.productElement(1));
+// assertEquals(new Double(2.0), result.productElement(2));
+//
+// result = format.nextRecord(result);
+// assertNotNull(result);
+// assertEquals("a test", result.productElement(0));
+// assertEquals(new Integer(3), result.productElement(1));
+// assertEquals(new Double(4.0), result.productElement(2));
+//
+// result = format.nextRecord(result);
+// assertNotNull(result);
+// assertEquals("#next", result.productElement(0));
+// assertEquals(new Integer(5), result.productElement(1));
+// assertEquals(new Double(6.0), result.productElement(2));
+//
+// result = format.nextRecord(result);
+// assertNull(result);
+
+ //re-open with lenient = true
+ format.setLenient(true);
+ format.configure(parameters);
+ format.open(split);
+
+ result = new Row(3);
+
+ result = format.nextRecord(result);
+ assertNotNull(result);
+ assertEquals("header1", result.productElement(0));
+ assertNull(result.productElement(1));
+ assertNull(result.productElement(2));
+
+ result = format.nextRecord(result);
+ assertNotNull(result);
+ assertEquals("this is", result.productElement(0));
+ assertEquals(new Integer(1), result.productElement(1));
+ assertEquals(new Double(2.0), result.productElement(2));
+
+ result = format.nextRecord(result);
+ assertNotNull(result);
+ assertEquals("a test", result.productElement(0));
+ assertEquals(new Integer(3), result.productElement(1));
+ assertEquals(new Double(4.0), result.productElement(2));
+
+ result = format.nextRecord(result);
+ assertNotNull(result);
+ assertEquals("#next", result.productElement(0));
+ assertEquals(new Integer(5), result.productElement(1));
+ assertEquals(new Double(6.0), result.productElement(2));
+
+ result = format.nextRecord(result);
+ assertNull(result);
+ }
+ catch (Exception ex) {
+ ex.printStackTrace();
+ fail("Test failed due to a " + ex.getClass().getName()
+ ": " + ex.getMessage());
+ }
+ }
+
+ @Test
+ public void ignoreSingleCharPrefixComments() {
+ try {
+ final String fileContent =
+ "#description of the data\n" +
+ "#successive commented line\n" +
+ "this is|1|2.0|\n" +
+ "a test|3|4.0|\n" +
+ "#next|5|6.0|\n";
+
+ FileInputSplit split = createTempFile(fileContent);
+
+ RowTypeInfo typeInfo = new RowTypeInfo(new
TypeInformation<?>[] {
+ BasicTypeInfo.STRING_TYPE_INFO,
+ BasicTypeInfo.INT_TYPE_INFO,
+ BasicTypeInfo.DOUBLE_TYPE_INFO });
+ CsvInputFormat<Row> format = new
RowCsvInputFormat(PATH, "\n", "|", typeInfo);
+ format.setCommentPrefix("#");
+
+ Configuration parameters = new Configuration();
+ format.configure(parameters);
+ format.open(split);
+
+ Row result = new Row(3);
+
+ result = format.nextRecord(result);
+ assertNotNull(result);
+ assertEquals("this is", result.productElement(0));
+ assertEquals(new Integer(1), result.productElement(1));
+ assertEquals(new Double(2.0), result.productElement(2));
+
+ result = format.nextRecord(result);
+ assertNotNull(result);
+ assertEquals("a test", result.productElement(0));
+ assertEquals(new Integer(3), result.productElement(1));
+ assertEquals(new Double(4.0), result.productElement(2));
+
+ result = format.nextRecord(result);
+ assertNull(result);
+ }
+ catch (Exception ex) {
+ ex.printStackTrace();
+ fail("Test failed due to a " + ex.getClass().getName()
+ ": " + ex.getMessage());
+ }
+ }
+
+ @Test
+ public void ignoreMultiCharPrefixComments() {
+ try {
+
+
+ final String fileContent = "//description of the
data\n" +
+ "//successive commented line\n" +
+ "this is|1|2.0|\n"+
+ "a test|3|4.0|\n" +
+ "//next|5|6.0|\n";
+
+ final FileInputSplit split =
createTempFile(fileContent);
+
+ final RowTypeInfo typeInfo = new RowTypeInfo(new
TypeInformation<?>[] {
+ BasicTypeInfo.STRING_TYPE_INFO,
BasicTypeInfo.INT_TYPE_INFO, BasicTypeInfo.DOUBLE_TYPE_INFO });
+ final CsvInputFormat<Row> format = new
RowCsvInputFormat(PATH, "\n", "|", typeInfo);
+ format.setCommentPrefix("//");
+
+ final Configuration parameters = new Configuration();
+ format.configure(parameters);
+ format.open(split);
+
+ Row result = new Row(3);
+
+ result = format.nextRecord(result);
+ assertNotNull(result);
+ assertEquals("this is", result.productElement(0));
+ assertEquals(new Integer(1), result.productElement(1));
+ assertEquals(new Double(2.0), result.productElement(2));
+
+ result = format.nextRecord(result);
+ assertNotNull(result);
+ assertEquals("a test", result.productElement(0));
+ assertEquals(new Integer(3), result.productElement(1));
+ assertEquals(new Double(4.0), result.productElement(2));
+
+ result = format.nextRecord(result);
+ assertNull(result);
+ }
+ catch (Exception ex) {
+ ex.printStackTrace();
+ fail("Test failed due to a " + ex.getClass().getName()
+ ": " + ex.getMessage());
+ }
+ }
+
+ @Test
+ public void readStringFields() {
+ try {
+ String fileContent = "abc|def|ghijk\nabc||hhg\n|||";
+ FileInputSplit split = createTempFile(fileContent);
+
+ RowTypeInfo typeInfo = new RowTypeInfo(new
TypeInformation<?>[] {
+ BasicTypeInfo.STRING_TYPE_INFO,
+ BasicTypeInfo.STRING_TYPE_INFO,
+ BasicTypeInfo.STRING_TYPE_INFO
+ });
+ CsvInputFormat<Row> format = new
RowCsvInputFormat(PATH, "\n", "|", typeInfo);
+
+ final Configuration parameters = new Configuration();
+ format.configure(parameters);
+ format.open(split);
+
+ Row result = new Row(3);
+
+ result = format.nextRecord(result);
+ assertNotNull(result);
+ assertEquals("abc", result.productElement(0));
+ assertEquals("def", result.productElement(1));
+ assertEquals("ghijk", result.productElement(2));
+
+ result = format.nextRecord(result);
+ assertNotNull(result);
+ assertEquals("abc", result.productElement(0));
+ assertEquals("", result.productElement(1));
+ assertEquals("hhg", result.productElement(2));
+
+ result = format.nextRecord(result);
+ assertNotNull(result);
+ assertEquals("", result.productElement(0));
+ assertEquals("", result.productElement(1));
+ assertEquals("", result.productElement(2));
+
+ result = format.nextRecord(result);
+ assertNull(result);
+ assertTrue(format.reachedEnd());
+ }
+ catch (Exception ex) {
+ ex.printStackTrace();
+ fail("Test failed due to a " + ex.getClass().getName()
+ ": " + ex.getMessage());
+ }
+ }
+
+ @Test
+ public void readMixedQuotedStringFields() {
+ try {
+ String fileContent =
"@a|b|c@|def|@ghijk@\nabc||@|hhg@\n|||";
+ FileInputSplit split = createTempFile(fileContent);
+
+ RowTypeInfo typeInfo = new RowTypeInfo(new
TypeInformation<?>[] {
+ BasicTypeInfo.STRING_TYPE_INFO,
+ BasicTypeInfo.STRING_TYPE_INFO,
+ BasicTypeInfo.STRING_TYPE_INFO
+ });
+ CsvInputFormat<Row> format = new
RowCsvInputFormat(PATH, "\n", "|", typeInfo);
+
+ Configuration parameters = new Configuration();
+ format.configure(parameters);
+ format.enableQuotedStringParsing('@');
+ format.open(split);
+
+ Row result = new Row(3);
+
+ result = format.nextRecord(result);
+ assertNotNull(result);
+ assertEquals("a|b|c", result.productElement(0));
+ assertEquals("def", result.productElement(1));
+ assertEquals("ghijk", result.productElement(2));
+
+ result = format.nextRecord(result);
+ assertNotNull(result);
+ assertEquals("abc", result.productElement(0));
+ assertEquals("", result.productElement(1));
+ assertEquals("|hhg", result.productElement(2));
+
+ result = format.nextRecord(result);
+ assertNotNull(result);
+ assertEquals("", result.productElement(0));
+ assertEquals("", result.productElement(1));
+ assertEquals("", result.productElement(2));
+
+ result = format.nextRecord(result);
+ assertNull(result);
+ assertTrue(format.reachedEnd());
+ }
+ catch (Exception ex) {
+ ex.printStackTrace();
+ fail("Test failed due to a " + ex.getClass().getName()
+ ": " + ex.getMessage());
+ }
+ }
+
+ @Test
+ public void readStringFieldsWithTrailingDelimiters() {
+ try {
+ String fileContent =
"abc|-def|-ghijk\nabc|-|-hhg\n|-|-|-\n";
+ FileInputSplit split = createTempFile(fileContent);
+
+ RowTypeInfo typeInfo = new RowTypeInfo(new
TypeInformation<?>[] {
+ BasicTypeInfo.STRING_TYPE_INFO,
+ BasicTypeInfo.STRING_TYPE_INFO,
+ BasicTypeInfo.STRING_TYPE_INFO
+ });
+ CsvInputFormat<Row> format = new
RowCsvInputFormat(PATH, "\n", "|", typeInfo);
+
+ format.setFieldDelimiter("|-");
+
+ format.configure(new Configuration());
+ format.open(split);
+
+ Row result = new Row(3);
+
+ result = format.nextRecord(result);
+ assertNotNull(result);
+ assertEquals("abc", result.productElement(0));
+ assertEquals("def", result.productElement(1));
+ assertEquals("ghijk", result.productElement(2));
+
+ result = format.nextRecord(result);
+ assertNotNull(result);
+ assertEquals("abc", result.productElement(0));
+ assertEquals("", result.productElement(1));
+ assertEquals("hhg", result.productElement(2));
+
+ result = format.nextRecord(result);
+ assertNotNull(result);
+ assertEquals("", result.productElement(0));
+ assertEquals("", result.productElement(1));
+ assertEquals("", result.productElement(2));
+
+ result = format.nextRecord(result);
+ assertNull(result);
+ assertTrue(format.reachedEnd());
+ }
+ catch (Exception ex) {
+ fail("Test failed due to a " + ex.getClass().getName()
+ ": " + ex.getMessage());
+ }
+ }
+
+ @Test
+ public void testIntegerFields() throws IOException {
+ try {
+ String fileContent =
"111|222|333|444|555\n666|777|888|999|000|\n";
+ FileInputSplit split = createTempFile(fileContent);
+
+ RowTypeInfo typeInfo = new RowTypeInfo(
+ new TypeInformation<?>[] {
+ BasicTypeInfo.INT_TYPE_INFO,
+ BasicTypeInfo.INT_TYPE_INFO,
+ BasicTypeInfo.INT_TYPE_INFO,
+ BasicTypeInfo.INT_TYPE_INFO,
+ BasicTypeInfo.INT_TYPE_INFO
+ });
+ CsvInputFormat<Row> format = new
RowCsvInputFormat(PATH, "\n", "|", typeInfo);
+
+ format.setFieldDelimiter("|");
+
+ format.configure(new Configuration());
+ format.open(split);
+
+ Row result = new Row(5);
+
+ result = format.nextRecord(result);
+ assertNotNull(result);
+ assertEquals(Integer.valueOf(111),
result.productElement(0));
+ assertEquals(Integer.valueOf(222),
result.productElement(1));
+ assertEquals(Integer.valueOf(333),
result.productElement(2));
+ assertEquals(Integer.valueOf(444),
result.productElement(3));
+ assertEquals(Integer.valueOf(555),
result.productElement(4));
+
+ result = format.nextRecord(result);
+ assertNotNull(result);
+ assertEquals(Integer.valueOf(666),
result.productElement(0));
+ assertEquals(Integer.valueOf(777),
result.productElement(1));
+ assertEquals(Integer.valueOf(888),
result.productElement(2));
+ assertEquals(Integer.valueOf(999),
result.productElement(3));
+ assertEquals(Integer.valueOf(000),
result.productElement(4));
+
+ result = format.nextRecord(result);
+ assertNull(result);
+ assertTrue(format.reachedEnd());
+ }
+ catch (Exception ex) {
+ fail("Test failed due to a " + ex.getClass().getName()
+ ": " + ex.getMessage());
+ }
+ }
+
+ @Test
+ public void testEmptyFields() throws IOException {
+ try{
+ String fileContent =
+ "|0|0|0|0|0|\n" +
+ "1||1|1|1|1|\n" +
+ "2|2||2|2|2|\n" +
+ "3|3|3||3|3|\n" +
+ "4|4|4|4||4|\n" +
+ "5|5|5|5|5||\n";
+
+ FileInputSplit split = createTempFile(fileContent);
+
+ //TODO: FLOAT_TYPE_INFO and DOUBLE_TYPE_INFO don't
handle correctly null values
+ RowTypeInfo typeInfo = new RowTypeInfo(new
TypeInformation<?>[] {
+ BasicTypeInfo.SHORT_TYPE_INFO,
+ BasicTypeInfo.INT_TYPE_INFO,
+ BasicTypeInfo.LONG_TYPE_INFO,
+ BasicTypeInfo.INT_TYPE_INFO,
+// BasicTypeInfo.FLOAT_TYPE_INFO,
--- End diff --
Can these be removed?
> Create a RowCsvInputFormat to use as default CSV IF in Table API
> ----------------------------------------------------------------
>
> Key: FLINK-3901
> URL: https://issues.apache.org/jira/browse/FLINK-3901
> Project: Flink
> Issue Type: Improvement
> Affects Versions: 1.0.2
> Reporter: Flavio Pompermaier
> Assignee: Flavio Pompermaier
> Priority: Minor
> Labels: csv, null-values, row, tuple
>
> At the moment the Table APIs reads CSVs using the TupleCsvInputFormat, that
> has the big limitation of 25 fields and null handling.
> A new IF producing Row object is indeed necessary to avoid those limitations
--
This message was sent by Atlassian JIRA
(v6.3.4#6332)