Author: daijy
Date: Tue Jan 6 01:06:26 2015
New Revision: 1649702
URL: http://svn.apache.org/r1649702
Log:
PIG-4213: CSVExcelStorage not quoting texts containing \r (CR) when storing
Modified:
pig/trunk/CHANGES.txt
pig/trunk/contrib/piggybank/java/src/main/java/org/apache/pig/piggybank/storage/CSVExcelStorage.java
pig/trunk/contrib/piggybank/java/src/test/java/org/apache/pig/piggybank/test/storage/TestCSVExcelStorage.java
Modified: pig/trunk/CHANGES.txt
URL:
http://svn.apache.org/viewvc/pig/trunk/CHANGES.txt?rev=1649702&r1=1649701&r2=1649702&view=diff
==============================================================================
--- pig/trunk/CHANGES.txt (original)
+++ pig/trunk/CHANGES.txt Tue Jan 6 01:06:26 2015
@@ -30,6 +30,8 @@ PIG-4333: Split BigData tests into multi
BUG FIXES
+PIG-4213: CSVExcelStorage not quoting texts containing \r (CR) when storing
(alfonso.nishikawa via daijy)
+
PIG-2647: Split Combining drops splits with empty getLocations() (tmwoodruff
via daijy)
PIG-4294: Enable unit test "TestNestedForeach" for spark (kellyzly via rohini)
Modified:
pig/trunk/contrib/piggybank/java/src/main/java/org/apache/pig/piggybank/storage/CSVExcelStorage.java
URL:
http://svn.apache.org/viewvc/pig/trunk/contrib/piggybank/java/src/main/java/org/apache/pig/piggybank/storage/CSVExcelStorage.java?rev=1649702&r1=1649701&r2=1649702&view=diff
==============================================================================
---
pig/trunk/contrib/piggybank/java/src/main/java/org/apache/pig/piggybank/storage/CSVExcelStorage.java
(original)
+++
pig/trunk/contrib/piggybank/java/src/main/java/org/apache/pig/piggybank/storage/CSVExcelStorage.java
Tue Jan 6 01:06:26 2015
@@ -103,6 +103,7 @@ public class CSVExcelStorage extends Pig
public static enum Headers { DEFAULT, READ_INPUT_HEADER,
SKIP_INPUT_HEADER, WRITE_OUTPUT_HEADER, SKIP_OUTPUT_HEADER }
protected final static byte LINEFEED = '\n';
+ protected final static byte CARRIAGE_RETURN = '\r';
protected final static byte DOUBLE_QUOTE = '"';
protected final static byte RECORD_DEL = LINEFEED;
@@ -293,6 +294,7 @@ public class CSVExcelStorage extends Pig
ArrayList<Object> mProtoTuple = new ArrayList<Object>();
int embeddedNewlineIndex = -1;
+ int embeddedCarriageReturn = -1;
String fieldStr = null;
// For good debug messages:
int fieldCounter = -1;
@@ -315,13 +317,15 @@ public class CSVExcelStorage extends Pig
// If any field delimiters are in the field, or if we did replace
// any double quotes with a pair of double quotes above,
// or if the string includes a newline character (LF:\n:0x0A)
+ // or includes a carriage return (CR:\r:0x0D)
// and we are to allow newlines in fields,
// then the entire field must be enclosed in double quotes:
embeddedNewlineIndex = fieldStr.indexOf(LINEFEED);
+ embeddedCarriageReturn = fieldStr.indexOf(CARRIAGE_RETURN);
if ((fieldStr.indexOf(fieldDelimiter) != -1) ||
(fieldStr.indexOf(DOUBLE_QUOTE) != -1) ||
- (multilineTreatment == Multiline.YES) && (embeddedNewlineIndex
!= -1)) {
+ (multilineTreatment == Multiline.YES) && (embeddedNewlineIndex
!= -1 || embeddedCarriageReturn != -1)) {
fieldStr = "\"" + fieldStr + "\"";
}
Modified:
pig/trunk/contrib/piggybank/java/src/test/java/org/apache/pig/piggybank/test/storage/TestCSVExcelStorage.java
URL:
http://svn.apache.org/viewvc/pig/trunk/contrib/piggybank/java/src/test/java/org/apache/pig/piggybank/test/storage/TestCSVExcelStorage.java?rev=1649702&r1=1649701&r2=1649702&view=diff
==============================================================================
---
pig/trunk/contrib/piggybank/java/src/test/java/org/apache/pig/piggybank/test/storage/TestCSVExcelStorage.java
(original)
+++
pig/trunk/contrib/piggybank/java/src/test/java/org/apache/pig/piggybank/test/storage/TestCSVExcelStorage.java
Tue Jan 6 01:06:26 2015
@@ -19,6 +19,7 @@
package org.apache.pig.piggybank.test.storage;
import java.io.File;
+import java.io.FileInputStream;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Iterator;
@@ -27,10 +28,11 @@ import java.util.Properties;
import junit.framework.Assert;
import org.apache.commons.lang.StringUtils;
-
import org.apache.pig.ExecType;
import org.apache.pig.PigServer;
import
org.apache.pig.backend.hadoop.executionengine.mapReduceLayer.MRConfiguration;
+import org.apache.pig.builtin.mock.Storage;
+import org.apache.pig.builtin.mock.Storage.Data;
import org.apache.pig.data.DataByteArray;
import org.apache.pig.data.Tuple;
import org.apache.pig.tools.parameters.ParseException;
@@ -58,6 +60,7 @@ public class TestCSVExcelStorage {
"1st Field,\"A poem that continues\n" +
"for several lines\n" +
"do we\n" +
+ "(even with \r)" +
"handle that?\",Good,Fairy\n";
String[] testStrCommaArray =
@@ -69,7 +72,7 @@ public class TestCSVExcelStorage {
"\"Conrad\nEmil\",Dinger,40",
"Emil,\"\nDinger\",40",
"Quote problem,\"My \"\"famous\"\"\nsong\",60",
- "1st Field,\"A poem that continues\nfor several lines\ndo
we\nhandle that?\",Good,Fairy",
+ "1st Field,\"A poem that continues\nfor several lines\ndo
we\n(even with \r)handle that?\",Good,Fairy",
};
@SuppressWarnings("serial")
@@ -83,7 +86,7 @@ public class TestCSVExcelStorage {
add(Util.createTuple(new String[] {"Conrad\nEmil", "Dinger",
"40"}));
add(Util.createTuple(new String[] {"Emil", "\nDinger", "40"}));
add(Util.createTuple(new String[] {"Quote problem", "My
\"famous\"\nsong", "60"}));
- add(Util.createTuple(new String[] {"1st Field", "A poem that
continues\nfor several lines\ndo we\nhandle that?", "Good", "Fairy"}));
+ add(Util.createTuple(new String[] {"1st Field", "A poem that
continues\nfor several lines\ndo we\n(even with \n)handle that?", "Good",
"Fairy"}));
}
};
@@ -104,7 +107,8 @@ public class TestCSVExcelStorage {
add(Util.createTuple(new String[] {"1st Field", "A poem that
continues"}));
add(Util.createTuple(new String[] {"for several lines"}));
add(Util.createTuple(new String[] {"do we"}));
- add(Util.createTuple(new String[] {"handle that?,Good,Fairy"}));
// Trailing double quote eats rest of line
+ add(Util.createTuple(new String[] {"(even with "}));
+ add(Util.createTuple(new String[] {")handle that?,Good,Fairy"}));
// Trailing double quote eats rest of line
}
};
@@ -161,6 +165,7 @@ public class TestCSVExcelStorage {
"1,,,,\"",
"qwe",
"rty\", uiop",
+ "1,10,2.718,3.14159,\"abc\rdef\",uiop",
"1,,,,\"qwe,rty\",uiop",
"1,,,,\"q\"\"wert\"\"y\", uiop",
"1,,,,qwerty,\"u\"\"io\"\"p\""
@@ -338,6 +343,7 @@ public class TestCSVExcelStorage {
"(1,10,,3.15159,,uiop)", // extra field (input data
has "moose" after "uiop")
"(1,,2.718,,qwerty,uiop)", // quoted regular fields
(2.718, qwerty, and uiop in quotes)
"(1,,,,\nqwe\nrty, uiop)", // newlines in quotes
+ "(1,10,2.718,3.14159,abc\ndef,uiop)",// After LOAD \r => \n
(PIG-4213)
"(1,,,,qwe,rty,uiop)", // commas in quotes
"(1,,,,q\"wert\"y, uiop)", // quotes in quotes
"(1,,,,qwerty,u\"io\"p)" // quotes in quotes at the
end of a line
@@ -384,6 +390,8 @@ public class TestCSVExcelStorage {
"(1,,,,\")", // since we are just using
TextLoader for verification
"(qwe)", // it treats the
linebreaks as meaning separate records
"(rty\", uiop)", // but as shown in the
load() test, CSVExcelStorage will read these properly
+ "(1,10,2.718,3.14159,\"abc)",
+ "(def\",uiop)",
"(1,,,,\"qwe,rty\",uiop)",
"(1,,,,\"q\"\"wert\"\"y\", uiop)",
"(1,,,,qwerty,\"u\"\"io\"\"p\")"
@@ -426,4 +434,48 @@ public class TestCSVExcelStorage {
Assert.assertEquals(StringUtils.join(expected, "\n"),
StringUtils.join(data, "\n"));
}
+
+ // Test that STORE stores CR (\r) quoted/unquoted in
yes_multiline/no_multiline
+ @Test
+ public void storeCR() throws IOException {
+ ArrayList<Tuple> inputTuples = new ArrayList<Tuple>();
+ inputTuples.add(Storage.tuple(1,"text","a line\rand another line to
write"));
+ String expected = "1,text,\"a line\rand another line to write\"\n";
+ String expectedNoMultiline = "1,text,a line\rand another line to
write\n";
+
+ // Prepare the input using mock.Storage() since this will not
interpret \r
+ Data data = Storage.resetData(pig);
+ data.set("inputTuples", inputTuples);
+
+ // Test for quoted when YES_MULTILINE
+ // Execute
+ String testOut = dataDir + "csv_cr_quoted_output_yes_multiline" ;
+ String script = "A = load 'inputTuples' USING mock.Storage() as
(f1:int, f2:chararray, f3:chararray);" +
+ "STORE A INTO '" + testOut + "' USING " +
+ "org.apache.pig.piggybank.storage.CSVExcelStorage(',',
'YES_MULTILINE', 'UNIX');";
+ Util.registerMultiLineQuery(pig, script);
+ // Load result
+ FileInputStream resultFile = new FileInputStream(testOut +
"/part-m-00000");
+ byte[] actualBytes = new byte[resultFile.available()];
+ resultFile.read(actualBytes);
+ resultFile.close();
+ String actual = new String(actualBytes);
+ Assert.assertEquals(expected, actual);
+
+ // Test for unquoted when NO_MULTILINE
+ // Execute
+ testOut = dataDir + "csv_cr_quoted_output_no_multiline" ;
+ script = "A = load 'inputTuples' USING mock.Storage() as (f1:int,
f2:chararray, f3:chararray);" +
+ "STORE A INTO '" + testOut + "' USING " +
+ "org.apache.pig.piggybank.storage.CSVExcelStorage(',',
'NO_MULTILINE', 'UNIX');";
+ Util.registerMultiLineQuery(pig, script);
+ // Load result
+ resultFile = new FileInputStream(testOut + "/part-m-00000");
+ actualBytes = new byte[resultFile.available()];
+ resultFile.read(actualBytes);
+ resultFile.close();
+ actual = new String(actualBytes);
+ Assert.assertEquals(expectedNoMultiline, actual);
+ }
+
}