[GitHub] [arrow] lidavidm commented on a change in pull request #11836: ARROW-14903: [C++] Enable CSV Writer to control string to be used for missing data

GitBox Thu, 02 Dec 2021 05:33:39 -0800


lidavidm commented on a change in pull request #11836:
URL: https://github.com/apache/arrow/pull/11836#discussion_r761095033




##########
File path: cpp/src/arrow/csv/writer_test.cc
##########
@@ -69,25 +70,49 @@ std::vector<WriterTestParams> GenerateTestCases() {
                              { "a": 124, "b\"": "a\"\"b\"" },
                              { "d": 0 },
                              { "e": 86400000 },
-                             { "f": 1078016523 }])";
-  std::string expected_without_header = std::string("1,,-1,,,") + "\n" +       
// line 1
-                                        R"(1,"abc""efg",2324,,,)" + "\n" +     
// line 2
-                                        R"(,"abcd",5467,,,)" + "\n" +          
// line 3
-                                        R"(,,,,,)" + "\n" +                    
// line 4
-                                        R"(546,"",517,,,)" + "\n" +            
// line 5
-                                        R"(124,"a""""b""",,,,)" + "\n" +       
// line 6
-                                        R"(,,,1970-01-01,,)" + "\n" +          
// line 7
-                                        R"(,,,,1970-01-02,)" + "\n" +          
// line 8
-                                        R"(,,,,,2004-02-29 01:02:03)" + "\n";  
// line 9
+                             { "f": 1078016523 },
+                             { "b\"": "NA" }])";
+  std::string expected_without_header = std::string("1,,-1,,,") + "\n" +       
 // line 1
+                                        R"(1,"abc""efg",2324,,,)" + "\n" +     
 // line 2
+                                        R"(,"abcd",5467,,,)" + "\n" +          
 // line 3
+                                        R"(,,,,,)" + "\n" +                    
 // line 4
+                                        R"(546,"",517,,,)" + "\n" +            
 // line 5
+                                        R"(124,"a""""b""",,,,)" + "\n" +       
 // line 6
+                                        R"(,,,1970-01-01,,)" + "\n" +          
 // line 7
+                                        R"(,,,,1970-01-02,)" + "\n" +          
 // line 8
+                                        R"(,,,,,2004-02-29 01:02:03)" + "\n" + 
 // line 9
+                                        R"(,"NA",,,,)" + "\n";                 
 // line 10
+
   std::string expected_header = std::string(R"("a","b""","c ","d","e","f")") + 
"\n";
 
+  auto schema_custom_na = schema({field("g", uint64()), field("h", utf8())});
+
+  auto populated_batch_custom_na = R"([{"g": 42, "h": "NA"},
+                                        {}])";
+
+  std::string expected_custom_na = std::string(R"(42,"NA")") + "\n" +  // line 
1
+                                   R"(NA,NA)" + "\n";                  // line 
2
+
+  std::string expected_custom_quoted_na = std::string(R"(42,"NA")") + "\n" +  
// line 1
+                                          R"(""NA"",""NA"")" + "\n";          
// line 2
+
   return std::vector<WriterTestParams>{
-      {abc_schema, "[]", DefaultTestOptions(/*header=*/false), ""},
-      {abc_schema, "[]", DefaultTestOptions(/*header=*/true), expected_header},
-      {abc_schema, populated_batch, DefaultTestOptions(/*header=*/false),
+      {abc_schema, "[]", DefaultTestOptions(/*include_header=*/false, 
/*null_string=*/""),
+       ""},
+      {abc_schema, "[]", DefaultTestOptions(/*include_header=*/true, 
/*null_string=*/""),
+       expected_header},
+      {abc_schema, populated_batch,
+       DefaultTestOptions(/*include_header=*/false, /*null_string=*/""),
        expected_without_header},
-      {abc_schema, populated_batch, DefaultTestOptions(/*header=*/true),
-       expected_header + expected_without_header}};
+      {abc_schema, populated_batch,
+       DefaultTestOptions(/*include_header=*/true, /*null_string=*/""),
+       expected_header + expected_without_header},
+      {schema_custom_na, populated_batch_custom_na,
+       DefaultTestOptions(/*include_header=*/false, /*null_string=*/"NA"),
+       expected_custom_na},
+      {schema_custom_na, populated_batch_custom_na,
+       DefaultTestOptions(/*include_header=*/false, /*null_string=*/R"("NA")"),
+       expected_custom_quoted_na}};

Review comment:
       I'm having trouble getting this one to round-trip. With double quotes 
(though as Antoine points out this is wrong):
   
   ```
   >>> csv.read_csv("test2.csv", 
convert_options=csv.ConvertOptions(strings_can_be_null=True, 
quoted_strings_can_be_null=False, null_values=['"NA"'], column_types={"g": 
pyarrow.int64()}))
   Traceback (most recent call last):
     File "<stdin>", line 1, in <module>
     File "pyarrow/_csv.pyx", line 886, in pyarrow._csv.read_csv
     File "pyarrow/_csv.pyx", line 895, in pyarrow._csv.read_csv
     File "pyarrow/error.pxi", line 143, in 
pyarrow.lib.pyarrow_internal_check_status
     File "pyarrow/error.pxi", line 99, in pyarrow.lib.check_status
   pyarrow.lib.ArrowInvalid: In CSV column #0: CSV conversion error to int64: 
invalid value 'NA""'
   ```
   
   With triple quotes:
   
   ```
   >>> csv.read_csv("test2.csv", 
convert_options=csv.ConvertOptions(strings_can_be_null=True, 
quoted_strings_can_be_null=False, null_values=['"NA"'], column_types={"g": 
pyarrow.int64()}))
   Traceback (most recent call last):
     File "<stdin>", line 1, in <module>
     File "pyarrow/_csv.pyx", line 886, in pyarrow._csv.read_csv
     File "pyarrow/_csv.pyx", line 895, in pyarrow._csv.read_csv
     File "pyarrow/error.pxi", line 143, in 
pyarrow.lib.pyarrow_internal_check_status
     File "pyarrow/error.pxi", line 99, in pyarrow.lib.check_status
   pyarrow.lib.ArrowInvalid: In CSV column #0: CSV conversion error to int64: 
invalid value '"NA"'
   ```
   
   Might be an issue on the read side. I don't think it's an issue to be 
addressed here.

##########
File path: cpp/src/arrow/csv/writer_test.cc
##########
@@ -69,25 +70,49 @@ std::vector<WriterTestParams> GenerateTestCases() {
                              { "a": 124, "b\"": "a\"\"b\"" },
                              { "d": 0 },
                              { "e": 86400000 },
-                             { "f": 1078016523 }])";
-  std::string expected_without_header = std::string("1,,-1,,,") + "\n" +       
// line 1
-                                        R"(1,"abc""efg",2324,,,)" + "\n" +     
// line 2
-                                        R"(,"abcd",5467,,,)" + "\n" +          
// line 3
-                                        R"(,,,,,)" + "\n" +                    
// line 4
-                                        R"(546,"",517,,,)" + "\n" +            
// line 5
-                                        R"(124,"a""""b""",,,,)" + "\n" +       
// line 6
-                                        R"(,,,1970-01-01,,)" + "\n" +          
// line 7
-                                        R"(,,,,1970-01-02,)" + "\n" +          
// line 8
-                                        R"(,,,,,2004-02-29 01:02:03)" + "\n";  
// line 9
+                             { "f": 1078016523 },
+                             { "b\"": "NA" }])";
+  std::string expected_without_header = std::string("1,,-1,,,") + "\n" +       
 // line 1
+                                        R"(1,"abc""efg",2324,,,)" + "\n" +     
 // line 2
+                                        R"(,"abcd",5467,,,)" + "\n" +          
 // line 3
+                                        R"(,,,,,)" + "\n" +                    
 // line 4
+                                        R"(546,"",517,,,)" + "\n" +            
 // line 5
+                                        R"(124,"a""""b""",,,,)" + "\n" +       
 // line 6
+                                        R"(,,,1970-01-01,,)" + "\n" +          
 // line 7
+                                        R"(,,,,1970-01-02,)" + "\n" +          
 // line 8
+                                        R"(,,,,,2004-02-29 01:02:03)" + "\n" + 
 // line 9
+                                        R"(,"NA",,,,)" + "\n";                 
 // line 10
+
   std::string expected_header = std::string(R"("a","b""","c ","d","e","f")") + 
"\n";
 
+  auto schema_custom_na = schema({field("g", uint64()), field("h", utf8())});
+
+  auto populated_batch_custom_na = R"([{"g": 42, "h": "NA"},
+                                        {}])";
+
+  std::string expected_custom_na = std::string(R"(42,"NA")") + "\n" +  // line 
1
+                                   R"(NA,NA)" + "\n";                  // line 
2
+
+  std::string expected_custom_quoted_na = std::string(R"(42,"NA")") + "\n" +  
// line 1
+                                          R"(""NA"",""NA"")" + "\n";          
// line 2
+
   return std::vector<WriterTestParams>{
-      {abc_schema, "[]", DefaultTestOptions(/*header=*/false), ""},
-      {abc_schema, "[]", DefaultTestOptions(/*header=*/true), expected_header},
-      {abc_schema, populated_batch, DefaultTestOptions(/*header=*/false),
+      {abc_schema, "[]", DefaultTestOptions(/*include_header=*/false, 
/*null_string=*/""),
+       ""},
+      {abc_schema, "[]", DefaultTestOptions(/*include_header=*/true, 
/*null_string=*/""),
+       expected_header},
+      {abc_schema, populated_batch,
+       DefaultTestOptions(/*include_header=*/false, /*null_string=*/""),
        expected_without_header},
-      {abc_schema, populated_batch, DefaultTestOptions(/*header=*/true),
-       expected_header + expected_without_header}};
+      {abc_schema, populated_batch,
+       DefaultTestOptions(/*include_header=*/true, /*null_string=*/""),
+       expected_header + expected_without_header},
+      {schema_custom_na, populated_batch_custom_na,
+       DefaultTestOptions(/*include_header=*/false, /*null_string=*/"NA"),
+       expected_custom_na},

Review comment:
       This round-trips with `csv.ConvertOptions(strings_can_be_null=True, 
quoted_strings_can_be_null=False)` which is to be expected.




-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]

[GitHub] [arrow] lidavidm commented on a change in pull request #11836: ARROW-14903: [C++] Enable CSV Writer to control string to be used for missing data

Reply via email to