This is an automated email from the ASF dual-hosted git repository.

zeroshade pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow.git


The following commit(s) were added to refs/heads/main by this push:
     new 26cf0e0b15 GH-38751: [C++][Go][Parquet] Add tests for reading Float16 
files in parquet-testing (#38753)
26cf0e0b15 is described below

commit 26cf0e0b154b188499676466579e977829c346f6
Author: Ben Harkins <[email protected]>
AuthorDate: Fri Nov 17 12:35:33 2023 -0500

    GH-38751: [C++][Go][Parquet] Add tests for reading Float16 files in 
parquet-testing (#38753)
    
    
    
    ### Rationale for this change
    
    Validates compatibility between implementations when reading `Float16` 
columns.
    
    ### What changes are included in this PR?
    
    - Bumps `parquet-testing` commit to latest to use the recently-added files
    - Adds reader tests for C++ and Go in the same vein as 
https://github.com/apache/arrow-rs/pull/5003
    
    ### Are these changes tested?
    
    Yes
    
    ### Are there any user-facing changes?
    
    No
    
    * Closes: #38751
    
    Authored-by: benibus <[email protected]>
    Signed-off-by: Matt Topol <[email protected]>
---
 cpp/src/parquet/arrow/arrow_reader_writer_test.cc | 54 ++++++++++++++++++
 cpp/submodules/parquet-testing                    |  2 +-
 go/parquet/pqarrow/file_reader_test.go            | 67 +++++++++++++++++++++++
 3 files changed, 122 insertions(+), 1 deletion(-)

diff --git a/cpp/src/parquet/arrow/arrow_reader_writer_test.cc 
b/cpp/src/parquet/arrow/arrow_reader_writer_test.cc
index a314ecbf74..9c6f7a044b 100644
--- a/cpp/src/parquet/arrow/arrow_reader_writer_test.cc
+++ b/cpp/src/parquet/arrow/arrow_reader_writer_test.cc
@@ -4099,6 +4099,60 @@ INSTANTIATE_TEST_SUITE_P(
         std::make_tuple("fixed_length_decimal_legacy.parquet", 
::arrow::decimal(13, 2)),
         std::make_tuple("byte_array_decimal.parquet", ::arrow::decimal(4, 
2))));
 
+TEST(TestArrowReaderAdHoc, ReadFloat16Files) {
+  using ::arrow::util::Float16;
+  constexpr auto nan = std::numeric_limits<Float16>::quiet_NaN();
+
+  struct TestCase {
+    std::string filename;
+    int32_t len;
+    std::vector<Float16> vals;
+  } test_cases[] = {
+      {"float16_nonzeros_and_nans",
+       8,
+       {Float16(+1.0), Float16(-2.0), nan, Float16(+0.0), Float16(-1.0), 
Float16(-0.0),
+        Float16(+2.0)}},
+      {"float16_zeros_and_nans", 3, {Float16(+0.0), nan}},
+  };
+
+  const auto pool = ::arrow::default_memory_pool();
+
+  for (const auto& tc : test_cases) {
+    std::string path(test::get_data_dir());
+    path += "/" + tc.filename + ".parquet";
+    ARROW_SCOPED_TRACE("path = ", path);
+
+    std::unique_ptr<FileReader> reader;
+    ASSERT_OK_NO_THROW(
+        FileReader::Make(pool, ParquetFileReader::OpenFile(path, false), 
&reader));
+    std::shared_ptr<::arrow::Table> table;
+    ASSERT_OK_NO_THROW(reader->ReadTable(&table));
+
+    std::shared_ptr<::arrow::Schema> schema;
+    ASSERT_OK_NO_THROW(reader->GetSchema(&schema));
+    ASSERT_EQ(1, schema->num_fields());
+    ASSERT_EQ(schema->field(0)->type()->id(), ::arrow::Type::HALF_FLOAT);
+
+    ASSERT_EQ(1, table->num_columns());
+    auto column = table->column(0);
+    ASSERT_EQ(tc.len, column->length());
+    ASSERT_EQ(1, column->num_chunks());
+
+    auto chunk = 
checked_pointer_cast<::arrow::HalfFloatArray>(column->chunk(0));
+    ASSERT_TRUE(chunk->IsNull(0));
+    for (int32_t i = 0; i < tc.len - 1; ++i) {
+      const auto expected = tc.vals[i];
+      const auto actual = Float16::FromBits(chunk->Value(i + 1));
+      if (expected.is_nan()) {
+        // NaN representations aren't guaranteed to be exact on a binary level
+        ASSERT_TRUE(actual.is_nan());
+      } else {
+        ASSERT_EQ(expected.bits(), actual.bits());
+      }
+    }
+  }
+}
+
 // direct-as-possible translation of
 // pyarrow/tests/test_parquet.py::test_validate_schema_write_table
 TEST(TestArrowWriterAdHoc, SchemaMismatch) {
diff --git a/cpp/submodules/parquet-testing b/cpp/submodules/parquet-testing
index e45cd23f78..89b685a64c 160000
--- a/cpp/submodules/parquet-testing
+++ b/cpp/submodules/parquet-testing
@@ -1 +1 @@
-Subproject commit e45cd23f784aab3d6bf0701f8f4e621469ed3be7
+Subproject commit 89b685a64c3117b3023d8684af1f41400841db71
diff --git a/go/parquet/pqarrow/file_reader_test.go 
b/go/parquet/pqarrow/file_reader_test.go
index 9c1b4252f5..0c52eec9e3 100644
--- a/go/parquet/pqarrow/file_reader_test.go
+++ b/go/parquet/pqarrow/file_reader_test.go
@@ -29,6 +29,7 @@ import (
        "github.com/apache/arrow/go/v15/arrow"
        "github.com/apache/arrow/go/v15/arrow/array"
        "github.com/apache/arrow/go/v15/arrow/decimal128"
+       "github.com/apache/arrow/go/v15/arrow/float16"
        "github.com/apache/arrow/go/v15/arrow/memory"
        "github.com/apache/arrow/go/v15/parquet"
        "github.com/apache/arrow/go/v15/parquet/file"
@@ -100,6 +101,72 @@ func TestArrowReaderAdHocReadDecimals(t *testing.T) {
        }
 }
 
+func TestArrowReaderAdHocReadFloat16s(t *testing.T) {
+       tests := []struct {
+               file string
+               len  int
+               vals []float16.Num
+       }{
+               {"float16_nonzeros_and_nans", 8,
+                       []float16.Num{
+                               float16.New(1.0),
+                               float16.New(-2.0),
+                               float16.NaN(),
+                               float16.New(0.0),
+                               float16.New(-1.0),
+                               float16.New(0.0).Negate(),
+                               float16.New(2.0),
+                       }},
+               {"float16_zeros_and_nans", 3,
+                       []float16.Num{
+                               float16.New(0.0),
+                               float16.NaN(),
+                       }},
+       }
+
+       dataDir := getDataDir()
+       for _, tt := range tests {
+               t.Run(tt.file, func(t *testing.T) {
+                       mem := 
memory.NewCheckedAllocator(memory.DefaultAllocator)
+                       defer mem.AssertSize(t, 0)
+
+                       filename := filepath.Join(dataDir, tt.file+".parquet")
+                       require.FileExists(t, filename)
+
+                       rdr, err := file.OpenParquetFile(filename, false, 
file.WithReadProps(parquet.NewReaderProperties(mem)))
+                       require.NoError(t, err)
+                       defer rdr.Close()
+
+                       arrowRdr, err := pqarrow.NewFileReader(rdr, 
pqarrow.ArrowReadProperties{}, mem)
+                       require.NoError(t, err)
+
+                       tbl, err := arrowRdr.ReadTable(context.Background())
+                       require.NoError(t, err)
+                       defer tbl.Release()
+
+                       assert.EqualValues(t, 1, tbl.NumCols())
+                       assert.Truef(t, 
arrow.TypeEqual(tbl.Schema().Field(0).Type, &arrow.Float16Type{}), "expected: 
%s\ngot: %s", tbl.Schema().Field(0).Type, arrow.Float16Type{})
+
+                       valCol := tbl.Column(0)
+                       assert.EqualValues(t, tt.len, valCol.Len())
+                       assert.Len(t, valCol.Data().Chunks(), 1)
+
+                       chunk := valCol.Data().Chunk(0).(*array.Float16)
+                       assert.True(t, chunk.IsNull(0))
+                       for i := 0; i < tt.len-1; i++ {
+                               expected := tt.vals[i]
+                               actual := chunk.Value(i + 1)
+                               if expected.IsNaN() {
+                                       // NaN representations aren't 
guaranteed to be exact on a binary level
+                                       assert.True(t, actual.IsNaN())
+                               } else {
+                                       assert.Equal(t, expected.Uint16(), 
actual.Uint16())
+                               }
+                       }
+               })
+       }
+}
+
 func TestRecordReaderParallel(t *testing.T) {
        mem := memory.NewCheckedAllocator(memory.DefaultAllocator)
        defer mem.AssertSize(t, 0)

Reply via email to