This is an automated email from the ASF dual-hosted git repository.
zeroshade pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow.git
The following commit(s) were added to refs/heads/main by this push:
new 26cf0e0b15 GH-38751: [C++][Go][Parquet] Add tests for reading Float16
files in parquet-testing (#38753)
26cf0e0b15 is described below
commit 26cf0e0b154b188499676466579e977829c346f6
Author: Ben Harkins <[email protected]>
AuthorDate: Fri Nov 17 12:35:33 2023 -0500
GH-38751: [C++][Go][Parquet] Add tests for reading Float16 files in
parquet-testing (#38753)
### Rationale for this change
Validates compatibility between implementations when reading `Float16`
columns.
### What changes are included in this PR?
- Bumps `parquet-testing` commit to latest to use the recently-added files
- Adds reader tests for C++ and Go in the same vein as
https://github.com/apache/arrow-rs/pull/5003
### Are these changes tested?
Yes
### Are there any user-facing changes?
No
* Closes: #38751
Authored-by: benibus <[email protected]>
Signed-off-by: Matt Topol <[email protected]>
---
cpp/src/parquet/arrow/arrow_reader_writer_test.cc | 54 ++++++++++++++++++
cpp/submodules/parquet-testing | 2 +-
go/parquet/pqarrow/file_reader_test.go | 67 +++++++++++++++++++++++
3 files changed, 122 insertions(+), 1 deletion(-)
diff --git a/cpp/src/parquet/arrow/arrow_reader_writer_test.cc
b/cpp/src/parquet/arrow/arrow_reader_writer_test.cc
index a314ecbf74..9c6f7a044b 100644
--- a/cpp/src/parquet/arrow/arrow_reader_writer_test.cc
+++ b/cpp/src/parquet/arrow/arrow_reader_writer_test.cc
@@ -4099,6 +4099,60 @@ INSTANTIATE_TEST_SUITE_P(
std::make_tuple("fixed_length_decimal_legacy.parquet",
::arrow::decimal(13, 2)),
std::make_tuple("byte_array_decimal.parquet", ::arrow::decimal(4,
2))));
+TEST(TestArrowReaderAdHoc, ReadFloat16Files) {
+ using ::arrow::util::Float16;
+ constexpr auto nan = std::numeric_limits<Float16>::quiet_NaN();
+
+ struct TestCase {
+ std::string filename;
+ int32_t len;
+ std::vector<Float16> vals;
+ } test_cases[] = {
+ {"float16_nonzeros_and_nans",
+ 8,
+ {Float16(+1.0), Float16(-2.0), nan, Float16(+0.0), Float16(-1.0),
Float16(-0.0),
+ Float16(+2.0)}},
+ {"float16_zeros_and_nans", 3, {Float16(+0.0), nan}},
+ };
+
+ const auto pool = ::arrow::default_memory_pool();
+
+ for (const auto& tc : test_cases) {
+ std::string path(test::get_data_dir());
+ path += "/" + tc.filename + ".parquet";
+ ARROW_SCOPED_TRACE("path = ", path);
+
+ std::unique_ptr<FileReader> reader;
+ ASSERT_OK_NO_THROW(
+ FileReader::Make(pool, ParquetFileReader::OpenFile(path, false),
&reader));
+ std::shared_ptr<::arrow::Table> table;
+ ASSERT_OK_NO_THROW(reader->ReadTable(&table));
+
+ std::shared_ptr<::arrow::Schema> schema;
+ ASSERT_OK_NO_THROW(reader->GetSchema(&schema));
+ ASSERT_EQ(1, schema->num_fields());
+ ASSERT_EQ(schema->field(0)->type()->id(), ::arrow::Type::HALF_FLOAT);
+
+ ASSERT_EQ(1, table->num_columns());
+ auto column = table->column(0);
+ ASSERT_EQ(tc.len, column->length());
+ ASSERT_EQ(1, column->num_chunks());
+
+ auto chunk =
checked_pointer_cast<::arrow::HalfFloatArray>(column->chunk(0));
+ ASSERT_TRUE(chunk->IsNull(0));
+ for (int32_t i = 0; i < tc.len - 1; ++i) {
+ const auto expected = tc.vals[i];
+ const auto actual = Float16::FromBits(chunk->Value(i + 1));
+ if (expected.is_nan()) {
+ // NaN representations aren't guaranteed to be exact on a binary level
+ ASSERT_TRUE(actual.is_nan());
+ } else {
+ ASSERT_EQ(expected.bits(), actual.bits());
+ }
+ }
+ }
+}
+
// direct-as-possible translation of
// pyarrow/tests/test_parquet.py::test_validate_schema_write_table
TEST(TestArrowWriterAdHoc, SchemaMismatch) {
diff --git a/cpp/submodules/parquet-testing b/cpp/submodules/parquet-testing
index e45cd23f78..89b685a64c 160000
--- a/cpp/submodules/parquet-testing
+++ b/cpp/submodules/parquet-testing
@@ -1 +1 @@
-Subproject commit e45cd23f784aab3d6bf0701f8f4e621469ed3be7
+Subproject commit 89b685a64c3117b3023d8684af1f41400841db71
diff --git a/go/parquet/pqarrow/file_reader_test.go
b/go/parquet/pqarrow/file_reader_test.go
index 9c1b4252f5..0c52eec9e3 100644
--- a/go/parquet/pqarrow/file_reader_test.go
+++ b/go/parquet/pqarrow/file_reader_test.go
@@ -29,6 +29,7 @@ import (
"github.com/apache/arrow/go/v15/arrow"
"github.com/apache/arrow/go/v15/arrow/array"
"github.com/apache/arrow/go/v15/arrow/decimal128"
+ "github.com/apache/arrow/go/v15/arrow/float16"
"github.com/apache/arrow/go/v15/arrow/memory"
"github.com/apache/arrow/go/v15/parquet"
"github.com/apache/arrow/go/v15/parquet/file"
@@ -100,6 +101,72 @@ func TestArrowReaderAdHocReadDecimals(t *testing.T) {
}
}
+func TestArrowReaderAdHocReadFloat16s(t *testing.T) {
+ tests := []struct {
+ file string
+ len int
+ vals []float16.Num
+ }{
+ {"float16_nonzeros_and_nans", 8,
+ []float16.Num{
+ float16.New(1.0),
+ float16.New(-2.0),
+ float16.NaN(),
+ float16.New(0.0),
+ float16.New(-1.0),
+ float16.New(0.0).Negate(),
+ float16.New(2.0),
+ }},
+ {"float16_zeros_and_nans", 3,
+ []float16.Num{
+ float16.New(0.0),
+ float16.NaN(),
+ }},
+ }
+
+ dataDir := getDataDir()
+ for _, tt := range tests {
+ t.Run(tt.file, func(t *testing.T) {
+ mem :=
memory.NewCheckedAllocator(memory.DefaultAllocator)
+ defer mem.AssertSize(t, 0)
+
+ filename := filepath.Join(dataDir, tt.file+".parquet")
+ require.FileExists(t, filename)
+
+ rdr, err := file.OpenParquetFile(filename, false,
file.WithReadProps(parquet.NewReaderProperties(mem)))
+ require.NoError(t, err)
+ defer rdr.Close()
+
+ arrowRdr, err := pqarrow.NewFileReader(rdr,
pqarrow.ArrowReadProperties{}, mem)
+ require.NoError(t, err)
+
+ tbl, err := arrowRdr.ReadTable(context.Background())
+ require.NoError(t, err)
+ defer tbl.Release()
+
+ assert.EqualValues(t, 1, tbl.NumCols())
+ assert.Truef(t,
arrow.TypeEqual(tbl.Schema().Field(0).Type, &arrow.Float16Type{}), "expected:
%s\ngot: %s", tbl.Schema().Field(0).Type, arrow.Float16Type{})
+
+ valCol := tbl.Column(0)
+ assert.EqualValues(t, tt.len, valCol.Len())
+ assert.Len(t, valCol.Data().Chunks(), 1)
+
+ chunk := valCol.Data().Chunk(0).(*array.Float16)
+ assert.True(t, chunk.IsNull(0))
+ for i := 0; i < tt.len-1; i++ {
+ expected := tt.vals[i]
+ actual := chunk.Value(i + 1)
+ if expected.IsNaN() {
+ // NaN representations aren't
guaranteed to be exact on a binary level
+ assert.True(t, actual.IsNaN())
+ } else {
+ assert.Equal(t, expected.Uint16(),
actual.Uint16())
+ }
+ }
+ })
+ }
+}
+
func TestRecordReaderParallel(t *testing.T) {
mem := memory.NewCheckedAllocator(memory.DefaultAllocator)
defer mem.AssertSize(t, 0)