[
https://issues.apache.org/jira/browse/ARROW-12054?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=17307041#comment-17307041
]
Taras Kuzyo commented on ARROW-12054:
-------------------------------------
Actually, it is even weirder
{code:java}
import decimal
import pyarrow as pa
import pyarrow.parquet as pq
def pad(b):
# Left pad 0 or 1 based on leading digit (2's complement rules)
if b[-1] & 128 == 0:
return b.ljust(16, b'\x00')
else:
return b.ljust(16, b'\xff')
def to_pyarrow_bytes(b):
# converts from big-endian (parquet's repr) to little endian (arrow's repr)
# and then pads to 16 bytes
return pad(b[::-1])
def decode_stats_decimal(b):
pyarrow_bytes = to_pyarrow_bytes(b)
arr = pa.Array.from_buffers(pa.decimal128(12, 4), 1, [None,
pa.py_buffer(pyarrow_bytes)], 0)
return arr[0].as_py()
context = decimal.Context(prec=12)
numbers = [context.create_decimal(x/10) for x in range(10, 30)]
a = pa.array(numbers, pa.decimal128(12, 4))
numbers = [context.create_decimal(x - 10) for x in range(10, 30)]
b = pa.array(numbers, pa.decimal128(12, 4))
numbers = [context.create_decimal(x % 20) for x in range(10, 30)]
c = pa.array(numbers, pa.decimal128(12, 4))
numbers = [context.create_decimal(x % 10) for x in range(10, 30)]
d = pa.array(numbers, pa.decimal128(12, 4))
table = pa.Table.from_arrays([a, b, c, d], ['A', 'B', 'C', 'D'])
with pq.ParquetWriter('test.parquet', table.schema) as writer:
writer.write_table(table)
reader = pq.ParquetFile('test.parquet')
for rowgroup in range(reader.num_row_groups):
meta = reader.metadata.row_group(rowgroup)
for i in range(meta.num_columns):
print(f'Column {i}')
min_price = meta.column(i).statistics.min
max_price = meta.column(i).statistics.max
df = reader.read_row_group(0).column(i).to_pandas()
actual_min, actual_max = df.min(), df.max()
print(f'Min decoded: {decode_stats_decimal(min_price)} Min actual:
{actual_min}')
print(f'Max decoded: {decode_stats_decimal(max_price)} Max actual:
{actual_max}')
{code}
The output:
{noformat}
Column 0
Min decoded: 1.0000 Min actual: 1.0000
Max decoded: 2.9000 Max actual: 2.9000
Column 1
Min decoded: 4.0000 Min actual: 0.0000
Max decoded: 16.0000 Max actual: 19.0000
Column 2
Min decoded: 4.0000 Min actual: 0.0000
Max decoded: 16.0000 Max actual: 19.0000
Column 3
Min decoded: 4.0000 Min actual: 0.0000
Max decoded: 9.0000 Max actual: 9.0000{noformat}
> [C++] Parquet statistics incorrect for decimal128
> -------------------------------------------------
>
> Key: ARROW-12054
> URL: https://issues.apache.org/jira/browse/ARROW-12054
> Project: Apache Arrow
> Issue Type: Bug
> Components: C++
> Affects Versions: 3.0.0
> Reporter: Weston Pace
> Priority: Major
>
> {code:java}
> import decimal
> import pyarrow as pa
> import pyarrow.parquet as pq
> dtype = pa.decimal128(12, 4)
> ctx = decimal.Context(prec=12)
> arr = pa.array([0, ctx.create_decimal(3.99)], dtype)
> table = pa.Table.from_arrays([arr], ["foo"])
> pq.write_table(table, '/tmp/foo.pq')
> meta = pq.read_metadata('/tmp/foo.pq')
> print(meta.row_group(0).column(0).statistics)
> {code}
> Expected 0 to be the min and 3.99 to be the max but got the reverse.
--
This message was sent by Atlassian Jira
(v8.3.4#803005)