This is an automated email from the ASF dual-hosted git repository.
uwe pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/parquet-mr.git
The following commit(s) were added to refs/heads/master by this push:
new e021734 PARQUET-1285: [Java] SchemaConverter should not convert from
TimeUnit.SECOND and TimeUnit.NANOSECOND of Arrow (#469)
e021734 is described below
commit e021734b62ea5ac273e516b4ac83727cbb99ec08
Author: Masayuki Takahashi <[email protected]>
AuthorDate: Mon May 7 17:11:58 2018 +0900
PARQUET-1285: [Java] SchemaConverter should not convert from
TimeUnit.SECOND and TimeUnit.NANOSECOND of Arrow (#469)
* PARQUET-1285: [Java] SchemaConverter should not convert from
TimeUnit.SECOND AND TimeUnit.NANOSECOND of Arrow
Arrow's 'Time' definition is below:
{ "name" : "time", "unit" : "SECOND|MILLISECOND|MICROSECOND|NANOSECOND",
"bitWidth": /* integer: 32 or 64 */ }
http://arrow.apache.org/docs/metadata.html
But Parquet only supports 'TIME_MILLIS' and 'TIME_MICROS'.
https://github.com/Apache/parquet-format/blob/master/LogicalTypes.md
Therefore SchemaConverter should not convert from TimeUnit.SECOND AND
TimeUnit.NANOSECOND of Arrow to Parquet.
Author: Masayuki Takahashi <[email protected]>
* PARQUET-1285: [Java] SchemaConverter should not convert from
TimeUnit.SECOND AND TimeUnit.NANOSECOND of Arrow
Since the import statements were collected, I restored it.
Author: Masayuki Takahashi <[email protected]>
* PARQUET-1285: [Java] SchemaConverter should not convert from
TimeUnit.SECOND AND TimeUnit.NANOSECOND of Arrow
Remove unnecessary updates.
Author: Masayuki Takahashi <[email protected]>
* PARQUET-1285: [Java] SchemaConverter should not convert from
TimeUnit.SECOND AND TimeUnit.NANOSECOND of Arrow
Remove unnecessary package name
Author: Masayuki Takahashi <[email protected]>
* PARQUET-1285: [Java] SchemaConverter should not convert from
TimeUnit.SECOND AND TimeUnit.NANOSECOND of Arrow
Add a conversion pattern from Parquet's TIME_MICROS to Arrow's MICROSECOND
Author: Masayuki Takahashi <[email protected]>
* PARQUET-1285: [Java] SchemaConverter should not convert from
TimeUnit.SECOND AND TimeUnit.NANOSECOND of Arrow
Fix to specify `expected` positions in assertEquals
Author: Masayuki Takahashi <[email protected]>
* PARQUET-1285: [Java] SchemaConverter should not convert from
TimeUnit.SECOND AND TimeUnit.NANOSECOND of Arrow
Add a test to convert from Parquet's TIME_MICROS to Arrow's MICROSECOND
Author: Masayuki Takahashi <[email protected]>
---
.../parquet/arrow/schema/SchemaConverter.java | 24 +++++--
.../parquet/arrow/schema/TestSchemaConverter.java | 79 ++++++++++++++++++++--
2 files changed, 89 insertions(+), 14 deletions(-)
diff --git
a/parquet-arrow/src/main/java/org/apache/parquet/arrow/schema/SchemaConverter.java
b/parquet-arrow/src/main/java/org/apache/parquet/arrow/schema/SchemaConverter.java
index 1d69c45..f298558 100644
---
a/parquet-arrow/src/main/java/org/apache/parquet/arrow/schema/SchemaConverter.java
+++
b/parquet-arrow/src/main/java/org/apache/parquet/arrow/schema/SchemaConverter.java
@@ -28,6 +28,7 @@ import static org.apache.parquet.schema.OriginalType.INT_64;
import static org.apache.parquet.schema.OriginalType.INT_8;
import static org.apache.parquet.schema.OriginalType.TIMESTAMP_MILLIS;
import static org.apache.parquet.schema.OriginalType.TIME_MILLIS;
+import static org.apache.parquet.schema.OriginalType.TIME_MICROS;
import static org.apache.parquet.schema.OriginalType.UINT_16;
import static org.apache.parquet.schema.OriginalType.UINT_32;
import static org.apache.parquet.schema.OriginalType.UINT_64;
@@ -49,6 +50,7 @@ import java.util.List;
import org.apache.arrow.vector.types.DateUnit;
import org.apache.arrow.vector.types.FloatingPointPrecision;
+import org.apache.arrow.vector.types.TimeUnit;
import org.apache.arrow.vector.types.pojo.ArrowType;
import org.apache.arrow.vector.types.pojo.ArrowType.ArrowTypeVisitor;
import org.apache.arrow.vector.types.pojo.ArrowType.Binary;
@@ -245,7 +247,14 @@ public class SchemaConverter {
@Override
public TypeMapping visit(Time type) {
- return primitive(INT32, TIME_MILLIS);
+ int bitWidth = type.getBitWidth();
+ TimeUnit timeUnit = type.getUnit();
+ if (bitWidth == 32 && timeUnit == TimeUnit.MILLISECOND) {
+ return primitive(INT32, TIME_MILLIS);
+ } else if (bitWidth == 64 && timeUnit == TimeUnit.MICROSECOND) {
+ return primitive(INT64, TIME_MICROS);
+ }
+ throw new UnsupportedOperationException("Unsupported type " + type);
}
@Override
@@ -407,11 +416,11 @@ public class SchemaConverter {
case DATE:
return field(new ArrowType.Date(DateUnit.DAY));
case TIMESTAMP_MICROS:
- return field(new
ArrowType.Timestamp(org.apache.arrow.vector.types.TimeUnit.MICROSECOND, "UTC"));
+ return field(new ArrowType.Timestamp(TimeUnit.MICROSECOND, "UTC"));
case TIMESTAMP_MILLIS:
- return field(new
ArrowType.Timestamp(org.apache.arrow.vector.types.TimeUnit.MILLISECOND, "UTC"));
+ return field(new ArrowType.Timestamp(TimeUnit.MILLISECOND, "UTC"));
case TIME_MILLIS:
- return field(new
ArrowType.Time(org.apache.arrow.vector.types.TimeUnit.MILLISECOND, 32));
+ return field(new ArrowType.Time(TimeUnit.MILLISECOND, 32));
default:
case TIME_MICROS:
case INT_64:
@@ -456,11 +465,12 @@ public class SchemaConverter {
case DATE:
return field(new ArrowType.Date(DateUnit.DAY));
case TIMESTAMP_MICROS:
- return field(new
ArrowType.Timestamp(org.apache.arrow.vector.types.TimeUnit.MICROSECOND, "UTC"));
+ return field(new ArrowType.Timestamp(TimeUnit.MICROSECOND, "UTC"));
case TIMESTAMP_MILLIS:
- return field(new
ArrowType.Timestamp(org.apache.arrow.vector.types.TimeUnit.MILLISECOND, "UTC"));
- default:
+ return field(new ArrowType.Timestamp(TimeUnit.MILLISECOND, "UTC"));
case TIME_MICROS:
+ return field(new ArrowType.Time(TimeUnit.MICROSECOND, 64));
+ default:
case UTF8:
case ENUM:
case BSON:
diff --git
a/parquet-arrow/src/test/java/org/apache/parquet/arrow/schema/TestSchemaConverter.java
b/parquet-arrow/src/test/java/org/apache/parquet/arrow/schema/TestSchemaConverter.java
index 654f773..4c3da35 100644
---
a/parquet-arrow/src/test/java/org/apache/parquet/arrow/schema/TestSchemaConverter.java
+++
b/parquet-arrow/src/test/java/org/apache/parquet/arrow/schema/TestSchemaConverter.java
@@ -28,6 +28,7 @@ import static org.apache.parquet.schema.OriginalType.INT_64;
import static org.apache.parquet.schema.OriginalType.INT_8;
import static org.apache.parquet.schema.OriginalType.TIMESTAMP_MILLIS;
import static org.apache.parquet.schema.OriginalType.TIME_MILLIS;
+import static org.apache.parquet.schema.OriginalType.TIME_MICROS;
import static org.apache.parquet.schema.OriginalType.UINT_16;
import static org.apache.parquet.schema.OriginalType.UINT_32;
import static org.apache.parquet.schema.OriginalType.UINT_64;
@@ -43,11 +44,12 @@ import static
org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName.INT64;
import java.io.IOException;
import java.util.List;
-import org.apache.arrow.vector.types.IntervalUnit;
-import org.apache.arrow.vector.types.UnionMode;
import org.apache.arrow.vector.types.DateUnit;
import org.apache.arrow.vector.types.FloatingPointPrecision;
+import org.apache.arrow.vector.types.IntervalUnit;
+import org.apache.arrow.vector.types.TimeUnit;
+import org.apache.arrow.vector.types.UnionMode;
import org.apache.arrow.vector.types.pojo.ArrowType;
import org.apache.arrow.vector.types.pojo.Field;
import org.apache.arrow.vector.types.pojo.Schema;
@@ -86,7 +88,7 @@ public class TestSchemaConverter {
field("e", new ArrowType.List(), field(null, new
ArrowType.Date(DateUnit.DAY))),
field("f", new ArrowType.FixedSizeList(1), field(null, new
ArrowType.Date(DateUnit.DAY))),
field("g", new ArrowType.FloatingPoint(FloatingPointPrecision.SINGLE)),
- field("h", new
ArrowType.Timestamp(org.apache.arrow.vector.types.TimeUnit.MILLISECOND, "UTC")),
+ field("h", new ArrowType.Timestamp(TimeUnit.MILLISECOND, "UTC")),
field("i", new ArrowType.Interval(IntervalUnit.DAY_TIME))
));
private final MessageType complexParquetSchema = Types.buildMessage()
@@ -129,11 +131,12 @@ public class TestSchemaConverter {
field("k1", new ArrowType.Decimal(15, 5)),
field("k2", new ArrowType.Decimal(25, 5)),
field("l", new ArrowType.Date(DateUnit.DAY)),
- field("m", new
ArrowType.Time(org.apache.arrow.vector.types.TimeUnit.SECOND, 32)),
- field("n", new
ArrowType.Timestamp(org.apache.arrow.vector.types.TimeUnit.MILLISECOND, "UTC")),
+ field("m", new ArrowType.Time(TimeUnit.MILLISECOND, 32)),
+ field("n", new ArrowType.Timestamp(TimeUnit.MILLISECOND, "UTC")),
field("o", new ArrowType.Interval(IntervalUnit.DAY_TIME)),
field("o1", new ArrowType.Interval(IntervalUnit.YEAR_MONTH))
));
+
private final MessageType allTypesParquetSchema = Types.buildMessage()
.addField(Types.optional(BINARY).named("a"))
.addField(Types.optionalGroup()
@@ -191,8 +194,8 @@ public class TestSchemaConverter {
field("j1", new ArrowType.Decimal(15, 5)),
field("j2", new ArrowType.Decimal(25, 5)),
field("k", new ArrowType.Date(DateUnit.DAY)),
- field("l", new
ArrowType.Time(org.apache.arrow.vector.types.TimeUnit.MILLISECOND, 32)),
- field("m", new
ArrowType.Timestamp(org.apache.arrow.vector.types.TimeUnit.MILLISECOND, "UTC"))
+ field("l", new ArrowType.Time(TimeUnit.MILLISECOND, 32)),
+ field("m", new ArrowType.Timestamp(TimeUnit.MILLISECOND, "UTC"))
));
private final MessageType supportedTypesParquetSchema = Types.buildMessage()
@@ -348,4 +351,66 @@ public class TestSchemaConverter {
SchemaMapping map = converter.map(paperArrowSchema, Paper.schema);
Assert.assertEquals("p, s<r<p>, r<p>>, r<s<r<s<p, p>>, p>>",
toSummaryString(map));
}
+
+ @Test(expected = UnsupportedOperationException.class)
+ public void testArrowTimeSecondToParquet() {
+ converter.fromArrow(new Schema(asList(
+ field("a", new ArrowType.Time(TimeUnit.SECOND, 32))
+ ))).getParquetSchema();
+ }
+
+ @Test
+ public void testArrowTimeMillisecondToParquet() {
+ MessageType expected = converter.fromArrow(new Schema(asList(
+ field("a", new ArrowType.Time(TimeUnit.MILLISECOND, 32))
+ ))).getParquetSchema();
+ Assert.assertEquals(expected,
Types.buildMessage().addField(Types.optional(INT32).as(TIME_MILLIS).named("a")).named("root"));
+ }
+
+ @Test
+ public void testArrowTimeMicrosecondToParquet() {
+ MessageType expected = converter.fromArrow(new Schema(asList(
+ field("a", new ArrowType.Time(TimeUnit.MICROSECOND, 64))
+ ))).getParquetSchema();
+ Assert.assertEquals(expected,
Types.buildMessage().addField(Types.optional(INT64).as(TIME_MICROS).named("a")).named("root"));
+ }
+
+ @Test(expected = UnsupportedOperationException.class)
+ public void testArrowTimeNanosecondToParquet() {
+ converter.fromArrow(new Schema(asList(
+ field("a", new ArrowType.Time(TimeUnit.NANOSECOND, 64))
+ ))).getParquetSchema();
+ }
+
+ @Test
+ public void testParquetInt32TimeMillisToArrow() {
+ MessageType parquet = Types.buildMessage()
+
.addField(Types.optional(INT32).as(TIME_MILLIS).named("a")).named("root");
+ Schema expected = new Schema(asList(
+ field("a", new ArrowType.Time(TimeUnit.MILLISECOND, 32))
+ ));
+ Assert.assertEquals(expected,
converter.fromParquet(parquet).getArrowSchema());
+ }
+
+ @Test
+ public void testParquetInt64TimeMicrosToArrow() {
+ MessageType parquet = Types.buildMessage()
+
.addField(Types.optional(INT64).as(TIME_MICROS).named("a")).named("root");
+ Schema expected = new Schema(asList(
+ field("a", new ArrowType.Time(TimeUnit.MICROSECOND, 64))
+ ));
+ Assert.assertEquals(expected,
converter.fromParquet(parquet).getArrowSchema());
+ }
+
+ @Test(expected = IllegalStateException.class)
+ public void testParquetInt64TimeMillisToArrow() {
+ converter.fromParquet(Types.buildMessage()
+
.addField(Types.optional(INT64).as(TIME_MILLIS).named("a")).named("root"));
+ }
+
+ @Test(expected = IllegalStateException.class)
+ public void testParquetInt32TimeMicrosToArrow() {
+ converter.fromParquet(Types.buildMessage()
+
.addField(Types.optional(INT32).as(TIME_MICROS).named("a")).named("root"));
+ }
}
--
To stop receiving notification emails like this one, please contact
[email protected].