This is an automated email from the ASF dual-hosted git repository.
chaokunyang pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/fury.git
The following commit(s) were added to refs/heads/main by this push:
new 5b22ccd0 fix(java): Move schema caching to unsafe trait to avoid
issues when using non-inferred schema. (#1944)
5b22ccd0 is described below
commit 5b22ccd035ea80f453a7483b407ced6a5401a738
Author: Yi Wen Wong <[email protected]>
AuthorDate: Tue Nov 19 08:23:39 2024 -0800
fix(java): Move schema caching to unsafe trait to avoid issues when using
non-inferred schema. (#1944)
## What does this PR do?
This PR removes the java specific `ExtField` class from the schema and
moves the extData mechanism to the internal UnsafeTrait class. This is
necessary because `ExtField` is only created internally from
`inferSchema` method used by java, and we would potentially import or
derive schemas from other sources (e.g XLANG, handwritten schema,
arrow-native source) -- those schemas will be incompatible when we
attempt to retrieve the cached schema. Removing schema-caching will fix
the issue, but create allocations, so after some discussion, we decided
to move the mechanism to the internal UnsafeTrait class.
This implementation makes changes internal API:
- Derived classes from `UnsafeTrait` need to initialize the `extData`
cache and define the number of extData slots needed.
- The internal `getStruct` method needs to define which slot we use to
retrieve `extData`.
Other:
- REVERTED: pom.xml for fury-format will automatically run tests with
appropriate --add-opens flag for arrow
## Related issues
<!--
Is there any related issue? Please attach here.
- #xxxx0
- #xxxx1
- #xxxx2
-->
## Does this PR introduce any user-facing change?
N/A
- [ ] Does this PR introduce any public API change?
- [ ] Does this PR introduce any binary protocol compatibility change?
## Benchmark
<!--
When the PR has an impact on performance (if you don't know whether the
PR will have an impact on performance, you can submit the PR first, and
if it will have impact on performance, the code reviewer will explain
it), be sure to attach a benchmark data here.
-->
---
.../apache/fury/format/row/binary/BinaryArray.java | 3 +-
.../apache/fury/format/row/binary/BinaryRow.java | 3 +-
.../apache/fury/format/row/binary/UnsafeTrait.java | 21 ++++++++++++--
.../org/apache/fury/format/type/DataTypes.java | 33 +++-------------------
4 files changed, 27 insertions(+), 33 deletions(-)
diff --git
a/java/fury-format/src/main/java/org/apache/fury/format/row/binary/BinaryArray.java
b/java/fury-format/src/main/java/org/apache/fury/format/row/binary/BinaryArray.java
index 3b1c5cb9..fdae1a73 100644
---
a/java/fury-format/src/main/java/org/apache/fury/format/row/binary/BinaryArray.java
+++
b/java/fury-format/src/main/java/org/apache/fury/format/row/binary/BinaryArray.java
@@ -63,6 +63,7 @@ public class BinaryArray extends UnsafeTrait implements
ArrayData {
} else {
this.elementSize = width;
}
+ initializeExtData(1); // Only require at most one slot to cache the schema
for array type.
}
public void pointTo(MemoryBuffer buffer, int offset, int sizeInBytes) {
@@ -135,7 +136,7 @@ public class BinaryArray extends UnsafeTrait implements
ArrayData {
@Override
public BinaryRow getStruct(int ordinal) {
- return getStruct(ordinal, field.getChildren().get(0));
+ return getStruct(ordinal, field.getChildren().get(0), 0);
}
@Override
diff --git
a/java/fury-format/src/main/java/org/apache/fury/format/row/binary/BinaryRow.java
b/java/fury-format/src/main/java/org/apache/fury/format/row/binary/BinaryRow.java
index 59d95d42..8303faea 100644
---
a/java/fury-format/src/main/java/org/apache/fury/format/row/binary/BinaryRow.java
+++
b/java/fury-format/src/main/java/org/apache/fury/format/row/binary/BinaryRow.java
@@ -73,6 +73,7 @@ public class BinaryRow extends UnsafeTrait implements Row {
this.numFields = schema.getFields().size();
Preconditions.checkArgument(numFields > 0);
this.bitmapWidthInBytes = BitUtils.calculateBitmapWidthInBytes(numFields);
+ initializeExtData(numFields);
}
public void pointTo(MemoryBuffer buffer, int offset, int sizeInBytes) {
@@ -155,7 +156,7 @@ public class BinaryRow extends UnsafeTrait implements Row {
@Override
public BinaryRow getStruct(int ordinal) {
- return getStruct(ordinal, schema.getFields().get(ordinal));
+ return getStruct(ordinal, schema.getFields().get(ordinal), ordinal);
}
@Override
diff --git
a/java/fury-format/src/main/java/org/apache/fury/format/row/binary/UnsafeTrait.java
b/java/fury-format/src/main/java/org/apache/fury/format/row/binary/UnsafeTrait.java
index 0b7d1d5f..a13eca4e 100644
---
a/java/fury-format/src/main/java/org/apache/fury/format/row/binary/UnsafeTrait.java
+++
b/java/fury-format/src/main/java/org/apache/fury/format/row/binary/UnsafeTrait.java
@@ -26,6 +26,7 @@ import java.nio.charset.StandardCharsets;
import org.apache.arrow.memory.ArrowBuf;
import org.apache.arrow.vector.types.pojo.ArrowType;
import org.apache.arrow.vector.types.pojo.Field;
+import org.apache.arrow.vector.types.pojo.Schema;
import org.apache.arrow.vector.util.DecimalUtility;
import org.apache.fury.format.row.Getters;
import org.apache.fury.format.row.Setters;
@@ -35,6 +36,7 @@ import org.apache.fury.memory.MemoryBuffer;
/** Internal to binary row format to reuse code, don't use it in anywhere
else. */
abstract class UnsafeTrait implements Getters, Setters {
+ private Object[] extData;
abstract MemoryBuffer getBuffer();
@@ -55,6 +57,10 @@ abstract class UnsafeTrait implements Getters, Setters {
abstract int getOffset(int ordinal);
+ void initializeExtData(int numSlots) {
+ extData = new Object[numSlots];
+ }
+
// ###########################################################
// ####################### getters #######################
// ###########################################################
@@ -143,14 +149,25 @@ abstract class UnsafeTrait implements Getters, Setters {
return decimal;
}
- BinaryRow getStruct(int ordinal, Field field) {
+ /**
+ * Gets the field at a specific ordinal as a struct.
+ *
+ * @param ordinal the ordinal position of this field.
+ * @param field the Arrow field corresponding to this struct.
+ * @param extDataSlot the ext data slot used to cache the schema for the
struct.
+ * @return the binary row representation of the struct.
+ */
+ BinaryRow getStruct(int ordinal, Field field, int extDataSlot) {
if (isNullAt(ordinal)) {
return null;
}
final long offsetAndSize = getInt64(ordinal);
final int relativeOffset = (int) (offsetAndSize >> 32);
final int size = (int) offsetAndSize;
- BinaryRow row = new BinaryRow(DataTypes.createSchema(field));
+ if (extData[extDataSlot] == null) {
+ extData[extDataSlot] = DataTypes.createSchema(field);
+ }
+ BinaryRow row = new BinaryRow((Schema) extData[extDataSlot]);
row.pointTo(getBuffer(), getBaseOffset() + relativeOffset, size);
return row;
}
diff --git
a/java/fury-format/src/main/java/org/apache/fury/format/type/DataTypes.java
b/java/fury-format/src/main/java/org/apache/fury/format/type/DataTypes.java
index 383be40c..3bb83c40 100644
--- a/java/fury-format/src/main/java/org/apache/fury/format/type/DataTypes.java
+++ b/java/fury-format/src/main/java/org/apache/fury/format/type/DataTypes.java
@@ -314,7 +314,7 @@ public class DataTypes {
}
public static Field field(String name, FieldType fieldType, List<Field>
children) {
- return new ExtField(name, fieldType, children);
+ return new Field(name, fieldType, children);
}
public static Field notNullField(String name, ArrowType type, Field...
children) {
@@ -396,19 +396,11 @@ public class DataTypes {
}
public static Field keyFieldForMap(Field mapField) {
- Field field = mapField.getChildren().get(0).getChildren().get(0);
- if (field.getClass() != ExtField.class) {
- return new ExtField(field.getName(), field.getFieldType(),
field.getChildren());
- }
- return field;
+ return mapField.getChildren().get(0).getChildren().get(0);
}
public static Field itemFieldForMap(Field mapField) {
- Field field = mapField.getChildren().get(0).getChildren().get(1);
- if (field.getClass() != ExtField.class) {
- return new ExtField(field.getName(), field.getFieldType(),
field.getChildren());
- }
- return field;
+ return mapField.getChildren().get(0).getChildren().get(1);
}
public static Field keyArrayFieldForMap(Field mapField) {
@@ -425,24 +417,7 @@ public class DataTypes {
}
public static Schema createSchema(Field field) {
- if (field.getClass() != ExtField.class) {
- throw new IllegalArgumentException(
- String.format("Field %s got wrong type %s", field,
field.getClass()));
- }
- ExtField extField = (ExtField) field;
- Object extData = extField.extData;
- if (extData == null) {
- extField.extData = extData = new Schema(field.getChildren(),
field.getMetadata());
- }
- return (Schema) extData;
- }
-
- static class ExtField extends Field {
- Object extData;
-
- public ExtField(String name, FieldType fieldType, List<Field> children) {
- super(name, fieldType, children);
- }
+ return new Schema(field.getChildren(), field.getMetadata());
}
public static Field structField(boolean nullable, Field... fields) {
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]