>From Peeyush Gupta <[email protected]>:
Peeyush Gupta has uploaded this change for review. (
https://asterix-gerrit.ics.uci.edu/c/asterixdb/+/19047 )
Change subject: wip: use delta kernel api instead of delta standalone
......................................................................
wip: use delta kernel api instead of delta standalone
Change-Id: I0017e63ac0bddcfa0b342d9380d55934a76c12ec
---
M asterixdb/asterix-external-data/pom.xml
M
asterixdb/asterix-external-data/src/main/java/org/apache/asterix/external/input/record/reader/aws/delta/AwsS3DeltaReaderFactory.java
M
asterixdb/asterix-external-data/src/main/java/org/apache/asterix/external/util/ExternalDataUtils.java
M asterixdb/pom.xml
M hyracks-fullstack/pom.xml
5 files changed, 46 insertions(+), 12 deletions(-)
git pull ssh://asterix-gerrit.ics.uci.edu:29418/asterixdb
refs/changes/47/19047/1
diff --git a/asterixdb/asterix-external-data/pom.xml
b/asterixdb/asterix-external-data/pom.xml
index 8c8ad10..a388054 100644
--- a/asterixdb/asterix-external-data/pom.xml
+++ b/asterixdb/asterix-external-data/pom.xml
@@ -579,6 +579,16 @@
</dependency>
<dependency>
<groupId>io.delta</groupId>
+ <artifactId>delta-kernel-api</artifactId>
+ <version>3.2.1</version>
+ </dependency>
+ <dependency>
+ <groupId>io.delta</groupId>
+ <artifactId>delta-kernel-defaults</artifactId>
+ <version>3.2.1</version>
+ </dependency>
+ <dependency>
+ <groupId>io.delta</groupId>
<artifactId>delta-standalone_2.12</artifactId>
<version>3.0.0</version>
</dependency>
diff --git
a/asterixdb/asterix-external-data/src/main/java/org/apache/asterix/external/input/record/reader/aws/delta/AwsS3DeltaReaderFactory.java
b/asterixdb/asterix-external-data/src/main/java/org/apache/asterix/external/input/record/reader/aws/delta/AwsS3DeltaReaderFactory.java
index f5a2cd5..a0d1e49 100644
---
a/asterixdb/asterix-external-data/src/main/java/org/apache/asterix/external/input/record/reader/aws/delta/AwsS3DeltaReaderFactory.java
+++
b/asterixdb/asterix-external-data/src/main/java/org/apache/asterix/external/input/record/reader/aws/delta/AwsS3DeltaReaderFactory.java
@@ -47,6 +47,9 @@
Configuration conf = new Configuration();
conf.set(S3Constants.HADOOP_ACCESS_KEY_ID,
configuration.get(S3Constants.ACCESS_KEY_ID_FIELD_NAME));
conf.set(S3Constants.HADOOP_SECRET_ACCESS_KEY,
configuration.get(S3Constants.SECRET_ACCESS_KEY_FIELD_NAME));
+ if (configuration.containsKey(S3Constants.SESSION_TOKEN_FIELD_NAME)) {
+ conf.set(S3Constants.HADOOP_SESSION_TOKEN,
configuration.get(S3Constants.SESSION_TOKEN_FIELD_NAME));
+ }
conf.set(S3Constants.HADOOP_REGION,
configuration.get(S3Constants.REGION_FIELD_NAME));
String serviceEndpoint =
configuration.get(SERVICE_END_POINT_FIELD_NAME);
if (serviceEndpoint != null) {
diff --git
a/asterixdb/asterix-external-data/src/main/java/org/apache/asterix/external/util/ExternalDataUtils.java
b/asterixdb/asterix-external-data/src/main/java/org/apache/asterix/external/util/ExternalDataUtils.java
index c362f75..732f3c5 100644
---
a/asterixdb/asterix-external-data/src/main/java/org/apache/asterix/external/util/ExternalDataUtils.java
+++
b/asterixdb/asterix-external-data/src/main/java/org/apache/asterix/external/util/ExternalDataUtils.java
@@ -109,9 +109,13 @@
import org.apache.iceberg.hadoop.HadoopTables;
import org.apache.iceberg.io.CloseableIterable;
-import io.delta.standalone.DeltaLog;
-import io.delta.standalone.Snapshot;
-import io.delta.standalone.actions.AddFile;
+import io.delta.kernel.Scan;
+import io.delta.kernel.Snapshot;
+import io.delta.kernel.data.FilteredColumnarBatch;
+import io.delta.kernel.data.Row;
+import io.delta.kernel.defaults.engine.DefaultEngine;
+import io.delta.kernel.engine.Engine;
+import io.delta.kernel.utils.CloseableIterator;
public class ExternalDataUtils {
private static final Map<ATypeTag, IValueParserFactory>
valueParserFactoryMap = new EnumMap<>(ATypeTag.class);
@@ -503,14 +507,22 @@
public static void prepareDeltaTableFormat(Map<String, String>
configuration, Configuration conf,
String tableMetadataPath) {
- DeltaLog deltaLog = DeltaLog.forTable(conf, tableMetadataPath);
- Snapshot snapshot = deltaLog.snapshot();
- List<AddFile> dataFiles = snapshot.getAllFiles();
+ Engine engine = DefaultEngine.create(conf);
+ io.delta.kernel.Table table = io.delta.kernel.Table.forPath(engine,
tableMetadataPath);
+ Snapshot snapshot = table.getLatestSnapshot(engine);
+ Scan scan = snapshot.getScanBuilder(engine).withReadSchema(engine,
snapshot.getSchema(engine)).build();
+ CloseableIterator<FilteredColumnarBatch> iter =
scan.getScanFiles(engine);
StringBuilder builder = new StringBuilder();
- for (AddFile batchFile : dataFiles) {
- builder.append(",");
- String path = batchFile.getPath();
- builder.append(tableMetadataPath).append('/').append(path);
+ while (iter.hasNext()) {
+ FilteredColumnarBatch batch = iter.next();
+ CloseableIterator<Row> rowIter = batch.getRows();
+ while (rowIter.hasNext()) {
+ Row row = rowIter.next();
+ Row addFile = row.getStruct(0);
+ builder.append(",");
+ String path = addFile.getString(0);
+ builder.append(tableMetadataPath).append('/').append(path);
+ }
}
if (builder.length() > 0) {
builder.deleteCharAt(0);
diff --git a/asterixdb/pom.xml b/asterixdb/pom.xml
index 94bd6e1..e8b27c9 100644
--- a/asterixdb/pom.xml
+++ b/asterixdb/pom.xml
@@ -91,7 +91,7 @@
<!-- Versions under dependencymanagement or used in many projects via
properties -->
<algebricks.version>0.3.10-SNAPSHOT</algebricks.version>
<hyracks.version>0.3.10-SNAPSHOT</hyracks.version>
- <hadoop.version>3.3.6</hadoop.version>
+ <hadoop.version>3.3.4</hadoop.version>
<jacoco.version>0.7.6.201602180812</jacoco.version>
<log4j.version>2.22.1</log4j.version>
<awsjavasdk.version>2.24.9</awsjavasdk.version>
diff --git a/hyracks-fullstack/pom.xml b/hyracks-fullstack/pom.xml
index 20c3e21..473f779d 100644
--- a/hyracks-fullstack/pom.xml
+++ b/hyracks-fullstack/pom.xml
@@ -69,7 +69,7 @@
<test.includes>${global.test.includes}</test.includes>
<test.excludes>${global.test.excludes}</test.excludes>
<!-- Versions under dependencymanagement or used in many projects via
properties -->
- <hadoop.version>3.3.6</hadoop.version>
+ <hadoop.version>3.3.4</hadoop.version>
<jacoco.version>0.7.6.201602180812</jacoco.version>
<log4j.version>2.22.1</log4j.version>
<snappy.version>1.1.10.5</snappy.version>
--
To view, visit https://asterix-gerrit.ics.uci.edu/c/asterixdb/+/19047
To unsubscribe, or for help writing mail filters, visit
https://asterix-gerrit.ics.uci.edu/settings
Gerrit-Project: asterixdb
Gerrit-Branch: master
Gerrit-Change-Id: I0017e63ac0bddcfa0b342d9380d55934a76c12ec
Gerrit-Change-Number: 19047
Gerrit-PatchSet: 1
Gerrit-Owner: Peeyush Gupta <[email protected]>
Gerrit-MessageType: newchange