This is an automated email from the ASF dual-hosted git repository. ihuzenko pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/drill.git
commit df48eb8167219b7c632e8f6d760e76242805d6af Author: Arina Ielchiieva <[email protected]> AuthorDate: Wed Jul 31 14:04:32 2019 +0300 DRILL-7334: Update Iceberg Metastore Parquet write mode closes #1832 --- metastore/iceberg-metastore/README.md | 19 ++++++++++--------- metastore/iceberg-metastore/pom.xml | 2 +- .../metastore/iceberg/write/ParquetFileWriter.java | 7 ++++--- .../src/main/resources/drill-metastore-module.conf | 9 ++++----- .../iceberg/write/TestParquetFileWriter.java | 8 +++----- 5 files changed, 22 insertions(+), 23 deletions(-) diff --git a/metastore/iceberg-metastore/README.md b/metastore/iceberg-metastore/README.md index cc46c2a..3db08d1 100644 --- a/metastore/iceberg-metastore/README.md +++ b/metastore/iceberg-metastore/README.md @@ -56,16 +56,18 @@ Assume Iceberg table location is `/drill/metastore/iceberg/tables`, metadata for ### Metadata Storage Format -By default, Metadata will be stored in Parquet files. -Each parquet file will hold information for one partition. +Iceberg tables support data storage in three formats: Parquet, Avro, ORC. +Drill metadata will be stored in Parquet files. This format was chosen over others +since it is column oriented and efficient in terms of disk I/O +when specific columns need to be queried. + +Each Parquet file will hold information for one partition. Partition keys will depend on Metastore component characteristics. For example, for `tables` component, partitions keys are storage plugin, workspace, table name and metadata key. Parquet files name will be based on `UUID` to ensure uniqueness. - -Iceberg also supports data storage in Avro and ORC files, writing metadata -in these formats can be added later. +If somehow collision occurs, modify operation in Metastore will fail. ## Metastore Operations flow @@ -73,8 +75,7 @@ Metastore main goal is to provide ability to read and modify metadata. ### Read -Metastore data is read using `IcebergGenerics#read`. Iceberg will automatically determine -format in which data is stored (three formats are supported Parquet, Avro, ORC). +Metastore data is read using `IcebergGenerics#read`. Based on given filter and select columns list, data will be returned in `org.apache.iceberg.data.Record` format which will be transformed into the list of Metastore component units and returned to the caller. @@ -86,7 +87,7 @@ partition keys can be included into filter expression. ### Add To add metadata to Iceberg table, caller provides list of component units which -will be written into Parquet files (current default format) and grouped by partition keys. +will be written into Parquet files and grouped by partition keys. Each group will be written into separate Parquet file and stored in the location inside of Iceberg table based on component unit location keys. Note: partition keys must not be null. @@ -107,7 +108,7 @@ Parquet files with metadata for this table will be stored in `[METASTORE_ROOT_DIRECTORY]/[COMPONENT_LOCATION]/dfs/tmp/nation` folder. If `dfs.tmp.nation` is un-partitioned, it's metadata will be stored in two -parquet files: one file with general table information, +Parquet files: one file with general table information, another file with default segment information. If `dfs.tmp.nation` is partitioned, it will have also one file with general information and `N` files with top-level segments information. diff --git a/metastore/iceberg-metastore/pom.xml b/metastore/iceberg-metastore/pom.xml index f8544fd..d935750 100644 --- a/metastore/iceberg-metastore/pom.xml +++ b/metastore/iceberg-metastore/pom.xml @@ -33,7 +33,7 @@ <name>metastore/Drill Iceberg Metastore</name> <properties> - <iceberg.version>77a456a</iceberg.version> + <iceberg.version>08e0873</iceberg.version> <caffeine.version>2.7.0</caffeine.version> </properties> diff --git a/metastore/iceberg-metastore/src/main/java/org/apache/drill/metastore/iceberg/write/ParquetFileWriter.java b/metastore/iceberg-metastore/src/main/java/org/apache/drill/metastore/iceberg/write/ParquetFileWriter.java index 3c1604d..3286676 100644 --- a/metastore/iceberg-metastore/src/main/java/org/apache/drill/metastore/iceberg/write/ParquetFileWriter.java +++ b/metastore/iceberg-metastore/src/main/java/org/apache/drill/metastore/iceberg/write/ParquetFileWriter.java @@ -23,6 +23,7 @@ import org.apache.iceberg.FileFormat; import org.apache.iceberg.Table; import org.apache.iceberg.data.Record; import org.apache.iceberg.data.parquet.GenericParquetWriter; +import org.apache.iceberg.exceptions.RuntimeIOException; import org.apache.iceberg.io.FileAppender; import org.apache.iceberg.io.OutputFile; import org.apache.iceberg.parquet.Parquet; @@ -35,7 +36,7 @@ import java.util.Objects; /** * Parquet File Writer implementation. Creates Parquet file in given location and name * and '.parquet' extension and writes given data into it. - * Note: if file already exists, it will be overwritten. + * Note: if file already exists, write operation will fail. */ public class ParquetFileWriter implements FileWriter { @@ -83,7 +84,7 @@ public class ParquetFileWriter implements FileWriter { fileAppender.close(); // metrics are available only when file was written (i.e. close method was executed) return new File(outputFile, fileAppender.metrics()); - } catch (IOException | ClassCastException e) { + } catch (IOException | ClassCastException | RuntimeIOException e) { if (fileAppender != null) { try { fileAppender.close(); @@ -91,7 +92,7 @@ public class ParquetFileWriter implements FileWriter { // write has failed anyway, ignore closing exception if any and throw initial one } } - throw new IcebergMetastoreException("Unable to write data into parquet file", e); + throw new IcebergMetastoreException(String.format("Unable to write data into parquet file [%s]", outputFile.location()), e); } } } diff --git a/metastore/iceberg-metastore/src/main/resources/drill-metastore-module.conf b/metastore/iceberg-metastore/src/main/resources/drill-metastore-module.conf index be98a97..33fe795 100644 --- a/metastore/iceberg-metastore/src/main/resources/drill-metastore-module.conf +++ b/metastore/iceberg-metastore/src/main/resources/drill-metastore-module.conf @@ -43,11 +43,10 @@ drill.metastore.iceberg: { } components: { - // Common properties for all Iceberg tables from org.apache.iceberg.TableProperties can be specified - common.properties: { - write.format.default: "parquet", - write.metadata.compression-codec: "none" - }, + // Common properties for all Iceberg tables from org.apache.iceberg.TableProperties can be specified + common.properties: { + write.metadata.compression-codec: "none" + }, tables: { // Iceberg table location in Iceberg Metastore diff --git a/metastore/iceberg-metastore/src/test/java/org/apache/drill/metastore/iceberg/write/TestParquetFileWriter.java b/metastore/iceberg-metastore/src/test/java/org/apache/drill/metastore/iceberg/write/TestParquetFileWriter.java index 6943185..4a9211a 100644 --- a/metastore/iceberg-metastore/src/test/java/org/apache/drill/metastore/iceberg/write/TestParquetFileWriter.java +++ b/metastore/iceberg-metastore/src/test/java/org/apache/drill/metastore/iceberg/write/TestParquetFileWriter.java @@ -261,15 +261,13 @@ public class TestParquetFileWriter extends IcebergBaseTest { java.nio.file.Path file = Paths.get(new File(location, FileFormat.PARQUET.addExtension(fileName)).getPath()); Files.write(file, Collections.singletonList("abc")); - org.apache.drill.metastore.iceberg.write.File result = new ParquetFileWriter(table) + thrown.expect(IcebergMetastoreException.class); + + new ParquetFileWriter(table) .records(Collections.singletonList(record)) .location(location) .name(fileName) .write(); - - List<Record> rows = readData(result.input(), schema); - assertEquals(1, rows.size()); - assertEquals(1, rows.get(0).getField("int_field")); } private List<Record> readData(InputFile inputFile, Schema schema) throws IOException {
