This is an automated email from the ASF dual-hosted git repository.
fokko pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/iceberg-rust.git
The following commit(s) were added to refs/heads/main by this push:
new 9e1e8eb chore: Use nightly toolchain for check (#445)
9e1e8eb is described below
commit 9e1e8ebfd255f3903c4f406e38caef2653988706
Author: Renjie Liu <[email protected]>
AuthorDate: Tue Jul 9 21:22:54 2024 +0800
chore: Use nightly toolchain for check (#445)
* chore: Use nightly toolchain for check
* Fix check
* Fix clippy finds
* Make rustfmt happy
* Make rustfmt happy
* Update github actions
* Use action builder since apache doesn't allow external actions
* Fix comments
* Fix README.md
---
.github/actions/setup-builder/action.yml | 40 +++
.github/workflows/ci.yml | 22 +-
.github/workflows/publish.yml | 16 +-
Makefile | 29 +-
README.md | 9 +
crates/iceberg/src/avro/schema.rs | 332 ++++++++++-----------
.../src/writer/file_writer/parquet_writer.rs | 3 -
crates/integrations/datafusion/src/schema.rs | 2 +-
rust-toolchain.toml | 2 +-
9 files changed, 262 insertions(+), 193 deletions(-)
diff --git a/.github/actions/setup-builder/action.yml
b/.github/actions/setup-builder/action.yml
new file mode 100644
index 0000000..43de1cb
--- /dev/null
+++ b/.github/actions/setup-builder/action.yml
@@ -0,0 +1,40 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# This file is heavily inspired by
+#
[datafusion](https://github.com/apache/datafusion/blob/main/.github/actions/setup-builder/action.yaml).
+name: Prepare Rust Builder
+description: 'Prepare Rust Build Environment'
+inputs:
+ rust-version:
+ description: 'version of rust to install (e.g. stable)'
+ required: true
+ default: 'stable'
+runs:
+ using: "composite"
+ steps:
+ - name: Setup Rust toolchain
+ shell: bash
+ run: |
+ echo "Installing ${{ inputs.rust-version }}"
+ rustup toolchain install ${{ inputs.rust-version }}
+ rustup default ${{ inputs.rust-version }}
+ rustup component add rustfmt clippy
+ - name: Fixup git permissions
+ # https://github.com/actions/checkout/issues/766
+ shell: bash
+ run: git config --global --add safe.directory "$GITHUB_WORKSPACE"
\ No newline at end of file
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index 09f2317..f98f5e7 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -29,6 +29,9 @@ concurrency:
group: ${{ github.workflow }}-${{ github.ref }}-${{ github.event_name }}
cancel-in-progress: true
+env:
+ rust_msrv: "1.77.1"
+
jobs:
check:
runs-on: ubuntu-latest
@@ -38,6 +41,12 @@ jobs:
- name: Check License Header
uses: apache/skywalking-eyes/[email protected]
+ - name: Install cargo-sort
+ run: make install-cargo-sort
+
+ - name: Install taplo-cli
+ run: make install-taplo-cli
+
- name: Cargo format
run: make check-fmt
@@ -63,8 +72,14 @@ jobs:
- windows-latest
steps:
- uses: actions/checkout@v4
+
+ - name: Setup Rust toolchain
+ uses: ./.github/actions/setup-builder
+ with:
+ rust-version: ${{ env.rust_msrv }}
+
- name: Build
- run: cargo build
+ run: make build
build_with_no_default_features:
runs-on: ${{ matrix.os }}
@@ -84,6 +99,11 @@ jobs:
steps:
- uses: actions/checkout@v4
+ - name: Setup Rust toolchain
+ uses: ./.github/actions/setup-builder
+ with:
+ rust-version: ${{ env.rust_msrv }}
+
- name: Test
run: cargo test --no-fail-fast --all-targets --all-features --workspace
diff --git a/.github/workflows/publish.yml b/.github/workflows/publish.yml
index 9289d3e..a57aa61 100644
--- a/.github/workflows/publish.yml
+++ b/.github/workflows/publish.yml
@@ -21,13 +21,11 @@ on:
push:
tags:
- '*'
- pull_request:
- branches:
- - main
- paths:
- - ".github/workflows/publish.yml"
workflow_dispatch:
+env:
+ rust_msrv: "1.77.1"
+
jobs:
publish:
runs-on: ubuntu-latest
@@ -42,9 +40,11 @@ jobs:
- "crates/catalog/rest"
steps:
- uses: actions/checkout@v4
- - name: Dryrun ${{ matrix.package }}
- working-directory: ${{ matrix.package }}
- run: cargo publish --all-features --dry-run
+
+ - name: Setup Rust toolchain
+ uses: ./.github/actions/setup-builder
+ with:
+ rust-version: ${{ env.rust_msrv }}
- name: Publish ${{ matrix.package }}
working-directory: ${{ matrix.package }}
diff --git a/Makefile b/Makefile
index abee425..ed10a8a 100644
--- a/Makefile
+++ b/Makefile
@@ -17,34 +17,37 @@
.EXPORT_ALL_VARIABLES:
-RUST_LOG = debug
-
build:
- cargo build
+ cargo build --all-targets --all-features --workspace
check-fmt:
- cargo fmt --all -- --check
+ cargo fmt --all -- --check
check-clippy:
- cargo clippy --all-targets --all-features --workspace -- -D warnings
+ cargo clippy --all-targets --all-features --workspace -- -D warnings
+
+install-cargo-sort:
+ cargo install [email protected]
-cargo-sort:
- cargo install cargo-sort
+cargo-sort: install-cargo-sort
cargo sort -c -w
-cargo-machete:
+install-cargo-machete:
cargo install cargo-machete
+
+cargo-machete: install-cargo-machete
cargo machete
-fix-toml:
- cargo install taplo-cli --locked
+install-taplo-cli:
+ cargo install [email protected]
+
+fix-toml: install-taplo-cli
taplo fmt
-check-toml:
- cargo install taplo-cli --locked
+check-toml: install-taplo-cli
taplo check
-check: check-fmt check-clippy cargo-sort check-toml
+check: check-fmt check-clippy cargo-sort check-toml cargo-machete
doc-test:
cargo test --no-fail-fast --doc --all-features --workspace
diff --git a/README.md b/README.md
index 3f4f7a3..4f8265b 100644
--- a/README.md
+++ b/README.md
@@ -57,6 +57,15 @@ The Apache Iceberg Rust project is composed of the following
components:
[iceberg-catalog-rest link]: https://crates.io/crates/iceberg-catalog-rest
[iceberg-catalog-rest release docs]: https://docs.rs/iceberg-catalog-rest
+## Supported Rust Version
+
+Iceberg Rust is built and tested with stable rust, and will keep a rolling
MSRV(minimum supported rust version). The
+current MSRV is 1.77.1.
+
+Also, we use unstable rust to run linters, such as `clippy` and `rustfmt`. But
this will not affect downstream users,
+and only MSRV is required.
+
+
## Contribute
Apache Iceberg is an active open-source project, governed under the Apache
Software Foundation (ASF). We are always open to people who want to use or
contribute to it. Here are some ways to get involved.
diff --git a/crates/iceberg/src/avro/schema.rs
b/crates/iceberg/src/avro/schema.rs
index d83cdc3..11d000c 100644
--- a/crates/iceberg/src/avro/schema.rs
+++ b/crates/iceberg/src/avro/schema.rs
@@ -19,10 +19,10 @@
use std::collections::BTreeMap;
use crate::spec::{
- visit_schema, ListType, MapType, NestedField, NestedFieldRef,
PrimitiveType, Schema,
- SchemaVisitor, StructType, Type,
+ visit_schema, ListType, MapType, NestedFieldRef, PrimitiveType, Schema,
SchemaVisitor,
+ StructType,
};
-use crate::{ensure_data_valid, Error, ErrorKind, Result};
+use crate::{Error, ErrorKind, Result};
use apache_avro::schema::{
DecimalSchema, FixedSchema, Name, RecordField as AvroRecordField,
RecordFieldOrder,
RecordSchema, UnionSchema,
@@ -272,205 +272,205 @@ fn avro_optional(avro_schema: AvroSchema) ->
Result<AvroSchema> {
])?))
}
-fn is_avro_optional(avro_schema: &AvroSchema) -> bool {
- match avro_schema {
- AvroSchema::Union(union) => union.is_nullable(),
- _ => false,
+#[cfg(test)]
+mod tests {
+ use super::*;
+ use crate::ensure_data_valid;
+ use crate::spec::{ListType, MapType, NestedField, PrimitiveType, Schema,
StructType, Type};
+ use apache_avro::schema::{Namespace, UnionSchema};
+ use apache_avro::Schema as AvroSchema;
+ use std::fs::read_to_string;
+
+ fn is_avro_optional(avro_schema: &AvroSchema) -> bool {
+ match avro_schema {
+ AvroSchema::Union(union) => union.is_nullable(),
+ _ => false,
+ }
}
-}
-/// Post order avro schema visitor.
-pub(crate) trait AvroSchemaVisitor {
- type T;
+ /// Post order avro schema visitor.
+ pub(crate) trait AvroSchemaVisitor {
+ type T;
- fn record(&mut self, record: &RecordSchema, fields: Vec<Self::T>) ->
Result<Self::T>;
+ fn record(&mut self, record: &RecordSchema, fields: Vec<Self::T>) ->
Result<Self::T>;
- fn union(&mut self, union: &UnionSchema, options: Vec<Self::T>) ->
Result<Self::T>;
+ fn union(&mut self, union: &UnionSchema, options: Vec<Self::T>) ->
Result<Self::T>;
- fn array(&mut self, array: &AvroSchema, item: Self::T) -> Result<Self::T>;
- fn map(&mut self, map: &AvroSchema, value: Self::T) -> Result<Self::T>;
+ fn array(&mut self, array: &AvroSchema, item: Self::T) ->
Result<Self::T>;
+ fn map(&mut self, map: &AvroSchema, value: Self::T) -> Result<Self::T>;
- fn primitive(&mut self, schema: &AvroSchema) -> Result<Self::T>;
-}
+ fn primitive(&mut self, schema: &AvroSchema) -> Result<Self::T>;
+ }
-struct AvroSchemaToSchema {
- next_id: i32,
-}
+ struct AvroSchemaToSchema {
+ next_id: i32,
+ }
-impl AvroSchemaToSchema {
- fn next_field_id(&mut self) -> i32 {
- self.next_id += 1;
- self.next_id
+ impl AvroSchemaToSchema {
+ fn next_field_id(&mut self) -> i32 {
+ self.next_id += 1;
+ self.next_id
+ }
}
-}
-impl AvroSchemaVisitor for AvroSchemaToSchema {
- // Only `AvroSchema::Null` will return `None`
- type T = Option<Type>;
+ impl AvroSchemaVisitor for AvroSchemaToSchema {
+ // Only `AvroSchema::Null` will return `None`
+ type T = Option<Type>;
+
+ fn record(
+ &mut self,
+ record: &RecordSchema,
+ field_types: Vec<Option<Type>>,
+ ) -> Result<Option<Type>> {
+ let mut fields = Vec::with_capacity(field_types.len());
+ for (avro_field, typ) in record.fields.iter().zip_eq(field_types) {
+ let field_id = avro_field
+ .custom_attributes
+ .get(FILED_ID_PROP)
+ .and_then(Value::as_i64)
+ .ok_or_else(|| {
+ Error::new(
+ ErrorKind::DataInvalid,
+ format!("Can't convert field, missing field id:
{avro_field:?}"),
+ )
+ })?;
- fn record(
- &mut self,
- record: &RecordSchema,
- field_types: Vec<Option<Type>>,
- ) -> Result<Option<Type>> {
- let mut fields = Vec::with_capacity(field_types.len());
- for (avro_field, typ) in record.fields.iter().zip_eq(field_types) {
- let field_id = avro_field
- .custom_attributes
- .get(FILED_ID_PROP)
- .and_then(Value::as_i64)
- .ok_or_else(|| {
- Error::new(
- ErrorKind::DataInvalid,
- format!("Can't convert field, missing field id:
{avro_field:?}"),
- )
- })?;
+ let optional = is_avro_optional(&avro_field.schema);
- let optional = is_avro_optional(&avro_field.schema);
+ let mut field = if optional {
+ NestedField::optional(field_id as i32, &avro_field.name,
typ.unwrap())
+ } else {
+ NestedField::required(field_id as i32, &avro_field.name,
typ.unwrap())
+ };
- let mut field = if optional {
- NestedField::optional(field_id as i32, &avro_field.name,
typ.unwrap())
- } else {
- NestedField::required(field_id as i32, &avro_field.name,
typ.unwrap())
- };
+ if let Some(doc) = &avro_field.doc {
+ field = field.with_doc(doc);
+ }
- if let Some(doc) = &avro_field.doc {
- field = field.with_doc(doc);
+ fields.push(field.into());
}
- fields.push(field.into());
+ Ok(Some(Type::Struct(StructType::new(fields))))
}
- Ok(Some(Type::Struct(StructType::new(fields))))
- }
-
- fn union(
- &mut self,
- union: &UnionSchema,
- mut options: Vec<Option<Type>>,
- ) -> Result<Option<Type>> {
- ensure_data_valid!(
- options.len() <= 2 && !options.is_empty(),
- "Can't convert avro union type {:?} to iceberg.",
- union
- );
-
- if options.len() > 1 {
+ fn union(
+ &mut self,
+ union: &UnionSchema,
+ mut options: Vec<Option<Type>>,
+ ) -> Result<Option<Type>> {
ensure_data_valid!(
- options[0].is_none(),
+ options.len() <= 2 && !options.is_empty(),
"Can't convert avro union type {:?} to iceberg.",
union
);
- }
- if options.len() == 1 {
- Ok(Some(options.remove(0).unwrap()))
- } else {
- Ok(Some(options.remove(1).unwrap()))
- }
- }
+ if options.len() > 1 {
+ ensure_data_valid!(
+ options[0].is_none(),
+ "Can't convert avro union type {:?} to iceberg.",
+ union
+ );
+ }
- fn array(&mut self, array: &AvroSchema, item: Option<Type>) ->
Result<Self::T> {
- if let AvroSchema::Array(item_schema) = array {
- let element_field = NestedField::list_element(
- self.next_field_id(),
- item.unwrap(),
- !is_avro_optional(item_schema),
- )
- .into();
- Ok(Some(Type::List(ListType { element_field })))
- } else {
- Err(Error::new(
- ErrorKind::Unexpected,
- "Expected avro array schema, but {array}",
- ))
+ if options.len() == 1 {
+ Ok(Some(options.remove(0).unwrap()))
+ } else {
+ Ok(Some(options.remove(1).unwrap()))
+ }
}
- }
- fn map(&mut self, map: &AvroSchema, value: Option<Type>) ->
Result<Option<Type>> {
- if let AvroSchema::Map(value_schema) = map {
- // Due to avro rust implementation's limitation, we can't store
attributes in map schema,
- // we will fix it later when it has been resolved.
- let key_field = NestedField::map_key_element(
- self.next_field_id(),
- Type::Primitive(PrimitiveType::String),
- );
- let value_field = NestedField::map_value_element(
- self.next_field_id(),
- value.unwrap(),
- !is_avro_optional(value_schema),
- );
- Ok(Some(Type::Map(MapType {
- key_field: key_field.into(),
- value_field: value_field.into(),
- })))
- } else {
- Err(Error::new(
- ErrorKind::Unexpected,
- "Expected avro map schema, but {map}",
- ))
+ fn array(&mut self, array: &AvroSchema, item: Option<Type>) ->
Result<Self::T> {
+ if let AvroSchema::Array(item_schema) = array {
+ let element_field = NestedField::list_element(
+ self.next_field_id(),
+ item.unwrap(),
+ !is_avro_optional(item_schema),
+ )
+ .into();
+ Ok(Some(Type::List(ListType { element_field })))
+ } else {
+ Err(Error::new(
+ ErrorKind::Unexpected,
+ "Expected avro array schema, but {array}",
+ ))
+ }
}
- }
- fn primitive(&mut self, schema: &AvroSchema) -> Result<Option<Type>> {
- let typ = match schema {
- AvroSchema::Decimal(decimal) => {
- Type::decimal(decimal.precision as u32, decimal.scale as u32)?
+ fn map(&mut self, map: &AvroSchema, value: Option<Type>) ->
Result<Option<Type>> {
+ if let AvroSchema::Map(value_schema) = map {
+ // Due to avro rust implementation's limitation, we can't
store attributes in map schema,
+ // we will fix it later when it has been resolved.
+ let key_field = NestedField::map_key_element(
+ self.next_field_id(),
+ Type::Primitive(PrimitiveType::String),
+ );
+ let value_field = NestedField::map_value_element(
+ self.next_field_id(),
+ value.unwrap(),
+ !is_avro_optional(value_schema),
+ );
+ Ok(Some(Type::Map(MapType {
+ key_field: key_field.into(),
+ value_field: value_field.into(),
+ })))
+ } else {
+ Err(Error::new(
+ ErrorKind::Unexpected,
+ "Expected avro map schema, but {map}",
+ ))
}
- AvroSchema::Date => Type::Primitive(PrimitiveType::Date),
- AvroSchema::TimeMicros => Type::Primitive(PrimitiveType::Time),
- AvroSchema::TimestampMicros =>
Type::Primitive(PrimitiveType::Timestamp),
- AvroSchema::Boolean => Type::Primitive(PrimitiveType::Boolean),
- AvroSchema::Int => Type::Primitive(PrimitiveType::Int),
- AvroSchema::Long => Type::Primitive(PrimitiveType::Long),
- AvroSchema::Float => Type::Primitive(PrimitiveType::Float),
- AvroSchema::Double => Type::Primitive(PrimitiveType::Double),
- AvroSchema::String | AvroSchema::Enum(_) =>
Type::Primitive(PrimitiveType::String),
- AvroSchema::Fixed(fixed) => {
- if let Some(logical_type) = fixed.attributes.get(LOGICAL_TYPE)
{
- let logical_type = logical_type.as_str().ok_or_else(|| {
- Error::new(
- ErrorKind::DataInvalid,
- "logicalType in attributes of avro schema is not a
string type",
- )
- })?;
- match logical_type {
- UUID_LOGICAL_TYPE =>
Type::Primitive(PrimitiveType::Uuid),
- ty => {
- return Err(Error::new(
- ErrorKind::FeatureUnsupported,
- format!(
+ }
+
+ fn primitive(&mut self, schema: &AvroSchema) -> Result<Option<Type>> {
+ let typ = match schema {
+ AvroSchema::Decimal(decimal) => {
+ Type::decimal(decimal.precision as u32, decimal.scale as
u32)?
+ }
+ AvroSchema::Date => Type::Primitive(PrimitiveType::Date),
+ AvroSchema::TimeMicros => Type::Primitive(PrimitiveType::Time),
+ AvroSchema::TimestampMicros =>
Type::Primitive(PrimitiveType::Timestamp),
+ AvroSchema::Boolean => Type::Primitive(PrimitiveType::Boolean),
+ AvroSchema::Int => Type::Primitive(PrimitiveType::Int),
+ AvroSchema::Long => Type::Primitive(PrimitiveType::Long),
+ AvroSchema::Float => Type::Primitive(PrimitiveType::Float),
+ AvroSchema::Double => Type::Primitive(PrimitiveType::Double),
+ AvroSchema::String | AvroSchema::Enum(_) =>
Type::Primitive(PrimitiveType::String),
+ AvroSchema::Fixed(fixed) => {
+ if let Some(logical_type) =
fixed.attributes.get(LOGICAL_TYPE) {
+ let logical_type = logical_type.as_str().ok_or_else(||
{
+ Error::new(
+ ErrorKind::DataInvalid,
+ "logicalType in attributes of avro schema is
not a string type",
+ )
+ })?;
+ match logical_type {
+ UUID_LOGICAL_TYPE =>
Type::Primitive(PrimitiveType::Uuid),
+ ty => {
+ return Err(Error::new(
+ ErrorKind::FeatureUnsupported,
+ format!(
"Logical type {ty} is not support in
iceberg primitive type.",
),
- ))
+ ))
+ }
}
+ } else {
+ Type::Primitive(PrimitiveType::Fixed(fixed.size as
u64))
}
- } else {
- Type::Primitive(PrimitiveType::Fixed(fixed.size as u64))
}
- }
- AvroSchema::Bytes => Type::Primitive(PrimitiveType::Binary),
- AvroSchema::Null => return Ok(None),
- _ => {
- return Err(Error::new(
- ErrorKind::Unexpected,
- "Unable to convert avro {schema} to iceberg primitive
type.",
- ))
- }
- };
+ AvroSchema::Bytes => Type::Primitive(PrimitiveType::Binary),
+ AvroSchema::Null => return Ok(None),
+ _ => {
+ return Err(Error::new(
+ ErrorKind::Unexpected,
+ "Unable to convert avro {schema} to iceberg primitive
type.",
+ ))
+ }
+ };
- Ok(Some(typ))
+ Ok(Some(typ))
+ }
}
-}
-
-#[cfg(test)]
-mod tests {
- use super::*;
- use crate::avro::schema::AvroSchemaToSchema;
- use crate::spec::{ListType, MapType, NestedField, PrimitiveType, Schema,
StructType, Type};
- use apache_avro::schema::{Namespace, UnionSchema};
- use apache_avro::Schema as AvroSchema;
- use std::fs::read_to_string;
/// Visit avro schema in post order visitor.
pub(crate) fn visit<V: AvroSchemaVisitor>(
diff --git a/crates/iceberg/src/writer/file_writer/parquet_writer.rs
b/crates/iceberg/src/writer/file_writer/parquet_writer.rs
index 43e49fc..5f50e41 100644
--- a/crates/iceberg/src/writer/file_writer/parquet_writer.rs
+++ b/crates/iceberg/src/writer/file_writer/parquet_writer.rs
@@ -627,9 +627,6 @@ mod tests {
use
crate::writer::file_writer::location_generator::DefaultFileNameGenerator;
use crate::writer::tests::check_parquet_data_file;
- #[derive(Clone)]
- struct TestLocationGen;
-
fn schema_for_all_type() -> Schema {
Schema::builder()
.with_schema_id(1)
diff --git a/crates/integrations/datafusion/src/schema.rs
b/crates/integrations/datafusion/src/schema.rs
index 2ba6962..f7b1a21 100644
--- a/crates/integrations/datafusion/src/schema.rs
+++ b/crates/integrations/datafusion/src/schema.rs
@@ -89,7 +89,7 @@ impl SchemaProvider for IcebergSchemaProvider {
}
fn table_exist(&self, name: &str) -> bool {
- self.tables.get(name).is_some()
+ self.tables.contains_key(name)
}
async fn table(&self, name: &str) -> DFResult<Option<Arc<dyn
TableProvider>>> {
diff --git a/rust-toolchain.toml b/rust-toolchain.toml
index 6685489..7b10a86 100644
--- a/rust-toolchain.toml
+++ b/rust-toolchain.toml
@@ -16,5 +16,5 @@
# under the License.
[toolchain]
-channel = "1.77.1"
+channel = "nightly-2024-06-10"
components = ["rustfmt", "clippy"]