[GitHub] [incubator-iceberg] jun-he commented on a change in pull request #357: Add in and not in predicates
jun-he commented on a change in pull request #357: Add in and not in predicates URL: https://github.com/apache/incubator-iceberg/pull/357#discussion_r318403061 ## File path: api/src/main/java/org/apache/iceberg/expressions/BoundSetPredicate.java ## @@ -0,0 +1,176 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.iceberg.expressions; + +import com.google.common.base.Joiner; +import com.google.common.base.Preconditions; +import com.google.common.collect.ImmutableSet; +import java.io.Serializable; +import java.util.Collection; +import java.util.Iterator; +import java.util.Set; +import org.apache.iceberg.util.CharSequenceWrapper; + +public class BoundSetPredicate extends Predicate> { + private final LiteralSet literalSet; + + BoundSetPredicate(Operation op, BoundReference ref, Set> lits) { +super(op, ref); +Preconditions.checkArgument(op == Operation.IN || op == Operation.NOT_IN, +"%s predicate does not support a set of literals", op); +this.literalSet = new LiteralSet<>(lits); + } + + BoundSetPredicate(Operation op, BoundReference ref, LiteralSet lits) { +super(op, ref); +Preconditions.checkArgument(op == Operation.IN || op == Operation.NOT_IN, +"%s predicate does not support a literal set", op); +this.literalSet = lits; + } + + @Override + public Expression negate() { +return new BoundSetPredicate<>(op().negate(), ref(), literalSet); + } + + public Set literalSet() { +return literalSet; + } + + @Override + String literalString() { +return literalSet.toString(); + } + + /** + * Represents a set of literal values in an IN or NOT_IN predicate + * @param The Java type of the value, which can be wrapped by a {@link Literal} + */ + private static class LiteralSet implements Set, Serializable { +private final Set values; + +@SuppressWarnings("unchecked") +LiteralSet(Set> lits) { + Preconditions.checkArgument(lits == null || lits.size() > 1, + "The input literal set must include more than 1 element."); + values = ImmutableSet.builder().addAll( + lits.stream().map( + lit -> { +if (lit instanceof Literals.StringLiteral) { + return (T) CharSequenceWrapper.wrap(((Literals.StringLiteral) lit).value()); +} else { + return lit.value(); +} + } + ).iterator()).build(); +} + +@Override +public String toString() { + return Joiner.on(", ").join(values); +} + +@Override +public boolean contains(Object object) { Review comment: Thanks for the comments. Updated. This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org With regards, Apache Git Services - To unsubscribe, e-mail: issues-unsubscr...@iceberg.apache.org For additional commands, e-mail: issues-h...@iceberg.apache.org
[GitHub] [incubator-iceberg] jun-he commented on a change in pull request #357: Add in and not in predicates
jun-he commented on a change in pull request #357: Add in and not in predicates URL: https://github.com/apache/incubator-iceberg/pull/357#discussion_r318402782 ## File path: api/src/test/java/org/apache/iceberg/expressions/TestEvaluator.java ## @@ -362,4 +368,197 @@ public void testCharSeqValue() { Assert.assertFalse("string(abc) == utf8(abcd) => false", evaluator.eval(TestHelpers.Row.of(new Utf8("abcd"; } + + @Test + public void testIn() { +Assert.assertEquals(3, in("s", 7, 8, 9).literals().size()); +Assert.assertEquals(3, in("s", 7, 8.1, Long.MAX_VALUE).literals().size()); +Assert.assertEquals(2, in("s", "abc", "abd", "abc").literals().size()); +Assert.assertEquals(0, in("s").literals().size()); +Assert.assertEquals(1, in("s", 5).literals().size()); +Assert.assertEquals(1, in("s", 5, 5).literals().size()); +Assert.assertEquals(1, in("s", Arrays.asList(5, 5)).literals().size()); +Assert.assertEquals(0, in("s", Collections.emptyList()).literals().size()); + +Evaluator evaluator = new Evaluator(STRUCT, in("x", 7, 8, Long.MAX_VALUE)); +Assert.assertTrue("7 in [7, 8] => true", evaluator.eval(TestHelpers.Row.of(7, 8, null))); +Assert.assertFalse("9 in [7, 8] => false", evaluator.eval(TestHelpers.Row.of(9, 8, null))); + +Evaluator longEvaluator = new Evaluator(STRUCT, +in("x", Long.MAX_VALUE, Integer.MAX_VALUE, Long.MIN_VALUE)); +Assert.assertTrue("Integer.MAX_VALUE in [Integer.MAX_VALUE] => true", +longEvaluator.eval(TestHelpers.Row.of(Integer.MAX_VALUE, 7.0, null))); +Assert.assertFalse("6 in [Integer.MAX_VALUE] => false", +longEvaluator.eval(TestHelpers.Row.of(6, 6.8, null))); + +Evaluator integerEvaluator = new Evaluator(STRUCT, in("y", 7, 8, 9.1)); +Assert.assertTrue("7.0 in [7, 8, 9.1] => true", integerEvaluator.eval(TestHelpers.Row.of(7, 7.0, null))); +Assert.assertTrue("9.1 in [7, 8, 9.1] => true", integerEvaluator.eval(TestHelpers.Row.of(7, 9.1, null))); +Assert.assertFalse("6.8 in [7, 8, 9] => false", integerEvaluator.eval(TestHelpers.Row.of(6, 6.8, null))); + +Evaluator structEvaluator = new Evaluator(STRUCT, in("s1.s2.s3.s4.i", 7, 8, 9)); +Assert.assertTrue("7 in [7, 8, 9] => true", +structEvaluator.eval(TestHelpers.Row.of(7, 8, null, +TestHelpers.Row.of( +TestHelpers.Row.of( +TestHelpers.Row.of( +TestHelpers.Row.of(7))); +Assert.assertFalse("6 in [7, 8, 9] => false", +structEvaluator.eval(TestHelpers.Row.of(6, 8, null, +TestHelpers.Row.of( +TestHelpers.Row.of( +TestHelpers.Row.of( +TestHelpers.Row.of(6))); + +StructType charSeqStruct = StructType.of(required(34, "s", Types.StringType.get())); +Evaluator charSeqEvaluator = new Evaluator(charSeqStruct, in("s", "abc", "abd", "abc")); +Assert.assertTrue("utf8(abc) in [string(abc), string(abd)] => true", +charSeqEvaluator.eval(TestHelpers.Row.of(new Utf8("abc"; +Assert.assertFalse("utf8(abcd) in [string(abc), string(abd)] => false", +charSeqEvaluator.eval(TestHelpers.Row.of(new Utf8("abcd"; + } + + @Test + public void testInExceptions() { +TestHelpers.assertThrows( +"Throw exception if value is null", +NullPointerException.class, +"Cannot create expression literal from null", +() -> in("x", (Literal) null)); + +TestHelpers.assertThrows( +"Throw exception if value is null", +NullPointerException.class, +"Values cannot be null for IN predicate", +() -> in("x", (Collection) null)); + +TestHelpers.assertThrows( +"Throw exception if calling literals() for EQ predicate", +IllegalArgumentException.class, +"EQ predicate cannot return a list of literals", +() -> equal("x", 5).literals()); Review comment: 👌 Updated accordingly. This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org With regards, Apache Git Services - To unsubscribe, e-mail: issues-unsubscr...@iceberg.apache.org For additional commands, e-mail: issues-h...@iceberg.apache.org
[GitHub] [incubator-iceberg] jun-he commented on a change in pull request #357: Add in and not in predicates
jun-he commented on a change in pull request #357: Add in and not in predicates URL: https://github.com/apache/incubator-iceberg/pull/357#discussion_r318402818 ## File path: api/src/test/java/org/apache/iceberg/expressions/TestEvaluator.java ## @@ -362,4 +368,197 @@ public void testCharSeqValue() { Assert.assertFalse("string(abc) == utf8(abcd) => false", evaluator.eval(TestHelpers.Row.of(new Utf8("abcd"; } + + @Test + public void testIn() { +Assert.assertEquals(3, in("s", 7, 8, 9).literals().size()); +Assert.assertEquals(3, in("s", 7, 8.1, Long.MAX_VALUE).literals().size()); +Assert.assertEquals(2, in("s", "abc", "abd", "abc").literals().size()); +Assert.assertEquals(0, in("s").literals().size()); +Assert.assertEquals(1, in("s", 5).literals().size()); +Assert.assertEquals(1, in("s", 5, 5).literals().size()); +Assert.assertEquals(1, in("s", Arrays.asList(5, 5)).literals().size()); +Assert.assertEquals(0, in("s", Collections.emptyList()).literals().size()); + +Evaluator evaluator = new Evaluator(STRUCT, in("x", 7, 8, Long.MAX_VALUE)); +Assert.assertTrue("7 in [7, 8] => true", evaluator.eval(TestHelpers.Row.of(7, 8, null))); +Assert.assertFalse("9 in [7, 8] => false", evaluator.eval(TestHelpers.Row.of(9, 8, null))); + +Evaluator longEvaluator = new Evaluator(STRUCT, +in("x", Long.MAX_VALUE, Integer.MAX_VALUE, Long.MIN_VALUE)); +Assert.assertTrue("Integer.MAX_VALUE in [Integer.MAX_VALUE] => true", +longEvaluator.eval(TestHelpers.Row.of(Integer.MAX_VALUE, 7.0, null))); +Assert.assertFalse("6 in [Integer.MAX_VALUE] => false", +longEvaluator.eval(TestHelpers.Row.of(6, 6.8, null))); + +Evaluator integerEvaluator = new Evaluator(STRUCT, in("y", 7, 8, 9.1)); +Assert.assertTrue("7.0 in [7, 8, 9.1] => true", integerEvaluator.eval(TestHelpers.Row.of(7, 7.0, null))); +Assert.assertTrue("9.1 in [7, 8, 9.1] => true", integerEvaluator.eval(TestHelpers.Row.of(7, 9.1, null))); +Assert.assertFalse("6.8 in [7, 8, 9] => false", integerEvaluator.eval(TestHelpers.Row.of(6, 6.8, null))); Review comment: Fixed. This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org With regards, Apache Git Services - To unsubscribe, e-mail: issues-unsubscr...@iceberg.apache.org For additional commands, e-mail: issues-h...@iceberg.apache.org
[GitHub] [incubator-iceberg] jun-he commented on a change in pull request #357: Add in and not in predicates
jun-he commented on a change in pull request #357: Add in and not in predicates URL: https://github.com/apache/incubator-iceberg/pull/357#discussion_r318402850 ## File path: parquet/src/main/java/org/apache/iceberg/parquet/ParquetMetricsRowGroupFilter.java ## @@ -19,19 +19,14 @@ package org.apache.iceberg.parquet; -import com.google.common.base.Preconditions; import com.google.common.collect.Maps; import java.util.Map; +import java.util.Set; import java.util.function.Function; import org.apache.iceberg.Schema; -import org.apache.iceberg.expressions.Binder; -import org.apache.iceberg.expressions.BoundReference; -import org.apache.iceberg.expressions.Expression; -import org.apache.iceberg.expressions.ExpressionVisitors; +import org.apache.iceberg.expressions.*; Review comment: Done. This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org With regards, Apache Git Services - To unsubscribe, e-mail: issues-unsubscr...@iceberg.apache.org For additional commands, e-mail: issues-h...@iceberg.apache.org
[GitHub] [incubator-iceberg] jun-he commented on a change in pull request #357: Add in and not in predicates
jun-he commented on a change in pull request #357: Add in and not in predicates URL: https://github.com/apache/incubator-iceberg/pull/357#discussion_r318402737 ## File path: api/src/test/java/org/apache/iceberg/expressions/TestEvaluator.java ## @@ -362,4 +368,197 @@ public void testCharSeqValue() { Assert.assertFalse("string(abc) == utf8(abcd) => false", evaluator.eval(TestHelpers.Row.of(new Utf8("abcd"; } + + @Test + public void testIn() { +Assert.assertEquals(3, in("s", 7, 8, 9).literals().size()); +Assert.assertEquals(3, in("s", 7, 8.1, Long.MAX_VALUE).literals().size()); +Assert.assertEquals(2, in("s", "abc", "abd", "abc").literals().size()); +Assert.assertEquals(0, in("s").literals().size()); +Assert.assertEquals(1, in("s", 5).literals().size()); +Assert.assertEquals(1, in("s", 5, 5).literals().size()); +Assert.assertEquals(1, in("s", Arrays.asList(5, 5)).literals().size()); +Assert.assertEquals(0, in("s", Collections.emptyList()).literals().size()); + +Evaluator evaluator = new Evaluator(STRUCT, in("x", 7, 8, Long.MAX_VALUE)); +Assert.assertTrue("7 in [7, 8] => true", evaluator.eval(TestHelpers.Row.of(7, 8, null))); +Assert.assertFalse("9 in [7, 8] => false", evaluator.eval(TestHelpers.Row.of(9, 8, null))); + +Evaluator longEvaluator = new Evaluator(STRUCT, +in("x", Long.MAX_VALUE, Integer.MAX_VALUE, Long.MIN_VALUE)); +Assert.assertTrue("Integer.MAX_VALUE in [Integer.MAX_VALUE] => true", +longEvaluator.eval(TestHelpers.Row.of(Integer.MAX_VALUE, 7.0, null))); +Assert.assertFalse("6 in [Integer.MAX_VALUE] => false", +longEvaluator.eval(TestHelpers.Row.of(6, 6.8, null))); + +Evaluator integerEvaluator = new Evaluator(STRUCT, in("y", 7, 8, 9.1)); +Assert.assertTrue("7.0 in [7, 8, 9.1] => true", integerEvaluator.eval(TestHelpers.Row.of(7, 7.0, null))); Review comment: Done. This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org With regards, Apache Git Services - To unsubscribe, e-mail: issues-unsubscr...@iceberg.apache.org For additional commands, e-mail: issues-h...@iceberg.apache.org
[GitHub] [incubator-iceberg] jun-he commented on a change in pull request #357: Add in and not in predicates
jun-he commented on a change in pull request #357: Add in and not in predicates URL: https://github.com/apache/incubator-iceberg/pull/357#discussion_r318402641 ## File path: api/src/main/java/org/apache/iceberg/expressions/UnboundPredicate.java ## @@ -125,13 +149,40 @@ public Expression bind(Types.StructType struct, boolean caseSensitive) { case LT_EQ: case EQ: return Expressions.alwaysFalse(); -//case IN: -// break; -//case NOT_IN: -// break; } } return new BoundPredicate<>(op(), new BoundReference<>(field.fieldId(), -schema.accessorForField(field.fieldId())), lit); +schema.accessorForField(field.fieldId())), lit); + } + + @SuppressWarnings("unchecked") + private Expression bindInOperation(Types.NestedField field, Schema schema) { +final Set> lits = literals().stream().map( +l -> { + Literal lit = l.to(field.type()); + if (lit == null) { +throw new ValidationException(String.format( +"Invalid value for comparison inclusive type %s: %s (%s)", +field.type(), l.value(), l.value().getClass().getName())); + } + return lit; +}) +.filter(l -> l != Literals.aboveMax() && l != Literals.belowMin()) +.collect(Collectors.toSet()); + +if (lits.isEmpty()) { + return Expressions.alwaysFalse(); +} else if (lits.size() == 1) { + return new BoundPredicate<>(Operation.EQ, new BoundReference<>(field.fieldId(), + schema.accessorForField(field.fieldId())), lits.iterator().next()); Review comment: 👌 This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org With regards, Apache Git Services - To unsubscribe, e-mail: issues-unsubscr...@iceberg.apache.org For additional commands, e-mail: issues-h...@iceberg.apache.org
[GitHub] [incubator-iceberg] jun-he commented on a change in pull request #357: Add in and not in predicates
jun-he commented on a change in pull request #357: Add in and not in predicates URL: https://github.com/apache/incubator-iceberg/pull/357#discussion_r318402697 ## File path: api/src/test/java/org/apache/iceberg/expressions/TestEvaluator.java ## @@ -362,4 +368,197 @@ public void testCharSeqValue() { Assert.assertFalse("string(abc) == utf8(abcd) => false", evaluator.eval(TestHelpers.Row.of(new Utf8("abcd"; } + + @Test + public void testIn() { +Assert.assertEquals(3, in("s", 7, 8, 9).literals().size()); +Assert.assertEquals(3, in("s", 7, 8.1, Long.MAX_VALUE).literals().size()); +Assert.assertEquals(2, in("s", "abc", "abd", "abc").literals().size()); +Assert.assertEquals(0, in("s").literals().size()); +Assert.assertEquals(1, in("s", 5).literals().size()); +Assert.assertEquals(1, in("s", 5, 5).literals().size()); +Assert.assertEquals(1, in("s", Arrays.asList(5, 5)).literals().size()); +Assert.assertEquals(0, in("s", Collections.emptyList()).literals().size()); + +Evaluator evaluator = new Evaluator(STRUCT, in("x", 7, 8, Long.MAX_VALUE)); +Assert.assertTrue("7 in [7, 8] => true", evaluator.eval(TestHelpers.Row.of(7, 8, null))); +Assert.assertFalse("9 in [7, 8] => false", evaluator.eval(TestHelpers.Row.of(9, 8, null))); + +Evaluator longEvaluator = new Evaluator(STRUCT, Review comment: Thanks and fixed. This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org With regards, Apache Git Services - To unsubscribe, e-mail: issues-unsubscr...@iceberg.apache.org For additional commands, e-mail: issues-h...@iceberg.apache.org
[GitHub] [incubator-iceberg] jun-he commented on a change in pull request #357: Add in and not in predicates
jun-he commented on a change in pull request #357: Add in and not in predicates URL: https://github.com/apache/incubator-iceberg/pull/357#discussion_r318402469 ## File path: api/src/main/java/org/apache/iceberg/expressions/BoundSetPredicate.java ## @@ -0,0 +1,176 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.iceberg.expressions; + +import com.google.common.base.Joiner; +import com.google.common.base.Preconditions; +import com.google.common.collect.ImmutableSet; +import java.io.Serializable; +import java.util.Collection; +import java.util.Iterator; +import java.util.Set; +import org.apache.iceberg.util.CharSequenceWrapper; + +public class BoundSetPredicate extends Predicate> { + private final LiteralSet literalSet; + + BoundSetPredicate(Operation op, BoundReference ref, Set> lits) { +super(op, ref); +Preconditions.checkArgument(op == Operation.IN || op == Operation.NOT_IN, +"%s predicate does not support a set of literals", op); +this.literalSet = new LiteralSet<>(lits); + } + + BoundSetPredicate(Operation op, BoundReference ref, LiteralSet lits) { +super(op, ref); +Preconditions.checkArgument(op == Operation.IN || op == Operation.NOT_IN, +"%s predicate does not support a literal set", op); +this.literalSet = lits; + } + + @Override + public Expression negate() { +return new BoundSetPredicate<>(op().negate(), ref(), literalSet); + } + + public Set literalSet() { +return literalSet; + } + + @Override + String literalString() { +return literalSet.toString(); + } + + /** + * Represents a set of literal values in an IN or NOT_IN predicate + * @param The Java type of the value, which can be wrapped by a {@link Literal} + */ + private static class LiteralSet implements Set, Serializable { +private final Set values; + +@SuppressWarnings("unchecked") +LiteralSet(Set> lits) { + Preconditions.checkArgument(lits == null || lits.size() > 1, + "The input literal set must include more than 1 element."); + values = ImmutableSet.builder().addAll( + lits.stream().map( + lit -> { +if (lit instanceof Literals.StringLiteral) { + return (T) CharSequenceWrapper.wrap(((Literals.StringLiteral) lit).value()); +} else { + return lit.value(); +} + } + ).iterator()).build(); +} + +@Override +public String toString() { + return Joiner.on(", ").join(values); +} + +@Override +public boolean contains(Object object) { + if (object instanceof CharSequence) { +return values.contains(CharSequenceWrapper.wrap((CharSequence) object)); + } + return values.contains(object); +} + +@Override +public int size() { + return values.size(); +} + +@Override +public boolean isEmpty() { + return values.isEmpty(); +} + +@Override +@SuppressWarnings("unchecked") +public Iterator iterator() { + return values.stream().map( + val -> { +if (val instanceof CharSequenceWrapper) { + return (T) ((CharSequenceWrapper) val).get(); +} else { + return val; +} + }).iterator(); +} + +@Override +public boolean containsAll(Collection c) { + throw new UnsupportedOperationException( + "LiteralSet currently only supports checking if a single item is contained in it."); +} + +@Override +public Object[] toArray() { + throw new UnsupportedOperationException( + "Please use iterator() to visit the elements in the set."); +} + +@Override +public X[] toArray(X[] a) { + throw new UnsupportedOperationException( + "Please use iterator() to visit the elements in the set."); +} + +@Override +public boolean add(T t) { + throw new UnsupportedOperationException( + "The set is immutable and cannot add an element."); +} + +@Override +public boolean remove(Object o) { + throw new UnsupportedOperationException( + "The set is immutable an
[GitHub] [incubator-iceberg] jun-he commented on a change in pull request #357: Add in and not in predicates
jun-he commented on a change in pull request #357: Add in and not in predicates URL: https://github.com/apache/incubator-iceberg/pull/357#discussion_r318402583 ## File path: api/src/main/java/org/apache/iceberg/expressions/UnboundPredicate.java ## @@ -125,13 +149,40 @@ public Expression bind(Types.StructType struct, boolean caseSensitive) { case LT_EQ: case EQ: return Expressions.alwaysFalse(); -//case IN: -// break; -//case NOT_IN: -// break; } } return new BoundPredicate<>(op(), new BoundReference<>(field.fieldId(), -schema.accessorForField(field.fieldId())), lit); +schema.accessorForField(field.fieldId())), lit); Review comment: Fixed. This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org With regards, Apache Git Services - To unsubscribe, e-mail: issues-unsubscr...@iceberg.apache.org For additional commands, e-mail: issues-h...@iceberg.apache.org
[GitHub] [incubator-iceberg] jun-he commented on a change in pull request #357: Add in and not in predicates
jun-he commented on a change in pull request #357: Add in and not in predicates URL: https://github.com/apache/incubator-iceberg/pull/357#discussion_r318402556 ## File path: api/src/main/java/org/apache/iceberg/expressions/Predicate.java ## @@ -39,37 +37,35 @@ public R ref() { return ref; } - public Literal literal() { -return literal; - } + abstract String literalString(); @Override public String toString() { -switch (op) { +switch (op()) { Review comment: Done. This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org With regards, Apache Git Services - To unsubscribe, e-mail: issues-unsubscr...@iceberg.apache.org For additional commands, e-mail: issues-h...@iceberg.apache.org
[GitHub] [incubator-iceberg] jun-he commented on a change in pull request #357: Add in and not in predicates
jun-he commented on a change in pull request #357: Add in and not in predicates URL: https://github.com/apache/incubator-iceberg/pull/357#discussion_r318402380 ## File path: api/src/main/java/org/apache/iceberg/expressions/BoundSetPredicate.java ## @@ -0,0 +1,176 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.iceberg.expressions; + +import com.google.common.base.Joiner; +import com.google.common.base.Preconditions; +import com.google.common.collect.ImmutableSet; +import java.io.Serializable; +import java.util.Collection; +import java.util.Iterator; +import java.util.Set; +import org.apache.iceberg.util.CharSequenceWrapper; + +public class BoundSetPredicate extends Predicate> { + private final LiteralSet literalSet; + + BoundSetPredicate(Operation op, BoundReference ref, Set> lits) { +super(op, ref); +Preconditions.checkArgument(op == Operation.IN || op == Operation.NOT_IN, +"%s predicate does not support a set of literals", op); +this.literalSet = new LiteralSet<>(lits); + } + + BoundSetPredicate(Operation op, BoundReference ref, LiteralSet lits) { +super(op, ref); +Preconditions.checkArgument(op == Operation.IN || op == Operation.NOT_IN, +"%s predicate does not support a literal set", op); +this.literalSet = lits; + } + + @Override + public Expression negate() { +return new BoundSetPredicate<>(op().negate(), ref(), literalSet); + } + + public Set literalSet() { +return literalSet; + } + + @Override + String literalString() { +return literalSet.toString(); + } + + /** + * Represents a set of literal values in an IN or NOT_IN predicate + * @param The Java type of the value, which can be wrapped by a {@link Literal} + */ + private static class LiteralSet implements Set, Serializable { Review comment: 👌 This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org With regards, Apache Git Services - To unsubscribe, e-mail: issues-unsubscr...@iceberg.apache.org For additional commands, e-mail: issues-h...@iceberg.apache.org
[GitHub] [incubator-iceberg] rdsr commented on a change in pull request #421: Add friendly names to catalog tables
rdsr commented on a change in pull request #421: Add friendly names to catalog tables URL: https://github.com/apache/incubator-iceberg/pull/421#discussion_r318396277 ## File path: hive/src/main/java/org/apache/iceberg/hive/HiveCatalog.java ## @@ -43,6 +43,11 @@ public HiveCatalog(Configuration conf) { this.conf = conf; } + @Override + protected String name() { +return conf.get("hive.metastore.uris"); Review comment: This is usually a collection of uris. So I think the catalogName would come out as `thrift://host1:port1/,thrift://host2:port2/` . I don' think its a big deal. Just calling it out This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org With regards, Apache Git Services - To unsubscribe, e-mail: issues-unsubscr...@iceberg.apache.org For additional commands, e-mail: issues-h...@iceberg.apache.org
[GitHub] [incubator-iceberg] chenjunjiedada commented on a change in pull request #374: Migrate spark table to iceberg table
chenjunjiedada commented on a change in pull request #374: Migrate spark table to iceberg table URL: https://github.com/apache/incubator-iceberg/pull/374#discussion_r318368167 ## File path: spark/src/main/scala/org/apache/iceberg/spark/SparkTableUtil.scala ## @@ -297,5 +302,81 @@ object SparkTableUtil { ) } } + + private def buildManifest(table: Table, + sparkDataFiles: Seq[SparkDataFile], + partitionSpec: PartitionSpec): ManifestFile = { +val outputFile = table.io + .newOutputFile(FileFormat.AVRO.addExtension("/tmp/" + UUID.randomUUID.toString)) Review comment: Understood, I was still thinking serialize the manifest file... This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org With regards, Apache Git Services - To unsubscribe, e-mail: issues-unsubscr...@iceberg.apache.org For additional commands, e-mail: issues-h...@iceberg.apache.org
[GitHub] [incubator-iceberg] rdblue commented on issue #416: Adding docs on how to use custom catalog with iceberg
rdblue commented on issue #416: Adding docs on how to use custom catalog with iceberg URL: https://github.com/apache/incubator-iceberg/pull/416#issuecomment-525518658 Merged. Thanks @moulimukherjee! This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org With regards, Apache Git Services - To unsubscribe, e-mail: issues-unsubscr...@iceberg.apache.org For additional commands, e-mail: issues-h...@iceberg.apache.org
[GitHub] [incubator-iceberg] rdblue merged pull request #416: Adding docs on how to use custom catalog with iceberg
rdblue merged pull request #416: Adding docs on how to use custom catalog with iceberg URL: https://github.com/apache/incubator-iceberg/pull/416 This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org With regards, Apache Git Services - To unsubscribe, e-mail: issues-unsubscr...@iceberg.apache.org For additional commands, e-mail: issues-h...@iceberg.apache.org
[GitHub] [incubator-iceberg] rdblue opened a new pull request #421: Add friendly names to catalog tables
rdblue opened a new pull request #421: Add friendly names to catalog tables URL: https://github.com/apache/incubator-iceberg/pull/421 This adds an abstract `name` method to `BaseMetastoreCatalog` and a new method to generate full table names from a catalog. This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org With regards, Apache Git Services - To unsubscribe, e-mail: issues-unsubscr...@iceberg.apache.org For additional commands, e-mail: issues-h...@iceberg.apache.org
[GitHub] [incubator-iceberg] rdblue merged pull request #386: Fix copies in partition data and field summaries
rdblue merged pull request #386: Fix copies in partition data and field summaries URL: https://github.com/apache/incubator-iceberg/pull/386 This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org With regards, Apache Git Services - To unsubscribe, e-mail: issues-unsubscr...@iceberg.apache.org For additional commands, e-mail: issues-h...@iceberg.apache.org
[GitHub] [incubator-iceberg] moulimukherjee commented on a change in pull request #416: Adding docs on how to use custom catalog with iceberg
moulimukherjee commented on a change in pull request #416: Adding docs on how to use custom catalog with iceberg URL: https://github.com/apache/incubator-iceberg/pull/416#discussion_r318324317 ## File path: site/docs/custom-catalog.md ## @@ -0,0 +1,139 @@ +# Custom Catalog Implementation + +It's possible to read an iceberg table either from an hdfs path or from a hive table. It's also possible to use a custom metastore in place of hive. The steps to do that are as follows. + +- [Custom TableOperations](#custom-table-operations-implementation) +- [Custom Catalog](#custom-table-implementation) +- [Custom IcebergSource](#custom-icebergsource) + +### Custom table operations implementation +Extend `BaseMetastoreTableOperations` to provide implementation on how to read and write metadata + +Example: +```java +class CustomTableOperations extends BaseMetastoreTableOperations { + private String dbName; + private String tableName; + private Configuration conf; + private FileIO fileIO; + + protected CustomTableOperations(Configuration conf, String dbName, String tableName) { +this.conf = conf; +this.dbName = dbName; +this.tableName = tableName; + } + + // The doRefresh method should provide implementation on how to get the metadata location + @Override + public void doRefresh() { + +// Example custom service which returns the metadata location given a dbName and tableName +String metadataLocation = CustomService.getMetadataForTable(conf, dbName, tableName); + +// When updating from a metadata file location, call the helper method +refreshFromMetadataLocation(metadataLocation); + + } + + // The doCommit method should provide implementation on how to update with metadata location atomically + @Override + public void doCommit(TableMetadata base, TableMetadata metadata) { +String oldMetadataLocation = base.location(); + +// Write new metadata using helper method +String newMetadataLocation = writeNewMetadata(metadata, currentVersion() + 1); + +// Example custom service which updates the metadata location for the given db and table atomically +CustomService.updateMetadataLocation(dbName, tableName, oldMetadataLocation, newMetadataLocation); + + } + + // The io method provides a FileIO which is used to read and write the table metadata files + @Override + public FileIO io() { +if (fileIO == null) { + fileIO = new HadoopFileIO(conf); +} +return fileIO; + } + + // Optional: this can be overridden to provide custom location provider implementation + @Override + public LocationProvider locationProvider() { +// TODO + } +} +``` + +### Custom table implementation +Extend `BaseMetastoreCatalog` to provide default warehouse locations and instantiate `CustomTableOperations` + +Example: +```java +public class CustomCatalog extends BaseMetastoreCatalog { + + private Configuration configuration; + + public CustomCatalog(Configuration configuration) { +this.configuration = configuration; + } + + @Override + protected TableOperations newTableOps(TableIdentifier tableIdentifier) { +String dbName = tableIdentifier.namespace().level(0); +String tableName = tableIdentifier.name(); +// instantiate the CustomTableOperations +return new CustomTableOperations(configuration, dbName, tableName); + } + + @Override + protected String defaultWarehouseLocation(TableIdentifier tableIdentifier) { + +// Can choose to use any other configuration name +String tableLocation = configuration.get("custom.iceberg.warehouse.location"); + +// Can be an s3 or hdfs path +if (tableLocation == null) { + throw new RuntimeException("custom.iceberg.warehouse.location configuration not set!"); +} + +return String.format( +"%s/%s.db/%s", tableLocation, +tableIdentifier.namespace().levels()[0], +tableIdentifier.name()); + } + + @Override + public boolean dropTable(TableIdentifier identifier, boolean purge) { +// Example service to delete table +CustomService.deleteTable(identifier.namepsace().level(0), identifier.name()); + } + + @Override + public void renameTable(TableIdentifier from, TableIdentifier to) { +// TODO implement behavior +throw new RuntimeException("Not yet implemented"); Review comment: Ack 👍 This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org With regards, Apache Git Services - To unsubscribe, e-mail: issues-unsubscr...@iceberg.apache.org For additional commands, e-mail: issues-h...@iceberg.apache.org
[GitHub] [incubator-iceberg] rdblue commented on a change in pull request #416: Adding docs on how to use custom catalog with iceberg
rdblue commented on a change in pull request #416: Adding docs on how to use custom catalog with iceberg URL: https://github.com/apache/incubator-iceberg/pull/416#discussion_r318324221 ## File path: site/docs/custom-catalog.md ## @@ -0,0 +1,139 @@ +# Custom Catalog Implementation + +It's possible to read an iceberg table either from an hdfs path or from a hive table. It's also possible to use a custom metastore in place of hive. The steps to do that are as follows. + +- [Custom TableOperations](#custom-table-operations-implementation) +- [Custom Catalog](#custom-table-implementation) +- [Custom IcebergSource](#custom-icebergsource) + +### Custom table operations implementation +Extend `BaseMetastoreTableOperations` to provide implementation on how to read and write metadata + +Example: +```java +class CustomTableOperations extends BaseMetastoreTableOperations { + private String dbName; + private String tableName; + private Configuration conf; + private FileIO fileIO; + + protected CustomTableOperations(Configuration conf, String dbName, String tableName) { +this.conf = conf; +this.dbName = dbName; +this.tableName = tableName; + } + + // The doRefresh method should provide implementation on how to get the metadata location + @Override + public void doRefresh() { + +// Example custom service which returns the metadata location given a dbName and tableName +String metadataLocation = CustomService.getMetadataForTable(conf, dbName, tableName); + +// When updating from a metadata file location, call the helper method +refreshFromMetadataLocation(metadataLocation); + + } + + // The doCommit method should provide implementation on how to update with metadata location atomically + @Override + public void doCommit(TableMetadata base, TableMetadata metadata) { +String oldMetadataLocation = base.location(); + +// Write new metadata using helper method +String newMetadataLocation = writeNewMetadata(metadata, currentVersion() + 1); + +// Example custom service which updates the metadata location for the given db and table atomically +CustomService.updateMetadataLocation(dbName, tableName, oldMetadataLocation, newMetadataLocation); + + } + + // The io method provides a FileIO which is used to read and write the table metadata files + @Override + public FileIO io() { +if (fileIO == null) { + fileIO = new HadoopFileIO(conf); +} +return fileIO; + } + + // Optional: this can be overridden to provide custom location provider implementation + @Override + public LocationProvider locationProvider() { Review comment: Let's remove this for now. We can add it back later if we want to show this example. This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org With regards, Apache Git Services - To unsubscribe, e-mail: issues-unsubscr...@iceberg.apache.org For additional commands, e-mail: issues-h...@iceberg.apache.org
[GitHub] [incubator-iceberg] rdblue commented on a change in pull request #416: Adding docs on how to use custom catalog with iceberg
rdblue commented on a change in pull request #416: Adding docs on how to use custom catalog with iceberg URL: https://github.com/apache/incubator-iceberg/pull/416#discussion_r318324025 ## File path: site/docs/custom-catalog.md ## @@ -0,0 +1,139 @@ +# Custom Catalog Implementation + +It's possible to read an iceberg table either from an hdfs path or from a hive table. It's also possible to use a custom metastore in place of hive. The steps to do that are as follows. + +- [Custom TableOperations](#custom-table-operations-implementation) +- [Custom Catalog](#custom-table-implementation) +- [Custom IcebergSource](#custom-icebergsource) + +### Custom table operations implementation +Extend `BaseMetastoreTableOperations` to provide implementation on how to read and write metadata + +Example: +```java +class CustomTableOperations extends BaseMetastoreTableOperations { + private String dbName; + private String tableName; + private Configuration conf; + private FileIO fileIO; + + protected CustomTableOperations(Configuration conf, String dbName, String tableName) { +this.conf = conf; +this.dbName = dbName; +this.tableName = tableName; + } + + // The doRefresh method should provide implementation on how to get the metadata location + @Override + public void doRefresh() { + +// Example custom service which returns the metadata location given a dbName and tableName +String metadataLocation = CustomService.getMetadataForTable(conf, dbName, tableName); + +// When updating from a metadata file location, call the helper method +refreshFromMetadataLocation(metadataLocation); + + } + + // The doCommit method should provide implementation on how to update with metadata location atomically + @Override + public void doCommit(TableMetadata base, TableMetadata metadata) { +String oldMetadataLocation = base.location(); + +// Write new metadata using helper method +String newMetadataLocation = writeNewMetadata(metadata, currentVersion() + 1); + +// Example custom service which updates the metadata location for the given db and table atomically +CustomService.updateMetadataLocation(dbName, tableName, oldMetadataLocation, newMetadataLocation); + + } + + // The io method provides a FileIO which is used to read and write the table metadata files + @Override + public FileIO io() { +if (fileIO == null) { + fileIO = new HadoopFileIO(conf); +} +return fileIO; + } + + // Optional: this can be overridden to provide custom location provider implementation + @Override + public LocationProvider locationProvider() { +// TODO + } +} +``` + +### Custom table implementation +Extend `BaseMetastoreCatalog` to provide default warehouse locations and instantiate `CustomTableOperations` + +Example: +```java +public class CustomCatalog extends BaseMetastoreCatalog { + + private Configuration configuration; + + public CustomCatalog(Configuration configuration) { +this.configuration = configuration; + } + + @Override + protected TableOperations newTableOps(TableIdentifier tableIdentifier) { +String dbName = tableIdentifier.namespace().level(0); +String tableName = tableIdentifier.name(); +// instantiate the CustomTableOperations +return new CustomTableOperations(configuration, dbName, tableName); + } + + @Override + protected String defaultWarehouseLocation(TableIdentifier tableIdentifier) { + +// Can choose to use any other configuration name +String tableLocation = configuration.get("custom.iceberg.warehouse.location"); + +// Can be an s3 or hdfs path +if (tableLocation == null) { + throw new RuntimeException("custom.iceberg.warehouse.location configuration not set!"); +} + +return String.format( +"%s/%s.db/%s", tableLocation, +tableIdentifier.namespace().levels()[0], +tableIdentifier.name()); + } + + @Override + public boolean dropTable(TableIdentifier identifier, boolean purge) { +// Example service to delete table +CustomService.deleteTable(identifier.namepsace().level(0), identifier.name()); + } + + @Override + public void renameTable(TableIdentifier from, TableIdentifier to) { +// TODO implement behavior +throw new RuntimeException("Not yet implemented"); Review comment: As above, could this be a `CustomService` call? ```java Preconditions.checkArgument( from.namespace().level(0).equals(to.namespace().level(0)), "Cannot move table between databases"); CustomService.renameTable(from.namepsace().level(0), from.name(), to.name()); ``` This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org With regards, Apache
[GitHub] [incubator-iceberg] rdblue commented on issue #420: Add commit.manifest-merge.enabled table property
rdblue commented on issue #420: Add commit.manifest-merge.enabled table property URL: https://github.com/apache/incubator-iceberg/pull/420#issuecomment-525503667 @bryanck, here's the PR to disable manifest merging. This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org With regards, Apache Git Services - To unsubscribe, e-mail: issues-unsubscr...@iceberg.apache.org For additional commands, e-mail: issues-h...@iceberg.apache.org
[GitHub] [incubator-iceberg] rdblue opened a new pull request #420: Add commit.manifest-merge.enabled table property
rdblue opened a new pull request #420: Add commit.manifest-merge.enabled table property URL: https://github.com/apache/incubator-iceberg/pull/420 This property can disable manifest merging. This is intended to be used with `RewriteManifests`. Instead of automatic merging, `RewriteManifests` is used to maintain metadata. This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org With regards, Apache Git Services - To unsubscribe, e-mail: issues-unsubscr...@iceberg.apache.org For additional commands, e-mail: issues-h...@iceberg.apache.org
[GitHub] [incubator-iceberg] rdblue commented on issue #419: Adding common checks in commit()
rdblue commented on issue #419: Adding common checks in commit() URL: https://github.com/apache/incubator-iceberg/pull/419#issuecomment-525498398 Merged. Thanks @moulimukherjee! This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org With regards, Apache Git Services - To unsubscribe, e-mail: issues-unsubscr...@iceberg.apache.org For additional commands, e-mail: issues-h...@iceberg.apache.org
[GitHub] [incubator-iceberg] rdblue merged pull request #419: Adding common checks in commit()
rdblue merged pull request #419: Adding common checks in commit() URL: https://github.com/apache/incubator-iceberg/pull/419 This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org With regards, Apache Git Services - To unsubscribe, e-mail: issues-unsubscr...@iceberg.apache.org For additional commands, e-mail: issues-h...@iceberg.apache.org
[GitHub] [incubator-iceberg] rdblue commented on issue #419: Adding common checks in commit()
rdblue commented on issue #419: Adding common checks in commit() URL: https://github.com/apache/incubator-iceberg/pull/419#issuecomment-525481988 Looks good to me. I'll merge once tests are passing. This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org With regards, Apache Git Services - To unsubscribe, e-mail: issues-unsubscr...@iceberg.apache.org For additional commands, e-mail: issues-h...@iceberg.apache.org
[GitHub] [incubator-iceberg] moulimukherjee commented on a change in pull request #416: Adding docs on how to use custom catalog with iceberg
moulimukherjee commented on a change in pull request #416: Adding docs on how to use custom catalog with iceberg URL: https://github.com/apache/incubator-iceberg/pull/416#discussion_r318295030 ## File path: site/mkdocs.yml ## @@ -52,6 +52,7 @@ nav: - Quickstart: api-quickstart.md - Spark: spark.md - Presto: presto.md +- Custom Catalog: custom-catalog.md Review comment: Done This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org With regards, Apache Git Services - To unsubscribe, e-mail: issues-unsubscr...@iceberg.apache.org For additional commands, e-mail: issues-h...@iceberg.apache.org
[GitHub] [incubator-iceberg] moulimukherjee commented on a change in pull request #416: Adding docs on how to use custom catalog with iceberg
moulimukherjee commented on a change in pull request #416: Adding docs on how to use custom catalog with iceberg URL: https://github.com/apache/incubator-iceberg/pull/416#discussion_r318291645 ## File path: site/docs/custom-catalog.md ## @@ -0,0 +1,149 @@ +# Custom Catalog Implementation + +It's possible to read an iceberg table either from an hdfs path or from a hive table. It's also possible to use a custom metastore in place of hive. The steps to do that are as follows. + +- [Custom TableOperations](#custom-table-operations-implementation) +- [Custom Catalog](#custom-table-implementation) +- [Custom IcebergSource](#custom-icebergsource) + +### Custom table operations implementation +Extend `BaseMetastoreTableOperations` to provide implementation on how to read and write metadata + +Example: +```java +class CustomTableOperations extends BaseMetastoreTableOperations { + private String dbName; + private String tableName; + private Configuration conf; + private FileIO fileIO; + + protected CustomTableOperations(Configuration conf, String dbName, String tableName) { +this.conf = conf; +this.dbName = dbName; +this.tableName = tableName; + } + + // The doRefresh method should provide implementation on how to get the metadata location + @Override + public void doRefresh() { + +// Example custom service which returns the metadata location given a dbName and tableName +String metadataLocation = CustomService.getMetadataForTable(conf, dbName, tableName); + +// Use existing method to refresh metadata +refreshFromMetadataLocation(metadataLocation); + + } + + // The doCommit method should provide implementation on how to update with metadata location atomically + @Override + public void doCommit(TableMetadata base, TableMetadata metadata) { +// if the metadata is already out of date, reject it +if (base != current()) { Review comment: Added it at https://github.com/apache/incubator-iceberg/pull/419, will remove it from here. This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org With regards, Apache Git Services - To unsubscribe, e-mail: issues-unsubscr...@iceberg.apache.org For additional commands, e-mail: issues-h...@iceberg.apache.org
[GitHub] [incubator-iceberg] moulimukherjee opened a new pull request #419: Adding common checks in commit()
moulimukherjee opened a new pull request #419: Adding common checks in commit() URL: https://github.com/apache/incubator-iceberg/pull/419 Adding common checks in commit() From conversation at https://github.com/apache/incubator-iceberg/pull/416/files#r318243051 r? @rdblue @aokolnychyi This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org With regards, Apache Git Services - To unsubscribe, e-mail: issues-unsubscr...@iceberg.apache.org For additional commands, e-mail: issues-h...@iceberg.apache.org
[GitHub] [incubator-iceberg] moulimukherjee commented on a change in pull request #416: Adding docs on how to use custom catalog with iceberg
moulimukherjee commented on a change in pull request #416: Adding docs on how to use custom catalog with iceberg URL: https://github.com/apache/incubator-iceberg/pull/416#discussion_r318268122 ## File path: site/docs/custom-catalog.md ## @@ -0,0 +1,149 @@ +# Custom Catalog Implementation + +It's possible to read an iceberg table either from an hdfs path or from a hive table. It's also possible to use a custom metastore in place of hive. The steps to do that are as follows. + +- [Custom TableOperations](#custom-table-operations-implementation) +- [Custom Catalog](#custom-table-implementation) +- [Custom IcebergSource](#custom-icebergsource) + +### Custom table operations implementation +Extend `BaseMetastoreTableOperations` to provide implementation on how to read and write metadata + +Example: +```java +class CustomTableOperations extends BaseMetastoreTableOperations { + private String dbName; + private String tableName; + private Configuration conf; + private FileIO fileIO; + + protected CustomTableOperations(Configuration conf, String dbName, String tableName) { +this.conf = conf; +this.dbName = dbName; +this.tableName = tableName; + } + + // The doRefresh method should provide implementation on how to get the metadata location + @Override + public void doRefresh() { + +// Example custom service which returns the metadata location given a dbName and tableName +String metadataLocation = CustomService.getMetadataForTable(conf, dbName, tableName); + +// Use existing method to refresh metadata +refreshFromMetadataLocation(metadataLocation); + + } + + // The doCommit method should provide implementation on how to update with metadata location atomically + @Override + public void doCommit(TableMetadata base, TableMetadata metadata) { +// if the metadata is already out of date, reject it +if (base != current()) { + throw new CommitFailedException("Cannot commit: stale table metadata for %s.%s", dbName, tableName); +} + +// if the metadata is not changed, return early +if (base == metadata) { + return; +} + +String oldMetadataLocation = base.location(); + +// Write new metadata +String newMetadataLocation = writeNewMetadata(metadata, currentVersion() + 1); + +// Example custom service which updates the metadata location for the given db and table atomically +CustomService.updateMetadataLocation(dbName, tableName, oldMetadataLocation, newMetadataLocation); + + } + + // The io method provides a FileIO which is used to read and write the table metadata files + @Override + public FileIO io() { +if (fileIO == null) { + fileIO = new HadoopFileIO(conf); +} +return fileIO; + } + + // Optional: this can be overridden to provide custom location provider implementation + @Override + public LocationProvider locationProvider() { +// TODO + } +} +``` + +### Custom table implementation +Extend `BaseMetastoreCatalog` to provide default warehouse locations and instantiate `CustomTableOperations` + +Example: +```java +public class CustomCatalog extends BaseMetastoreCatalog { + + private Configuration configuration; + + public CustomCatalog(Configuration configuration) { +this.configuration = configuration; + } + + @Override + protected TableOperations newTableOps(TableIdentifier tableIdentifier) { +String dbName = tableIdentifier.namespace().level(0); +String tableName = tableIdentifier.name(); +// instantiate the CustomTableOperations +return new CustomTableOperations(configuration, dbName, tableName); + } + + @Override + protected String defaultWarehouseLocation(TableIdentifier tableIdentifier) { + +// Can choose to use any other configuration name +String tableLocation = configuration.get("custom.iceberg.warehouse.location"); + +// Can be an s3 or hdfs path +if (tableLocation == null) { + throw new RuntimeException("custom.iceberg.warehouse.location configuration not set!"); +} + +return String.format( +"%s/%s.db/%s", tableLocation, +tableIdentifier.namespace().levels()[0], +tableIdentifier.name()); + } + + @Override + public boolean dropTable(TableIdentifier identifier, boolean purge) { +// TODO implement behavior +throw new RuntimeException("Not yet implemented"); Review comment: Ack This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org With regards, Apache Git Services - To unsubscribe, e-mail: issues-unsubscr...@iceberg.apache.org For additional commands, e-mail: issues-h...@iceberg.apache.org
[GitHub] [incubator-iceberg] rdblue commented on issue #349: Remove unreferenced hash computations
rdblue commented on issue #349: Remove unreferenced hash computations URL: https://github.com/apache/incubator-iceberg/pull/349#issuecomment-525455644 +1 Thanks for fixing this @jbapple! This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org With regards, Apache Git Services - To unsubscribe, e-mail: issues-unsubscr...@iceberg.apache.org For additional commands, e-mail: issues-h...@iceberg.apache.org
[GitHub] [incubator-iceberg] rdblue merged pull request #349: Remove unreferenced hash computations
rdblue merged pull request #349: Remove unreferenced hash computations URL: https://github.com/apache/incubator-iceberg/pull/349 This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org With regards, Apache Git Services - To unsubscribe, e-mail: issues-unsubscr...@iceberg.apache.org For additional commands, e-mail: issues-h...@iceberg.apache.org
[GitHub] [incubator-iceberg] rdblue edited a comment on issue #280: Add persistent IDs to partition fields
rdblue edited a comment on issue #280: Add persistent IDs to partition fields URL: https://github.com/apache/incubator-iceberg/issues/280#issuecomment-525451795 > And also as TableMetadata knows how many fields are in partition, so can maintain the nextIDValue as well. The next partition field ID is the highest field ID in all of the table's partition specs +1. Once a partition spec is removed, we can reuse the ID. Alternatively, we can keep track of the last assigned ID, like we do for the table schema. > Also the TableMetadata#updatePartitionSpec should also use nextIDValue to pass to PartitionSpec. I think the spec's IDs will be assigned by the time that method is called because the partition spec passed in is already created. > Does modifying/dropping columns also needs to be taken care, I believe not? No. `updateSchema` already checks compatibility: https://github.com/apache/incubator-iceberg/blob/master/core/src/main/java/org/apache/iceberg/TableMetadata.java#L284 This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org With regards, Apache Git Services - To unsubscribe, e-mail: issues-unsubscr...@iceberg.apache.org For additional commands, e-mail: issues-h...@iceberg.apache.org
[GitHub] [incubator-iceberg] rdblue commented on issue #280: Add persistent IDs to partition fields
rdblue commented on issue #280: Add persistent IDs to partition fields URL: https://github.com/apache/incubator-iceberg/issues/280#issuecomment-525451795 > And also as TableMetadata knows how many fields are in partition, so can maintain the nextIDValue as well. The next partition field ID is the highest field ID in all of the table's partition specs +1. Once a partition spec is removed, we can reuse the ID. Alternatively, we can keep track of the last assigned ID, like we do for the table schema. > Also the TableMetadata#updatePartitionSpec should also use nextIDValue to pass to PartitionSpec. I think the spec's IDs will be assigned by the time that method is called because the partition spec passed in is already created. This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org With regards, Apache Git Services - To unsubscribe, e-mail: issues-unsubscr...@iceberg.apache.org For additional commands, e-mail: issues-h...@iceberg.apache.org
[GitHub] [incubator-iceberg] rdblue commented on a change in pull request #374: Migrate spark table to iceberg table
rdblue commented on a change in pull request #374: Migrate spark table to iceberg table URL: https://github.com/apache/incubator-iceberg/pull/374#discussion_r318254975 ## File path: spark/src/test/java/org/apache/iceberg/spark/source/TestSparkTableUtil.java ## @@ -116,4 +123,28 @@ public void testPartitionScanByFilter() { Dataset partitionDF = SparkTableUtil.partitionDFByFilter(spark, qualifiedTableName, "data = 'a'"); Assert.assertEquals("There should be 1 matching partition", 1, partitionDF.count()); } + + @Test + public void testImportPartitionedTable() throws Exception { Review comment: Can you run these tests to create Hive catalog tables as well? This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org With regards, Apache Git Services - To unsubscribe, e-mail: issues-unsubscr...@iceberg.apache.org For additional commands, e-mail: issues-h...@iceberg.apache.org
[GitHub] [incubator-iceberg] rdblue commented on a change in pull request #374: Migrate spark table to iceberg table
rdblue commented on a change in pull request #374: Migrate spark table to iceberg table URL: https://github.com/apache/incubator-iceberg/pull/374#discussion_r318254975 ## File path: spark/src/test/java/org/apache/iceberg/spark/source/TestSparkTableUtil.java ## @@ -116,4 +123,28 @@ public void testPartitionScanByFilter() { Dataset partitionDF = SparkTableUtil.partitionDFByFilter(spark, qualifiedTableName, "data = 'a'"); Assert.assertEquals("There should be 1 matching partition", 1, partitionDF.count()); } + + @Test + public void testImportPartitionedTable() throws Exception { Review comment: Can you run these tests with Hive tables as well? This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org With regards, Apache Git Services - To unsubscribe, e-mail: issues-unsubscr...@iceberg.apache.org For additional commands, e-mail: issues-h...@iceberg.apache.org
[GitHub] [incubator-iceberg] rdblue commented on a change in pull request #416: Adding docs on how to use custom catalog with iceberg
rdblue commented on a change in pull request #416: Adding docs on how to use custom catalog with iceberg URL: https://github.com/apache/incubator-iceberg/pull/416#discussion_r318252717 ## File path: site/docs/custom-catalog.md ## @@ -0,0 +1,149 @@ +# Custom Catalog Implementation + +It's possible to read an iceberg table either from an hdfs path or from a hive table. It's also possible to use a custom metastore in place of hive. The steps to do that are as follows. + +- [Custom TableOperations](#custom-table-operations-implementation) +- [Custom Catalog](#custom-table-implementation) +- [Custom IcebergSource](#custom-icebergsource) + +### Custom table operations implementation +Extend `BaseMetastoreTableOperations` to provide implementation on how to read and write metadata + +Example: +```java +class CustomTableOperations extends BaseMetastoreTableOperations { + private String dbName; + private String tableName; + private Configuration conf; + private FileIO fileIO; + + protected CustomTableOperations(Configuration conf, String dbName, String tableName) { +this.conf = conf; +this.dbName = dbName; +this.tableName = tableName; + } + + // The doRefresh method should provide implementation on how to get the metadata location + @Override + public void doRefresh() { + +// Example custom service which returns the metadata location given a dbName and tableName +String metadataLocation = CustomService.getMetadataForTable(conf, dbName, tableName); + +// Use existing method to refresh metadata +refreshFromMetadataLocation(metadataLocation); + + } + + // The doCommit method should provide implementation on how to update with metadata location atomically + @Override + public void doCommit(TableMetadata base, TableMetadata metadata) { +// if the metadata is already out of date, reject it +if (base != current()) { + throw new CommitFailedException("Cannot commit: stale table metadata for %s.%s", dbName, tableName); +} + +// if the metadata is not changed, return early +if (base == metadata) { + return; +} + +String oldMetadataLocation = base.location(); + +// Write new metadata +String newMetadataLocation = writeNewMetadata(metadata, currentVersion() + 1); + +// Example custom service which updates the metadata location for the given db and table atomically +CustomService.updateMetadataLocation(dbName, tableName, oldMetadataLocation, newMetadataLocation); + + } + + // The io method provides a FileIO which is used to read and write the table metadata files + @Override + public FileIO io() { +if (fileIO == null) { + fileIO = new HadoopFileIO(conf); +} +return fileIO; + } + + // Optional: this can be overridden to provide custom location provider implementation + @Override + public LocationProvider locationProvider() { +// TODO + } +} +``` + +### Custom table implementation +Extend `BaseMetastoreCatalog` to provide default warehouse locations and instantiate `CustomTableOperations` + +Example: +```java +public class CustomCatalog extends BaseMetastoreCatalog { + + private Configuration configuration; + + public CustomCatalog(Configuration configuration) { +this.configuration = configuration; + } + + @Override + protected TableOperations newTableOps(TableIdentifier tableIdentifier) { +String dbName = tableIdentifier.namespace().level(0); +String tableName = tableIdentifier.name(); +// instantiate the CustomTableOperations +return new CustomTableOperations(configuration, dbName, tableName); + } + + @Override + protected String defaultWarehouseLocation(TableIdentifier tableIdentifier) { + +// Can choose to use any other configuration name +String tableLocation = configuration.get("custom.iceberg.warehouse.location"); + +// Can be an s3 or hdfs path +if (tableLocation == null) { + throw new RuntimeException("custom.iceberg.warehouse.location configuration not set!"); +} + +return String.format( +"%s/%s.db/%s", tableLocation, +tableIdentifier.namespace().levels()[0], +tableIdentifier.name()); + } + + @Override + public boolean dropTable(TableIdentifier identifier, boolean purge) { +// TODO implement behavior +throw new RuntimeException("Not yet implemented"); Review comment: Maybe `CustomService.deleteTable(identifier.namepsace().level(0), identifier.name())`? This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org With regards, Apache Git Services - To unsubscribe, e-mail: issues-unsubscr...@iceberg.apache.org For additiona
[GitHub] [incubator-iceberg] rdblue commented on a change in pull request #416: Adding docs on how to use custom catalog with iceberg
rdblue commented on a change in pull request #416: Adding docs on how to use custom catalog with iceberg URL: https://github.com/apache/incubator-iceberg/pull/416#discussion_r318244402 ## File path: site/docs/custom-catalog.md ## @@ -0,0 +1,149 @@ +# Custom Catalog Implementation + +It's possible to read an iceberg table either from an hdfs path or from a hive table. It's also possible to use a custom metastore in place of hive. The steps to do that are as follows. + +- [Custom TableOperations](#custom-table-operations-implementation) +- [Custom Catalog](#custom-table-implementation) +- [Custom IcebergSource](#custom-icebergsource) + +### Custom table operations implementation +Extend `BaseMetastoreTableOperations` to provide implementation on how to read and write metadata + +Example: +```java +class CustomTableOperations extends BaseMetastoreTableOperations { + private String dbName; + private String tableName; + private Configuration conf; + private FileIO fileIO; + + protected CustomTableOperations(Configuration conf, String dbName, String tableName) { +this.conf = conf; +this.dbName = dbName; +this.tableName = tableName; + } + + // The doRefresh method should provide implementation on how to get the metadata location + @Override + public void doRefresh() { + +// Example custom service which returns the metadata location given a dbName and tableName +String metadataLocation = CustomService.getMetadataForTable(conf, dbName, tableName); + +// Use existing method to refresh metadata Review comment: How about "when updating from a metadata file location, call the helper method" This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org With regards, Apache Git Services - To unsubscribe, e-mail: issues-unsubscr...@iceberg.apache.org For additional commands, e-mail: issues-h...@iceberg.apache.org
[GitHub] [incubator-iceberg] rdblue commented on a change in pull request #416: Adding docs on how to use custom catalog with iceberg
rdblue commented on a change in pull request #416: Adding docs on how to use custom catalog with iceberg URL: https://github.com/apache/incubator-iceberg/pull/416#discussion_r318243982 ## File path: site/mkdocs.yml ## @@ -52,6 +52,7 @@ nav: - Quickstart: api-quickstart.md - Spark: spark.md - Presto: presto.md +- Custom Catalog: custom-catalog.md Review comment: I'd probably move this to the Java drop-down since it shows how to implement a catalog in Java. This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org With regards, Apache Git Services - To unsubscribe, e-mail: issues-unsubscr...@iceberg.apache.org For additional commands, e-mail: issues-h...@iceberg.apache.org
[GitHub] [incubator-iceberg] rdblue commented on a change in pull request #416: Adding docs on how to use custom catalog with iceberg
rdblue commented on a change in pull request #416: Adding docs on how to use custom catalog with iceberg URL: https://github.com/apache/incubator-iceberg/pull/416#discussion_r318243051 ## File path: site/docs/custom-catalog.md ## @@ -0,0 +1,149 @@ +# Custom Catalog Implementation + +It's possible to read an iceberg table either from an hdfs path or from a hive table. It's also possible to use a custom metastore in place of hive. The steps to do that are as follows. + +- [Custom TableOperations](#custom-table-operations-implementation) +- [Custom Catalog](#custom-table-implementation) +- [Custom IcebergSource](#custom-icebergsource) + +### Custom table operations implementation +Extend `BaseMetastoreTableOperations` to provide implementation on how to read and write metadata + +Example: +```java +class CustomTableOperations extends BaseMetastoreTableOperations { + private String dbName; + private String tableName; + private Configuration conf; + private FileIO fileIO; + + protected CustomTableOperations(Configuration conf, String dbName, String tableName) { +this.conf = conf; +this.dbName = dbName; +this.tableName = tableName; + } + + // The doRefresh method should provide implementation on how to get the metadata location + @Override + public void doRefresh() { + +// Example custom service which returns the metadata location given a dbName and tableName +String metadataLocation = CustomService.getMetadataForTable(conf, dbName, tableName); + +// Use existing method to refresh metadata +refreshFromMetadataLocation(metadataLocation); + + } + + // The doCommit method should provide implementation on how to update with metadata location atomically + @Override + public void doCommit(TableMetadata base, TableMetadata metadata) { +// if the metadata is already out of date, reject it +if (base != current()) { Review comment: Good point, it does. Same with the base == metadata check. This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org With regards, Apache Git Services - To unsubscribe, e-mail: issues-unsubscr...@iceberg.apache.org For additional commands, e-mail: issues-h...@iceberg.apache.org
[GitHub] [incubator-iceberg] rdblue commented on a change in pull request #416: Adding docs on how to use custom catalog with iceberg
rdblue commented on a change in pull request #416: Adding docs on how to use custom catalog with iceberg URL: https://github.com/apache/incubator-iceberg/pull/416#discussion_r318243051 ## File path: site/docs/custom-catalog.md ## @@ -0,0 +1,149 @@ +# Custom Catalog Implementation + +It's possible to read an iceberg table either from an hdfs path or from a hive table. It's also possible to use a custom metastore in place of hive. The steps to do that are as follows. + +- [Custom TableOperations](#custom-table-operations-implementation) +- [Custom Catalog](#custom-table-implementation) +- [Custom IcebergSource](#custom-icebergsource) + +### Custom table operations implementation +Extend `BaseMetastoreTableOperations` to provide implementation on how to read and write metadata + +Example: +```java +class CustomTableOperations extends BaseMetastoreTableOperations { + private String dbName; + private String tableName; + private Configuration conf; + private FileIO fileIO; + + protected CustomTableOperations(Configuration conf, String dbName, String tableName) { +this.conf = conf; +this.dbName = dbName; +this.tableName = tableName; + } + + // The doRefresh method should provide implementation on how to get the metadata location + @Override + public void doRefresh() { + +// Example custom service which returns the metadata location given a dbName and tableName +String metadataLocation = CustomService.getMetadataForTable(conf, dbName, tableName); + +// Use existing method to refresh metadata +refreshFromMetadataLocation(metadataLocation); + + } + + // The doCommit method should provide implementation on how to update with metadata location atomically + @Override + public void doCommit(TableMetadata base, TableMetadata metadata) { +// if the metadata is already out of date, reject it +if (base != current()) { Review comment: Good point, it does. Same with the base == current() check. This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org With regards, Apache Git Services - To unsubscribe, e-mail: issues-unsubscr...@iceberg.apache.org For additional commands, e-mail: issues-h...@iceberg.apache.org
[GitHub] [incubator-iceberg] manishmalhotrawork commented on issue #280: Add persistent IDs to partition fields
manishmalhotrawork commented on issue #280: Add persistent IDs to partition fields URL: https://github.com/apache/incubator-iceberg/issues/280#issuecomment-525415789 @rdblue thanks. Let me take a stab on this, may be as you mentioned in description. Passing initial value for the partitionId to PartitionSpec from TableMetadata. And also as TableMetadata knows how many fields are in partition, so can maintain the nextIDValue as well. So, that when `PartitionSpec.partitionType` will be called, it will use the passed value as first ID and then increment. Also the [TableMetadata#updatePartitionSpec](https://github.com/apache/incubator-iceberg/blob/master/core/src/main/java/org/apache/iceberg/TableMetadata.java#L294) should also use nextIDValue to pass to PartitionSpec. Does modifying/dropping columns also needs to be taken care, I believe not? yeah, for my understanding, a sample partitioned table manifest file's schema has this entry for partition. Where field-id is 1000+. ``` "name" : "partition", "type" : { "type" : "record", "name" : "r102", "fields" : [ { "name" : "order_status", "type" : [ "null", "string" ], "default" : null, "field-id" : 1000 }, { "name" : "ship_priority", "type" : [ "null", "int" ], "default" : null, "field-id" : 1001 }, { "name" : "order_key_bucket", "type" : [ "null", "int" ], "default" : null, "field-id" : 1002 } ]``` This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org With regards, Apache Git Services - To unsubscribe, e-mail: issues-unsubscr...@iceberg.apache.org For additional commands, e-mail: issues-h...@iceberg.apache.org
[GitHub] [incubator-iceberg] rdblue commented on issue #411: Ignore unsupported partition transforms for forward compatibility
rdblue commented on issue #411: Ignore unsupported partition transforms for forward compatibility URL: https://github.com/apache/incubator-iceberg/pull/411#issuecomment-525412584 @aokolnychyi, can you review this? This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org With regards, Apache Git Services - To unsubscribe, e-mail: issues-unsubscr...@iceberg.apache.org For additional commands, e-mail: issues-h...@iceberg.apache.org
[GitHub] [incubator-iceberg] rdblue commented on a change in pull request #374: Migrate spark table to iceberg table
rdblue commented on a change in pull request #374: Migrate spark table to iceberg table URL: https://github.com/apache/incubator-iceberg/pull/374#discussion_r318200212 ## File path: spark/src/main/scala/org/apache/iceberg/spark/SparkTableUtil.scala ## @@ -297,5 +302,81 @@ object SparkTableUtil { ) } } + + private def buildManifest(table: Table, + sparkDataFiles: Seq[SparkDataFile], + partitionSpec: PartitionSpec): ManifestFile = { +val outputFile = table.io + .newOutputFile(FileFormat.AVRO.addExtension("/tmp/" + UUID.randomUUID.toString)) Review comment: This can't be local because it is written on executors and read from the driver. If those are on different machines, the driver won't be able to find the location. This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org With regards, Apache Git Services - To unsubscribe, e-mail: issues-unsubscr...@iceberg.apache.org For additional commands, e-mail: issues-h...@iceberg.apache.org
[GitHub] [incubator-iceberg] rdblue merged pull request #388: Handle rollback in snapshot expiration
rdblue merged pull request #388: Handle rollback in snapshot expiration URL: https://github.com/apache/incubator-iceberg/pull/388 This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org With regards, Apache Git Services - To unsubscribe, e-mail: issues-unsubscr...@iceberg.apache.org For additional commands, e-mail: issues-h...@iceberg.apache.org
[GitHub] [incubator-iceberg] rdblue commented on issue #398: Push down StringStartsWith in Spark IcebergSource
rdblue commented on issue #398: Push down StringStartsWith in Spark IcebergSource URL: https://github.com/apache/incubator-iceberg/pull/398#issuecomment-525398380 +1 There were a couple of minor points, but I think we can fix those in a follow-up. This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org With regards, Apache Git Services - To unsubscribe, e-mail: issues-unsubscr...@iceberg.apache.org For additional commands, e-mail: issues-h...@iceberg.apache.org
[GitHub] [incubator-iceberg] rdblue merged pull request #398: Push down StringStartsWith in Spark IcebergSource
rdblue merged pull request #398: Push down StringStartsWith in Spark IcebergSource URL: https://github.com/apache/incubator-iceberg/pull/398 This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org With regards, Apache Git Services - To unsubscribe, e-mail: issues-unsubscr...@iceberg.apache.org For additional commands, e-mail: issues-h...@iceberg.apache.org
[GitHub] [incubator-iceberg] rdblue closed issue #397: Push down StringStartsWith in Spark IcebergSource
rdblue closed issue #397: Push down StringStartsWith in Spark IcebergSource URL: https://github.com/apache/incubator-iceberg/issues/397 This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org With regards, Apache Git Services - To unsubscribe, e-mail: issues-unsubscr...@iceberg.apache.org For additional commands, e-mail: issues-h...@iceberg.apache.org
[GitHub] [incubator-iceberg] rdblue commented on a change in pull request #398: Push down StringStartsWith in Spark IcebergSource
rdblue commented on a change in pull request #398: Push down StringStartsWith in Spark IcebergSource URL: https://github.com/apache/incubator-iceberg/pull/398#discussion_r318197341 ## File path: parquet/src/test/java/org/apache/iceberg/parquet/TestDictionaryRowGroupFilter.java ## @@ -231,6 +236,41 @@ public void testRequiredColumn() { Assert.assertFalse("Should skip: required columns are always non-null", shouldRead); } + @Test + public void testStartsWith() { +boolean shouldRead = new ParquetDictionaryRowGroupFilter(SCHEMA, startsWith("non_dict", "re")) +.shouldRead(PARQUET_SCHEMA, ROW_GROUP_METADATA, DICTIONARY_STORE); +Assert.assertTrue("Should read: no dictionary", shouldRead); + +shouldRead = new ParquetDictionaryRowGroupFilter(SCHEMA, startsWith("required", "re")) +.shouldRead(PARQUET_SCHEMA, ROW_GROUP_METADATA, DICTIONARY_STORE); +Assert.assertTrue("Should read: dictionary contains a matching entry", shouldRead); + +shouldRead = new ParquetDictionaryRowGroupFilter(SCHEMA, startsWith("required", "req")) +.shouldRead(PARQUET_SCHEMA, ROW_GROUP_METADATA, DICTIONARY_STORE); +Assert.assertTrue("Should read: dictionary contains a matching entry", shouldRead); + +shouldRead = new ParquetDictionaryRowGroupFilter(SCHEMA, startsWith("some_nulls", "so")) +.shouldRead(PARQUET_SCHEMA, ROW_GROUP_METADATA, DICTIONARY_STORE); +Assert.assertTrue("Should read: dictionary contains a matching entry", shouldRead); + +shouldRead = new ParquetDictionaryRowGroupFilter(SCHEMA, startsWith("no_stats", UUID.randomUUID().toString())) +.shouldRead(PARQUET_SCHEMA, ROW_GROUP_METADATA, DICTIONARY_STORE); +Assert.assertFalse("Should skip: no stats but dictionary is present", shouldRead); + +shouldRead = new ParquetDictionaryRowGroupFilter(SCHEMA, startsWith("required", "reqs")) +.shouldRead(PARQUET_SCHEMA, ROW_GROUP_METADATA, DICTIONARY_STORE); +Assert.assertFalse("Should skip: no match in dictionary", shouldRead); + +shouldRead = new ParquetDictionaryRowGroupFilter(SCHEMA, startsWith("some_nulls", "somex")) +.shouldRead(PARQUET_SCHEMA, ROW_GROUP_METADATA, DICTIONARY_STORE); +Assert.assertFalse("Should skip: no match in dictionary", shouldRead); + +shouldRead = new ParquetDictionaryRowGroupFilter(SCHEMA, startsWith("no_nulls", "xxx")) +.shouldRead(PARQUET_SCHEMA, ROW_GROUP_METADATA, DICTIONARY_STORE); +Assert.assertFalse("Should skip: no match in dictionary", shouldRead); Review comment: Can you add tests for `all_nulls` and `not_in_file` columns? This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org With regards, Apache Git Services - To unsubscribe, e-mail: issues-unsubscr...@iceberg.apache.org For additional commands, e-mail: issues-h...@iceberg.apache.org
[GitHub] [incubator-iceberg] rdblue commented on a change in pull request #398: Push down StringStartsWith in Spark IcebergSource
rdblue commented on a change in pull request #398: Push down StringStartsWith in Spark IcebergSource URL: https://github.com/apache/incubator-iceberg/pull/398#discussion_r318196009 ## File path: parquet/src/main/java/org/apache/iceberg/parquet/ParquetMetricsRowGroupFilter.java ## @@ -339,6 +341,47 @@ public Boolean or(Boolean leftResult, Boolean rightResult) { return ROWS_MIGHT_MATCH; } +@Override +@SuppressWarnings("unchecked") +public Boolean startsWith(BoundReference ref, Literal lit) { + int id = ref.fieldId(); + + Long valueCount = valueCounts.get(id); + if (valueCount == null) { +// the column is not present and is all nulls +return ROWS_CANNOT_MATCH; + } + + Statistics colStats = (Statistics) stats.get(id); + if (colStats != null && !colStats.isEmpty()) { +if (!colStats.hasNonNullValue()) { + return ROWS_CANNOT_MATCH; +} + +Binary prefixAsBinary = Binary.fromConstantByteBuffer(lit.toByteBuffer()); + +PrimitiveComparator comparator = PrimitiveComparator.UNSIGNED_LEXICOGRAPHICAL_BINARY_COMPARATOR; + +Binary lower = colStats.genericGetMin(); +// truncate lower bound so that its length in bytes is not greater than the length of prefix +int lowerLength = Math.min(prefixAsBinary.length(), lower.length()); +int lowerCmp = comparator.compare(lower.slice(0, lowerLength), prefixAsBinary); Review comment: I just looked at Parquet and I think it is going to be cheaper to use `toByteBuffer().slice(...)` instead of `slice(...)` because the slice implementation for most of the `Binary` implementations calls `getBytesUnsafe()` that returns a `byte[]` and usually copies data. Another minor point is that I think it would be better to use Iceberg's comparators instead of Parquet's comparators to ensure we have the same behavior everywhere. So I think this should be updated to convert the binary to a byte buffer, slice that, and then compare the two using Iceberg's `Comparators.unsignedBytes()`. This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org With regards, Apache Git Services - To unsubscribe, e-mail: issues-unsubscr...@iceberg.apache.org For additional commands, e-mail: issues-h...@iceberg.apache.org
[GitHub] [incubator-iceberg] rdblue commented on a change in pull request #398: Push down StringStartsWith in Spark IcebergSource
rdblue commented on a change in pull request #398: Push down StringStartsWith in Spark IcebergSource URL: https://github.com/apache/incubator-iceberg/pull/398#discussion_r318191642 ## File path: api/src/test/java/org/apache/iceberg/expressions/TestInclusiveMetricsEvaluator.java ## @@ -355,4 +390,40 @@ public void testCaseInsensitiveIntegerNotEqRewritten() { public void testCaseSensitiveIntegerNotEqRewritten() { boolean shouldRead = new InclusiveMetricsEvaluator(SCHEMA, not(equal("ID", 5)), true).eval(FILE); } + + @Test + public void testStringStartsWith() { +boolean shouldRead = new InclusiveMetricsEvaluator(SCHEMA, startsWith("required", "a"), true).eval(FILE); +Assert.assertTrue("Should read: no stats", shouldRead); + +shouldRead = new InclusiveMetricsEvaluator(SCHEMA, startsWith("required", "a"), true).eval(FILE_2); +Assert.assertTrue("Should read: range matches", shouldRead); + +shouldRead = new InclusiveMetricsEvaluator(SCHEMA, startsWith("required", "aa"), true).eval(FILE_2); +Assert.assertTrue("Should read: range matches", shouldRead); + +shouldRead = new InclusiveMetricsEvaluator(SCHEMA, startsWith("required", "aaa"), true).eval(FILE_2); +Assert.assertTrue("Should read: range matches", shouldRead); + +shouldRead = new InclusiveMetricsEvaluator(SCHEMA, startsWith("required", "1s"), true).eval(FILE_3); +Assert.assertTrue("Should read: range matches", shouldRead); + +shouldRead = new InclusiveMetricsEvaluator(SCHEMA, startsWith("required", "1str1x"), true).eval(FILE_3); +Assert.assertTrue("Should read: range matches", shouldRead); + +shouldRead = new InclusiveMetricsEvaluator(SCHEMA, startsWith("required", "ff"), true).eval(FILE_4); +Assert.assertTrue("Should read: range matches", shouldRead); + +shouldRead = new InclusiveMetricsEvaluator(SCHEMA, startsWith("required", "aB"), true).eval(FILE_2); Review comment: Okay, that works for me. You might add a comment to show your intent in the future, but it's not a big deal here. This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org With regards, Apache Git Services - To unsubscribe, e-mail: issues-unsubscr...@iceberg.apache.org For additional commands, e-mail: issues-h...@iceberg.apache.org
[GitHub] [incubator-iceberg] rdblue commented on a change in pull request #398: Push down StringStartsWith in Spark IcebergSource
rdblue commented on a change in pull request #398: Push down StringStartsWith in Spark IcebergSource URL: https://github.com/apache/incubator-iceberg/pull/398#discussion_r318178750 ## File path: api/src/test/java/org/apache/iceberg/expressions/TestInclusiveMetricsEvaluator.java ## @@ -355,4 +390,40 @@ public void testCaseInsensitiveIntegerNotEqRewritten() { public void testCaseSensitiveIntegerNotEqRewritten() { boolean shouldRead = new InclusiveMetricsEvaluator(SCHEMA, not(equal("ID", 5)), true).eval(FILE); } + + @Test + public void testStringStartsWith() { +boolean shouldRead = new InclusiveMetricsEvaluator(SCHEMA, startsWith("required", "a"), true).eval(FILE); +Assert.assertTrue("Should read: no stats", shouldRead); + +shouldRead = new InclusiveMetricsEvaluator(SCHEMA, startsWith("required", "a"), true).eval(FILE_2); +Assert.assertTrue("Should read: range matches", shouldRead); + +shouldRead = new InclusiveMetricsEvaluator(SCHEMA, startsWith("required", "aa"), true).eval(FILE_2); +Assert.assertTrue("Should read: range matches", shouldRead); + +shouldRead = new InclusiveMetricsEvaluator(SCHEMA, startsWith("required", "aaa"), true).eval(FILE_2); Review comment: You're right, I think that behavior is correct. This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org With regards, Apache Git Services - To unsubscribe, e-mail: issues-unsubscr...@iceberg.apache.org For additional commands, e-mail: issues-h...@iceberg.apache.org
[GitHub] [incubator-iceberg] chenjunjiedada commented on a change in pull request #374: Migrate spark table to iceberg table
chenjunjiedada commented on a change in pull request #374: Migrate spark table to iceberg table URL: https://github.com/apache/incubator-iceberg/pull/374#discussion_r318176144 ## File path: spark/src/main/scala/org/apache/iceberg/spark/SparkTableUtil.scala ## @@ -297,5 +302,81 @@ object SparkTableUtil { ) } } + + private def buildManifest(table: Table, + sparkDataFiles: Seq[SparkDataFile], + partitionSpec: PartitionSpec): ManifestFile = { +val outputFile = table.io + .newOutputFile(FileFormat.AVRO.addExtension("/tmp/" + UUID.randomUUID.toString)) +val writer = ManifestWriter.write(partitionSpec, outputFile) +try { + sparkDataFiles.foreach { file => +writer.add(file.toDataFile(partitionSpec)) + } +} finally { + writer.close() +} + +writer.toManifestFile + } + + /** + * Import a spark table to a iceberg table. + * + * The import uses the spark session to get table metadata. It assumes no + * operation is going on original table and target table and thus is not + * thread-safe. + * + * @param source the database name of the table to be import + * @param location the location used to store table metadata + * + * @return table the imported table + */ + def importSparkTable(source: TableIdentifier, location: String): Table = { +val sparkSession = SparkSession.builder().getOrCreate() +import sparkSession.sqlContext.implicits._ + +val dbName = source.database.getOrElse("default") +val tableName = source.table + +if (!sparkSession.catalog.tableExists(dbName, tableName)) { + throw new NoSuchTableException(s"Table $dbName.$tableName does not exist") +} + +val partitionSpec = SparkSchemaUtil.specForTable(sparkSession, s"$dbName.$tableName") +val conf = sparkSession.sparkContext.hadoopConfiguration +val tables = new HadoopTables(conf) +val schema = SparkSchemaUtil.schemaForTable(sparkSession, s"$dbName.$tableName") +val table = tables.create(schema, partitionSpec, ImmutableMap.of(), location) +val appender = table.newAppend() + +if (partitionSpec == PartitionSpec.unpartitioned) { + val tableMetadata = sparkSession.sessionState.catalog.getTableMetadata(source) + val format = tableMetadata.provider.getOrElse("none") + + if (format != "avro" && format != "parquet" && format != "orc") { +throw new UnsupportedOperationException(s"Unsupported format: $format") + } + listPartition(Map.empty[String, String], tableMetadata.location.toString, +format).foreach{f => appender.appendFile(f.toDataFile(PartitionSpec.unpartitioned))} + appender.commit() +} else { + val partitions = partitionDF(sparkSession, s"$dbName.$tableName") + partitions.flatMap { row => +listPartition(row.getMap[String, String](0).toMap, row.getString(1), row.getString(2)) + }.coalesce(1).mapPartitions { Review comment: That's a temporary use. I left comments to discuss a way to serialize manifest info [here](https://github.com/apache/incubator-iceberg/pull/374#issuecomment-522904779). Looks like the case class is a good way to do that, I will copy this. This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org With regards, Apache Git Services - To unsubscribe, e-mail: issues-unsubscr...@iceberg.apache.org For additional commands, e-mail: issues-h...@iceberg.apache.org
[GitHub] [incubator-iceberg] chenjunjiedada commented on a change in pull request #374: Migrate spark table to iceberg table
chenjunjiedada commented on a change in pull request #374: Migrate spark table to iceberg table URL: https://github.com/apache/incubator-iceberg/pull/374#discussion_r318174610 ## File path: spark/src/main/scala/org/apache/iceberg/spark/SparkTableUtil.scala ## @@ -297,5 +302,81 @@ object SparkTableUtil { ) } } + + private def buildManifest(table: Table, + sparkDataFiles: Seq[SparkDataFile], + partitionSpec: PartitionSpec): ManifestFile = { +val outputFile = table.io + .newOutputFile(FileFormat.AVRO.addExtension("/tmp/" + UUID.randomUUID.toString)) +val writer = ManifestWriter.write(partitionSpec, outputFile) +try { + sparkDataFiles.foreach { file => +writer.add(file.toDataFile(partitionSpec)) + } +} finally { + writer.close() +} + +writer.toManifestFile + } + + /** + * Import a spark table to a iceberg table. + * + * The import uses the spark session to get table metadata. It assumes no + * operation is going on original table and target table and thus is not + * thread-safe. + * + * @param source the database name of the table to be import + * @param location the location used to store table metadata + * + * @return table the imported table + */ + def importSparkTable(source: TableIdentifier, location: String): Table = { +val sparkSession = SparkSession.builder().getOrCreate() +import sparkSession.sqlContext.implicits._ + +val dbName = source.database.getOrElse("default") +val tableName = source.table + +if (!sparkSession.catalog.tableExists(dbName, tableName)) { + throw new NoSuchTableException(s"Table $dbName.$tableName does not exist") +} + +val partitionSpec = SparkSchemaUtil.specForTable(sparkSession, s"$dbName.$tableName") +val conf = sparkSession.sparkContext.hadoopConfiguration +val tables = new HadoopTables(conf) +val schema = SparkSchemaUtil.schemaForTable(sparkSession, s"$dbName.$tableName") +val table = tables.create(schema, partitionSpec, ImmutableMap.of(), location) +val appender = table.newAppend() + +if (partitionSpec == PartitionSpec.unpartitioned) { + val tableMetadata = sparkSession.sessionState.catalog.getTableMetadata(source) + val format = tableMetadata.provider.getOrElse("none") + + if (format != "avro" && format != "parquet" && format != "orc") { Review comment: ok, will update. This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org With regards, Apache Git Services - To unsubscribe, e-mail: issues-unsubscr...@iceberg.apache.org For additional commands, e-mail: issues-h...@iceberg.apache.org
[GitHub] [incubator-iceberg] chenjunjiedada commented on a change in pull request #374: Migrate spark table to iceberg table
chenjunjiedada commented on a change in pull request #374: Migrate spark table to iceberg table URL: https://github.com/apache/incubator-iceberg/pull/374#discussion_r318171088 ## File path: spark/src/main/scala/org/apache/iceberg/spark/SparkTableUtil.scala ## @@ -297,5 +302,81 @@ object SparkTableUtil { ) } } + + private def buildManifest(table: Table, + sparkDataFiles: Seq[SparkDataFile], + partitionSpec: PartitionSpec): ManifestFile = { +val outputFile = table.io + .newOutputFile(FileFormat.AVRO.addExtension("/tmp/" + UUID.randomUUID.toString)) Review comment: Agreed. Just want to confirm why it can't be the local file? It is just a temporary file, I wanted to store it in the tmpfs of localhost to save some IO. This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org With regards, Apache Git Services - To unsubscribe, e-mail: issues-unsubscr...@iceberg.apache.org For additional commands, e-mail: issues-h...@iceberg.apache.org
[GitHub] [incubator-iceberg] aokolnychyi commented on issue #398: Push down StringStartsWith in Spark IcebergSource
aokolnychyi commented on issue #398: Push down StringStartsWith in Spark IcebergSource URL: https://github.com/apache/incubator-iceberg/pull/398#issuecomment-525331795 I extended the scope of this PR to also cover filtering of manifests. This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org With regards, Apache Git Services - To unsubscribe, e-mail: issues-unsubscr...@iceberg.apache.org For additional commands, e-mail: issues-h...@iceberg.apache.org
[GitHub] [incubator-iceberg] aokolnychyi closed pull request #398: Push down StringStartsWith in Spark IcebergSource
aokolnychyi closed pull request #398: Push down StringStartsWith in Spark IcebergSource URL: https://github.com/apache/incubator-iceberg/pull/398 This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org With regards, Apache Git Services - To unsubscribe, e-mail: issues-unsubscr...@iceberg.apache.org For additional commands, e-mail: issues-h...@iceberg.apache.org
[GitHub] [incubator-iceberg] aokolnychyi opened a new pull request #398: Push down StringStartsWith in Spark IcebergSource
aokolnychyi opened a new pull request #398: Push down StringStartsWith in Spark IcebergSource URL: https://github.com/apache/incubator-iceberg/pull/398 This PR adds support for pushdown of `StringStartsWith` predicates in Spark and resolves #397. This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org With regards, Apache Git Services - To unsubscribe, e-mail: issues-unsubscr...@iceberg.apache.org For additional commands, e-mail: issues-h...@iceberg.apache.org
[GitHub] [incubator-iceberg] aokolnychyi commented on a change in pull request #398: Push down StringStartsWith in Spark IcebergSource
aokolnychyi commented on a change in pull request #398: Push down StringStartsWith in Spark IcebergSource URL: https://github.com/apache/incubator-iceberg/pull/398#discussion_r318107696 ## File path: spark/src/main/java/org/apache/iceberg/spark/SparkFilters.java ## @@ -153,6 +156,11 @@ public static Expression convert(Filter filter) { } return null; } + +case STARTS_WITH: { + StringStartsWith stringStartsWith = (StringStartsWith) filter; + return startsWith(stringStartsWith.attribute(), stringStartsWith.value()); +} Review comment: I've extended `TestFilteredScan` to verify this logic. This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org With regards, Apache Git Services - To unsubscribe, e-mail: issues-unsubscr...@iceberg.apache.org For additional commands, e-mail: issues-h...@iceberg.apache.org
[GitHub] [incubator-iceberg] aokolnychyi commented on a change in pull request #398: Push down StringStartsWith in Spark IcebergSource
aokolnychyi commented on a change in pull request #398: Push down StringStartsWith in Spark IcebergSource URL: https://github.com/apache/incubator-iceberg/pull/398#discussion_r317970914 ## File path: api/src/test/java/org/apache/iceberg/expressions/TestInclusiveMetricsEvaluator.java ## @@ -355,4 +390,40 @@ public void testCaseInsensitiveIntegerNotEqRewritten() { public void testCaseSensitiveIntegerNotEqRewritten() { boolean shouldRead = new InclusiveMetricsEvaluator(SCHEMA, not(equal("ID", 5)), true).eval(FILE); } + + @Test + public void testStringStartsWith() { +boolean shouldRead = new InclusiveMetricsEvaluator(SCHEMA, startsWith("required", "a"), true).eval(FILE); +Assert.assertTrue("Should read: no stats", shouldRead); + +shouldRead = new InclusiveMetricsEvaluator(SCHEMA, startsWith("required", "a"), true).eval(FILE_2); +Assert.assertTrue("Should read: range matches", shouldRead); + +shouldRead = new InclusiveMetricsEvaluator(SCHEMA, startsWith("required", "aa"), true).eval(FILE_2); +Assert.assertTrue("Should read: range matches", shouldRead); + +shouldRead = new InclusiveMetricsEvaluator(SCHEMA, startsWith("required", "aaa"), true).eval(FILE_2); +Assert.assertTrue("Should read: range matches", shouldRead); + +shouldRead = new InclusiveMetricsEvaluator(SCHEMA, startsWith("required", "1s"), true).eval(FILE_3); +Assert.assertTrue("Should read: range matches", shouldRead); + +shouldRead = new InclusiveMetricsEvaluator(SCHEMA, startsWith("required", "1str1x"), true).eval(FILE_3); +Assert.assertTrue("Should read: range matches", shouldRead); + +shouldRead = new InclusiveMetricsEvaluator(SCHEMA, startsWith("required", "ff"), true).eval(FILE_4); +Assert.assertTrue("Should read: range matches", shouldRead); + +shouldRead = new InclusiveMetricsEvaluator(SCHEMA, startsWith("required", "aB"), true).eval(FILE_2); Review comment: I just wanted to test case sensitivity as well. I can remove this as other cases are covered by remaining tests. This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org With regards, Apache Git Services - To unsubscribe, e-mail: issues-unsubscr...@iceberg.apache.org For additional commands, e-mail: issues-h...@iceberg.apache.org
[GitHub] [incubator-iceberg] aokolnychyi commented on a change in pull request #398: Push down StringStartsWith in Spark IcebergSource
aokolnychyi commented on a change in pull request #398: Push down StringStartsWith in Spark IcebergSource URL: https://github.com/apache/incubator-iceberg/pull/398#discussion_r317964972 ## File path: api/src/test/java/org/apache/iceberg/expressions/TestInclusiveMetricsEvaluator.java ## @@ -355,4 +390,40 @@ public void testCaseInsensitiveIntegerNotEqRewritten() { public void testCaseSensitiveIntegerNotEqRewritten() { boolean shouldRead = new InclusiveMetricsEvaluator(SCHEMA, not(equal("ID", 5)), true).eval(FILE); } + + @Test + public void testStringStartsWith() { +boolean shouldRead = new InclusiveMetricsEvaluator(SCHEMA, startsWith("required", "a"), true).eval(FILE); +Assert.assertTrue("Should read: no stats", shouldRead); + +shouldRead = new InclusiveMetricsEvaluator(SCHEMA, startsWith("required", "a"), true).eval(FILE_2); +Assert.assertTrue("Should read: range matches", shouldRead); + +shouldRead = new InclusiveMetricsEvaluator(SCHEMA, startsWith("required", "aa"), true).eval(FILE_2); +Assert.assertTrue("Should read: range matches", shouldRead); + +shouldRead = new InclusiveMetricsEvaluator(SCHEMA, startsWith("required", "aaa"), true).eval(FILE_2); Review comment: I'll add one. Actually, we don't truncate the prefix. For example, if the upper bound is `3str3` and our prefix is `3str3abc`, then we cannot match, right? Seems safe to me. This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org With regards, Apache Git Services - To unsubscribe, e-mail: issues-unsubscr...@iceberg.apache.org For additional commands, e-mail: issues-h...@iceberg.apache.org