date:20190827

[GitHub] [incubator-iceberg] jun-he commented on a change in pull request #357: Add in and not in predicates

2019-08-27 Thread GitBox

jun-he commented on a change in pull request #357: Add in and not in predicates
URL: https://github.com/apache/incubator-iceberg/pull/357#discussion_r318403061
 
 

 ##
 File path: 
api/src/main/java/org/apache/iceberg/expressions/BoundSetPredicate.java
 ##
 @@ -0,0 +1,176 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.iceberg.expressions;
+
+import com.google.common.base.Joiner;
+import com.google.common.base.Preconditions;
+import com.google.common.collect.ImmutableSet;
+import java.io.Serializable;
+import java.util.Collection;
+import java.util.Iterator;
+import java.util.Set;
+import org.apache.iceberg.util.CharSequenceWrapper;
+
+public class BoundSetPredicate extends Predicate> {
+  private final LiteralSet literalSet;
+
+  BoundSetPredicate(Operation op, BoundReference ref, Set> lits) 
{
+super(op, ref);
+Preconditions.checkArgument(op == Operation.IN || op == Operation.NOT_IN,
+"%s predicate does not support a set of literals", op);
+this.literalSet = new LiteralSet<>(lits);
+  }
+
+  BoundSetPredicate(Operation op, BoundReference ref, LiteralSet lits) {
+super(op, ref);
+Preconditions.checkArgument(op == Operation.IN || op == Operation.NOT_IN,
+"%s predicate does not support a literal set", op);
+this.literalSet = lits;
+  }
+
+  @Override
+  public Expression negate() {
+return new BoundSetPredicate<>(op().negate(), ref(), literalSet);
+  }
+
+  public Set literalSet() {
+return literalSet;
+  }
+
+  @Override
+  String literalString() {
+return literalSet.toString();
+  }
+
+  /**
+   * Represents a set of literal values in an IN or NOT_IN predicate
+   * @param  The Java type of the value, which can be wrapped by a {@link 
Literal}
+   */
+  private static class LiteralSet implements Set, Serializable {
+private final Set values;
+
+@SuppressWarnings("unchecked")
+LiteralSet(Set> lits) {
+  Preconditions.checkArgument(lits == null || lits.size() > 1,
+  "The input literal set must include more than 1 element.");
+  values = ImmutableSet.builder().addAll(
+  lits.stream().map(
+  lit -> {
+if (lit instanceof Literals.StringLiteral) {
+  return (T) 
CharSequenceWrapper.wrap(((Literals.StringLiteral) lit).value());
+} else {
+  return lit.value();
+}
+  }
+  ).iterator()).build();
+}
+
+@Override
+public String toString() {
+  return Joiner.on(", ").join(values);
+}
+
+@Override
+public boolean contains(Object object) {
 
 Review comment:
   Thanks for the comments. Updated.
   


This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


With regards,
Apache Git Services

-
To unsubscribe, e-mail: issues-unsubscr...@iceberg.apache.org
For additional commands, e-mail: issues-h...@iceberg.apache.org

[GitHub] [incubator-iceberg] jun-he commented on a change in pull request #357: Add in and not in predicates

2019-08-27 Thread GitBox

jun-he commented on a change in pull request #357: Add in and not in predicates
URL: https://github.com/apache/incubator-iceberg/pull/357#discussion_r318402782
 
 

 ##
 File path: api/src/test/java/org/apache/iceberg/expressions/TestEvaluator.java
 ##
 @@ -362,4 +368,197 @@ public void testCharSeqValue() {
 Assert.assertFalse("string(abc) == utf8(abcd) => false",
 evaluator.eval(TestHelpers.Row.of(new Utf8("abcd";
   }
+
+  @Test
+  public void testIn() {
+Assert.assertEquals(3, in("s", 7, 8, 9).literals().size());
+Assert.assertEquals(3, in("s", 7, 8.1, Long.MAX_VALUE).literals().size());
+Assert.assertEquals(2, in("s", "abc", "abd", "abc").literals().size());
+Assert.assertEquals(0, in("s").literals().size());
+Assert.assertEquals(1, in("s", 5).literals().size());
+Assert.assertEquals(1, in("s", 5, 5).literals().size());
+Assert.assertEquals(1, in("s", Arrays.asList(5, 5)).literals().size());
+Assert.assertEquals(0, in("s", Collections.emptyList()).literals().size());
+
+Evaluator evaluator = new Evaluator(STRUCT, in("x", 7, 8, Long.MAX_VALUE));
+Assert.assertTrue("7 in [7, 8] => true", 
evaluator.eval(TestHelpers.Row.of(7, 8, null)));
+Assert.assertFalse("9 in [7, 8]  => false", 
evaluator.eval(TestHelpers.Row.of(9, 8, null)));
+
+Evaluator longEvaluator = new Evaluator(STRUCT,
+in("x", Long.MAX_VALUE, Integer.MAX_VALUE, Long.MIN_VALUE));
+Assert.assertTrue("Integer.MAX_VALUE in [Integer.MAX_VALUE] => true",
+longEvaluator.eval(TestHelpers.Row.of(Integer.MAX_VALUE, 7.0, null)));
+Assert.assertFalse("6 in [Integer.MAX_VALUE]  => false",
+longEvaluator.eval(TestHelpers.Row.of(6, 6.8, null)));
+
+Evaluator integerEvaluator = new Evaluator(STRUCT, in("y", 7, 8, 9.1));
+Assert.assertTrue("7.0 in [7, 8, 9.1] => true", 
integerEvaluator.eval(TestHelpers.Row.of(7, 7.0, null)));
+Assert.assertTrue("9.1 in [7, 8, 9.1] => true", 
integerEvaluator.eval(TestHelpers.Row.of(7, 9.1, null)));
+Assert.assertFalse("6.8 in [7, 8, 9]  => false", 
integerEvaluator.eval(TestHelpers.Row.of(6, 6.8, null)));
+
+Evaluator structEvaluator = new Evaluator(STRUCT, in("s1.s2.s3.s4.i", 7, 
8, 9));
+Assert.assertTrue("7 in [7, 8, 9] => true",
+structEvaluator.eval(TestHelpers.Row.of(7, 8, null,
+TestHelpers.Row.of(
+TestHelpers.Row.of(
+TestHelpers.Row.of(
+TestHelpers.Row.of(7)));
+Assert.assertFalse("6 in [7, 8, 9]  => false",
+structEvaluator.eval(TestHelpers.Row.of(6, 8, null,
+TestHelpers.Row.of(
+TestHelpers.Row.of(
+TestHelpers.Row.of(
+TestHelpers.Row.of(6)));
+
+StructType charSeqStruct = StructType.of(required(34, "s", 
Types.StringType.get()));
+Evaluator charSeqEvaluator = new Evaluator(charSeqStruct, in("s", "abc", 
"abd", "abc"));
+Assert.assertTrue("utf8(abc) in [string(abc), string(abd)] => true",
+charSeqEvaluator.eval(TestHelpers.Row.of(new Utf8("abc";
+Assert.assertFalse("utf8(abcd) in [string(abc), string(abd)] => false",
+charSeqEvaluator.eval(TestHelpers.Row.of(new Utf8("abcd";
+  }
+
+  @Test
+  public void testInExceptions() {
+TestHelpers.assertThrows(
+"Throw exception if value is null",
+NullPointerException.class,
+"Cannot create expression literal from null",
+() -> in("x", (Literal) null));
+
+TestHelpers.assertThrows(
+"Throw exception if value is null",
+NullPointerException.class,
+"Values cannot be null for IN predicate",
+() -> in("x", (Collection) null));
+
+TestHelpers.assertThrows(
+"Throw exception if calling literals() for EQ predicate",
+IllegalArgumentException.class,
+"EQ predicate cannot return a list of literals",
+() -> equal("x", 5).literals());
 
 Review comment:
   👌 Updated accordingly.
   


This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


With regards,
Apache Git Services

-
To unsubscribe, e-mail: issues-unsubscr...@iceberg.apache.org
For additional commands, e-mail: issues-h...@iceberg.apache.org

[GitHub] [incubator-iceberg] jun-he commented on a change in pull request #357: Add in and not in predicates

2019-08-27 Thread GitBox

jun-he commented on a change in pull request #357: Add in and not in predicates
URL: https://github.com/apache/incubator-iceberg/pull/357#discussion_r318402818
 
 

 ##
 File path: api/src/test/java/org/apache/iceberg/expressions/TestEvaluator.java
 ##
 @@ -362,4 +368,197 @@ public void testCharSeqValue() {
 Assert.assertFalse("string(abc) == utf8(abcd) => false",
 evaluator.eval(TestHelpers.Row.of(new Utf8("abcd";
   }
+
+  @Test
+  public void testIn() {
+Assert.assertEquals(3, in("s", 7, 8, 9).literals().size());
+Assert.assertEquals(3, in("s", 7, 8.1, Long.MAX_VALUE).literals().size());
+Assert.assertEquals(2, in("s", "abc", "abd", "abc").literals().size());
+Assert.assertEquals(0, in("s").literals().size());
+Assert.assertEquals(1, in("s", 5).literals().size());
+Assert.assertEquals(1, in("s", 5, 5).literals().size());
+Assert.assertEquals(1, in("s", Arrays.asList(5, 5)).literals().size());
+Assert.assertEquals(0, in("s", Collections.emptyList()).literals().size());
+
+Evaluator evaluator = new Evaluator(STRUCT, in("x", 7, 8, Long.MAX_VALUE));
+Assert.assertTrue("7 in [7, 8] => true", 
evaluator.eval(TestHelpers.Row.of(7, 8, null)));
+Assert.assertFalse("9 in [7, 8]  => false", 
evaluator.eval(TestHelpers.Row.of(9, 8, null)));
+
+Evaluator longEvaluator = new Evaluator(STRUCT,
+in("x", Long.MAX_VALUE, Integer.MAX_VALUE, Long.MIN_VALUE));
+Assert.assertTrue("Integer.MAX_VALUE in [Integer.MAX_VALUE] => true",
+longEvaluator.eval(TestHelpers.Row.of(Integer.MAX_VALUE, 7.0, null)));
+Assert.assertFalse("6 in [Integer.MAX_VALUE]  => false",
+longEvaluator.eval(TestHelpers.Row.of(6, 6.8, null)));
+
+Evaluator integerEvaluator = new Evaluator(STRUCT, in("y", 7, 8, 9.1));
+Assert.assertTrue("7.0 in [7, 8, 9.1] => true", 
integerEvaluator.eval(TestHelpers.Row.of(7, 7.0, null)));
+Assert.assertTrue("9.1 in [7, 8, 9.1] => true", 
integerEvaluator.eval(TestHelpers.Row.of(7, 9.1, null)));
+Assert.assertFalse("6.8 in [7, 8, 9]  => false", 
integerEvaluator.eval(TestHelpers.Row.of(6, 6.8, null)));
 
 Review comment:
   Fixed.
   


This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


With regards,
Apache Git Services

-
To unsubscribe, e-mail: issues-unsubscr...@iceberg.apache.org
For additional commands, e-mail: issues-h...@iceberg.apache.org

[GitHub] [incubator-iceberg] jun-he commented on a change in pull request #357: Add in and not in predicates

2019-08-27 Thread GitBox

jun-he commented on a change in pull request #357: Add in and not in predicates
URL: https://github.com/apache/incubator-iceberg/pull/357#discussion_r318402850
 
 

 ##
 File path: 
parquet/src/main/java/org/apache/iceberg/parquet/ParquetMetricsRowGroupFilter.java
 ##
 @@ -19,19 +19,14 @@
 
 package org.apache.iceberg.parquet;
 
-import com.google.common.base.Preconditions;
 import com.google.common.collect.Maps;
 import java.util.Map;
+import java.util.Set;
 import java.util.function.Function;
 import org.apache.iceberg.Schema;
-import org.apache.iceberg.expressions.Binder;
-import org.apache.iceberg.expressions.BoundReference;
-import org.apache.iceberg.expressions.Expression;
-import org.apache.iceberg.expressions.ExpressionVisitors;
+import org.apache.iceberg.expressions.*;
 
 Review comment:
   Done.
   


This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


With regards,
Apache Git Services

-
To unsubscribe, e-mail: issues-unsubscr...@iceberg.apache.org
For additional commands, e-mail: issues-h...@iceberg.apache.org

[GitHub] [incubator-iceberg] jun-he commented on a change in pull request #357: Add in and not in predicates

2019-08-27 Thread GitBox

jun-he commented on a change in pull request #357: Add in and not in predicates
URL: https://github.com/apache/incubator-iceberg/pull/357#discussion_r318402737
 
 

 ##
 File path: api/src/test/java/org/apache/iceberg/expressions/TestEvaluator.java
 ##
 @@ -362,4 +368,197 @@ public void testCharSeqValue() {
 Assert.assertFalse("string(abc) == utf8(abcd) => false",
 evaluator.eval(TestHelpers.Row.of(new Utf8("abcd";
   }
+
+  @Test
+  public void testIn() {
+Assert.assertEquals(3, in("s", 7, 8, 9).literals().size());
+Assert.assertEquals(3, in("s", 7, 8.1, Long.MAX_VALUE).literals().size());
+Assert.assertEquals(2, in("s", "abc", "abd", "abc").literals().size());
+Assert.assertEquals(0, in("s").literals().size());
+Assert.assertEquals(1, in("s", 5).literals().size());
+Assert.assertEquals(1, in("s", 5, 5).literals().size());
+Assert.assertEquals(1, in("s", Arrays.asList(5, 5)).literals().size());
+Assert.assertEquals(0, in("s", Collections.emptyList()).literals().size());
+
+Evaluator evaluator = new Evaluator(STRUCT, in("x", 7, 8, Long.MAX_VALUE));
+Assert.assertTrue("7 in [7, 8] => true", 
evaluator.eval(TestHelpers.Row.of(7, 8, null)));
+Assert.assertFalse("9 in [7, 8]  => false", 
evaluator.eval(TestHelpers.Row.of(9, 8, null)));
+
+Evaluator longEvaluator = new Evaluator(STRUCT,
+in("x", Long.MAX_VALUE, Integer.MAX_VALUE, Long.MIN_VALUE));
+Assert.assertTrue("Integer.MAX_VALUE in [Integer.MAX_VALUE] => true",
+longEvaluator.eval(TestHelpers.Row.of(Integer.MAX_VALUE, 7.0, null)));
+Assert.assertFalse("6 in [Integer.MAX_VALUE]  => false",
+longEvaluator.eval(TestHelpers.Row.of(6, 6.8, null)));
+
+Evaluator integerEvaluator = new Evaluator(STRUCT, in("y", 7, 8, 9.1));
+Assert.assertTrue("7.0 in [7, 8, 9.1] => true", 
integerEvaluator.eval(TestHelpers.Row.of(7, 7.0, null)));
 
 Review comment:
   Done.
   


This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


With regards,
Apache Git Services

-
To unsubscribe, e-mail: issues-unsubscr...@iceberg.apache.org
For additional commands, e-mail: issues-h...@iceberg.apache.org

[GitHub] [incubator-iceberg] jun-he commented on a change in pull request #357: Add in and not in predicates

2019-08-27 Thread GitBox

jun-he commented on a change in pull request #357: Add in and not in predicates
URL: https://github.com/apache/incubator-iceberg/pull/357#discussion_r318402641
 
 

 ##
 File path: 
api/src/main/java/org/apache/iceberg/expressions/UnboundPredicate.java
 ##
 @@ -125,13 +149,40 @@ public Expression bind(Types.StructType struct, boolean 
caseSensitive) {
 case LT_EQ:
 case EQ:
   return Expressions.alwaysFalse();
-//case IN:
-//  break;
-//case NOT_IN:
-//  break;
   }
 }
 return new BoundPredicate<>(op(), new BoundReference<>(field.fieldId(),
-schema.accessorForField(field.fieldId())), lit);
+schema.accessorForField(field.fieldId())), lit);
+  }
+
+  @SuppressWarnings("unchecked")
+  private Expression bindInOperation(Types.NestedField field, Schema schema) {
+final Set> lits = literals().stream().map(
+l -> {
+  Literal lit = l.to(field.type());
+  if (lit == null) {
+throw new ValidationException(String.format(
+"Invalid value for comparison inclusive type %s: %s (%s)",
+field.type(), l.value(), l.value().getClass().getName()));
+  }
+  return lit;
+})
+.filter(l -> l != Literals.aboveMax() && l != Literals.belowMin())
+.collect(Collectors.toSet());
+
+if (lits.isEmpty()) {
+  return Expressions.alwaysFalse();
+} else if (lits.size() == 1) {
+  return new BoundPredicate<>(Operation.EQ, new 
BoundReference<>(field.fieldId(),
+  schema.accessorForField(field.fieldId())), lits.iterator().next());
 
 Review comment:
   👌 


This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


With regards,
Apache Git Services

-
To unsubscribe, e-mail: issues-unsubscr...@iceberg.apache.org
For additional commands, e-mail: issues-h...@iceberg.apache.org

[GitHub] [incubator-iceberg] jun-he commented on a change in pull request #357: Add in and not in predicates

2019-08-27 Thread GitBox

jun-he commented on a change in pull request #357: Add in and not in predicates
URL: https://github.com/apache/incubator-iceberg/pull/357#discussion_r318402697
 
 

 ##
 File path: api/src/test/java/org/apache/iceberg/expressions/TestEvaluator.java
 ##
 @@ -362,4 +368,197 @@ public void testCharSeqValue() {
 Assert.assertFalse("string(abc) == utf8(abcd) => false",
 evaluator.eval(TestHelpers.Row.of(new Utf8("abcd";
   }
+
+  @Test
+  public void testIn() {
+Assert.assertEquals(3, in("s", 7, 8, 9).literals().size());
+Assert.assertEquals(3, in("s", 7, 8.1, Long.MAX_VALUE).literals().size());
+Assert.assertEquals(2, in("s", "abc", "abd", "abc").literals().size());
+Assert.assertEquals(0, in("s").literals().size());
+Assert.assertEquals(1, in("s", 5).literals().size());
+Assert.assertEquals(1, in("s", 5, 5).literals().size());
+Assert.assertEquals(1, in("s", Arrays.asList(5, 5)).literals().size());
+Assert.assertEquals(0, in("s", Collections.emptyList()).literals().size());
+
+Evaluator evaluator = new Evaluator(STRUCT, in("x", 7, 8, Long.MAX_VALUE));
+Assert.assertTrue("7 in [7, 8] => true", 
evaluator.eval(TestHelpers.Row.of(7, 8, null)));
+Assert.assertFalse("9 in [7, 8]  => false", 
evaluator.eval(TestHelpers.Row.of(9, 8, null)));
+
+Evaluator longEvaluator = new Evaluator(STRUCT,
 
 Review comment:
   Thanks and fixed.


This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


With regards,
Apache Git Services

-
To unsubscribe, e-mail: issues-unsubscr...@iceberg.apache.org
For additional commands, e-mail: issues-h...@iceberg.apache.org

[GitHub] [incubator-iceberg] jun-he commented on a change in pull request #357: Add in and not in predicates

2019-08-27 Thread GitBox

jun-he commented on a change in pull request #357: Add in and not in predicates
URL: https://github.com/apache/incubator-iceberg/pull/357#discussion_r318402469
 
 

 ##
 File path: 
api/src/main/java/org/apache/iceberg/expressions/BoundSetPredicate.java
 ##
 @@ -0,0 +1,176 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.iceberg.expressions;
+
+import com.google.common.base.Joiner;
+import com.google.common.base.Preconditions;
+import com.google.common.collect.ImmutableSet;
+import java.io.Serializable;
+import java.util.Collection;
+import java.util.Iterator;
+import java.util.Set;
+import org.apache.iceberg.util.CharSequenceWrapper;
+
+public class BoundSetPredicate extends Predicate> {
+  private final LiteralSet literalSet;
+
+  BoundSetPredicate(Operation op, BoundReference ref, Set> lits) 
{
+super(op, ref);
+Preconditions.checkArgument(op == Operation.IN || op == Operation.NOT_IN,
+"%s predicate does not support a set of literals", op);
+this.literalSet = new LiteralSet<>(lits);
+  }
+
+  BoundSetPredicate(Operation op, BoundReference ref, LiteralSet lits) {
+super(op, ref);
+Preconditions.checkArgument(op == Operation.IN || op == Operation.NOT_IN,
+"%s predicate does not support a literal set", op);
+this.literalSet = lits;
+  }
+
+  @Override
+  public Expression negate() {
+return new BoundSetPredicate<>(op().negate(), ref(), literalSet);
+  }
+
+  public Set literalSet() {
+return literalSet;
+  }
+
+  @Override
+  String literalString() {
+return literalSet.toString();
+  }
+
+  /**
+   * Represents a set of literal values in an IN or NOT_IN predicate
+   * @param  The Java type of the value, which can be wrapped by a {@link 
Literal}
+   */
+  private static class LiteralSet implements Set, Serializable {
+private final Set values;
+
+@SuppressWarnings("unchecked")
+LiteralSet(Set> lits) {
+  Preconditions.checkArgument(lits == null || lits.size() > 1,
+  "The input literal set must include more than 1 element.");
+  values = ImmutableSet.builder().addAll(
+  lits.stream().map(
+  lit -> {
+if (lit instanceof Literals.StringLiteral) {
+  return (T) 
CharSequenceWrapper.wrap(((Literals.StringLiteral) lit).value());
+} else {
+  return lit.value();
+}
+  }
+  ).iterator()).build();
+}
+
+@Override
+public String toString() {
+  return Joiner.on(", ").join(values);
+}
+
+@Override
+public boolean contains(Object object) {
+  if (object instanceof CharSequence) {
+return values.contains(CharSequenceWrapper.wrap((CharSequence) 
object));
+  }
+  return values.contains(object);
+}
+
+@Override
+public int size() {
+  return values.size();
+}
+
+@Override
+public boolean isEmpty() {
+  return values.isEmpty();
+}
+
+@Override
+@SuppressWarnings("unchecked")
+public Iterator iterator() {
+  return values.stream().map(
+  val -> {
+if (val instanceof CharSequenceWrapper) {
+  return (T) ((CharSequenceWrapper) val).get();
+} else {
+  return val;
+}
+  }).iterator();
+}
+
+@Override
+public boolean containsAll(Collection c) {
+  throw new UnsupportedOperationException(
+  "LiteralSet currently only supports checking if a single item is 
contained in it.");
+}
+
+@Override
+public Object[] toArray() {
+  throw new UnsupportedOperationException(
+  "Please use iterator() to visit the elements in the set.");
+}
+
+@Override
+public  X[] toArray(X[] a) {
+  throw new UnsupportedOperationException(
+  "Please use iterator() to visit the elements in the set.");
+}
+
+@Override
+public boolean add(T t) {
+  throw new UnsupportedOperationException(
+  "The set is immutable and cannot add an element.");
+}
+
+@Override
+public boolean remove(Object o) {
+  throw new UnsupportedOperationException(
+  "The set is immutable an

[GitHub] [incubator-iceberg] jun-he commented on a change in pull request #357: Add in and not in predicates

2019-08-27 Thread GitBox

jun-he commented on a change in pull request #357: Add in and not in predicates
URL: https://github.com/apache/incubator-iceberg/pull/357#discussion_r318402583
 
 

 ##
 File path: 
api/src/main/java/org/apache/iceberg/expressions/UnboundPredicate.java
 ##
 @@ -125,13 +149,40 @@ public Expression bind(Types.StructType struct, boolean 
caseSensitive) {
 case LT_EQ:
 case EQ:
   return Expressions.alwaysFalse();
-//case IN:
-//  break;
-//case NOT_IN:
-//  break;
   }
 }
 return new BoundPredicate<>(op(), new BoundReference<>(field.fieldId(),
-schema.accessorForField(field.fieldId())), lit);
+schema.accessorForField(field.fieldId())), lit);
 
 Review comment:
   Fixed.


This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


With regards,
Apache Git Services

-
To unsubscribe, e-mail: issues-unsubscr...@iceberg.apache.org
For additional commands, e-mail: issues-h...@iceberg.apache.org

[GitHub] [incubator-iceberg] jun-he commented on a change in pull request #357: Add in and not in predicates

2019-08-27 Thread GitBox

jun-he commented on a change in pull request #357: Add in and not in predicates
URL: https://github.com/apache/incubator-iceberg/pull/357#discussion_r318402556
 
 

 ##
 File path: api/src/main/java/org/apache/iceberg/expressions/Predicate.java
 ##
 @@ -39,37 +37,35 @@ public R ref() {
 return ref;
   }
 
-  public Literal literal() {
-return literal;
-  }
+  abstract String literalString();
 
   @Override
   public String toString() {
-switch (op) {
+switch (op()) {
 
 Review comment:
   Done.


This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


With regards,
Apache Git Services

-
To unsubscribe, e-mail: issues-unsubscr...@iceberg.apache.org
For additional commands, e-mail: issues-h...@iceberg.apache.org

[GitHub] [incubator-iceberg] jun-he commented on a change in pull request #357: Add in and not in predicates

2019-08-27 Thread GitBox

jun-he commented on a change in pull request #357: Add in and not in predicates
URL: https://github.com/apache/incubator-iceberg/pull/357#discussion_r318402380
 
 

 ##
 File path: 
api/src/main/java/org/apache/iceberg/expressions/BoundSetPredicate.java
 ##
 @@ -0,0 +1,176 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.iceberg.expressions;
+
+import com.google.common.base.Joiner;
+import com.google.common.base.Preconditions;
+import com.google.common.collect.ImmutableSet;
+import java.io.Serializable;
+import java.util.Collection;
+import java.util.Iterator;
+import java.util.Set;
+import org.apache.iceberg.util.CharSequenceWrapper;
+
+public class BoundSetPredicate extends Predicate> {
+  private final LiteralSet literalSet;
+
+  BoundSetPredicate(Operation op, BoundReference ref, Set> lits) 
{
+super(op, ref);
+Preconditions.checkArgument(op == Operation.IN || op == Operation.NOT_IN,
+"%s predicate does not support a set of literals", op);
+this.literalSet = new LiteralSet<>(lits);
+  }
+
+  BoundSetPredicate(Operation op, BoundReference ref, LiteralSet lits) {
+super(op, ref);
+Preconditions.checkArgument(op == Operation.IN || op == Operation.NOT_IN,
+"%s predicate does not support a literal set", op);
+this.literalSet = lits;
+  }
+
+  @Override
+  public Expression negate() {
+return new BoundSetPredicate<>(op().negate(), ref(), literalSet);
+  }
+
+  public Set literalSet() {
+return literalSet;
+  }
+
+  @Override
+  String literalString() {
+return literalSet.toString();
+  }
+
+  /**
+   * Represents a set of literal values in an IN or NOT_IN predicate
+   * @param  The Java type of the value, which can be wrapped by a {@link 
Literal}
+   */
+  private static class LiteralSet implements Set, Serializable {
 
 Review comment:
   👌 


This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


With regards,
Apache Git Services

-
To unsubscribe, e-mail: issues-unsubscr...@iceberg.apache.org
For additional commands, e-mail: issues-h...@iceberg.apache.org

[GitHub] [incubator-iceberg] rdsr commented on a change in pull request #421: Add friendly names to catalog tables

2019-08-27 Thread GitBox

rdsr commented on a change in pull request #421: Add friendly names to catalog 
tables
URL: https://github.com/apache/incubator-iceberg/pull/421#discussion_r318396277
 
 

 ##
 File path: hive/src/main/java/org/apache/iceberg/hive/HiveCatalog.java
 ##
 @@ -43,6 +43,11 @@ public HiveCatalog(Configuration conf) {
 this.conf = conf;
   }
 
+  @Override
+  protected String name() {
+return conf.get("hive.metastore.uris");
 
 Review comment:
   This is usually a collection of uris. So I think the catalogName would come 
out as `thrift://host1:port1/,thrift://host2:port2/` .  I don' think its a big 
deal. Just calling it out


This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


With regards,
Apache Git Services

-
To unsubscribe, e-mail: issues-unsubscr...@iceberg.apache.org
For additional commands, e-mail: issues-h...@iceberg.apache.org

[GitHub] [incubator-iceberg] chenjunjiedada commented on a change in pull request #374: Migrate spark table to iceberg table

2019-08-27 Thread GitBox

chenjunjiedada commented on a change in pull request #374: Migrate spark table 
to iceberg table
URL: https://github.com/apache/incubator-iceberg/pull/374#discussion_r318368167
 
 

 ##
 File path: spark/src/main/scala/org/apache/iceberg/spark/SparkTableUtil.scala
 ##
 @@ -297,5 +302,81 @@ object SparkTableUtil {
   )
 }
   }
+
+  private def buildManifest(table: Table,
+  sparkDataFiles: Seq[SparkDataFile],
+  partitionSpec: PartitionSpec): ManifestFile = {
+val outputFile = table.io
+  .newOutputFile(FileFormat.AVRO.addExtension("/tmp/" + 
UUID.randomUUID.toString))
 
 Review comment:
   Understood, I was still thinking serialize the manifest file...


This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


With regards,
Apache Git Services

-
To unsubscribe, e-mail: issues-unsubscr...@iceberg.apache.org
For additional commands, e-mail: issues-h...@iceberg.apache.org

[GitHub] [incubator-iceberg] rdblue commented on issue #416: Adding docs on how to use custom catalog with iceberg

2019-08-27 Thread GitBox

rdblue commented on issue #416: Adding docs on how to use custom catalog with 
iceberg
URL: https://github.com/apache/incubator-iceberg/pull/416#issuecomment-525518658
 
 
   Merged. Thanks @moulimukherjee!


This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


With regards,
Apache Git Services

-
To unsubscribe, e-mail: issues-unsubscr...@iceberg.apache.org
For additional commands, e-mail: issues-h...@iceberg.apache.org

[GitHub] [incubator-iceberg] rdblue merged pull request #416: Adding docs on how to use custom catalog with iceberg

2019-08-27 Thread GitBox

rdblue merged pull request #416: Adding docs on how to use custom catalog with 
iceberg
URL: https://github.com/apache/incubator-iceberg/pull/416
 
 
   


This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


With regards,
Apache Git Services

-
To unsubscribe, e-mail: issues-unsubscr...@iceberg.apache.org
For additional commands, e-mail: issues-h...@iceberg.apache.org

[GitHub] [incubator-iceberg] rdblue opened a new pull request #421: Add friendly names to catalog tables

2019-08-27 Thread GitBox

rdblue opened a new pull request #421: Add friendly names to catalog tables
URL: https://github.com/apache/incubator-iceberg/pull/421
 
 
   This adds an abstract `name` method to `BaseMetastoreCatalog` and a new 
method to generate full table names from a catalog.


This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


With regards,
Apache Git Services

-
To unsubscribe, e-mail: issues-unsubscr...@iceberg.apache.org
For additional commands, e-mail: issues-h...@iceberg.apache.org

[GitHub] [incubator-iceberg] rdblue merged pull request #386: Fix copies in partition data and field summaries

2019-08-27 Thread GitBox

rdblue merged pull request #386: Fix copies in partition data and field 
summaries
URL: https://github.com/apache/incubator-iceberg/pull/386
 
 
   


This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


With regards,
Apache Git Services

-
To unsubscribe, e-mail: issues-unsubscr...@iceberg.apache.org
For additional commands, e-mail: issues-h...@iceberg.apache.org

[GitHub] [incubator-iceberg] moulimukherjee commented on a change in pull request #416: Adding docs on how to use custom catalog with iceberg

2019-08-27 Thread GitBox

moulimukherjee commented on a change in pull request #416: Adding docs on how 
to use custom catalog with iceberg
URL: https://github.com/apache/incubator-iceberg/pull/416#discussion_r318324317
 
 

 ##
 File path: site/docs/custom-catalog.md
 ##
 @@ -0,0 +1,139 @@
+# Custom Catalog Implementation
+
+It's possible to read an iceberg table either from an hdfs path or from a hive 
table. It's also possible to use a custom metastore in place of hive. The steps 
to do that are as follows.
+
+- [Custom TableOperations](#custom-table-operations-implementation)
+- [Custom Catalog](#custom-table-implementation)
+- [Custom IcebergSource](#custom-icebergsource)
+
+### Custom table operations implementation
+Extend `BaseMetastoreTableOperations` to provide implementation on how to read 
and write metadata
+
+Example:
+```java
+class CustomTableOperations extends BaseMetastoreTableOperations {
+  private String dbName;
+  private String tableName;
+  private Configuration conf;
+  private FileIO fileIO;
+
+  protected CustomTableOperations(Configuration conf, String dbName, String 
tableName) {
+this.conf = conf;
+this.dbName = dbName;
+this.tableName = tableName;
+  }
+
+  // The doRefresh method should provide implementation on how to get the 
metadata location
+  @Override
+  public void doRefresh() {
+
+// Example custom service which returns the metadata location given a 
dbName and tableName
+String metadataLocation = CustomService.getMetadataForTable(conf, dbName, 
tableName);
+
+// When updating from a metadata file location, call the helper method
+refreshFromMetadataLocation(metadataLocation);
+
+  }
+
+  // The doCommit method should provide implementation on how to update with 
metadata location atomically
+  @Override
+  public void doCommit(TableMetadata base, TableMetadata metadata) {
+String oldMetadataLocation = base.location();
+
+// Write new metadata using helper method
+String newMetadataLocation = writeNewMetadata(metadata, currentVersion() + 
1);
+
+// Example custom service which updates the metadata location for the 
given db and table atomically
+CustomService.updateMetadataLocation(dbName, tableName, 
oldMetadataLocation, newMetadataLocation);
+
+  }
+
+  // The io method provides a FileIO which is used to read and write the table 
metadata files
+  @Override
+  public FileIO io() {
+if (fileIO == null) {
+  fileIO = new HadoopFileIO(conf);
+}
+return fileIO;
+  }
+
+  // Optional: this can be overridden to provide custom location provider 
implementation
+  @Override
+  public LocationProvider locationProvider() {
+// TODO
+  }
+}
+```
+
+### Custom table implementation
+Extend `BaseMetastoreCatalog` to provide default warehouse locations and 
instantiate `CustomTableOperations`
+
+Example:
+```java
+public class CustomCatalog extends BaseMetastoreCatalog {
+
+  private Configuration configuration;
+
+  public CustomCatalog(Configuration configuration) {
+this.configuration = configuration;
+  }
+
+  @Override
+  protected TableOperations newTableOps(TableIdentifier tableIdentifier) {
+String dbName = tableIdentifier.namespace().level(0);
+String tableName = tableIdentifier.name();
+// instantiate the CustomTableOperations
+return new CustomTableOperations(configuration, dbName, tableName);
+  }
+
+  @Override
+  protected String defaultWarehouseLocation(TableIdentifier tableIdentifier) {
+
+// Can choose to use any other configuration name
+String tableLocation = 
configuration.get("custom.iceberg.warehouse.location");
+
+// Can be an s3 or hdfs path
+if (tableLocation == null) {
+  throw new RuntimeException("custom.iceberg.warehouse.location 
configuration not set!");
+}
+
+return String.format(
+"%s/%s.db/%s", tableLocation,
+tableIdentifier.namespace().levels()[0],
+tableIdentifier.name());
+  }
+
+  @Override
+  public boolean dropTable(TableIdentifier identifier, boolean purge) {
+// Example service to delete table
+CustomService.deleteTable(identifier.namepsace().level(0), 
identifier.name());
+  }
+
+  @Override
+  public void renameTable(TableIdentifier from, TableIdentifier to) {
+// TODO implement behavior  
+throw new RuntimeException("Not yet implemented");
 
 Review comment:
   Ack 👍 


This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


With regards,
Apache Git Services

-
To unsubscribe, e-mail: issues-unsubscr...@iceberg.apache.org
For additional commands, e-mail: issues-h...@iceberg.apache.org

[GitHub] [incubator-iceberg] rdblue commented on a change in pull request #416: Adding docs on how to use custom catalog with iceberg

2019-08-27 Thread GitBox

rdblue commented on a change in pull request #416: Adding docs on how to use 
custom catalog with iceberg
URL: https://github.com/apache/incubator-iceberg/pull/416#discussion_r318324221
 
 

 ##
 File path: site/docs/custom-catalog.md
 ##
 @@ -0,0 +1,139 @@
+# Custom Catalog Implementation
+
+It's possible to read an iceberg table either from an hdfs path or from a hive 
table. It's also possible to use a custom metastore in place of hive. The steps 
to do that are as follows.
+
+- [Custom TableOperations](#custom-table-operations-implementation)
+- [Custom Catalog](#custom-table-implementation)
+- [Custom IcebergSource](#custom-icebergsource)
+
+### Custom table operations implementation
+Extend `BaseMetastoreTableOperations` to provide implementation on how to read 
and write metadata
+
+Example:
+```java
+class CustomTableOperations extends BaseMetastoreTableOperations {
+  private String dbName;
+  private String tableName;
+  private Configuration conf;
+  private FileIO fileIO;
+
+  protected CustomTableOperations(Configuration conf, String dbName, String 
tableName) {
+this.conf = conf;
+this.dbName = dbName;
+this.tableName = tableName;
+  }
+
+  // The doRefresh method should provide implementation on how to get the 
metadata location
+  @Override
+  public void doRefresh() {
+
+// Example custom service which returns the metadata location given a 
dbName and tableName
+String metadataLocation = CustomService.getMetadataForTable(conf, dbName, 
tableName);
+
+// When updating from a metadata file location, call the helper method
+refreshFromMetadataLocation(metadataLocation);
+
+  }
+
+  // The doCommit method should provide implementation on how to update with 
metadata location atomically
+  @Override
+  public void doCommit(TableMetadata base, TableMetadata metadata) {
+String oldMetadataLocation = base.location();
+
+// Write new metadata using helper method
+String newMetadataLocation = writeNewMetadata(metadata, currentVersion() + 
1);
+
+// Example custom service which updates the metadata location for the 
given db and table atomically
+CustomService.updateMetadataLocation(dbName, tableName, 
oldMetadataLocation, newMetadataLocation);
+
+  }
+
+  // The io method provides a FileIO which is used to read and write the table 
metadata files
+  @Override
+  public FileIO io() {
+if (fileIO == null) {
+  fileIO = new HadoopFileIO(conf);
+}
+return fileIO;
+  }
+
+  // Optional: this can be overridden to provide custom location provider 
implementation
+  @Override
+  public LocationProvider locationProvider() {
 
 Review comment:
   Let's remove this for now. We can add it back later if we want to show this 
example.


This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


With regards,
Apache Git Services

-
To unsubscribe, e-mail: issues-unsubscr...@iceberg.apache.org
For additional commands, e-mail: issues-h...@iceberg.apache.org

[GitHub] [incubator-iceberg] rdblue commented on a change in pull request #416: Adding docs on how to use custom catalog with iceberg

2019-08-27 Thread GitBox

rdblue commented on a change in pull request #416: Adding docs on how to use 
custom catalog with iceberg
URL: https://github.com/apache/incubator-iceberg/pull/416#discussion_r318324025
 
 

 ##
 File path: site/docs/custom-catalog.md
 ##
 @@ -0,0 +1,139 @@
+# Custom Catalog Implementation
+
+It's possible to read an iceberg table either from an hdfs path or from a hive 
table. It's also possible to use a custom metastore in place of hive. The steps 
to do that are as follows.
+
+- [Custom TableOperations](#custom-table-operations-implementation)
+- [Custom Catalog](#custom-table-implementation)
+- [Custom IcebergSource](#custom-icebergsource)
+
+### Custom table operations implementation
+Extend `BaseMetastoreTableOperations` to provide implementation on how to read 
and write metadata
+
+Example:
+```java
+class CustomTableOperations extends BaseMetastoreTableOperations {
+  private String dbName;
+  private String tableName;
+  private Configuration conf;
+  private FileIO fileIO;
+
+  protected CustomTableOperations(Configuration conf, String dbName, String 
tableName) {
+this.conf = conf;
+this.dbName = dbName;
+this.tableName = tableName;
+  }
+
+  // The doRefresh method should provide implementation on how to get the 
metadata location
+  @Override
+  public void doRefresh() {
+
+// Example custom service which returns the metadata location given a 
dbName and tableName
+String metadataLocation = CustomService.getMetadataForTable(conf, dbName, 
tableName);
+
+// When updating from a metadata file location, call the helper method
+refreshFromMetadataLocation(metadataLocation);
+
+  }
+
+  // The doCommit method should provide implementation on how to update with 
metadata location atomically
+  @Override
+  public void doCommit(TableMetadata base, TableMetadata metadata) {
+String oldMetadataLocation = base.location();
+
+// Write new metadata using helper method
+String newMetadataLocation = writeNewMetadata(metadata, currentVersion() + 
1);
+
+// Example custom service which updates the metadata location for the 
given db and table atomically
+CustomService.updateMetadataLocation(dbName, tableName, 
oldMetadataLocation, newMetadataLocation);
+
+  }
+
+  // The io method provides a FileIO which is used to read and write the table 
metadata files
+  @Override
+  public FileIO io() {
+if (fileIO == null) {
+  fileIO = new HadoopFileIO(conf);
+}
+return fileIO;
+  }
+
+  // Optional: this can be overridden to provide custom location provider 
implementation
+  @Override
+  public LocationProvider locationProvider() {
+// TODO
+  }
+}
+```
+
+### Custom table implementation
+Extend `BaseMetastoreCatalog` to provide default warehouse locations and 
instantiate `CustomTableOperations`
+
+Example:
+```java
+public class CustomCatalog extends BaseMetastoreCatalog {
+
+  private Configuration configuration;
+
+  public CustomCatalog(Configuration configuration) {
+this.configuration = configuration;
+  }
+
+  @Override
+  protected TableOperations newTableOps(TableIdentifier tableIdentifier) {
+String dbName = tableIdentifier.namespace().level(0);
+String tableName = tableIdentifier.name();
+// instantiate the CustomTableOperations
+return new CustomTableOperations(configuration, dbName, tableName);
+  }
+
+  @Override
+  protected String defaultWarehouseLocation(TableIdentifier tableIdentifier) {
+
+// Can choose to use any other configuration name
+String tableLocation = 
configuration.get("custom.iceberg.warehouse.location");
+
+// Can be an s3 or hdfs path
+if (tableLocation == null) {
+  throw new RuntimeException("custom.iceberg.warehouse.location 
configuration not set!");
+}
+
+return String.format(
+"%s/%s.db/%s", tableLocation,
+tableIdentifier.namespace().levels()[0],
+tableIdentifier.name());
+  }
+
+  @Override
+  public boolean dropTable(TableIdentifier identifier, boolean purge) {
+// Example service to delete table
+CustomService.deleteTable(identifier.namepsace().level(0), 
identifier.name());
+  }
+
+  @Override
+  public void renameTable(TableIdentifier from, TableIdentifier to) {
+// TODO implement behavior  
+throw new RuntimeException("Not yet implemented");
 
 Review comment:
   As above, could this be a `CustomService` call?
   
   ```java
   Preconditions.checkArgument(
   from.namespace().level(0).equals(to.namespace().level(0)),
   "Cannot move table between databases");
   CustomService.renameTable(from.namepsace().level(0), from.name(), to.name());
   ```


This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


With regards,
Apache

[GitHub] [incubator-iceberg] rdblue commented on issue #420: Add commit.manifest-merge.enabled table property

2019-08-27 Thread GitBox

rdblue commented on issue #420: Add commit.manifest-merge.enabled table property
URL: https://github.com/apache/incubator-iceberg/pull/420#issuecomment-525503667
 
 
   @bryanck, here's the PR to disable manifest merging.


This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


With regards,
Apache Git Services

-
To unsubscribe, e-mail: issues-unsubscr...@iceberg.apache.org
For additional commands, e-mail: issues-h...@iceberg.apache.org

[GitHub] [incubator-iceberg] rdblue opened a new pull request #420: Add commit.manifest-merge.enabled table property

2019-08-27 Thread GitBox

rdblue opened a new pull request #420: Add commit.manifest-merge.enabled table 
property
URL: https://github.com/apache/incubator-iceberg/pull/420
 
 
   This property can disable manifest merging. This is intended to be used with 
`RewriteManifests`. Instead of automatic merging, `RewriteManifests` is used to 
maintain metadata.


This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


With regards,
Apache Git Services

-
To unsubscribe, e-mail: issues-unsubscr...@iceberg.apache.org
For additional commands, e-mail: issues-h...@iceberg.apache.org

[GitHub] [incubator-iceberg] rdblue commented on issue #419: Adding common checks in commit()

2019-08-27 Thread GitBox

rdblue commented on issue #419: Adding common checks in commit()
URL: https://github.com/apache/incubator-iceberg/pull/419#issuecomment-525498398
 
 
   Merged. Thanks @moulimukherjee!


This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


With regards,
Apache Git Services

-
To unsubscribe, e-mail: issues-unsubscr...@iceberg.apache.org
For additional commands, e-mail: issues-h...@iceberg.apache.org

[GitHub] [incubator-iceberg] rdblue merged pull request #419: Adding common checks in commit()

2019-08-27 Thread GitBox

rdblue merged pull request #419: Adding common checks in commit()
URL: https://github.com/apache/incubator-iceberg/pull/419
 
 
   


This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


With regards,
Apache Git Services

-
To unsubscribe, e-mail: issues-unsubscr...@iceberg.apache.org
For additional commands, e-mail: issues-h...@iceberg.apache.org

[GitHub] [incubator-iceberg] rdblue commented on issue #419: Adding common checks in commit()

2019-08-27 Thread GitBox

rdblue commented on issue #419: Adding common checks in commit()
URL: https://github.com/apache/incubator-iceberg/pull/419#issuecomment-525481988
 
 
   Looks good to me. I'll merge once tests are passing.


This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


With regards,
Apache Git Services

-
To unsubscribe, e-mail: issues-unsubscr...@iceberg.apache.org
For additional commands, e-mail: issues-h...@iceberg.apache.org

[GitHub] [incubator-iceberg] moulimukherjee commented on a change in pull request #416: Adding docs on how to use custom catalog with iceberg

2019-08-27 Thread GitBox

moulimukherjee commented on a change in pull request #416: Adding docs on how 
to use custom catalog with iceberg
URL: https://github.com/apache/incubator-iceberg/pull/416#discussion_r318295030
 
 

 ##
 File path: site/mkdocs.yml
 ##
 @@ -52,6 +52,7 @@ nav:
 - Quickstart: api-quickstart.md
 - Spark: spark.md
 - Presto: presto.md
+- Custom Catalog: custom-catalog.md
 
 Review comment:
   Done


This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


With regards,
Apache Git Services

-
To unsubscribe, e-mail: issues-unsubscr...@iceberg.apache.org
For additional commands, e-mail: issues-h...@iceberg.apache.org

[GitHub] [incubator-iceberg] moulimukherjee commented on a change in pull request #416: Adding docs on how to use custom catalog with iceberg

2019-08-27 Thread GitBox

moulimukherjee commented on a change in pull request #416: Adding docs on how 
to use custom catalog with iceberg
URL: https://github.com/apache/incubator-iceberg/pull/416#discussion_r318291645
 
 

 ##
 File path: site/docs/custom-catalog.md
 ##
 @@ -0,0 +1,149 @@
+# Custom Catalog Implementation
+
+It's possible to read an iceberg table either from an hdfs path or from a hive 
table. It's also possible to use a custom metastore in place of hive. The steps 
to do that are as follows.
+
+- [Custom TableOperations](#custom-table-operations-implementation)
+- [Custom Catalog](#custom-table-implementation)
+- [Custom IcebergSource](#custom-icebergsource)
+
+### Custom table operations implementation
+Extend `BaseMetastoreTableOperations` to provide implementation on how to read 
and write metadata
+
+Example:
+```java
+class CustomTableOperations extends BaseMetastoreTableOperations {
+  private String dbName;
+  private String tableName;
+  private Configuration conf;
+  private FileIO fileIO;
+
+  protected CustomTableOperations(Configuration conf, String dbName, String 
tableName) {
+this.conf = conf;
+this.dbName = dbName;
+this.tableName = tableName;
+  }
+
+  // The doRefresh method should provide implementation on how to get the 
metadata location
+  @Override
+  public void doRefresh() {
+
+// Example custom service which returns the metadata location given a 
dbName and tableName
+String metadataLocation = CustomService.getMetadataForTable(conf, dbName, 
tableName);
+
+// Use existing method to refresh metadata
+refreshFromMetadataLocation(metadataLocation);
+
+  }
+
+  // The doCommit method should provide implementation on how to update with 
metadata location atomically
+  @Override
+  public void doCommit(TableMetadata base, TableMetadata metadata) {
+// if the metadata is already out of date, reject it
+if (base != current()) {
 
 Review comment:
   Added it at https://github.com/apache/incubator-iceberg/pull/419, will 
remove it from here.


This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


With regards,
Apache Git Services

-
To unsubscribe, e-mail: issues-unsubscr...@iceberg.apache.org
For additional commands, e-mail: issues-h...@iceberg.apache.org

[GitHub] [incubator-iceberg] moulimukherjee opened a new pull request #419: Adding common checks in commit()

2019-08-27 Thread GitBox

moulimukherjee opened a new pull request #419: Adding common checks in commit()
URL: https://github.com/apache/incubator-iceberg/pull/419
 
 
   Adding common checks in commit()
   
   From conversation at 
https://github.com/apache/incubator-iceberg/pull/416/files#r318243051
   
   r? @rdblue @aokolnychyi 


This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


With regards,
Apache Git Services

-
To unsubscribe, e-mail: issues-unsubscr...@iceberg.apache.org
For additional commands, e-mail: issues-h...@iceberg.apache.org

[GitHub] [incubator-iceberg] moulimukherjee commented on a change in pull request #416: Adding docs on how to use custom catalog with iceberg

2019-08-27 Thread GitBox

moulimukherjee commented on a change in pull request #416: Adding docs on how 
to use custom catalog with iceberg
URL: https://github.com/apache/incubator-iceberg/pull/416#discussion_r318268122
 
 

 ##
 File path: site/docs/custom-catalog.md
 ##
 @@ -0,0 +1,149 @@
+# Custom Catalog Implementation
+
+It's possible to read an iceberg table either from an hdfs path or from a hive 
table. It's also possible to use a custom metastore in place of hive. The steps 
to do that are as follows.
+
+- [Custom TableOperations](#custom-table-operations-implementation)
+- [Custom Catalog](#custom-table-implementation)
+- [Custom IcebergSource](#custom-icebergsource)
+
+### Custom table operations implementation
+Extend `BaseMetastoreTableOperations` to provide implementation on how to read 
and write metadata
+
+Example:
+```java
+class CustomTableOperations extends BaseMetastoreTableOperations {
+  private String dbName;
+  private String tableName;
+  private Configuration conf;
+  private FileIO fileIO;
+
+  protected CustomTableOperations(Configuration conf, String dbName, String 
tableName) {
+this.conf = conf;
+this.dbName = dbName;
+this.tableName = tableName;
+  }
+
+  // The doRefresh method should provide implementation on how to get the 
metadata location
+  @Override
+  public void doRefresh() {
+
+// Example custom service which returns the metadata location given a 
dbName and tableName
+String metadataLocation = CustomService.getMetadataForTable(conf, dbName, 
tableName);
+
+// Use existing method to refresh metadata
+refreshFromMetadataLocation(metadataLocation);
+
+  }
+
+  // The doCommit method should provide implementation on how to update with 
metadata location atomically
+  @Override
+  public void doCommit(TableMetadata base, TableMetadata metadata) {
+// if the metadata is already out of date, reject it
+if (base != current()) {
+  throw new CommitFailedException("Cannot commit: stale table metadata for 
%s.%s", dbName, tableName);
+}
+
+// if the metadata is not changed, return early
+if (base == metadata) {
+  return;
+}
+
+String oldMetadataLocation = base.location();
+
+// Write new metadata
+String newMetadataLocation = writeNewMetadata(metadata, currentVersion() + 
1);
+
+// Example custom service which updates the metadata location for the 
given db and table atomically
+CustomService.updateMetadataLocation(dbName, tableName, 
oldMetadataLocation, newMetadataLocation);
+
+  }
+
+  // The io method provides a FileIO which is used to read and write the table 
metadata files
+  @Override
+  public FileIO io() {
+if (fileIO == null) {
+  fileIO = new HadoopFileIO(conf);
+}
+return fileIO;
+  }
+
+  // Optional: this can be overridden to provide custom location provider 
implementation
+  @Override
+  public LocationProvider locationProvider() {
+// TODO
+  }
+}
+```
+
+### Custom table implementation
+Extend `BaseMetastoreCatalog` to provide default warehouse locations and 
instantiate `CustomTableOperations`
+
+Example:
+```java
+public class CustomCatalog extends BaseMetastoreCatalog {
+
+  private Configuration configuration;
+
+  public CustomCatalog(Configuration configuration) {
+this.configuration = configuration;
+  }
+
+  @Override
+  protected TableOperations newTableOps(TableIdentifier tableIdentifier) {
+String dbName = tableIdentifier.namespace().level(0);
+String tableName = tableIdentifier.name();
+// instantiate the CustomTableOperations
+return new CustomTableOperations(configuration, dbName, tableName);
+  }
+
+  @Override
+  protected String defaultWarehouseLocation(TableIdentifier tableIdentifier) {
+
+// Can choose to use any other configuration name
+String tableLocation = 
configuration.get("custom.iceberg.warehouse.location");
+
+// Can be an s3 or hdfs path
+if (tableLocation == null) {
+  throw new RuntimeException("custom.iceberg.warehouse.location 
configuration not set!");
+}
+
+return String.format(
+"%s/%s.db/%s", tableLocation,
+tableIdentifier.namespace().levels()[0],
+tableIdentifier.name());
+  }
+
+  @Override
+  public boolean dropTable(TableIdentifier identifier, boolean purge) {
+// TODO implement behavior  
+throw new RuntimeException("Not yet implemented");
 
 Review comment:
   Ack


This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


With regards,
Apache Git Services

-
To unsubscribe, e-mail: issues-unsubscr...@iceberg.apache.org
For additional commands, e-mail: issues-h...@iceberg.apache.org

[GitHub] [incubator-iceberg] rdblue commented on issue #349: Remove unreferenced hash computations

2019-08-27 Thread GitBox

rdblue commented on issue #349: Remove unreferenced hash computations
URL: https://github.com/apache/incubator-iceberg/pull/349#issuecomment-525455644
 
 
   +1
   
   Thanks for fixing this @jbapple!


This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


With regards,
Apache Git Services

-
To unsubscribe, e-mail: issues-unsubscr...@iceberg.apache.org
For additional commands, e-mail: issues-h...@iceberg.apache.org

[GitHub] [incubator-iceberg] rdblue merged pull request #349: Remove unreferenced hash computations

2019-08-27 Thread GitBox

rdblue merged pull request #349: Remove unreferenced hash computations
URL: https://github.com/apache/incubator-iceberg/pull/349
 
 
   


This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


With regards,
Apache Git Services

-
To unsubscribe, e-mail: issues-unsubscr...@iceberg.apache.org
For additional commands, e-mail: issues-h...@iceberg.apache.org

[GitHub] [incubator-iceberg] rdblue edited a comment on issue #280: Add persistent IDs to partition fields

2019-08-27 Thread GitBox

rdblue edited a comment on issue #280: Add persistent IDs to partition fields
URL: 
https://github.com/apache/incubator-iceberg/issues/280#issuecomment-525451795
 
 
   > And also as TableMetadata knows how many fields are in partition, so can 
maintain the nextIDValue as well.
   
   The next partition field ID is the highest field ID in all of the table's 
partition specs +1. Once a partition spec is removed, we can reuse the ID. 
Alternatively, we can keep track of the last assigned ID, like we do for the 
table schema.
   
   > Also the TableMetadata#updatePartitionSpec should also use nextIDValue to 
pass to PartitionSpec.
   
   I think the spec's IDs will be assigned by the time that method is called 
because the partition spec passed in is already created.
   
   > Does modifying/dropping columns also needs to be taken care, I believe not?
   
   No. `updateSchema` already checks compatibility: 
https://github.com/apache/incubator-iceberg/blob/master/core/src/main/java/org/apache/iceberg/TableMetadata.java#L284
   


This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


With regards,
Apache Git Services

-
To unsubscribe, e-mail: issues-unsubscr...@iceberg.apache.org
For additional commands, e-mail: issues-h...@iceberg.apache.org

[GitHub] [incubator-iceberg] rdblue commented on issue #280: Add persistent IDs to partition fields

2019-08-27 Thread GitBox

rdblue commented on issue #280: Add persistent IDs to partition fields
URL: 
https://github.com/apache/incubator-iceberg/issues/280#issuecomment-525451795
 
 
   > And also as TableMetadata knows how many fields are in partition, so can 
maintain the nextIDValue as well.
   
   The next partition field ID is the highest field ID in all of the table's 
partition specs +1. Once a partition spec is removed, we can reuse the ID. 
Alternatively, we can keep track of the last assigned ID, like we do for the 
table schema.
   
   > Also the TableMetadata#updatePartitionSpec should also use nextIDValue to 
pass to PartitionSpec.
   
   I think the spec's IDs will be assigned by the time that method is called 
because the partition spec passed in is already created.


This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


With regards,
Apache Git Services

-
To unsubscribe, e-mail: issues-unsubscr...@iceberg.apache.org
For additional commands, e-mail: issues-h...@iceberg.apache.org

[GitHub] [incubator-iceberg] rdblue commented on a change in pull request #374: Migrate spark table to iceberg table

2019-08-27 Thread GitBox

rdblue commented on a change in pull request #374: Migrate spark table to 
iceberg table
URL: https://github.com/apache/incubator-iceberg/pull/374#discussion_r318254975
 
 

 ##
 File path: 
spark/src/test/java/org/apache/iceberg/spark/source/TestSparkTableUtil.java
 ##
 @@ -116,4 +123,28 @@ public void testPartitionScanByFilter() {
 Dataset partitionDF = SparkTableUtil.partitionDFByFilter(spark, 
qualifiedTableName, "data = 'a'");
 Assert.assertEquals("There should be 1 matching partition", 1, 
partitionDF.count());
   }
+
+  @Test
+  public void testImportPartitionedTable() throws Exception {
 
 Review comment:
   Can you run these tests to create Hive catalog tables as well?


This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


With regards,
Apache Git Services

-
To unsubscribe, e-mail: issues-unsubscr...@iceberg.apache.org
For additional commands, e-mail: issues-h...@iceberg.apache.org

[GitHub] [incubator-iceberg] rdblue commented on a change in pull request #374: Migrate spark table to iceberg table

2019-08-27 Thread GitBox

rdblue commented on a change in pull request #374: Migrate spark table to 
iceberg table
URL: https://github.com/apache/incubator-iceberg/pull/374#discussion_r318254975
 
 

 ##
 File path: 
spark/src/test/java/org/apache/iceberg/spark/source/TestSparkTableUtil.java
 ##
 @@ -116,4 +123,28 @@ public void testPartitionScanByFilter() {
 Dataset partitionDF = SparkTableUtil.partitionDFByFilter(spark, 
qualifiedTableName, "data = 'a'");
 Assert.assertEquals("There should be 1 matching partition", 1, 
partitionDF.count());
   }
+
+  @Test
+  public void testImportPartitionedTable() throws Exception {
 
 Review comment:
   Can you run these tests with Hive tables as well?


This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


With regards,
Apache Git Services

-
To unsubscribe, e-mail: issues-unsubscr...@iceberg.apache.org
For additional commands, e-mail: issues-h...@iceberg.apache.org

[GitHub] [incubator-iceberg] rdblue commented on a change in pull request #416: Adding docs on how to use custom catalog with iceberg

2019-08-27 Thread GitBox

rdblue commented on a change in pull request #416: Adding docs on how to use 
custom catalog with iceberg
URL: https://github.com/apache/incubator-iceberg/pull/416#discussion_r318252717
 
 

 ##
 File path: site/docs/custom-catalog.md
 ##
 @@ -0,0 +1,149 @@
+# Custom Catalog Implementation
+
+It's possible to read an iceberg table either from an hdfs path or from a hive 
table. It's also possible to use a custom metastore in place of hive. The steps 
to do that are as follows.
+
+- [Custom TableOperations](#custom-table-operations-implementation)
+- [Custom Catalog](#custom-table-implementation)
+- [Custom IcebergSource](#custom-icebergsource)
+
+### Custom table operations implementation
+Extend `BaseMetastoreTableOperations` to provide implementation on how to read 
and write metadata
+
+Example:
+```java
+class CustomTableOperations extends BaseMetastoreTableOperations {
+  private String dbName;
+  private String tableName;
+  private Configuration conf;
+  private FileIO fileIO;
+
+  protected CustomTableOperations(Configuration conf, String dbName, String 
tableName) {
+this.conf = conf;
+this.dbName = dbName;
+this.tableName = tableName;
+  }
+
+  // The doRefresh method should provide implementation on how to get the 
metadata location
+  @Override
+  public void doRefresh() {
+
+// Example custom service which returns the metadata location given a 
dbName and tableName
+String metadataLocation = CustomService.getMetadataForTable(conf, dbName, 
tableName);
+
+// Use existing method to refresh metadata
+refreshFromMetadataLocation(metadataLocation);
+
+  }
+
+  // The doCommit method should provide implementation on how to update with 
metadata location atomically
+  @Override
+  public void doCommit(TableMetadata base, TableMetadata metadata) {
+// if the metadata is already out of date, reject it
+if (base != current()) {
+  throw new CommitFailedException("Cannot commit: stale table metadata for 
%s.%s", dbName, tableName);
+}
+
+// if the metadata is not changed, return early
+if (base == metadata) {
+  return;
+}
+
+String oldMetadataLocation = base.location();
+
+// Write new metadata
+String newMetadataLocation = writeNewMetadata(metadata, currentVersion() + 
1);
+
+// Example custom service which updates the metadata location for the 
given db and table atomically
+CustomService.updateMetadataLocation(dbName, tableName, 
oldMetadataLocation, newMetadataLocation);
+
+  }
+
+  // The io method provides a FileIO which is used to read and write the table 
metadata files
+  @Override
+  public FileIO io() {
+if (fileIO == null) {
+  fileIO = new HadoopFileIO(conf);
+}
+return fileIO;
+  }
+
+  // Optional: this can be overridden to provide custom location provider 
implementation
+  @Override
+  public LocationProvider locationProvider() {
+// TODO
+  }
+}
+```
+
+### Custom table implementation
+Extend `BaseMetastoreCatalog` to provide default warehouse locations and 
instantiate `CustomTableOperations`
+
+Example:
+```java
+public class CustomCatalog extends BaseMetastoreCatalog {
+
+  private Configuration configuration;
+
+  public CustomCatalog(Configuration configuration) {
+this.configuration = configuration;
+  }
+
+  @Override
+  protected TableOperations newTableOps(TableIdentifier tableIdentifier) {
+String dbName = tableIdentifier.namespace().level(0);
+String tableName = tableIdentifier.name();
+// instantiate the CustomTableOperations
+return new CustomTableOperations(configuration, dbName, tableName);
+  }
+
+  @Override
+  protected String defaultWarehouseLocation(TableIdentifier tableIdentifier) {
+
+// Can choose to use any other configuration name
+String tableLocation = 
configuration.get("custom.iceberg.warehouse.location");
+
+// Can be an s3 or hdfs path
+if (tableLocation == null) {
+  throw new RuntimeException("custom.iceberg.warehouse.location 
configuration not set!");
+}
+
+return String.format(
+"%s/%s.db/%s", tableLocation,
+tableIdentifier.namespace().levels()[0],
+tableIdentifier.name());
+  }
+
+  @Override
+  public boolean dropTable(TableIdentifier identifier, boolean purge) {
+// TODO implement behavior  
+throw new RuntimeException("Not yet implemented");
 
 Review comment:
   Maybe `CustomService.deleteTable(identifier.namepsace().level(0), 
identifier.name())`?


This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


With regards,
Apache Git Services

-
To unsubscribe, e-mail: issues-unsubscr...@iceberg.apache.org
For additiona

[GitHub] [incubator-iceberg] rdblue commented on a change in pull request #416: Adding docs on how to use custom catalog with iceberg

2019-08-27 Thread GitBox

rdblue commented on a change in pull request #416: Adding docs on how to use 
custom catalog with iceberg
URL: https://github.com/apache/incubator-iceberg/pull/416#discussion_r318244402
 
 

 ##
 File path: site/docs/custom-catalog.md
 ##
 @@ -0,0 +1,149 @@
+# Custom Catalog Implementation
+
+It's possible to read an iceberg table either from an hdfs path or from a hive 
table. It's also possible to use a custom metastore in place of hive. The steps 
to do that are as follows.
+
+- [Custom TableOperations](#custom-table-operations-implementation)
+- [Custom Catalog](#custom-table-implementation)
+- [Custom IcebergSource](#custom-icebergsource)
+
+### Custom table operations implementation
+Extend `BaseMetastoreTableOperations` to provide implementation on how to read 
and write metadata
+
+Example:
+```java
+class CustomTableOperations extends BaseMetastoreTableOperations {
+  private String dbName;
+  private String tableName;
+  private Configuration conf;
+  private FileIO fileIO;
+
+  protected CustomTableOperations(Configuration conf, String dbName, String 
tableName) {
+this.conf = conf;
+this.dbName = dbName;
+this.tableName = tableName;
+  }
+
+  // The doRefresh method should provide implementation on how to get the 
metadata location
+  @Override
+  public void doRefresh() {
+
+// Example custom service which returns the metadata location given a 
dbName and tableName
+String metadataLocation = CustomService.getMetadataForTable(conf, dbName, 
tableName);
+
+// Use existing method to refresh metadata
 
 Review comment:
   How about "when updating from a metadata file location, call the helper 
method"


This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


With regards,
Apache Git Services

-
To unsubscribe, e-mail: issues-unsubscr...@iceberg.apache.org
For additional commands, e-mail: issues-h...@iceberg.apache.org

[GitHub] [incubator-iceberg] rdblue commented on a change in pull request #416: Adding docs on how to use custom catalog with iceberg

2019-08-27 Thread GitBox

rdblue commented on a change in pull request #416: Adding docs on how to use 
custom catalog with iceberg
URL: https://github.com/apache/incubator-iceberg/pull/416#discussion_r318243982
 
 

 ##
 File path: site/mkdocs.yml
 ##
 @@ -52,6 +52,7 @@ nav:
 - Quickstart: api-quickstart.md
 - Spark: spark.md
 - Presto: presto.md
+- Custom Catalog: custom-catalog.md
 
 Review comment:
   I'd probably move this to the Java drop-down since it shows how to implement 
a catalog in Java.


This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


With regards,
Apache Git Services

-
To unsubscribe, e-mail: issues-unsubscr...@iceberg.apache.org
For additional commands, e-mail: issues-h...@iceberg.apache.org

[GitHub] [incubator-iceberg] rdblue commented on a change in pull request #416: Adding docs on how to use custom catalog with iceberg

2019-08-27 Thread GitBox

rdblue commented on a change in pull request #416: Adding docs on how to use 
custom catalog with iceberg
URL: https://github.com/apache/incubator-iceberg/pull/416#discussion_r318243051
 
 

 ##
 File path: site/docs/custom-catalog.md
 ##
 @@ -0,0 +1,149 @@
+# Custom Catalog Implementation
+
+It's possible to read an iceberg table either from an hdfs path or from a hive 
table. It's also possible to use a custom metastore in place of hive. The steps 
to do that are as follows.
+
+- [Custom TableOperations](#custom-table-operations-implementation)
+- [Custom Catalog](#custom-table-implementation)
+- [Custom IcebergSource](#custom-icebergsource)
+
+### Custom table operations implementation
+Extend `BaseMetastoreTableOperations` to provide implementation on how to read 
and write metadata
+
+Example:
+```java
+class CustomTableOperations extends BaseMetastoreTableOperations {
+  private String dbName;
+  private String tableName;
+  private Configuration conf;
+  private FileIO fileIO;
+
+  protected CustomTableOperations(Configuration conf, String dbName, String 
tableName) {
+this.conf = conf;
+this.dbName = dbName;
+this.tableName = tableName;
+  }
+
+  // The doRefresh method should provide implementation on how to get the 
metadata location
+  @Override
+  public void doRefresh() {
+
+// Example custom service which returns the metadata location given a 
dbName and tableName
+String metadataLocation = CustomService.getMetadataForTable(conf, dbName, 
tableName);
+
+// Use existing method to refresh metadata
+refreshFromMetadataLocation(metadataLocation);
+
+  }
+
+  // The doCommit method should provide implementation on how to update with 
metadata location atomically
+  @Override
+  public void doCommit(TableMetadata base, TableMetadata metadata) {
+// if the metadata is already out of date, reject it
+if (base != current()) {
 
 Review comment:
   Good point, it does. Same with the base == metadata check.


This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


With regards,
Apache Git Services

-
To unsubscribe, e-mail: issues-unsubscr...@iceberg.apache.org
For additional commands, e-mail: issues-h...@iceberg.apache.org

[GitHub] [incubator-iceberg] rdblue commented on a change in pull request #416: Adding docs on how to use custom catalog with iceberg

2019-08-27 Thread GitBox

rdblue commented on a change in pull request #416: Adding docs on how to use 
custom catalog with iceberg
URL: https://github.com/apache/incubator-iceberg/pull/416#discussion_r318243051
 
 

 ##
 File path: site/docs/custom-catalog.md
 ##
 @@ -0,0 +1,149 @@
+# Custom Catalog Implementation
+
+It's possible to read an iceberg table either from an hdfs path or from a hive 
table. It's also possible to use a custom metastore in place of hive. The steps 
to do that are as follows.
+
+- [Custom TableOperations](#custom-table-operations-implementation)
+- [Custom Catalog](#custom-table-implementation)
+- [Custom IcebergSource](#custom-icebergsource)
+
+### Custom table operations implementation
+Extend `BaseMetastoreTableOperations` to provide implementation on how to read 
and write metadata
+
+Example:
+```java
+class CustomTableOperations extends BaseMetastoreTableOperations {
+  private String dbName;
+  private String tableName;
+  private Configuration conf;
+  private FileIO fileIO;
+
+  protected CustomTableOperations(Configuration conf, String dbName, String 
tableName) {
+this.conf = conf;
+this.dbName = dbName;
+this.tableName = tableName;
+  }
+
+  // The doRefresh method should provide implementation on how to get the 
metadata location
+  @Override
+  public void doRefresh() {
+
+// Example custom service which returns the metadata location given a 
dbName and tableName
+String metadataLocation = CustomService.getMetadataForTable(conf, dbName, 
tableName);
+
+// Use existing method to refresh metadata
+refreshFromMetadataLocation(metadataLocation);
+
+  }
+
+  // The doCommit method should provide implementation on how to update with 
metadata location atomically
+  @Override
+  public void doCommit(TableMetadata base, TableMetadata metadata) {
+// if the metadata is already out of date, reject it
+if (base != current()) {
 
 Review comment:
   Good point, it does. Same with the base == current() check.


This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


With regards,
Apache Git Services

-
To unsubscribe, e-mail: issues-unsubscr...@iceberg.apache.org
For additional commands, e-mail: issues-h...@iceberg.apache.org

[GitHub] [incubator-iceberg] manishmalhotrawork commented on issue #280: Add persistent IDs to partition fields

2019-08-27 Thread GitBox

manishmalhotrawork commented on issue #280: Add persistent IDs to partition 
fields
URL: 
https://github.com/apache/incubator-iceberg/issues/280#issuecomment-525415789
 
 
   @rdblue thanks.
   
   Let me take a stab on this, may be as you mentioned in description.
   Passing initial value for the partitionId to PartitionSpec from 
TableMetadata. And also as TableMetadata knows how many fields are in 
partition, so can maintain the nextIDValue as well.
   So, that when `PartitionSpec.partitionType` will be called, it will use the 
passed value as first ID and then increment.
   
   Also the 
[TableMetadata#updatePartitionSpec](https://github.com/apache/incubator-iceberg/blob/master/core/src/main/java/org/apache/iceberg/TableMetadata.java#L294)
 should also use nextIDValue to pass to PartitionSpec.
   
   Does modifying/dropping columns also needs to be taken care, I believe not?
   
   yeah, for my understanding, a sample partitioned table manifest file's 
schema has this entry for partition. Where field-id is 1000+.
   
   ```  "name" : "partition",
   "type" : {
 "type" : "record",
 "name" : "r102",
 "fields" : [ {
   "name" : "order_status",
   "type" : [ "null", "string" ],
   "default" : null,
   "field-id" : 1000
 }, {
   "name" : "ship_priority",
   "type" : [ "null", "int" ],
   "default" : null,
   "field-id" : 1001
 }, {
   "name" : "order_key_bucket",
   "type" : [ "null", "int" ],
   "default" : null,
   "field-id" : 1002
 } ]```


This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


With regards,
Apache Git Services

-
To unsubscribe, e-mail: issues-unsubscr...@iceberg.apache.org
For additional commands, e-mail: issues-h...@iceberg.apache.org

[GitHub] [incubator-iceberg] rdblue commented on issue #411: Ignore unsupported partition transforms for forward compatibility

2019-08-27 Thread GitBox

rdblue commented on issue #411: Ignore unsupported partition transforms for 
forward compatibility
URL: https://github.com/apache/incubator-iceberg/pull/411#issuecomment-525412584
 
 
   @aokolnychyi, can you review this?


This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


With regards,
Apache Git Services

-
To unsubscribe, e-mail: issues-unsubscr...@iceberg.apache.org
For additional commands, e-mail: issues-h...@iceberg.apache.org

[GitHub] [incubator-iceberg] rdblue commented on a change in pull request #374: Migrate spark table to iceberg table

2019-08-27 Thread GitBox

rdblue commented on a change in pull request #374: Migrate spark table to 
iceberg table
URL: https://github.com/apache/incubator-iceberg/pull/374#discussion_r318200212
 
 

 ##
 File path: spark/src/main/scala/org/apache/iceberg/spark/SparkTableUtil.scala
 ##
 @@ -297,5 +302,81 @@ object SparkTableUtil {
   )
 }
   }
+
+  private def buildManifest(table: Table,
+  sparkDataFiles: Seq[SparkDataFile],
+  partitionSpec: PartitionSpec): ManifestFile = {
+val outputFile = table.io
+  .newOutputFile(FileFormat.AVRO.addExtension("/tmp/" + 
UUID.randomUUID.toString))
 
 Review comment:
   This can't be local because it is written on executors and read from the 
driver. If those are on different machines, the driver won't be able to find 
the location.


This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


With regards,
Apache Git Services

-
To unsubscribe, e-mail: issues-unsubscr...@iceberg.apache.org
For additional commands, e-mail: issues-h...@iceberg.apache.org

[GitHub] [incubator-iceberg] rdblue merged pull request #388: Handle rollback in snapshot expiration

2019-08-27 Thread GitBox

rdblue merged pull request #388: Handle rollback in snapshot expiration
URL: https://github.com/apache/incubator-iceberg/pull/388
 
 
   


This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


With regards,
Apache Git Services

-
To unsubscribe, e-mail: issues-unsubscr...@iceberg.apache.org
For additional commands, e-mail: issues-h...@iceberg.apache.org

[GitHub] [incubator-iceberg] rdblue commented on issue #398: Push down StringStartsWith in Spark IcebergSource

2019-08-27 Thread GitBox

rdblue commented on issue #398: Push down StringStartsWith in Spark 
IcebergSource
URL: https://github.com/apache/incubator-iceberg/pull/398#issuecomment-525398380
 
 
   +1
   
   There were a couple of minor points, but I think we can fix those in a 
follow-up.


This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


With regards,
Apache Git Services

-
To unsubscribe, e-mail: issues-unsubscr...@iceberg.apache.org
For additional commands, e-mail: issues-h...@iceberg.apache.org

[GitHub] [incubator-iceberg] rdblue merged pull request #398: Push down StringStartsWith in Spark IcebergSource

2019-08-27 Thread GitBox

rdblue merged pull request #398: Push down StringStartsWith in Spark 
IcebergSource
URL: https://github.com/apache/incubator-iceberg/pull/398
 
 
   


This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


With regards,
Apache Git Services

-
To unsubscribe, e-mail: issues-unsubscr...@iceberg.apache.org
For additional commands, e-mail: issues-h...@iceberg.apache.org

[GitHub] [incubator-iceberg] rdblue closed issue #397: Push down StringStartsWith in Spark IcebergSource

2019-08-27 Thread GitBox

rdblue closed issue #397: Push down StringStartsWith in Spark IcebergSource
URL: https://github.com/apache/incubator-iceberg/issues/397
 
 
   


This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


With regards,
Apache Git Services

-
To unsubscribe, e-mail: issues-unsubscr...@iceberg.apache.org
For additional commands, e-mail: issues-h...@iceberg.apache.org

[GitHub] [incubator-iceberg] rdblue commented on a change in pull request #398: Push down StringStartsWith in Spark IcebergSource

2019-08-27 Thread GitBox

rdblue commented on a change in pull request #398: Push down StringStartsWith 
in Spark IcebergSource
URL: https://github.com/apache/incubator-iceberg/pull/398#discussion_r318197341
 
 

 ##
 File path: 
parquet/src/test/java/org/apache/iceberg/parquet/TestDictionaryRowGroupFilter.java
 ##
 @@ -231,6 +236,41 @@ public void testRequiredColumn() {
 Assert.assertFalse("Should skip: required columns are always non-null", 
shouldRead);
   }
 
+  @Test
+  public void testStartsWith() {
+boolean shouldRead = new ParquetDictionaryRowGroupFilter(SCHEMA, 
startsWith("non_dict", "re"))
+.shouldRead(PARQUET_SCHEMA, ROW_GROUP_METADATA, DICTIONARY_STORE);
+Assert.assertTrue("Should read: no dictionary", shouldRead);
+
+shouldRead = new ParquetDictionaryRowGroupFilter(SCHEMA, 
startsWith("required", "re"))
+.shouldRead(PARQUET_SCHEMA, ROW_GROUP_METADATA, DICTIONARY_STORE);
+Assert.assertTrue("Should read: dictionary contains a matching entry", 
shouldRead);
+
+shouldRead = new ParquetDictionaryRowGroupFilter(SCHEMA, 
startsWith("required", "req"))
+.shouldRead(PARQUET_SCHEMA, ROW_GROUP_METADATA, DICTIONARY_STORE);
+Assert.assertTrue("Should read: dictionary contains a matching entry", 
shouldRead);
+
+shouldRead = new ParquetDictionaryRowGroupFilter(SCHEMA, 
startsWith("some_nulls", "so"))
+.shouldRead(PARQUET_SCHEMA, ROW_GROUP_METADATA, DICTIONARY_STORE);
+Assert.assertTrue("Should read: dictionary contains a matching entry", 
shouldRead);
+
+shouldRead = new ParquetDictionaryRowGroupFilter(SCHEMA, 
startsWith("no_stats", UUID.randomUUID().toString()))
+.shouldRead(PARQUET_SCHEMA, ROW_GROUP_METADATA, DICTIONARY_STORE);
+Assert.assertFalse("Should skip: no stats but dictionary is present", 
shouldRead);
+
+shouldRead = new ParquetDictionaryRowGroupFilter(SCHEMA, 
startsWith("required", "reqs"))
+.shouldRead(PARQUET_SCHEMA, ROW_GROUP_METADATA, DICTIONARY_STORE);
+Assert.assertFalse("Should skip: no match in dictionary", shouldRead);
+
+shouldRead = new ParquetDictionaryRowGroupFilter(SCHEMA, 
startsWith("some_nulls", "somex"))
+.shouldRead(PARQUET_SCHEMA, ROW_GROUP_METADATA, DICTIONARY_STORE);
+Assert.assertFalse("Should skip: no match in dictionary", shouldRead);
+
+shouldRead = new ParquetDictionaryRowGroupFilter(SCHEMA, 
startsWith("no_nulls", "xxx"))
+.shouldRead(PARQUET_SCHEMA, ROW_GROUP_METADATA, DICTIONARY_STORE);
+Assert.assertFalse("Should skip: no match in dictionary", shouldRead);
 
 Review comment:
   Can you add tests for `all_nulls` and `not_in_file` columns?


This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


With regards,
Apache Git Services

-
To unsubscribe, e-mail: issues-unsubscr...@iceberg.apache.org
For additional commands, e-mail: issues-h...@iceberg.apache.org

[GitHub] [incubator-iceberg] rdblue commented on a change in pull request #398: Push down StringStartsWith in Spark IcebergSource

2019-08-27 Thread GitBox

rdblue commented on a change in pull request #398: Push down StringStartsWith 
in Spark IcebergSource
URL: https://github.com/apache/incubator-iceberg/pull/398#discussion_r318196009
 
 

 ##
 File path: 
parquet/src/main/java/org/apache/iceberg/parquet/ParquetMetricsRowGroupFilter.java
 ##
 @@ -339,6 +341,47 @@ public Boolean or(Boolean leftResult, Boolean 
rightResult) {
   return ROWS_MIGHT_MATCH;
 }
 
+@Override
+@SuppressWarnings("unchecked")
+public  Boolean startsWith(BoundReference ref, Literal lit) {
+  int id = ref.fieldId();
+
+  Long valueCount = valueCounts.get(id);
+  if (valueCount == null) {
+// the column is not present and is all nulls
+return ROWS_CANNOT_MATCH;
+  }
+
+  Statistics colStats = (Statistics) stats.get(id);
+  if (colStats != null && !colStats.isEmpty()) {
+if (!colStats.hasNonNullValue()) {
+  return ROWS_CANNOT_MATCH;
+}
+
+Binary prefixAsBinary = 
Binary.fromConstantByteBuffer(lit.toByteBuffer());
+
+PrimitiveComparator comparator = 
PrimitiveComparator.UNSIGNED_LEXICOGRAPHICAL_BINARY_COMPARATOR;
+
+Binary lower = colStats.genericGetMin();
+// truncate lower bound so that its length in bytes is not greater 
than the length of prefix
+int lowerLength = Math.min(prefixAsBinary.length(), lower.length());
+int lowerCmp = comparator.compare(lower.slice(0, lowerLength), 
prefixAsBinary);
 
 Review comment:
   I just looked at Parquet and I think it is going to be cheaper to use 
`toByteBuffer().slice(...)` instead of `slice(...)` because the slice 
implementation for most of the `Binary` implementations calls 
`getBytesUnsafe()` that returns a `byte[]` and usually copies data.
   
   Another minor point is that I think it would be better to use Iceberg's 
comparators instead of Parquet's comparators to ensure we have the same 
behavior everywhere. So I think this should be updated to convert the binary to 
a byte buffer, slice that, and then compare the two using Iceberg's 
`Comparators.unsignedBytes()`.


This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


With regards,
Apache Git Services

-
To unsubscribe, e-mail: issues-unsubscr...@iceberg.apache.org
For additional commands, e-mail: issues-h...@iceberg.apache.org

[GitHub] [incubator-iceberg] rdblue commented on a change in pull request #398: Push down StringStartsWith in Spark IcebergSource

2019-08-27 Thread GitBox

rdblue commented on a change in pull request #398: Push down StringStartsWith 
in Spark IcebergSource
URL: https://github.com/apache/incubator-iceberg/pull/398#discussion_r318191642
 
 

 ##
 File path: 
api/src/test/java/org/apache/iceberg/expressions/TestInclusiveMetricsEvaluator.java
 ##
 @@ -355,4 +390,40 @@ public void testCaseInsensitiveIntegerNotEqRewritten() {
   public void testCaseSensitiveIntegerNotEqRewritten() {
 boolean shouldRead = new InclusiveMetricsEvaluator(SCHEMA, not(equal("ID", 
5)), true).eval(FILE);
   }
+
+  @Test
+  public void testStringStartsWith() {
+boolean shouldRead = new InclusiveMetricsEvaluator(SCHEMA, 
startsWith("required", "a"), true).eval(FILE);
+Assert.assertTrue("Should read: no stats", shouldRead);
+
+shouldRead = new InclusiveMetricsEvaluator(SCHEMA, startsWith("required", 
"a"), true).eval(FILE_2);
+Assert.assertTrue("Should read: range matches", shouldRead);
+
+shouldRead = new InclusiveMetricsEvaluator(SCHEMA, startsWith("required", 
"aa"), true).eval(FILE_2);
+Assert.assertTrue("Should read: range matches", shouldRead);
+
+shouldRead = new InclusiveMetricsEvaluator(SCHEMA, startsWith("required", 
"aaa"), true).eval(FILE_2);
+Assert.assertTrue("Should read: range matches", shouldRead);
+
+shouldRead = new InclusiveMetricsEvaluator(SCHEMA, startsWith("required", 
"1s"), true).eval(FILE_3);
+Assert.assertTrue("Should read: range matches", shouldRead);
+
+shouldRead = new InclusiveMetricsEvaluator(SCHEMA, startsWith("required", 
"1str1x"), true).eval(FILE_3);
+Assert.assertTrue("Should read: range matches", shouldRead);
+
+shouldRead = new InclusiveMetricsEvaluator(SCHEMA, startsWith("required", 
"ff"), true).eval(FILE_4);
+Assert.assertTrue("Should read: range matches", shouldRead);
+
+shouldRead = new InclusiveMetricsEvaluator(SCHEMA, startsWith("required", 
"aB"), true).eval(FILE_2);
 
 Review comment:
   Okay, that works for me. You might add a comment to show your intent in the 
future, but it's not a big deal here.


This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


With regards,
Apache Git Services

-
To unsubscribe, e-mail: issues-unsubscr...@iceberg.apache.org
For additional commands, e-mail: issues-h...@iceberg.apache.org

[GitHub] [incubator-iceberg] rdblue commented on a change in pull request #398: Push down StringStartsWith in Spark IcebergSource

2019-08-27 Thread GitBox

rdblue commented on a change in pull request #398: Push down StringStartsWith 
in Spark IcebergSource
URL: https://github.com/apache/incubator-iceberg/pull/398#discussion_r318178750
 
 

 ##
 File path: 
api/src/test/java/org/apache/iceberg/expressions/TestInclusiveMetricsEvaluator.java
 ##
 @@ -355,4 +390,40 @@ public void testCaseInsensitiveIntegerNotEqRewritten() {
   public void testCaseSensitiveIntegerNotEqRewritten() {
 boolean shouldRead = new InclusiveMetricsEvaluator(SCHEMA, not(equal("ID", 
5)), true).eval(FILE);
   }
+
+  @Test
+  public void testStringStartsWith() {
+boolean shouldRead = new InclusiveMetricsEvaluator(SCHEMA, 
startsWith("required", "a"), true).eval(FILE);
+Assert.assertTrue("Should read: no stats", shouldRead);
+
+shouldRead = new InclusiveMetricsEvaluator(SCHEMA, startsWith("required", 
"a"), true).eval(FILE_2);
+Assert.assertTrue("Should read: range matches", shouldRead);
+
+shouldRead = new InclusiveMetricsEvaluator(SCHEMA, startsWith("required", 
"aa"), true).eval(FILE_2);
+Assert.assertTrue("Should read: range matches", shouldRead);
+
+shouldRead = new InclusiveMetricsEvaluator(SCHEMA, startsWith("required", 
"aaa"), true).eval(FILE_2);
 
 Review comment:
   You're right, I think that behavior is correct.


This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


With regards,
Apache Git Services

-
To unsubscribe, e-mail: issues-unsubscr...@iceberg.apache.org
For additional commands, e-mail: issues-h...@iceberg.apache.org

[GitHub] [incubator-iceberg] chenjunjiedada commented on a change in pull request #374: Migrate spark table to iceberg table

2019-08-27 Thread GitBox

chenjunjiedada commented on a change in pull request #374: Migrate spark table 
to iceberg table
URL: https://github.com/apache/incubator-iceberg/pull/374#discussion_r318176144
 
 

 ##
 File path: spark/src/main/scala/org/apache/iceberg/spark/SparkTableUtil.scala
 ##
 @@ -297,5 +302,81 @@ object SparkTableUtil {
   )
 }
   }
+
+  private def buildManifest(table: Table,
+  sparkDataFiles: Seq[SparkDataFile],
+  partitionSpec: PartitionSpec): ManifestFile = {
+val outputFile = table.io
+  .newOutputFile(FileFormat.AVRO.addExtension("/tmp/" + 
UUID.randomUUID.toString))
+val writer = ManifestWriter.write(partitionSpec, outputFile)
+try {
+  sparkDataFiles.foreach { file =>
+writer.add(file.toDataFile(partitionSpec))
+  }
+} finally {
+  writer.close()
+}
+
+writer.toManifestFile
+  }
+
+  /**
+   * Import a spark table to a iceberg table.
+   *
+   * The import uses the spark session to get table metadata. It assumes no
+   * operation is going on original table and target table and thus is not
+   * thread-safe.
+   *
+   * @param source the database name of the table to be import
+   * @param location the location used to store table metadata
+   *
+   * @return table the imported table
+   */
+  def importSparkTable(source: TableIdentifier, location: String): Table = {
+val sparkSession = SparkSession.builder().getOrCreate()
+import sparkSession.sqlContext.implicits._
+
+val dbName = source.database.getOrElse("default")
+val tableName = source.table
+
+if (!sparkSession.catalog.tableExists(dbName, tableName)) {
+  throw new NoSuchTableException(s"Table $dbName.$tableName does not 
exist")
+}
+
+val partitionSpec = SparkSchemaUtil.specForTable(sparkSession, 
s"$dbName.$tableName")
+val conf = sparkSession.sparkContext.hadoopConfiguration
+val tables = new HadoopTables(conf)
+val schema = SparkSchemaUtil.schemaForTable(sparkSession, 
s"$dbName.$tableName")
+val table = tables.create(schema, partitionSpec, ImmutableMap.of(), 
location)
+val appender = table.newAppend()
+
+if (partitionSpec == PartitionSpec.unpartitioned) {
+  val tableMetadata = 
sparkSession.sessionState.catalog.getTableMetadata(source)
+  val format = tableMetadata.provider.getOrElse("none")
+
+  if (format != "avro" && format != "parquet" && format != "orc") {
+throw new UnsupportedOperationException(s"Unsupported format: $format")
+  }
+  listPartition(Map.empty[String, String], tableMetadata.location.toString,
+format).foreach{f => 
appender.appendFile(f.toDataFile(PartitionSpec.unpartitioned))}
+  appender.commit()
+} else {
+  val partitions = partitionDF(sparkSession, s"$dbName.$tableName")
+  partitions.flatMap { row =>
+listPartition(row.getMap[String, String](0).toMap, row.getString(1), 
row.getString(2))
+  }.coalesce(1).mapPartitions {
 
 Review comment:
   That's a temporary use. I left comments to discuss a way to serialize 
manifest info 
[here](https://github.com/apache/incubator-iceberg/pull/374#issuecomment-522904779).
 Looks like the case class is a good way to do that, I will copy this.


This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


With regards,
Apache Git Services

-
To unsubscribe, e-mail: issues-unsubscr...@iceberg.apache.org
For additional commands, e-mail: issues-h...@iceberg.apache.org

[GitHub] [incubator-iceberg] chenjunjiedada commented on a change in pull request #374: Migrate spark table to iceberg table

2019-08-27 Thread GitBox

chenjunjiedada commented on a change in pull request #374: Migrate spark table 
to iceberg table
URL: https://github.com/apache/incubator-iceberg/pull/374#discussion_r318174610
 
 

 ##
 File path: spark/src/main/scala/org/apache/iceberg/spark/SparkTableUtil.scala
 ##
 @@ -297,5 +302,81 @@ object SparkTableUtil {
   )
 }
   }
+
+  private def buildManifest(table: Table,
+  sparkDataFiles: Seq[SparkDataFile],
+  partitionSpec: PartitionSpec): ManifestFile = {
+val outputFile = table.io
+  .newOutputFile(FileFormat.AVRO.addExtension("/tmp/" + 
UUID.randomUUID.toString))
+val writer = ManifestWriter.write(partitionSpec, outputFile)
+try {
+  sparkDataFiles.foreach { file =>
+writer.add(file.toDataFile(partitionSpec))
+  }
+} finally {
+  writer.close()
+}
+
+writer.toManifestFile
+  }
+
+  /**
+   * Import a spark table to a iceberg table.
+   *
+   * The import uses the spark session to get table metadata. It assumes no
+   * operation is going on original table and target table and thus is not
+   * thread-safe.
+   *
+   * @param source the database name of the table to be import
+   * @param location the location used to store table metadata
+   *
+   * @return table the imported table
+   */
+  def importSparkTable(source: TableIdentifier, location: String): Table = {
+val sparkSession = SparkSession.builder().getOrCreate()
+import sparkSession.sqlContext.implicits._
+
+val dbName = source.database.getOrElse("default")
+val tableName = source.table
+
+if (!sparkSession.catalog.tableExists(dbName, tableName)) {
+  throw new NoSuchTableException(s"Table $dbName.$tableName does not 
exist")
+}
+
+val partitionSpec = SparkSchemaUtil.specForTable(sparkSession, 
s"$dbName.$tableName")
+val conf = sparkSession.sparkContext.hadoopConfiguration
+val tables = new HadoopTables(conf)
+val schema = SparkSchemaUtil.schemaForTable(sparkSession, 
s"$dbName.$tableName")
+val table = tables.create(schema, partitionSpec, ImmutableMap.of(), 
location)
+val appender = table.newAppend()
+
+if (partitionSpec == PartitionSpec.unpartitioned) {
+  val tableMetadata = 
sparkSession.sessionState.catalog.getTableMetadata(source)
+  val format = tableMetadata.provider.getOrElse("none")
+
+  if (format != "avro" && format != "parquet" && format != "orc") {
 
 Review comment:
   ok, will update.


This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


With regards,
Apache Git Services

-
To unsubscribe, e-mail: issues-unsubscr...@iceberg.apache.org
For additional commands, e-mail: issues-h...@iceberg.apache.org

[GitHub] [incubator-iceberg] chenjunjiedada commented on a change in pull request #374: Migrate spark table to iceberg table

2019-08-27 Thread GitBox

chenjunjiedada commented on a change in pull request #374: Migrate spark table 
to iceberg table
URL: https://github.com/apache/incubator-iceberg/pull/374#discussion_r318171088
 
 

 ##
 File path: spark/src/main/scala/org/apache/iceberg/spark/SparkTableUtil.scala
 ##
 @@ -297,5 +302,81 @@ object SparkTableUtil {
   )
 }
   }
+
+  private def buildManifest(table: Table,
+  sparkDataFiles: Seq[SparkDataFile],
+  partitionSpec: PartitionSpec): ManifestFile = {
+val outputFile = table.io
+  .newOutputFile(FileFormat.AVRO.addExtension("/tmp/" + 
UUID.randomUUID.toString))
 
 Review comment:
   Agreed. Just want to confirm why it can't be the local file? It is just a 
temporary file, I wanted to store it in the tmpfs of localhost to save some IO.


This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


With regards,
Apache Git Services

-
To unsubscribe, e-mail: issues-unsubscr...@iceberg.apache.org
For additional commands, e-mail: issues-h...@iceberg.apache.org

[GitHub] [incubator-iceberg] aokolnychyi commented on issue #398: Push down StringStartsWith in Spark IcebergSource

2019-08-27 Thread GitBox

aokolnychyi commented on issue #398: Push down StringStartsWith in Spark 
IcebergSource
URL: https://github.com/apache/incubator-iceberg/pull/398#issuecomment-525331795
 
 
   I extended the scope of this PR to also cover filtering of manifests.


This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


With regards,
Apache Git Services

-
To unsubscribe, e-mail: issues-unsubscr...@iceberg.apache.org
For additional commands, e-mail: issues-h...@iceberg.apache.org

[GitHub] [incubator-iceberg] aokolnychyi closed pull request #398: Push down StringStartsWith in Spark IcebergSource

2019-08-27 Thread GitBox

aokolnychyi closed pull request #398: Push down StringStartsWith in Spark 
IcebergSource
URL: https://github.com/apache/incubator-iceberg/pull/398
 
 
   


This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


With regards,
Apache Git Services

-
To unsubscribe, e-mail: issues-unsubscr...@iceberg.apache.org
For additional commands, e-mail: issues-h...@iceberg.apache.org

[GitHub] [incubator-iceberg] aokolnychyi opened a new pull request #398: Push down StringStartsWith in Spark IcebergSource

2019-08-27 Thread GitBox

aokolnychyi opened a new pull request #398: Push down StringStartsWith in Spark 
IcebergSource
URL: https://github.com/apache/incubator-iceberg/pull/398
 
 
   This PR adds support for pushdown of `StringStartsWith` predicates in Spark 
and resolves #397.


This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


With regards,
Apache Git Services

-
To unsubscribe, e-mail: issues-unsubscr...@iceberg.apache.org
For additional commands, e-mail: issues-h...@iceberg.apache.org

[GitHub] [incubator-iceberg] aokolnychyi commented on a change in pull request #398: Push down StringStartsWith in Spark IcebergSource

2019-08-27 Thread GitBox

aokolnychyi commented on a change in pull request #398: Push down 
StringStartsWith in Spark IcebergSource
URL: https://github.com/apache/incubator-iceberg/pull/398#discussion_r318107696
 
 

 ##
 File path: spark/src/main/java/org/apache/iceberg/spark/SparkFilters.java
 ##
 @@ -153,6 +156,11 @@ public static Expression convert(Filter filter) {
   }
   return null;
 }
+
+case STARTS_WITH: {
+  StringStartsWith stringStartsWith = (StringStartsWith) filter;
+  return startsWith(stringStartsWith.attribute(), 
stringStartsWith.value());
+}
 
 Review comment:
   I've extended `TestFilteredScan` to verify this logic.


This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


With regards,
Apache Git Services

-
To unsubscribe, e-mail: issues-unsubscr...@iceberg.apache.org
For additional commands, e-mail: issues-h...@iceberg.apache.org

[GitHub] [incubator-iceberg] aokolnychyi commented on a change in pull request #398: Push down StringStartsWith in Spark IcebergSource

2019-08-27 Thread GitBox

aokolnychyi commented on a change in pull request #398: Push down 
StringStartsWith in Spark IcebergSource
URL: https://github.com/apache/incubator-iceberg/pull/398#discussion_r317970914
 
 

 ##
 File path: 
api/src/test/java/org/apache/iceberg/expressions/TestInclusiveMetricsEvaluator.java
 ##
 @@ -355,4 +390,40 @@ public void testCaseInsensitiveIntegerNotEqRewritten() {
   public void testCaseSensitiveIntegerNotEqRewritten() {
 boolean shouldRead = new InclusiveMetricsEvaluator(SCHEMA, not(equal("ID", 
5)), true).eval(FILE);
   }
+
+  @Test
+  public void testStringStartsWith() {
+boolean shouldRead = new InclusiveMetricsEvaluator(SCHEMA, 
startsWith("required", "a"), true).eval(FILE);
+Assert.assertTrue("Should read: no stats", shouldRead);
+
+shouldRead = new InclusiveMetricsEvaluator(SCHEMA, startsWith("required", 
"a"), true).eval(FILE_2);
+Assert.assertTrue("Should read: range matches", shouldRead);
+
+shouldRead = new InclusiveMetricsEvaluator(SCHEMA, startsWith("required", 
"aa"), true).eval(FILE_2);
+Assert.assertTrue("Should read: range matches", shouldRead);
+
+shouldRead = new InclusiveMetricsEvaluator(SCHEMA, startsWith("required", 
"aaa"), true).eval(FILE_2);
+Assert.assertTrue("Should read: range matches", shouldRead);
+
+shouldRead = new InclusiveMetricsEvaluator(SCHEMA, startsWith("required", 
"1s"), true).eval(FILE_3);
+Assert.assertTrue("Should read: range matches", shouldRead);
+
+shouldRead = new InclusiveMetricsEvaluator(SCHEMA, startsWith("required", 
"1str1x"), true).eval(FILE_3);
+Assert.assertTrue("Should read: range matches", shouldRead);
+
+shouldRead = new InclusiveMetricsEvaluator(SCHEMA, startsWith("required", 
"ff"), true).eval(FILE_4);
+Assert.assertTrue("Should read: range matches", shouldRead);
+
+shouldRead = new InclusiveMetricsEvaluator(SCHEMA, startsWith("required", 
"aB"), true).eval(FILE_2);
 
 Review comment:
   I just wanted to test case sensitivity as well. I can remove this as other 
cases are covered by remaining tests.


This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


With regards,
Apache Git Services

-
To unsubscribe, e-mail: issues-unsubscr...@iceberg.apache.org
For additional commands, e-mail: issues-h...@iceberg.apache.org

[GitHub] [incubator-iceberg] aokolnychyi commented on a change in pull request #398: Push down StringStartsWith in Spark IcebergSource

2019-08-27 Thread GitBox

aokolnychyi commented on a change in pull request #398: Push down 
StringStartsWith in Spark IcebergSource
URL: https://github.com/apache/incubator-iceberg/pull/398#discussion_r317964972
 
 

 ##
 File path: 
api/src/test/java/org/apache/iceberg/expressions/TestInclusiveMetricsEvaluator.java
 ##
 @@ -355,4 +390,40 @@ public void testCaseInsensitiveIntegerNotEqRewritten() {
   public void testCaseSensitiveIntegerNotEqRewritten() {
 boolean shouldRead = new InclusiveMetricsEvaluator(SCHEMA, not(equal("ID", 
5)), true).eval(FILE);
   }
+
+  @Test
+  public void testStringStartsWith() {
+boolean shouldRead = new InclusiveMetricsEvaluator(SCHEMA, 
startsWith("required", "a"), true).eval(FILE);
+Assert.assertTrue("Should read: no stats", shouldRead);
+
+shouldRead = new InclusiveMetricsEvaluator(SCHEMA, startsWith("required", 
"a"), true).eval(FILE_2);
+Assert.assertTrue("Should read: range matches", shouldRead);
+
+shouldRead = new InclusiveMetricsEvaluator(SCHEMA, startsWith("required", 
"aa"), true).eval(FILE_2);
+Assert.assertTrue("Should read: range matches", shouldRead);
+
+shouldRead = new InclusiveMetricsEvaluator(SCHEMA, startsWith("required", 
"aaa"), true).eval(FILE_2);
 
 Review comment:
   I'll add one. Actually, we don't truncate the prefix. For example, if the 
upper bound is `3str3` and our prefix is `3str3abc`, then we cannot match, 
right? Seems safe to me.


This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


With regards,
Apache Git Services

-
To unsubscribe, e-mail: issues-unsubscr...@iceberg.apache.org
For additional commands, e-mail: issues-h...@iceberg.apache.org

60 matches

Mail list logo