ORC-245. Create framework for data masking. Fixes #174
Signed-off-by: Owen O'Malley <[email protected]> Project: http://git-wip-us.apache.org/repos/asf/orc/repo Commit: http://git-wip-us.apache.org/repos/asf/orc/commit/d5a6c49b Tree: http://git-wip-us.apache.org/repos/asf/orc/tree/d5a6c49b Diff: http://git-wip-us.apache.org/repos/asf/orc/diff/d5a6c49b Branch: refs/heads/master Commit: d5a6c49bf994ee9b07bad699fdcb2af8b7dd4afc Parents: aae40f1 Author: Owen O'Malley <[email protected]> Authored: Tue Oct 10 14:58:33 2017 -0700 Committer: Owen O'Malley <[email protected]> Committed: Wed Oct 18 15:15:14 2017 -0700 ---------------------------------------------------------------------- java/core/pom.xml | 5 + java/core/src/java/org/apache/orc/DataMask.java | 124 +++ .../apache/orc/impl/mask/DecimalIdentity.java | 55 ++ .../apache/orc/impl/mask/DoubleIdentity.java | 50 ++ .../org/apache/orc/impl/mask/ListIdentity.java | 68 ++ .../org/apache/orc/impl/mask/LongIdentity.java | 50 ++ .../org/apache/orc/impl/mask/MapIdentity.java | 75 ++ .../org/apache/orc/impl/mask/MaskFactory.java | 101 +++ .../org/apache/orc/impl/mask/MaskProvider.java | 39 + .../org/apache/orc/impl/mask/NullifyMask.java | 35 + .../apache/orc/impl/mask/RedactMaskFactory.java | 857 +++++++++++++++++++ .../apache/orc/impl/mask/StructIdentity.java | 74 ++ .../apache/orc/impl/mask/TimestampIdentity.java | 52 ++ .../org/apache/orc/impl/mask/UnionIdentity.java | 66 ++ .../services/org.apache.orc.DataMask$Provider | 15 + .../org/apache/orc/impl/mask/TestDataMask.java | 177 ++++ .../apache/orc/impl/mask/TestRedactMask.java | 247 ++++++ 17 files changed, 2090 insertions(+) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/orc/blob/d5a6c49b/java/core/pom.xml ---------------------------------------------------------------------- diff --git a/java/core/pom.xml b/java/core/pom.xml index 32b2c15..e2973eb 100644 --- a/java/core/pom.xml +++ b/java/core/pom.xml @@ -95,6 +95,11 @@ <build> <sourceDirectory>${basedir}/src/java</sourceDirectory> <testSourceDirectory>${basedir}/src/test</testSourceDirectory> + <resources> + <resource> + <directory>${basedir}/src/resources</directory> + </resource> + </resources> <testResources> <testResource> <directory>${basedir}/src/test/resources</directory> http://git-wip-us.apache.org/repos/asf/orc/blob/d5a6c49b/java/core/src/java/org/apache/orc/DataMask.java ---------------------------------------------------------------------- diff --git a/java/core/src/java/org/apache/orc/DataMask.java b/java/core/src/java/org/apache/orc/DataMask.java new file mode 100644 index 0000000..df6ae67 --- /dev/null +++ b/java/core/src/java/org/apache/orc/DataMask.java @@ -0,0 +1,124 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.orc; + +import org.apache.hadoop.hive.ql.exec.vector.ColumnVector; + +import java.util.ServiceLoader; + +/** + * The API for masking data during column encryption for ORC. + * + * They apply to an individual column (via ColumnVector) instead of a + * VectorRowBatch. + * + */ +public interface DataMask { + + /** + * The standard DataMasks can be created using this short cut. + * + * For example, DataMask.Standard.NULLIFY.build(schema) will build a + * nullify DataMask. + */ + enum Standard { + NULLIFY("nullify"), + REDACT("redact"); + + Standard(String name) { + this.name = name; + } + + private final String name; + + public String getName() { + return name; + } + + public DataMask build(TypeDescription schema, String... params) { + return Factory.build(name, schema, params); + } + } + + /** + * Mask the given range of values + * @param original the original input data + * @param masked the masked output data + * @param start the first data element to mask + * @param length the number of data elements to mask + */ + void maskData(ColumnVector original, ColumnVector masked, + int start, int length); + + + /** + * Providers can provide one or more kinds of data masks. + * Because they are discovered using a service loader, they may be added + * by third party jars. + */ + interface Provider { + /** + * Build a mask with the given parameters. + * @param name the kind of masking + * @param schema the type of the field + * @param params the list of parameters with the name in params[0] + * @return the new data mask or null if this name is unknown + */ + DataMask build(String name, TypeDescription schema, String... params); + } + + /** + * To create a DataMask, the users should come through this API. + * + * It supports extension via additional DataMask.Provider implementations + * that are accessed through Java's ServiceLoader API. + */ + class Factory { + private static final ServiceLoader<Provider> LOADER = + ServiceLoader.load(Provider.class); + + /** + * Build a new DataMask instance. + * @param name the name of the mask + * @param schema the type of the field + * @param params a list of parameters to the mask + * @return a new DataMask + * @throws IllegalArgumentException if no such kind of data mask was found + * + * @see org.apache.orc.impl.mask.MaskProvider for the standard provider + */ + public static DataMask build(String name, + TypeDescription schema, + String... params) { + for(Provider provider: LOADER) { + DataMask result = provider.build(name, schema, params); + if (result != null) { + return result; + } + } + StringBuilder msg = new StringBuilder(); + msg.append("Can't find data mask - "); + msg.append(name); + for(int i=0; i < params.length; ++i) { + msg.append(", "); + msg.append(params[i]); + } + throw new IllegalArgumentException(msg.toString()); + } + } +} http://git-wip-us.apache.org/repos/asf/orc/blob/d5a6c49b/java/core/src/java/org/apache/orc/impl/mask/DecimalIdentity.java ---------------------------------------------------------------------- diff --git a/java/core/src/java/org/apache/orc/impl/mask/DecimalIdentity.java b/java/core/src/java/org/apache/orc/impl/mask/DecimalIdentity.java new file mode 100644 index 0000000..31f1ffd --- /dev/null +++ b/java/core/src/java/org/apache/orc/impl/mask/DecimalIdentity.java @@ -0,0 +1,55 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.orc.impl.mask; + +import org.apache.hadoop.hive.ql.exec.vector.ColumnVector; +import org.apache.hadoop.hive.ql.exec.vector.DecimalColumnVector; +import org.apache.orc.DataMask; + +/** + * An identity data mask for decimal types. + */ +public class DecimalIdentity implements DataMask { + @Override + public void maskData(ColumnVector original, ColumnVector masked, int start, + int length) { + DecimalColumnVector target = (DecimalColumnVector) masked; + DecimalColumnVector source = (DecimalColumnVector) original; + target.scale = source.scale; + target.precision = source.precision; + target.isRepeating = source.isRepeating; + target.noNulls = source.noNulls; + if (source.isRepeating) { + target.vector[0].set(source.vector[0]); + target.isNull[0] = source.isNull[0]; + } else if (source.noNulls) { + for(int r = start; r < start + length; ++r) { + target.vector[r].set(source.vector[r]); + } + } else { + for(int r = start; r < start + length; ++r) { + target.isNull[r] = source.isNull[r]; + if (!target.isNull[r]) { + target.vector[r].set(source.vector[r]); + } + } + } + } +} + http://git-wip-us.apache.org/repos/asf/orc/blob/d5a6c49b/java/core/src/java/org/apache/orc/impl/mask/DoubleIdentity.java ---------------------------------------------------------------------- diff --git a/java/core/src/java/org/apache/orc/impl/mask/DoubleIdentity.java b/java/core/src/java/org/apache/orc/impl/mask/DoubleIdentity.java new file mode 100644 index 0000000..2660a6c --- /dev/null +++ b/java/core/src/java/org/apache/orc/impl/mask/DoubleIdentity.java @@ -0,0 +1,50 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.orc.impl.mask; + +import org.apache.hadoop.hive.ql.exec.vector.ColumnVector; +import org.apache.hadoop.hive.ql.exec.vector.DoubleColumnVector; +import org.apache.orc.DataMask; + +/** + * An identity data mask for floating point types. + */ +public class DoubleIdentity implements DataMask { + @Override + public void maskData(ColumnVector original, ColumnVector masked, int start, + int length) { + DoubleColumnVector target = (DoubleColumnVector) masked; + DoubleColumnVector source = (DoubleColumnVector) original; + target.isRepeating = source.isRepeating; + target.noNulls = source.noNulls; + if (source.isRepeating) { + target.vector[0] = source.vector[0]; + target.isNull[0] = source.isNull[0]; + } else if (source.noNulls) { + for(int r = start; r < start + length; ++r) { + target.vector[r] = source.vector[r]; + } + } else { + for(int r = start; r < start + length; ++r) { + target.isNull[r] = source.isNull[r]; + target.vector[r] = source.vector[r]; + } + } + } +} http://git-wip-us.apache.org/repos/asf/orc/blob/d5a6c49b/java/core/src/java/org/apache/orc/impl/mask/ListIdentity.java ---------------------------------------------------------------------- diff --git a/java/core/src/java/org/apache/orc/impl/mask/ListIdentity.java b/java/core/src/java/org/apache/orc/impl/mask/ListIdentity.java new file mode 100644 index 0000000..7592802 --- /dev/null +++ b/java/core/src/java/org/apache/orc/impl/mask/ListIdentity.java @@ -0,0 +1,68 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.orc.impl.mask; + +import org.apache.hadoop.hive.ql.exec.vector.ColumnVector; +import org.apache.hadoop.hive.ql.exec.vector.ListColumnVector; +import org.apache.orc.DataMask; + +/** + * A data mask for list types that applies the given masks to its + * children, but doesn't mask at this level. + */ +public class ListIdentity implements DataMask { + private final DataMask child; + + ListIdentity(DataMask[] child) { + this.child = child[0]; + } + + @Override + public void maskData(ColumnVector original, ColumnVector masked, int start, int length) { + ListColumnVector source = (ListColumnVector) original; + ListColumnVector target = (ListColumnVector) masked; + target.noNulls = source.noNulls; + target.isRepeating = source.isRepeating; + if (source.isRepeating) { + if (!source.noNulls && source.isNull[0]) { + target.isNull[0] = true; + } else { + target.lengths[0] = source.lengths[0]; + child.maskData(source.child, target.child, (int) source.offsets[0], + (int) source.lengths[0]); + } + } else if (source.noNulls) { + for(int r=start; r < start+length; ++r) { + target.offsets[r] = source.offsets[r]; + target.lengths[r] = source.lengths[r]; + child.maskData(source.child, target.child, (int) target.offsets[r], + (int) target.lengths[r]); + } + } else { + for(int r=start; r < start+length; ++r) { + target.isNull[r] = source.isNull[r]; + if (!source.isNull[r]) { + target.offsets[r] = source.offsets[r]; + target.lengths[r] = source.lengths[r]; + child.maskData(source.child, target.child, (int) target.offsets[r], + (int) target.lengths[r]); + } + } + } + } +} http://git-wip-us.apache.org/repos/asf/orc/blob/d5a6c49b/java/core/src/java/org/apache/orc/impl/mask/LongIdentity.java ---------------------------------------------------------------------- diff --git a/java/core/src/java/org/apache/orc/impl/mask/LongIdentity.java b/java/core/src/java/org/apache/orc/impl/mask/LongIdentity.java new file mode 100644 index 0000000..69a0718 --- /dev/null +++ b/java/core/src/java/org/apache/orc/impl/mask/LongIdentity.java @@ -0,0 +1,50 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.orc.impl.mask; + +import org.apache.hadoop.hive.ql.exec.vector.ColumnVector; +import org.apache.hadoop.hive.ql.exec.vector.LongColumnVector; +import org.apache.orc.DataMask; + +/** + * An identity data mask for integer types. + */ +public class LongIdentity implements DataMask { + @Override + public void maskData(ColumnVector original, ColumnVector masked, int start, + int length) { + LongColumnVector target = (LongColumnVector) masked; + LongColumnVector source = (LongColumnVector) original; + target.isRepeating = source.isRepeating; + target.noNulls = source.noNulls; + if (original.isRepeating) { + target.vector[0] = source.vector[0]; + target.isNull[0] = source.isNull[0]; + } else if (source.noNulls) { + for(int r = start; r < start + length; ++r) { + target.vector[r] = source.vector[r]; + } + } else { + for(int r = start; r < start + length; ++r) { + target.vector[r] = source.vector[r]; + target.isNull[r] = source.isNull[r]; + } + } + } +} http://git-wip-us.apache.org/repos/asf/orc/blob/d5a6c49b/java/core/src/java/org/apache/orc/impl/mask/MapIdentity.java ---------------------------------------------------------------------- diff --git a/java/core/src/java/org/apache/orc/impl/mask/MapIdentity.java b/java/core/src/java/org/apache/orc/impl/mask/MapIdentity.java new file mode 100644 index 0000000..43a1aea --- /dev/null +++ b/java/core/src/java/org/apache/orc/impl/mask/MapIdentity.java @@ -0,0 +1,75 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.orc.impl.mask; + +import org.apache.hadoop.hive.ql.exec.vector.ColumnVector; +import org.apache.hadoop.hive.ql.exec.vector.MapColumnVector; +import org.apache.orc.DataMask; + +/** + * A data mask for map types that applies the given masks to its + * children, but doesn't mask at this level. + */ +public class MapIdentity implements DataMask { + private final DataMask keyMask; + private final DataMask valueMask; + + MapIdentity(DataMask[] children) { + this.keyMask = children[0]; + this.valueMask = children[1]; + } + + @Override + public void maskData(ColumnVector original, ColumnVector masked, int start, + int length) { + MapColumnVector source = (MapColumnVector) original; + MapColumnVector target = (MapColumnVector) masked; + target.isRepeating = source.isRepeating; + target.noNulls = source.noNulls; + if (source.isRepeating) { + target.isNull[0] = source.isNull[0]; + if (source.noNulls || !source.isNull[0]) { + target.lengths[0] = source.lengths[0]; + keyMask.maskData(source.keys, target.keys, (int) source.offsets[0], + (int) source.lengths[0]); + valueMask.maskData(source.values, target.values, (int) source.offsets[0], + (int) source.lengths[0]); } + } else if (source.noNulls) { + for(int r=start; r < start+length; ++r) { + target.offsets[r] = source.offsets[r]; + target.lengths[r] = source.lengths[r]; + keyMask.maskData(source.keys, target.keys, (int) target.offsets[r], + (int) target.lengths[r]); + valueMask.maskData(source.values, target.values, (int) target.offsets[r], + (int) target.lengths[r]); + } + } else { + for(int r=start; r < start+length; ++r) { + target.isNull[r] = source.isNull[r]; + if (!source.isNull[r]) { + target.offsets[r] = source.offsets[r]; + target.lengths[r] = source.lengths[r]; + keyMask.maskData(source.keys, target.keys, (int) target.offsets[r], + (int) target.lengths[r]); + valueMask.maskData(source.values, target.values, (int) target.offsets[r], + (int) target.lengths[r]); + } + } + } + } +} http://git-wip-us.apache.org/repos/asf/orc/blob/d5a6c49b/java/core/src/java/org/apache/orc/impl/mask/MaskFactory.java ---------------------------------------------------------------------- diff --git a/java/core/src/java/org/apache/orc/impl/mask/MaskFactory.java b/java/core/src/java/org/apache/orc/impl/mask/MaskFactory.java new file mode 100644 index 0000000..9a77c4a --- /dev/null +++ b/java/core/src/java/org/apache/orc/impl/mask/MaskFactory.java @@ -0,0 +1,101 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.orc.impl.mask; + +import org.apache.orc.DataMask; +import org.apache.orc.TypeDescription; + +import java.util.List; + +/** + * A mask factory framework that automatically builds a recursive mask. + * The subclass defines how to mask the primitive types and the factory + * builds a recursive tree of data masks that matches the schema tree. + */ +public abstract class MaskFactory { + + protected abstract DataMask buildBooleanMask(TypeDescription schema); + protected abstract DataMask buildLongMask(TypeDescription schema); + protected abstract DataMask buildDecimalMask(TypeDescription schema); + protected abstract DataMask buildDoubleMask(TypeDescription schema); + protected abstract DataMask buildStringMask(TypeDescription schema); + protected abstract DataMask buildDateMask(TypeDescription schema); + protected abstract DataMask buildTimestampMask(TypeDescription schema); + protected abstract DataMask buildBinaryMask(TypeDescription schema); + + public DataMask build(TypeDescription schema) { + switch(schema.getCategory()) { + case BOOLEAN: + return buildBooleanMask(schema); + case BYTE: + case SHORT: + case INT: + case LONG: + return buildLongMask(schema); + case FLOAT: + case DOUBLE: + return buildDoubleMask(schema); + case DECIMAL: + return buildDecimalMask(schema); + case STRING: + case CHAR: + case VARCHAR: + return buildStringMask(schema); + case TIMESTAMP: + return buildTimestampMask(schema); + case DATE: + return buildDateMask(schema); + case BINARY: + return buildBinaryMask(schema); + case UNION: + return buildUnionMask(schema); + case STRUCT: + return buildStructMask(schema); + case LIST: + return buildListMask(schema); + case MAP: + return buildMapMask(schema); + default: + throw new IllegalArgumentException("Unhandled type " + schema); + } + } + + protected DataMask[] buildChildren(List<TypeDescription> children) { + DataMask[] result = new DataMask[children.size()]; + for(int i = 0; i < result.length; ++i) { + result[i] = build(children.get(i)); + } + return result; + } + + protected DataMask buildStructMask(TypeDescription schema) { + return new StructIdentity(buildChildren(schema.getChildren())); + } + + DataMask buildListMask(TypeDescription schema) { + return new ListIdentity(buildChildren(schema.getChildren())); + } + + DataMask buildMapMask(TypeDescription schema) { + return new MapIdentity(buildChildren(schema.getChildren())); + } + + DataMask buildUnionMask(TypeDescription schema) { + return new UnionIdentity(buildChildren(schema.getChildren())); + } +} http://git-wip-us.apache.org/repos/asf/orc/blob/d5a6c49b/java/core/src/java/org/apache/orc/impl/mask/MaskProvider.java ---------------------------------------------------------------------- diff --git a/java/core/src/java/org/apache/orc/impl/mask/MaskProvider.java b/java/core/src/java/org/apache/orc/impl/mask/MaskProvider.java new file mode 100644 index 0000000..58f7b38 --- /dev/null +++ b/java/core/src/java/org/apache/orc/impl/mask/MaskProvider.java @@ -0,0 +1,39 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.orc.impl.mask; + +import org.apache.orc.DataMask; +import org.apache.orc.TypeDescription; + +/** + * The Provider for all of the built-in data masks. + */ +public class MaskProvider implements DataMask.Provider { + + @Override + public DataMask build(String name, + TypeDescription schema, + String... params) { + if (name.equals(DataMask.Standard.NULLIFY.getName())) { + return new NullifyMask(); + } else if (name.equals(DataMask.Standard.REDACT.getName())) { + return new RedactMaskFactory(params).build(schema); + } + return null; + } +} http://git-wip-us.apache.org/repos/asf/orc/blob/d5a6c49b/java/core/src/java/org/apache/orc/impl/mask/NullifyMask.java ---------------------------------------------------------------------- diff --git a/java/core/src/java/org/apache/orc/impl/mask/NullifyMask.java b/java/core/src/java/org/apache/orc/impl/mask/NullifyMask.java new file mode 100644 index 0000000..1ff8bc1 --- /dev/null +++ b/java/core/src/java/org/apache/orc/impl/mask/NullifyMask.java @@ -0,0 +1,35 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.orc.impl.mask; + +import org.apache.hadoop.hive.ql.exec.vector.ColumnVector; +import org.apache.orc.DataMask; + +/** + * Masking routine that converts every value to NULL. + */ +public class NullifyMask implements DataMask { + + @Override + public void maskData(ColumnVector original, ColumnVector masked, + int start, int length) { + masked.noNulls = false; + masked.isRepeating = true; + masked.isNull[0] = true; + } +} http://git-wip-us.apache.org/repos/asf/orc/blob/d5a6c49b/java/core/src/java/org/apache/orc/impl/mask/RedactMaskFactory.java ---------------------------------------------------------------------- diff --git a/java/core/src/java/org/apache/orc/impl/mask/RedactMaskFactory.java b/java/core/src/java/org/apache/orc/impl/mask/RedactMaskFactory.java new file mode 100644 index 0000000..37a5ba4 --- /dev/null +++ b/java/core/src/java/org/apache/orc/impl/mask/RedactMaskFactory.java @@ -0,0 +1,857 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.orc.impl.mask; + +import org.apache.hadoop.hive.ql.exec.vector.BytesColumnVector; +import org.apache.hadoop.hive.ql.exec.vector.ColumnVector; +import org.apache.hadoop.hive.ql.exec.vector.DecimalColumnVector; +import org.apache.hadoop.hive.ql.exec.vector.DoubleColumnVector; +import org.apache.hadoop.hive.ql.exec.vector.LongColumnVector; +import org.apache.hadoop.hive.ql.exec.vector.TimestampColumnVector; +import org.apache.hadoop.hive.serde2.io.HiveDecimalWritable; +import org.apache.hadoop.io.Text; +import org.apache.orc.TypeDescription; +import org.apache.orc.DataMask; + +import java.nio.ByteBuffer; +import java.nio.charset.StandardCharsets; +import java.util.Arrays; +import java.util.Calendar; +import java.util.TimeZone; +import java.util.concurrent.TimeUnit; +import java.util.regex.Pattern; + +/** + * Masking strategy that hides most string and numeric values based on unicode + * character categories. + * + * Masking Parameters: + * character replacements: string of 10 characters one per group below + * letter, upper case (default X) + * letter, lower case (default x) + * number, digit (default 9) + * symbol (default $) + * punctuation (default .) + * separator (default no masking) + * letter, other (default ª) + * mark (default à¤) + * number, other (default ²) + * other (default Û) + * + * time replacements: string of 6 numbers or _ one per field below + * year (0 to 4000, default no masking) + * month (1 to 12, default 1) + * date (1 to 31, default 1) + * hour (0 to 23, default 0) + * minute (0 to 59, default 0) + * second (0 to 59, default 0) + * + * Parameters use "_" for preserve original. + */ +public class RedactMaskFactory extends MaskFactory { + + /** + * The value to indicate that the value should be preserved. + */ + private static final int UNMASKED_CHAR = "_".codePointAt(0); + private static final int UNMASKED_DATE = -1; + + // The default replacements for each character category. + // I picked a character in the same category so that the masking is + // idempotent. For non-ascii characters, I mostly picked the first example. + private static final int DEFAULT_LETTER_UPPER = "X".codePointAt(0); + private static final int DEFAULT_LETTER_LOWER = "x".codePointAt(0); + private static final int DEFAULT_NUMBER_DIGIT = 9; + private static final int DEFAULT_NUMBER_DIGIT_CP = + Integer.toString(DEFAULT_NUMBER_DIGIT).codePointAt(0); + private static final int DEFAULT_SYMBOL = "$".codePointAt(0); + private static final int DEFAULT_PUNCTUATION = ".".codePointAt(0); + private static final int DEFAULT_SEPARATOR = UNMASKED_CHAR; + private static final int DEFAULT_LETTER_OTHER = "\u00AA".codePointAt(0); + private static final int DEFAULT_MARK = "\u0903".codePointAt(0); + private static final int DEFAULT_NUMBER_OTHER = "\u00B2".codePointAt(0); + private static final int DEFAULT_OTHER = "\u06DD".codePointAt(0); + + // The replacement codepoint for each character category. We use codepoints + // here so that we don't have to worry about handling long UTF characters + // as special cases. + private final int UPPPER_REPLACEMENT; + private final int LOWER_REPLACEMENT; + private final int OTHER_LETTER_REPLACEMENT; + private final int MARK_REPLACEMENT; + private final int DIGIT_CP_REPLACEMENT; + private final int OTHER_NUMBER_REPLACEMENT; + private final int SYMBOL_REPLACEMENT; + private final int PUNCTUATION_REPLACEMENT; + private final int SEPARATOR_REPLACEMENT; + private final int OTHER_REPLACEMENT; + + // numeric replacement + private final int DIGIT_REPLACEMENT; + + // time replacement + private final int YEAR_REPLACEMENT; + private final int MONTH_REPLACEMENT; + private final int DATE_REPLACEMENT; + private final int HOUR_REPLACEMENT; + private final int MINUTE_REPLACEMENT; + private final int SECOND_REPLACEMENT; + private final boolean maskDate; + private final boolean maskTimestamp; + + public RedactMaskFactory(String... params) { + ByteBuffer param = params.length < 1 ? ByteBuffer.allocate(0) : + ByteBuffer.wrap(params[0].getBytes(StandardCharsets.UTF_8)); + UPPPER_REPLACEMENT = getNextCodepoint(param, DEFAULT_LETTER_UPPER); + LOWER_REPLACEMENT = getNextCodepoint(param, DEFAULT_LETTER_LOWER); + DIGIT_CP_REPLACEMENT = getNextCodepoint(param, DEFAULT_NUMBER_DIGIT_CP); + DIGIT_REPLACEMENT = getReplacementDigit(DIGIT_CP_REPLACEMENT); + SYMBOL_REPLACEMENT = getNextCodepoint(param, DEFAULT_SYMBOL); + PUNCTUATION_REPLACEMENT = getNextCodepoint(param, DEFAULT_PUNCTUATION); + SEPARATOR_REPLACEMENT = getNextCodepoint(param, DEFAULT_SEPARATOR); + OTHER_LETTER_REPLACEMENT = getNextCodepoint(param, DEFAULT_LETTER_OTHER); + MARK_REPLACEMENT = getNextCodepoint(param, DEFAULT_MARK); + OTHER_NUMBER_REPLACEMENT = getNextCodepoint(param, DEFAULT_NUMBER_OTHER); + OTHER_REPLACEMENT = getNextCodepoint(param, DEFAULT_OTHER); + String[] timeParams; + if (params.length < 2) { + timeParams = null; + } else { + timeParams = params[1].split("\\W+"); + } + YEAR_REPLACEMENT = getDateParam(timeParams, 0, UNMASKED_DATE, 4000); + MONTH_REPLACEMENT = getDateParam(timeParams, 1, 1, 12); + DATE_REPLACEMENT = getDateParam(timeParams, 2, 1, 31); + HOUR_REPLACEMENT = getDateParam(timeParams, 3, 0, 23); + MINUTE_REPLACEMENT = getDateParam(timeParams, 4, 0, 59); + SECOND_REPLACEMENT = getDateParam(timeParams, 5, 0, 59); + maskDate = (YEAR_REPLACEMENT != UNMASKED_DATE) || + (MONTH_REPLACEMENT != UNMASKED_DATE) || + (DATE_REPLACEMENT != UNMASKED_DATE); + maskTimestamp = maskDate || (HOUR_REPLACEMENT != UNMASKED_DATE) || + (MINUTE_REPLACEMENT != UNMASKED_DATE) || + (SECOND_REPLACEMENT != UNMASKED_DATE); + } + + @Override + protected DataMask buildBooleanMask(TypeDescription schema) { + if (DIGIT_CP_REPLACEMENT == UNMASKED_CHAR) { + return new LongIdentity(); + } else { + return new BooleanRedactConverter(); + } + } + + @Override + protected DataMask buildLongMask(TypeDescription schema) { + if (DIGIT_CP_REPLACEMENT == UNMASKED_CHAR) { + return new LongIdentity(); + } else { + return new LongRedactConverter(schema.getCategory()); + } + } + + @Override + protected DataMask buildDecimalMask(TypeDescription schema) { + if (DIGIT_CP_REPLACEMENT == UNMASKED_CHAR) { + return new DecimalIdentity(); + } else { + return new DecimalRedactConverter(); + } + } + + @Override + protected DataMask buildDoubleMask(TypeDescription schema) { + if (DIGIT_CP_REPLACEMENT == UNMASKED_CHAR) { + return new DoubleIdentity(); + } else { + return new DoubleRedactConverter(); + } + } + + @Override + protected DataMask buildStringMask(TypeDescription schema) { + return new StringConverter(); + } + + @Override + protected DataMask buildDateMask(TypeDescription schema) { + if (maskDate) { + return new DateRedactConverter(); + } else { + return new LongIdentity(); + } + } + + @Override + protected DataMask buildTimestampMask(TypeDescription schema) { + if (maskTimestamp) { + return new TimestampRedactConverter(); + } else { + return new TimestampIdentity(); + } + } + + @Override + protected DataMask buildBinaryMask(TypeDescription schema) { + return new NullifyMask(); + } + + class LongRedactConverter implements DataMask { + final long mask; + + LongRedactConverter(TypeDescription.Category category) { + switch (category) { + case BYTE: + mask = 0xff; + break; + case SHORT: + mask = 0xffff; + break; + case INT: + mask = 0xffff_ffff; + break; + default: + case LONG: + mask = -1; + break; + } + } + + @Override + public void maskData(ColumnVector original, ColumnVector masked, int start, + int length) { + LongColumnVector target = (LongColumnVector) masked; + LongColumnVector source = (LongColumnVector) original; + target.noNulls = original.noNulls; + target.isRepeating = original.isRepeating; + if (original.isRepeating) { + target.vector[0] = maskLong(source.vector[0]) & mask; + target.isNull[0] = source.isNull[0]; + } else { + for(int r = start; r < start + length; ++r) { + target.vector[r] = maskLong(source.vector[r]) & mask; + target.isNull[r] = source.isNull[r]; + } + } + } + } + + class BooleanRedactConverter implements DataMask { + @Override + public void maskData(ColumnVector original, ColumnVector masked, int start, + int length) { + LongColumnVector target = (LongColumnVector) masked; + LongColumnVector source = (LongColumnVector) original; + target.noNulls = original.noNulls; + target.isRepeating = original.isRepeating; + if (original.isRepeating) { + target.vector[0] = DIGIT_REPLACEMENT == 0 ? 0 : 1; + target.isNull[0] = source.isNull[0]; + } else { + for(int r = start; r < start + length; ++r) { + target.vector[r] = DIGIT_REPLACEMENT == 0 ? 0 : 1; + target.isNull[r] = source.isNull[r]; + } + } + } + } + + class DoubleRedactConverter implements DataMask { + @Override + public void maskData(ColumnVector original, ColumnVector masked, int start, + int length) { + DoubleColumnVector target = (DoubleColumnVector) masked; + DoubleColumnVector source = (DoubleColumnVector) original; + target.noNulls = original.noNulls; + target.isRepeating = original.isRepeating; + if (original.isRepeating) { + target.vector[0] = maskDouble(source.vector[0]); + target.isNull[0] = source.isNull[0]; + } else { + for(int r = start; r < start + length; ++r) { + target.vector[r] = maskDouble(source.vector[r]); + target.isNull[r] = source.isNull[r]; + } + } + } + } + + class StringConverter implements DataMask { + @Override + public void maskData(ColumnVector original, ColumnVector masked, int start, + int length) { + BytesColumnVector target = (BytesColumnVector) masked; + BytesColumnVector source = (BytesColumnVector) original; + target.noNulls = original.noNulls; + target.isRepeating = original.isRepeating; + if (original.isRepeating) { + target.isNull[0] = source.isNull[0]; + if (target.noNulls || !target.isNull[0]) { + maskString(source, 0, target); + } + } else { + for(int r = start; r < start + length; ++r) { + target.isNull[r] = source.isNull[r]; + if (target.noNulls || !target.isNull[r]) { + maskString(source, r, target); + } + } + } + } + } + + class DecimalRedactConverter implements DataMask { + @Override + public void maskData(ColumnVector original, ColumnVector masked, int start, + int length) { + DecimalColumnVector target = (DecimalColumnVector) masked; + DecimalColumnVector source = (DecimalColumnVector) original; + target.noNulls = original.noNulls; + target.isRepeating = original.isRepeating; + target.scale = source.scale; + target.precision = source.precision; + if (original.isRepeating) { + target.isNull[0] = source.isNull[0]; + if (target.noNulls || !target.isNull[0]) { + target.vector[0].set(maskDecimal(source.vector[0])); + } + } else { + for(int r = start; r < start + length; ++r) { + target.isNull[r] = source.isNull[r]; + if (target.noNulls || !target.isNull[r]) { + target.vector[r].set(source.vector[r]); + } + } + } + } + } + + class TimestampRedactConverter implements DataMask { + + @Override + public void maskData(ColumnVector original, ColumnVector masked, int start, + int length) { + TimestampColumnVector target = (TimestampColumnVector) masked; + TimestampColumnVector source = (TimestampColumnVector) original; + target.noNulls = original.noNulls; + target.isRepeating = original.isRepeating; + if (original.isRepeating) { + target.isNull[0] = source.isNull[0]; + if (target.noNulls || !target.isNull[0]) { + target.time[0] = maskTime(source.time[0]); + target.nanos[0] = 0; + } + } else { + for(int r = start; r < start + length; ++r) { + target.isNull[r] = source.isNull[r]; + if (target.noNulls || !target.isNull[r]) { + target.time[r] = maskTime(source.time[r]); + target.nanos[r] = 0; + } + } + } + } + } + + class DateRedactConverter implements DataMask { + + @Override + public void maskData(ColumnVector original, ColumnVector masked, int start, + int length) { + LongColumnVector target = (LongColumnVector) masked; + LongColumnVector source = (LongColumnVector) original; + target.noNulls = original.noNulls; + target.isRepeating = original.isRepeating; + if (original.isRepeating) { + target.isNull[0] = source.isNull[0]; + if (target.noNulls || !target.isNull[0]) { + target.vector[0] = maskDate((int) source.vector[0]); + } + } else { + for(int r = start; r < start + length; ++r) { + target.isNull[r] = source.isNull[r]; + if (target.noNulls || !target.isNull[r]) { + target.vector[r] = maskDate((int) source.vector[r]); + } + } + } + } + } + + /** + * Get the next code point from the ByteBuffer. Moves the position in the + * ByteBuffer forward to the next code point. + * @param param the source of bytes + * @param defaultValue if there are no bytes left, use this value + * @return the code point that was found at the front of the buffer. + */ + static int getNextCodepoint(ByteBuffer param, int defaultValue) { + if (param.remaining() == 0) { + return defaultValue; + } else { + return Text.bytesToCodePoint(param); + } + } + + /** + * Get the replacement digit. This routine supports non-ASCII values for the + * replacement. For example, if the user gives one of "7", "à¥", "ã§" or "á¯" + * the value is 7. + * @param digitCodePoint the code point that is replacing digits + * @return the number from 0 to 9 to use as the numeric replacement + */ + static int getReplacementDigit(int digitCodePoint) { + int dig = Character.getNumericValue(digitCodePoint); + if (dig >= 0 && dig <= 9) { + return dig; + } else { + return DEFAULT_NUMBER_DIGIT; + } + } + + static int getDateParam(String[] dateParams, int posn, + int myDefault, int max) { + if (dateParams != null && posn < dateParams.length) { + if (dateParams[posn].codePointAt(0) == UNMASKED_CHAR) { + return UNMASKED_DATE; + } else { + int result = Integer.parseInt(dateParams[posn]); + if (result >= -1 && result <= max) { + return result; + } else { + throw new IllegalArgumentException("Invalid date parameter " + posn + + " of " + dateParams[posn] + " greater than " + max); + } + } + } else { + return myDefault; + } + } + + /** + * Replace each digit in value with DIGIT_REPLACEMENT scaled to the matching + * number of digits. + * @param value the number to mask + * @return the masked value + */ + public long maskLong(long value) { + long base; + if (DIGIT_REPLACEMENT == 0) { + return 0; + } else if (value >= 0) { + base = 1; + } else { + base = -1; + // make sure Long.MIN_VALUE doesn't overflow + if (value == Long.MIN_VALUE) { + value = Long.MAX_VALUE; + } else { + value = -value; + } + } + if (value < 100_000_000L) { + if (value < 10_000L) { + if (value < 100L) { + if (value < 10L) { + base *= 1; + } else { + base *= 11; + } + } else if (value < 1_000L) { + base *= 111; + } else { + base *= 1_111; + } + } else if (value < 1_000_000L) { + if (value < 100_000L) { + base *= 11_111; + } else { + base *= 111_111; + } + } else if (value < 10_000_000L) { + base *= 1_111_111; + } else { + base *= 11_111_111; + } + } else if (value < 10_000_000_000_000_000L) { + if (value < 1_000_000_000_000L) { + if (value < 10_000_000_000L) { + if (value < 1_000_000_000L) { + base *= 111_111_111; + } else { + base *= 1_111_111_111; + } + } else if (value < 100_000_000_000L) { + base *= 11_111_111_111L; + } else { + base *= 111_111_111_111L; + } + } else if (value < 100_000_000_000_000L) { + if (value < 10_000_000_000_000L) { + base *= 1_111_111_111_111L; + } else { + base *= 11_111_111_111_111L; + } + } else if (value < 1_000_000_000_000_000L) { + base *= 111_111_111_111_111L; + } else { + base *= 1_111_111_111_111_111L; + } + } else if (value < 100_000_000_000_000_000L) { + base *= 11_111_111_111_111_111L; + // If the digit is 9, it would overflow at 19 digits, so use 18. + } else if (value < 1_000_000_000_000_000_000L || DIGIT_REPLACEMENT == 9) { + base *= 111_111_111_111_111_111L; + } else { + base *= 1_111_111_111_111_111_111L; + } + return DIGIT_REPLACEMENT * base; + } + + private static final double[] DOUBLE_POWER_10 = new double[]{ + 1e-308, 1e-307, 1e-306, 1e-305, 1e-304, 1e-303, 1e-302, 1e-301, 1e-300, + 1e-299, 1e-298, 1e-297, 1e-296, 1e-295, 1e-294, 1e-293, 1e-292, 1e-291, + 1e-290, 1e-289, 1e-288, 1e-287, 1e-286, 1e-285, 1e-284, 1e-283, 1e-282, + 1e-281, 1e-280, 1e-279, 1e-278, 1e-277, 1e-276, 1e-275, 1e-274, 1e-273, + 1e-272, 1e-271, 1e-270, 1e-269, 1e-268, 1e-267, 1e-266, 1e-265, 1e-264, + 1e-263, 1e-262, 1e-261, 1e-260, 1e-259, 1e-258, 1e-257, 1e-256, 1e-255, + 1e-254, 1e-253, 1e-252, 1e-251, 1e-250, 1e-249, 1e-248, 1e-247, 1e-246, + 1e-245, 1e-244, 1e-243, 1e-242, 1e-241, 1e-240, 1e-239, 1e-238, 1e-237, + 1e-236, 1e-235, 1e-234, 1e-233, 1e-232, 1e-231, 1e-230, 1e-229, 1e-228, + 1e-227, 1e-226, 1e-225, 1e-224, 1e-223, 1e-222, 1e-221, 1e-220, 1e-219, + 1e-218, 1e-217, 1e-216, 1e-215, 1e-214, 1e-213, 1e-212, 1e-211, 1e-210, + 1e-209, 1e-208, 1e-207, 1e-206, 1e-205, 1e-204, 1e-203, 1e-202, 1e-201, + 1e-200, 1e-199, 1e-198, 1e-197, 1e-196, 1e-195, 1e-194, 1e-193, 1e-192, + 1e-191, 1e-190, 1e-189, 1e-188, 1e-187, 1e-186, 1e-185, 1e-184, 1e-183, + 1e-182, 1e-181, 1e-180, 1e-179, 1e-178, 1e-177, 1e-176, 1e-175, 1e-174, + 1e-173, 1e-172, 1e-171, 1e-170, 1e-169, 1e-168, 1e-167, 1e-166, 1e-165, + 1e-164, 1e-163, 1e-162, 1e-161, 1e-160, 1e-159, 1e-158, 1e-157, 1e-156, + 1e-155, 1e-154, 1e-153, 1e-152, 1e-151, 1e-150, 1e-149, 1e-148, 1e-147, + 1e-146, 1e-145, 1e-144, 1e-143, 1e-142, 1e-141, 1e-140, 1e-139, 1e-138, + 1e-137, 1e-136, 1e-135, 1e-134, 1e-133, 1e-132, 1e-131, 1e-130, 1e-129, + 1e-128, 1e-127, 1e-126, 1e-125, 1e-124, 1e-123, 1e-122, 1e-121, 1e-120, + 1e-119, 1e-118, 1e-117, 1e-116, 1e-115, 1e-114, 1e-113, 1e-112, 1e-111, + 1e-110, 1e-109, 1e-108, 1e-107, 1e-106, 1e-105, 1e-104, 1e-103, 1e-102, + 1e-101, 1e-100, 1e-99, 1e-98, 1e-97, 1e-96, 1e-95, 1e-94, 1e-93, + 1e-92, 1e-91, 1e-90, 1e-89, 1e-88, 1e-87, 1e-86, 1e-85, 1e-84, + 1e-83, 1e-82, 1e-81, 1e-80, 1e-79, 1e-78, 1e-77, 1e-76, 1e-75, + 1e-74, 1e-73, 1e-72, 1e-71, 1e-70, 1e-69, 1e-68, 1e-67, 1e-66, + 1e-65, 1e-64, 1e-63, 1e-62, 1e-61, 1e-60, 1e-59, 1e-58, 1e-57, + 1e-56, 1e-55, 1e-54, 1e-53, 1e-52, 1e-51, 1e-50, 1e-49, 1e-48, + 1e-47, 1e-46, 1e-45, 1e-44, 1e-43, 1e-42, 1e-41, 1e-40, 1e-39, + 1e-38, 1e-37, 1e-36, 1e-35, 1e-34, 1e-33, 1e-32, 1e-31, 1e-30, + 1e-29, 1e-28, 1e-27, 1e-26, 1e-25, 1e-24, 1e-23, 1e-22, 1e-21, + 1e-20, 1e-19, 1e-18, 1e-17, 1e-16, 1e-15, 1e-14, 1e-13, 1e-12, + 1e-11, 1e-10, 1e-9, 1e-8, 1e-7, 1e-6, 1e-5, 1e-4, 1e-3, + 1e-2, 1e-1, 1e0, 1e1, 1e2, 1e3, 1e4, 1e5, 1e6, + 1e7, 1e8, 1e9, 1e10, 1e11, 1e12, 1e13, 1e14, 1e15, + 1e16, 1e17, 1e18, 1e19, 1e20, 1e21, 1e22, 1e23, 1e24, + 1e25, 1e26, 1e27, 1e28, 1e29, 1e30, 1e31, 1e32, 1e33, + 1e34, 1e35, 1e36, 1e37, 1e38, 1e39, 1e40, 1e41, 1e42, + 1e43, 1e44, 1e45, 1e46, 1e47, 1e48, 1e49, 1e50, 1e51, + 1e52, 1e53, 1e54, 1e55, 1e56, 1e57, 1e58, 1e59, 1e60, + 1e61, 1e62, 1e63, 1e64, 1e65, 1e66, 1e67, 1e68, 1e69, + 1e70, 1e71, 1e72, 1e73, 1e74, 1e75, 1e76, 1e77, 1e78, + 1e79, 1e80, 1e81, 1e82, 1e83, 1e84, 1e85, 1e86, 1e87, + 1e88, 1e89, 1e90, 1e91, 1e92, 1e93, 1e94, 1e95, 1e96, + 1e97, 1e98, 1e99, 1e100, 1e101, 1e102, 1e103, 1e104, 1e105, + 1e106, 1e107, 1e108, 1e109, 1e110, 1e111, 1e112, 1e113, 1e114, + 1e115, 1e116, 1e117, 1e118, 1e119, 1e120, 1e121, 1e122, 1e123, + 1e124, 1e125, 1e126, 1e127, 1e128, 1e129, 1e130, 1e131, 1e132, + 1e133, 1e134, 1e135, 1e136, 1e137, 1e138, 1e139, 1e140, 1e141, + 1e142, 1e143, 1e144, 1e145, 1e146, 1e147, 1e148, 1e149, 1e150, + 1e151, 1e152, 1e153, 1e154, 1e155, 1e156, 1e157, 1e158, 1e159, + 1e160, 1e161, 1e162, 1e163, 1e164, 1e165, 1e166, 1e167, 1e168, + 1e169, 1e170, 1e171, 1e172, 1e173, 1e174, 1e175, 1e176, 1e177, + 1e178, 1e179, 1e180, 1e181, 1e182, 1e183, 1e184, 1e185, 1e186, + 1e187, 1e188, 1e189, 1e190, 1e191, 1e192, 1e193, 1e194, 1e195, + 1e196, 1e197, 1e198, 1e199, 1e200, 1e201, 1e202, 1e203, 1e204, + 1e205, 1e206, 1e207, 1e208, 1e209, 1e210, 1e211, 1e212, 1e213, + 1e214, 1e215, 1e216, 1e217, 1e218, 1e219, 1e220, 1e221, 1e222, + 1e223, 1e224, 1e225, 1e226, 1e227, 1e228, 1e229, 1e230, 1e231, + 1e232, 1e233, 1e234, 1e235, 1e236, 1e237, 1e238, 1e239, 1e240, + 1e241, 1e242, 1e243, 1e244, 1e245, 1e246, 1e247, 1e248, 1e249, + 1e250, 1e251, 1e252, 1e253, 1e254, 1e255, 1e256, 1e257, 1e258, + 1e259, 1e260, 1e261, 1e262, 1e263, 1e264, 1e265, 1e266, 1e267, + 1e268, 1e269, 1e270, 1e271, 1e272, 1e273, 1e274, 1e275, 1e276, + 1e277, 1e278, 1e279, 1e280, 1e281, 1e282, 1e283, 1e284, 1e285, + 1e286, 1e287, 1e288, 1e289, 1e290, 1e291, 1e292, 1e293, 1e294, + 1e295, 1e296, 1e297, 1e298, 1e299, 1e300, 1e301, 1e302, 1e303, + 1e304, 1e305, 1e306, 1e307}; + + /** + * Replace each digit in value with digit. + * @param value the number to mask + * @return the + */ + public double maskDouble(double value) { + double base; + // It seems better to mask 0 to 9.99999 rather than 9.99999e-308. + if (value == 0 || DIGIT_REPLACEMENT == 0) { + return DIGIT_REPLACEMENT * 1.11111; + } else if (value > 0) { + base = 1.11111; + } else { + base = -1.11111; + value = -value; + } + int posn = Arrays.binarySearch(DOUBLE_POWER_10, value); + if (posn < -DOUBLE_POWER_10.length - 2) { + posn = DOUBLE_POWER_10.length - 1; + } else if (posn == -1) { + posn = 0; + } else if (posn < 0) { + posn = -posn -2; + } + return DIGIT_REPLACEMENT * base * DOUBLE_POWER_10[posn]; + } + + private final Calendar scratch = Calendar.getInstance(); + + /** + * Given the requested masking parameters, redact the given time + * @param millis the original time + * @return the millis after it has been masked + */ + long maskTime(long millis) { + scratch.setTimeInMillis(millis); + if (YEAR_REPLACEMENT != UNMASKED_DATE) { + scratch.set(Calendar.YEAR, YEAR_REPLACEMENT); + } + if (MONTH_REPLACEMENT != UNMASKED_DATE) { + scratch.set(Calendar.MONTH, MONTH_REPLACEMENT - 1); + } + if (DATE_REPLACEMENT != UNMASKED_DATE) { + scratch.set(Calendar.DATE, DATE_REPLACEMENT); + } + if (HOUR_REPLACEMENT != UNMASKED_DATE) { + if (HOUR_REPLACEMENT >= 12) { + scratch.set(Calendar.HOUR, HOUR_REPLACEMENT - 12); + scratch.set(Calendar.AM_PM, Calendar.PM); + } else { + scratch.set(Calendar.HOUR, HOUR_REPLACEMENT); + scratch.set(Calendar.AM_PM, Calendar.AM); + } + } + if (MINUTE_REPLACEMENT != UNMASKED_DATE) { + scratch.set(Calendar.MINUTE, MINUTE_REPLACEMENT); + } + if (SECOND_REPLACEMENT != UNMASKED_DATE) { + scratch.set(Calendar.SECOND, SECOND_REPLACEMENT); + scratch.set(Calendar.MILLISECOND, 0); + } + return scratch.getTimeInMillis(); + } + + private static final long MILLIS_PER_DAY = TimeUnit.DAYS.toMillis(1); + + private final Calendar utcScratch = + Calendar.getInstance(TimeZone.getTimeZone("UTC")); + + /** + * Given a date as the number of days since epoch (1 Jan 1970), + * mask the date given the parameters. + * @param daysSinceEpoch the number of days after epoch + * @return the number of days after epoch when masked + */ + int maskDate(int daysSinceEpoch) { + utcScratch.setTimeInMillis(daysSinceEpoch * MILLIS_PER_DAY); + if (YEAR_REPLACEMENT != UNMASKED_DATE) { + utcScratch.set(Calendar.YEAR, YEAR_REPLACEMENT); + } + if (MONTH_REPLACEMENT != UNMASKED_DATE) { + utcScratch.set(Calendar.MONTH, MONTH_REPLACEMENT - 1); + } + if (DATE_REPLACEMENT != UNMASKED_DATE) { + utcScratch.set(Calendar.DATE, DATE_REPLACEMENT); + } + return (int) (utcScratch.getTimeInMillis() / MILLIS_PER_DAY); + } + + private static final Pattern DIGIT_PATTERN = Pattern.compile("[0-9]"); + + /** + * Mask a decimal. + * This is painfully slow because it converts to a string and then back to + * a decimal. Until HiveDecimalWritable gives us more access, this is + * the best tradeoff between developer time, functionality, and run time. + * @param source the value to mask + * @return the masked value. + */ + HiveDecimalWritable maskDecimal(HiveDecimalWritable source) { + String str = DIGIT_PATTERN.matcher(source.toString()). + replaceAll(Integer.toString(DIGIT_REPLACEMENT)); + return new HiveDecimalWritable(str); + } + + /** + * Given a UTF code point, find the replacement codepoint + * @param codepoint a UTF character + * @return the replacement codepoint + */ + int getReplacement(int codepoint) { + switch (Character.getType(codepoint)) { + case Character.UPPERCASE_LETTER: + return UPPPER_REPLACEMENT; + case Character.LOWERCASE_LETTER: + return LOWER_REPLACEMENT; + case Character.TITLECASE_LETTER: + case Character.MODIFIER_LETTER: + case Character.OTHER_LETTER: + return OTHER_LETTER_REPLACEMENT; + case Character.NON_SPACING_MARK: + case Character.ENCLOSING_MARK: + case Character.COMBINING_SPACING_MARK: + return MARK_REPLACEMENT; + case Character.DECIMAL_DIGIT_NUMBER: + return DIGIT_CP_REPLACEMENT; + case Character.LETTER_NUMBER: + case Character.OTHER_NUMBER: + return OTHER_NUMBER_REPLACEMENT; + case Character.SPACE_SEPARATOR: + case Character.LINE_SEPARATOR: + case Character.PARAGRAPH_SEPARATOR: + return SEPARATOR_REPLACEMENT; + case Character.MATH_SYMBOL: + case Character.CURRENCY_SYMBOL: + case Character.MODIFIER_SYMBOL: + case Character.OTHER_SYMBOL: + return SYMBOL_REPLACEMENT; + case Character.DASH_PUNCTUATION: + case Character.START_PUNCTUATION: + case Character.END_PUNCTUATION: + case Character.CONNECTOR_PUNCTUATION: + case Character.OTHER_PUNCTUATION: + return PUNCTUATION_REPLACEMENT; + default: + return OTHER_REPLACEMENT; + } + } + + /** + * Get the number of bytes for each codepoint + * @param codepoint the codepoint to check + * @return the number of bytes + */ + static int getCodepointLength(int codepoint) { + if (codepoint < 0) { + throw new IllegalArgumentException("Illegal codepoint " + codepoint); + } else if (codepoint < 0x80) { + return 1; + } else if (codepoint < 0x7ff) { + return 2; + } else if (codepoint < 0xffff) { + return 3; + } else if (codepoint < 0x10FFFF) { + return 4; + } else { + throw new IllegalArgumentException("Illegal codepoint " + codepoint); + } + } + + /** + * Write the give codepoint to the buffer. + * @param codepoint the codepoint to write + * @param buffer the buffer to write into + * @param offset the first offset to use + * @param length the number of bytes that will be used + */ + static void writeCodepoint(int codepoint, byte[] buffer, int offset, + int length) { + switch (length) { + case 1: + buffer[offset] = (byte) codepoint; + break; + case 2: + buffer[offset] = (byte)(0xC0 | codepoint >> 6); + buffer[offset+1] = (byte)(0x80 | (codepoint & 0x3f)); + break; + case 3: + buffer[offset] = (byte)(0xE0 | codepoint >> 12); + buffer[offset+1] = (byte)(0x80 | ((codepoint >> 6) & 0x3f)); + buffer[offset+2] = (byte)(0x80 | (codepoint & 0x3f)); + break; + case 4: + buffer[offset] = (byte)(0xF0 | codepoint >> 18); + buffer[offset+1] = (byte)(0x80 | ((codepoint >> 12) & 0x3f)); + buffer[offset+2] = (byte)(0x80 | ((codepoint >> 6) & 0x3f)); + buffer[offset+3] = (byte)(0x80 | (codepoint & 0x3f)); + break; + default: + throw new IllegalArgumentException("Invalid length for codepoint " + + codepoint + " = " + length); + } + } + + /** + * Mask a string by finding the character category of each character + * and replacing it with the matching literal. + * @param source the source column vector + * @param row the value index + * @param target the target column vector + */ + void maskString(BytesColumnVector source, int row, BytesColumnVector target) { + int expectedBytes = source.length[row]; + ByteBuffer sourceBytes = ByteBuffer.wrap(source.vector[row], + source.start[row], source.length[row]); + // ensure we have enough space, if the masked data is the same size + target.ensureValPreallocated(expectedBytes); + byte[] outputBuffer = target.getValPreallocatedBytes(); + int outputOffset = target.getValPreallocatedStart(); + int outputStart = outputOffset; + while (sourceBytes.remaining() > 0) { + int cp = Text.bytesToCodePoint(sourceBytes); + + // Find the replacement for the current character. + int replacement = getReplacement(cp); + if (replacement == UNMASKED_CHAR) { + replacement = cp; + } + int len = getCodepointLength(replacement); + + // If the translation will overflow the buffer, we need to resize. + // This will only happen when the masked size is larger than the original. + if (len + outputOffset > outputBuffer.length) { + // Revise estimate how much we are going to need now. We are maximally + // pesamistic here so that we don't have to expand again for this value. + int currentOutputStart = outputStart; + int currentOutputLength = outputOffset - currentOutputStart; + expectedBytes = currentOutputLength + len + sourceBytes.remaining() * 4; + + // Expand the buffer to fit the new estimate + target.ensureValPreallocated(expectedBytes); + + // Copy over the bytes we've already written for this value and move + // the pointers to the new output buffer. + byte[] oldBuffer = outputBuffer; + outputBuffer = target.getValPreallocatedBytes(); + outputOffset = target.getValPreallocatedStart(); + outputStart = outputOffset; + System.arraycopy(oldBuffer, currentOutputStart, outputBuffer, + outputOffset, currentOutputLength); + outputOffset += currentOutputLength; + } + + // finally copy the bytes + writeCodepoint(replacement, outputBuffer, outputOffset, len); + outputOffset += len; + } + target.setValPreallocated(row, outputOffset - outputStart); + } +} http://git-wip-us.apache.org/repos/asf/orc/blob/d5a6c49b/java/core/src/java/org/apache/orc/impl/mask/StructIdentity.java ---------------------------------------------------------------------- diff --git a/java/core/src/java/org/apache/orc/impl/mask/StructIdentity.java b/java/core/src/java/org/apache/orc/impl/mask/StructIdentity.java new file mode 100644 index 0000000..92cfb4e --- /dev/null +++ b/java/core/src/java/org/apache/orc/impl/mask/StructIdentity.java @@ -0,0 +1,74 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.orc.impl.mask; + +import org.apache.hadoop.hive.ql.exec.vector.ColumnVector; +import org.apache.hadoop.hive.ql.exec.vector.StructColumnVector; +import org.apache.orc.DataMask; + +/** + * A data mask for struct types that applies the given masks to its + * children, but doesn't mask at this level. + */ +public class StructIdentity implements DataMask { + private final DataMask[] children; + + StructIdentity(DataMask[] children) { + this.children = children; + } + + @Override + public void maskData(ColumnVector original, ColumnVector masked, int start, + int length) { + StructColumnVector source = (StructColumnVector) original; + StructColumnVector target = (StructColumnVector) masked; + target.isRepeating = source.isRepeating; + target.noNulls = source.noNulls; + if (source.isRepeating) { + target.isNull[0] = source.isNull[0]; + if (source.noNulls || !source.isNull[0]) { + for (int c = 0; c < children.length; ++c) { + children[c].maskData(source.fields[c], target.fields[c], 0, 1); + } + } + } else if (source.noNulls) { + for (int c = 0; c < children.length; ++c) { + children[c].maskData(source.fields[c], target.fields[c], start, length); + } + } else { + // process the children in runs of non-null values + int batchStart = start; + while (batchStart < start + length) { + int r = batchStart; + while (r < start + length && !source.isNull[r]) { + r += 1; + } + if (r != batchStart) { + for(int c=0; c < children.length; ++c) { + children[c].maskData(source.fields[c], target.fields[c], + batchStart, r - batchStart); + } + } + batchStart = r; + while (batchStart < start + length && source.isNull[batchStart]) { + batchStart += 1; + } + } + } + } +} http://git-wip-us.apache.org/repos/asf/orc/blob/d5a6c49b/java/core/src/java/org/apache/orc/impl/mask/TimestampIdentity.java ---------------------------------------------------------------------- diff --git a/java/core/src/java/org/apache/orc/impl/mask/TimestampIdentity.java b/java/core/src/java/org/apache/orc/impl/mask/TimestampIdentity.java new file mode 100644 index 0000000..b0b2684 --- /dev/null +++ b/java/core/src/java/org/apache/orc/impl/mask/TimestampIdentity.java @@ -0,0 +1,52 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.orc.impl.mask; + +import org.apache.hadoop.hive.ql.exec.vector.ColumnVector; +import org.apache.hadoop.hive.ql.exec.vector.TimestampColumnVector; +import org.apache.orc.DataMask; + +class TimestampIdentity implements DataMask { + @Override + public void maskData(ColumnVector original, ColumnVector masked, int start, + int length) { + TimestampColumnVector target = (TimestampColumnVector) masked; + TimestampColumnVector source = (TimestampColumnVector) original; + target.noNulls = source.noNulls; + target.isRepeating = source.isRepeating; + if (original.isRepeating) { + target.time[0] = source.time[0]; + target.nanos[0] = source.nanos[0]; + target.isNull[0] = source.isNull[0]; + } else if (source.noNulls) { + for(int r = start; r < start + length; ++r) { + target.time[r] = source.time[r]; + target.nanos[r] = source.nanos[r]; + } + } else { + for(int r = start; r < start + length; ++r) { + target.time[r] = source.time[r]; + target.nanos[r] = source.nanos[r]; + target.isNull[r] = source.isNull[r]; + } + } + } +} + + http://git-wip-us.apache.org/repos/asf/orc/blob/d5a6c49b/java/core/src/java/org/apache/orc/impl/mask/UnionIdentity.java ---------------------------------------------------------------------- diff --git a/java/core/src/java/org/apache/orc/impl/mask/UnionIdentity.java b/java/core/src/java/org/apache/orc/impl/mask/UnionIdentity.java new file mode 100644 index 0000000..27ae46c --- /dev/null +++ b/java/core/src/java/org/apache/orc/impl/mask/UnionIdentity.java @@ -0,0 +1,66 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.orc.impl.mask; + +import org.apache.hadoop.hive.ql.exec.vector.ColumnVector; +import org.apache.hadoop.hive.ql.exec.vector.UnionColumnVector; +import org.apache.orc.DataMask; + +/** + * A data mask for union types that applies the given masks to its + * children, but doesn't mask at this level. + */ +public class UnionIdentity implements DataMask { + private final DataMask[] children; + + UnionIdentity(DataMask[] children) { + this.children = children; + } + + @Override + public void maskData(ColumnVector original, ColumnVector masked, int start, + int length) { + UnionColumnVector source = (UnionColumnVector) original; + UnionColumnVector target = (UnionColumnVector) masked; + target.isRepeating = source.isRepeating; + target.noNulls = source.noNulls; + if (source.isRepeating) { + target.isNull[0] = source.isNull[0]; + if (source.noNulls || !source.isNull[0]) { + int tag = source.tags[0]; + target.tags[0] = tag; + children[tag].maskData(source.fields[tag], target.fields[tag], 0, 1); + } + } else if (source.noNulls) { + for (int r = start; r < start + length; ++r) { + int tag = source.tags[r]; + target.tags[r] = tag; + children[tag].maskData(source.fields[tag], target.fields[tag], r, 1); + } + } else { + for(int r= start; r < start + length; ++r) { + target.isNull[r] = source.isNull[r]; + if (!source.isNull[r]) { + int tag = source.tags[r]; + target.tags[r] = tag; + children[tag].maskData(source.fields[tag], target.fields[tag], r, 1); + } + } + } + } +} http://git-wip-us.apache.org/repos/asf/orc/blob/d5a6c49b/java/core/src/resources/META-INF/services/org.apache.orc.DataMask$Provider ---------------------------------------------------------------------- diff --git a/java/core/src/resources/META-INF/services/org.apache.orc.DataMask$Provider b/java/core/src/resources/META-INF/services/org.apache.orc.DataMask$Provider new file mode 100644 index 0000000..6daf1b0 --- /dev/null +++ b/java/core/src/resources/META-INF/services/org.apache.orc.DataMask$Provider @@ -0,0 +1,15 @@ +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +org.apache.orc.impl.mask.MaskProvider \ No newline at end of file http://git-wip-us.apache.org/repos/asf/orc/blob/d5a6c49b/java/core/src/test/org/apache/orc/impl/mask/TestDataMask.java ---------------------------------------------------------------------- diff --git a/java/core/src/test/org/apache/orc/impl/mask/TestDataMask.java b/java/core/src/test/org/apache/orc/impl/mask/TestDataMask.java new file mode 100644 index 0000000..bbf313c --- /dev/null +++ b/java/core/src/test/org/apache/orc/impl/mask/TestDataMask.java @@ -0,0 +1,177 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.orc.impl.mask; + +import org.apache.hadoop.hive.ql.exec.vector.BytesColumnVector; +import org.apache.hadoop.hive.ql.exec.vector.ColumnVector; +import org.apache.hadoop.hive.ql.exec.vector.DecimalColumnVector; +import org.apache.hadoop.hive.ql.exec.vector.DoubleColumnVector; +import org.apache.hadoop.hive.ql.exec.vector.ListColumnVector; +import org.apache.hadoop.hive.ql.exec.vector.LongColumnVector; +import org.apache.hadoop.hive.ql.exec.vector.MapColumnVector; +import org.apache.hadoop.hive.ql.exec.vector.StructColumnVector; +import org.apache.hadoop.hive.ql.exec.vector.TimestampColumnVector; +import org.apache.hadoop.hive.ql.exec.vector.UnionColumnVector; +import org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatch; +import org.apache.hadoop.hive.serde2.io.HiveDecimalWritable; +import org.apache.orc.DataMask; +import org.apache.orc.TypeDescription; +import org.junit.Test; + +import java.nio.charset.StandardCharsets; + +import static org.junit.Assert.assertEquals; + +public class TestDataMask { + + @Test + public void testNullFactory() throws Exception { + TypeDescription schema = TypeDescription.fromString("struct<x:int>"); + // take the first column's type + DataMask mask = DataMask.Standard.NULLIFY.build(schema.findSubtype(1)); + assertEquals(NullifyMask.class.toString(), mask.getClass().toString()); + LongColumnVector cv = (LongColumnVector) schema.createRowBatch().cols[0]; + LongColumnVector masked = (LongColumnVector) schema.createRowBatch().cols[0]; + cv.vector[0] = 10; + cv.vector[1] = 20; + mask.maskData(cv, masked, 0, 2); + assertEquals(true, masked.isRepeating); + assertEquals(false, masked.noNulls); + assertEquals(true, masked.isNull[0]); + } + + @Test + public void testRedactFactory() throws Exception { + TypeDescription schema = + TypeDescription.fromString("struct<s:struct<x:int,y:string>>"); + DataMask mask = DataMask.Standard.REDACT.build(schema.findSubtype(1)); + assertEquals(StructIdentity.class.toString(), mask.getClass().toString()); + StructColumnVector cv = (StructColumnVector)schema.createRowBatch().cols[0]; + StructColumnVector masked = (StructColumnVector)schema.createRowBatch().cols[0]; + LongColumnVector x = (LongColumnVector) cv.fields[0]; + BytesColumnVector y = (BytesColumnVector) cv.fields[1]; + x.vector[0] = 123; + y.setVal(0, "Owen".getBytes(StandardCharsets.UTF_8)); + x.vector[1] = 456789; + y.setVal(1, "ORC".getBytes(StandardCharsets.UTF_8)); + mask.maskData(cv, masked, 0, 2); + x = (LongColumnVector) masked.fields[0]; + y = (BytesColumnVector) masked.fields[1]; + assertEquals(999, x.vector[0]); + assertEquals(999999, x.vector[1]); + assertEquals("Xxxx", y.toString(0)); + assertEquals("XXX", y.toString(1)); + } + + @Test + public void testIdentityRedact() throws Exception { + TypeDescription schema = + TypeDescription.fromString("struct<s:struct<a:decimal(18,6),b:double," + + "c:array<int>,d:map<timestamp,date>,e:uniontype<int,binary>,f:string>>"); + DataMask nullify = DataMask.Standard.NULLIFY.build(schema.findSubtype(1)); + // create a redact mask that passes everything though + DataMask identity = DataMask.Standard.REDACT.build(schema.findSubtype(1), + "__________", "_ _ _ _ _ _"); + + // allow easier access to fields + StructColumnVector cv = (StructColumnVector)schema.createRowBatch().cols[0]; + StructColumnVector masked = (StructColumnVector)schema.createRowBatch().cols[0]; + DecimalColumnVector a = (DecimalColumnVector) cv.fields[0]; + DoubleColumnVector b = (DoubleColumnVector) cv.fields[1]; + ListColumnVector c = (ListColumnVector) cv.fields[2]; + LongColumnVector ce = (LongColumnVector) c.child; + MapColumnVector d = (MapColumnVector) cv.fields[3]; + TimestampColumnVector dk = (TimestampColumnVector) d.keys; + LongColumnVector dv = (LongColumnVector) d.values; + UnionColumnVector e = (UnionColumnVector) cv.fields[4]; + LongColumnVector e1 = (LongColumnVector) e.fields[0]; + BytesColumnVector e2 = (BytesColumnVector) e.fields[1]; + BytesColumnVector f = (BytesColumnVector) cv.fields[5]; + + // set up the input data + for(int i=0; i < 3; ++i) { + a.set(i, new HiveDecimalWritable((i + 1) + "." + (i + 1))); + b.vector[i] = 1.25 * (i + 1); + // layout c normally + c.offsets[i] = i == 0 ? 0 : c.offsets[i-1] + c.lengths[i-1]; + c.lengths[i] = 2 * i; + // layout d backward + d.offsets[i] = 2 * (2 - i); + d.lengths[i] = 2; + e.tags[i] = i % 2; + e1.vector[i] = i * 10; + f.setVal(i, Integer.toHexString(0x123 * i).getBytes(StandardCharsets.UTF_8)); + } + e2.setVal(1, "Foobar".getBytes(StandardCharsets.UTF_8)); + for(int i=0; i < 6; ++i) { + ce.vector[i] = i; + dk.time[i] = 1111 * i; + dk.nanos[i] = 0; + dv.vector[i] = i * 11; + } + + // send it through the nullify mask + nullify.maskData(cv, masked, 0, 3); + assertEquals(false, masked.noNulls); + assertEquals(true, masked.isRepeating); + assertEquals(true, masked.isNull[0]); + + // send it through our identity mask + identity.maskData(cv, masked, 0 , 3); + assertEquals(true, masked.noNulls); + assertEquals(false, masked.isRepeating); + + // point accessors to masked values + a = (DecimalColumnVector) masked.fields[0]; + b = (DoubleColumnVector) masked.fields[1]; + c = (ListColumnVector) masked.fields[2]; + ce = (LongColumnVector) c.child; + d = (MapColumnVector) masked.fields[3]; + dk = (TimestampColumnVector) d.keys; + dv = (LongColumnVector) d.values; + e = (UnionColumnVector) masked.fields[4]; + e1 = (LongColumnVector) e.fields[0]; + e2 = (BytesColumnVector) e.fields[1]; + f = (BytesColumnVector) masked.fields[5]; + + // check the outputs + for(int i=0; i < 3; ++i) { + assertEquals("iter " + i, (i + 1) + "." + (i + 1), a.vector[i].toString()); + assertEquals("iter " + i, 1.25 * (i + 1), b.vector[i], 0.0001); + assertEquals("iter " + i, i == 0 ? 0 : c.offsets[i-1] + c.lengths[i-1], c.offsets[i]); + assertEquals("iter " + i, 2 * i, c.lengths[i]); + assertEquals("iter " + i, i == 0 ? 4 : d.offsets[i-1] - d.lengths[i], d.offsets[i]); + assertEquals("iter " + i, 2, d.lengths[i]); + assertEquals("iter " + i, i % 2, e.tags[i]); + assertEquals("iter " + i, Integer.toHexString(0x123 * i), f.toString(i)); + } + // check the subvalues for the list and map + for(int i=0; i < 6; ++i) { + assertEquals("iter " + i, i, ce.vector[i]); + assertEquals("iter " + i, i * 1111, dk.time[i]); + assertEquals("iter " + i, i * 11, dv.vector[i]); + } + assertEquals(0, e1.vector[0]); + assertEquals(20, e1.vector[2]); + // the redact mask always replaces binary with null + assertEquals(false, e2.noNulls); + assertEquals(true, e2.isRepeating); + assertEquals(true, e2.isNull[0]); + } + +}
