Github user omalley commented on a diff in the pull request:
https://github.com/apache/orc/pull/208#discussion_r162110964
--- Diff:
java/core/src/java/org/apache/orc/impl/mask/SHA256MaskFactory.java ---
@@ -0,0 +1,290 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.orc.impl.mask;
+
+import org.apache.hadoop.hive.ql.exec.vector.BytesColumnVector;
+import org.apache.hadoop.hive.ql.exec.vector.ColumnVector;
+import org.apache.orc.DataMask;
+import org.apache.orc.TypeDescription;
+
+import javax.xml.bind.DatatypeConverter;
+import java.nio.ByteBuffer;
+import java.nio.charset.StandardCharsets;
+import java.security.MessageDigest;
+import java.security.NoSuchAlgorithmException;
+import java.util.Arrays;
+
+/**
+ * Masking strategy that masks String, Varchar, Char and Binary types
+ * as SHA 256 hash.
+ * <p>
+ * <b>For String type:</b><br/>
+ * All string type of any length will be converted to 64 length SHA256
hash.<br/><br/>
+ * <p>
+ * <b>For Varchar type:</b><br/>
+ * For Varchar type, max-length property will be honored i.e.
+ * if the length is less than max-length then the SHA256 hash will be
truncated
+ * to max-length. If max-length is greater than 64 then the output is the
sha256
+ * length, which is 64.<br/><br/>
+ * <p>
+ * <b>For Char type:</b><br/>
+ * For Char type, the length of mask will always be equal to specified
max-length.
+ * If the given length (max-length) is less than SHA256 hash length (64)
+ * the mask will be truncated.
+ * If the given length (max-length) is greater than SHA256 hash length (64)
+ * then the mask will be padded by blank spaces.<br/><br/>
+ * <p>
+ * <b>For Binary type:</b><br/>
+ * All Binary type of any length will be converted to 64 length SHA256
hash.<br/>
+ */
+public class SHA256MaskFactory extends MaskFactory {
+
+ final MessageDigest md;
+
+ public SHA256MaskFactory(final String... params) {
+ super();
+ try {
+ md = MessageDigest.getInstance("SHA-256");
+ } catch (NoSuchAlgorithmException e) {
+ throw new RuntimeException(e);
+ }
+ }
+
+ /**
+ * Mask a string by finding the character category of each character
+ * and replacing it with the matching literal.
+ *
+ * @param source the source column vector
+ * @param row the value index
+ * @param target the target column vector
+ * @param schema schema
+ */
+ void maskString(final BytesColumnVector source, final int row,
+ final BytesColumnVector target, final TypeDescription schema) {
+ final ByteBuffer sourceBytes = ByteBuffer
+ .wrap(source.vector[row], source.start[row], source.length[row]);
+
+ // take SHA-256 Hash and convert to HEX
+ byte[] hash = DatatypeConverter
+ .printHexBinary(md.digest(sourceBytes.array()))
+ .getBytes(StandardCharsets.UTF_8);
+ int targetLength = hash.length;
+
+ /* For type varchar */
+ if (schema.getCategory() == TypeDescription.Category.VARCHAR) {
+
+ /* truncate the hash if max length for varchar is less than hash
length
+ * on the other hand if if the max length is more than hash length
(64 bytes)
+ * we use the hash length (64 bytes) always.
+ */
+ if (schema.getMaxLength() < hash.length) {
+ targetLength = schema.getMaxLength();
+ }
+
+ }
+
+ /* For type char */
+ if (schema.getCategory() == TypeDescription.Category.CHAR) {
+ /* for char the length is always constant */
+ targetLength = schema.getMaxLength();
+ }
+
+ // ensure we have enough space, if the masked data is the same size
+ target.ensureValPreallocated(targetLength);
+ byte[] outputBuffer = target.getValPreallocatedBytes();
+ int outputOffset = target.getValPreallocatedStart();
+
+ if (targetLength > hash.length) {
+
+ System.arraycopy(hash, 0, outputBuffer, 0, hash.length);
--- End diff --
You don't need to copy the output bytes, just assign the hash variable into
the target by setting:
* target.vector[r] = hash
* target.start[r] = 0
* target.length[r] = targetLength
---