Github user takuti commented on a diff in the pull request:
https://github.com/apache/incubator-hivemall/pull/118#discussion_r142296940
--- Diff: core/src/main/java/hivemall/tools/text/NgramsUDF.java ---
@@ -0,0 +1,76 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package hivemall.tools.text;
+
+import hivemall.utils.lang.StringUtils;
+
+import org.apache.hadoop.hive.ql.exec.Description;
+import org.apache.hadoop.hive.ql.exec.UDF;
+import org.apache.hadoop.hive.ql.exec.UDFArgumentException;
+import org.apache.hadoop.hive.ql.metadata.HiveException;
+import org.apache.hadoop.hive.ql.udf.UDFType;
+import org.apache.hadoop.io.Text;
+
+import javax.annotation.Nonnegative;
+import javax.annotation.Nonnull;
+
+import java.util.ArrayList;
+import java.util.List;
+
+@Description(name = "to_ngrams", value = "_FUNC_(array<string> words, int
minSize, int maxSize])"
+ + " - Returns list of n-grams for given words, where `minSize <= n
<= maxSize`")
+@UDFType(deterministic = true, stateful = false)
+public final class NgramsUDF extends UDF {
+
+ public List<Text> evaluate(final List<Text> words, final int minSize,
final int maxSize)
+ throws HiveException {
+ if (words == null) {
+ return null;
+ }
+ if (minSize <= 0) {
+ throw new UDFArgumentException("`minSize` must be greater than
zero: " + minSize);
+ }
+ if (minSize > maxSize) {
+ throw new UDFArgumentException("`maxSize` must be greater than
or equal to `minSize`: "
+ + maxSize);
+ }
+ return getNgrams(words, minSize, maxSize);
+ }
+
+ @Nonnull
+ private List<Text> getNgrams(@Nonnull final List<Text> words,
@Nonnegative final int minSize,
+ @Nonnegative final int maxSize) {
+ final List<Text> ngrams = new ArrayList<Text>();
+ for (int i = 0, numWords = words.size(); i < numWords; i++) {
+ for (int ngramSize = minSize; ngramSize <= maxSize;
ngramSize++) {
+ if (i + ngramSize > numWords) { // exceeds the final
element
+ continue;
+ }
+
+ final List<String> ngram = new ArrayList<String>();
+ for (int j = i; j < i + ngramSize; j++) {
+ ngram.add(words.get(j).toString());
+ }
+ ngrams.add(new Text(StringUtils.join(ngram, " ")));
--- End diff --
`ngram` will never be empty thanks to `if (i + ngramSize > numWords)
continue;`. Notice that the for-loop creating `ngram` increments `j` from `i`
to `i + ngramSize - 1`.
---