Repository: incubator-hivemall Updated Branches: refs/heads/master 8639810d3 -> bedbd39ca
Close #110: [HIVEMALL-142] Implement SingularizeUDF Project: http://git-wip-us.apache.org/repos/asf/incubator-hivemall/repo Commit: http://git-wip-us.apache.org/repos/asf/incubator-hivemall/commit/5e1d0d07 Tree: http://git-wip-us.apache.org/repos/asf/incubator-hivemall/tree/5e1d0d07 Diff: http://git-wip-us.apache.org/repos/asf/incubator-hivemall/diff/5e1d0d07 Branch: refs/heads/master Commit: 5e1d0d0703d5d6d9e217c6e8a8345138b78e6843 Parents: 8639810 Author: Takuya Kitazawa <[email protected]> Authored: Wed Sep 13 21:35:56 2017 +0900 Committer: Makoto Yui <[email protected]> Committed: Wed Sep 13 21:35:56 2017 +0900 ---------------------------------------------------------------------- .../hivemall/tools/text/SingularizeUDF.java | 173 +++++++++++++++++++ .../java/hivemall/utils/lang/StringUtils.java | 38 +++- .../hivemall/tools/text/SingularizeUDFTest.java | 71 ++++++++ docs/gitbook/misc/generic_funcs.md | 8 + resources/ddl/define-all-as-permanent.hive | 3 + resources/ddl/define-all.hive | 3 + resources/ddl/define-all.spark | 3 + resources/ddl/define-udfs.td.hql | 1 + 8 files changed, 292 insertions(+), 8 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/incubator-hivemall/blob/5e1d0d07/core/src/main/java/hivemall/tools/text/SingularizeUDF.java ---------------------------------------------------------------------- diff --git a/core/src/main/java/hivemall/tools/text/SingularizeUDF.java b/core/src/main/java/hivemall/tools/text/SingularizeUDF.java new file mode 100644 index 0000000..390e3f2 --- /dev/null +++ b/core/src/main/java/hivemall/tools/text/SingularizeUDF.java @@ -0,0 +1,173 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package hivemall.tools.text; + +import org.apache.hadoop.hive.ql.exec.Description; +import org.apache.hadoop.hive.ql.exec.UDF; +import org.apache.hadoop.hive.ql.udf.UDFType; + +import java.util.ArrayList; +import java.util.Arrays; +import java.util.HashMap; +import java.util.List; +import java.util.Map; +import java.util.regex.Matcher; +import java.util.regex.Pattern; + +import javax.annotation.Nullable; + +import hivemall.utils.lang.StringUtils; + +/** + * @link + * https://github.com/sundrio/sundrio/blob/95c2b11f7b842bdaa04f61e8e338aea60fb38f70/codegen/src + * /main/java/io/sundr/codegen/functions/Singularize.java + * @link https://github.com/clips/pattern/blob/ + * 3eef00481a4555331cf9a099308910d977f6fc22/pattern/text/en/inflect.py#L445-L623 + */ +@Description(name = "singularize", + value = "_FUNC_(string word) - Returns singular form of a given English word") +@UDFType(deterministic = true, stateful = false) +public final class SingularizeUDF extends UDF { + + // sorted by an ascending (i.e., alphabetical) order for binary search + // plural preposition to detect compound words like "plural-preposition-something" + private static final String[] prepositions = new String[] {"about", "above", "across", "after", + "among", "around", "at", "athwart", "before", "behind", "below", "beneath", "beside", + "besides", "between", "betwixt", "beyond", "but", "by", "during", "except", "for", + "from", "in", "into", "near", "of", "off", "on", "onto", "out", "over", "since", + "till", "to", "under", "until", "unto", "upon", "with"}; + // uninfected or uncountable words + private static final String[] unchanged = new String[] {"advice", "bison", "bread", "bream", + "breeches", "britches", "butter", "carp", "chassis", "cheese", "christmas", "clippers", + "cod", "contretemps", "corps", "debris", "diabetes", "djinn", "eland", "electricity", + "elk", "equipment", "flounder", "fruit", "furniture", "gallows", "garbage", "georgia", + "graffiti", "gravel", "happiness", "headquarters", "herpes", "high-jinks", "homework", + "information", "innings", "jackanapes", "ketchup", "knowledge", "love", "luggage", + "mackerel", "mathematics", "mayonnaise", "measles", "meat", "mews", "mumps", "mustard", + "news", "news", "pincers", "pliers", "proceedings", "progress", "rabies", "research", + "rice", "salmon", "sand", "scissors", "series", "shears", "software", "species", + "swine", "swiss", "trout", "tuna", "understanding", "water", "whiting", "wildebeest"}; + + private static final Map<String, String> irregular = new HashMap<String, String>(); + static { + irregular.put("atlantes", "atlas"); + irregular.put("atlases", "atlas"); + irregular.put("axes", "axe"); + irregular.put("beeves", "beef"); + irregular.put("brethren", "brother"); + irregular.put("children", "child"); + irregular.put("corpora", "corpus"); + irregular.put("corpuses", "corpus"); + irregular.put("ephemerides", "ephemeris"); + irregular.put("feet", "foot"); + irregular.put("ganglia", "ganglion"); + irregular.put("geese", "goose"); + irregular.put("genera", "genus"); + irregular.put("genii", "genie"); + irregular.put("graffiti", "graffito"); + irregular.put("helves", "helve"); + irregular.put("kine", "cow"); + irregular.put("leaves", "leaf"); + irregular.put("loaves", "loaf"); + irregular.put("men", "man"); + irregular.put("mongooses", "mongoose"); + irregular.put("monies", "money"); + irregular.put("moves", "move"); + irregular.put("mythoi", "mythos"); + irregular.put("numena", "numen"); + irregular.put("occipita", "occiput"); + irregular.put("octopodes", "octopus"); + irregular.put("opera", "opus"); + irregular.put("opuses", "opus"); + irregular.put("our", "my"); + irregular.put("oxen", "ox"); + irregular.put("penes", "penis"); + irregular.put("penises", "penis"); + irregular.put("people", "person"); + irregular.put("sexes", "sex"); + irregular.put("soliloquies", "soliloquy"); + irregular.put("teeth", "tooth"); + irregular.put("testes", "testis"); + irregular.put("trilbys", "trilby"); + irregular.put("turves", "turf"); + irregular.put("zoa", "zoon"); + } + + private static final List<String> rules = Arrays.asList( + // regexp1, replacement1, regexp2, replacement2, ... + "(quiz)zes$", "$1", "(matr)ices$", "$1ix", "(vert|ind)ices$", "$1ex", "^(ox)en", "$1", + "(alias|status)$", "$1", "(alias|status)es$", "$1", "(octop|vir)us$", "$1us", + "(octop|vir)i$", "$1us", "(cris|ax|test)es$", "$1is", "(cris|ax|test)is$", "$1is", + "(shoe)s$", "$1", "(o)es$", "$1", "(bus)es$", "$1", "([m|l])ice$", "$1ouse", + "(x|ch|ss|sh)es$", "$1", "(m)ovies$", "$1ovie", "(s)eries$", "$1eries", + "([^aeiouy]|qu)ies$", "$1y", "([lr])ves$", "$1f", "(tive)s$", "$1", "(hive)s$", "$1", + "([^f])ves$", "$1fe", "(^analy)sis$", "$1sis", "(^analy)ses$", "$1sis", + "((a)naly|(b)a|(d)iagno|(p)arenthe|(p)rogno|(s)ynop|(t)he)ses$", "$1$2sis", "([ti])a$", + "$1um", "(n)ews$", "$1ews", "(s|si|u)s$", "$1s", "s$", ""); + + @Nullable + public String evaluate(@Nullable String word) { + return singularize(word); + } + + @Nullable + private String singularize(@Nullable String word) { + if (word == null) { + return null; + } + + if (word.isEmpty()) { + return word; + } + + if (Arrays.binarySearch(unchanged, word) >= 0) { + return word; + } + + if (word.contains("-")) { // compound words (e.g., mothers-in-law) + final List<String> chunks = new ArrayList<>(); + chunks.addAll(Arrays.asList(word.split("-"))); + + if ((chunks.size() > 1) && (Arrays.binarySearch(prepositions, chunks.get(1)) >= 0)) { + String head = chunks.remove(0); + return singularize(head) + "-" + StringUtils.join(chunks, "-"); + } + } + + if (word.endsWith("'")) { // dogs' => dog's + return singularize(word.substring(0, word.length() - 1)) + "'s"; + } + + if (irregular.containsKey(word)) { + return irregular.get(word); + } + + for (int i = 0, n = rules.size(); i < n; i += 2) { + Pattern pattern = Pattern.compile(rules.get(i), Pattern.CASE_INSENSITIVE); + Matcher matcher = pattern.matcher(word); + if (matcher.find()) { + return matcher.replaceAll(rules.get(i + 1)); + } + } + + return word; + } + +} http://git-wip-us.apache.org/repos/asf/incubator-hivemall/blob/5e1d0d07/core/src/main/java/hivemall/utils/lang/StringUtils.java ---------------------------------------------------------------------- diff --git a/core/src/main/java/hivemall/utils/lang/StringUtils.java b/core/src/main/java/hivemall/utils/lang/StringUtils.java index 5b66dd1..3652ebd 100644 --- a/core/src/main/java/hivemall/utils/lang/StringUtils.java +++ b/core/src/main/java/hivemall/utils/lang/StringUtils.java @@ -28,7 +28,8 @@ public final class StringUtils { private StringUtils() {} - public static byte[] getBytes(final String s) { + @Nonnull + public static byte[] getBytes(@Nonnull final String s) { final int len = s.length(); final byte[] b = new byte[len * 2]; for (int i = 0; i < len; i++) { @@ -37,11 +38,13 @@ public final class StringUtils { return b; } - public static String toString(byte[] b) { + @Nonnull + public static String toString(@Nonnull final byte[] b) { return toString(b, 0, b.length); } - public static String toString(byte[] b, int off, int len) { + @Nonnull + public static String toString(@Nonnull final byte[] b, final int off, final int len) { final int clen = len >>> 1; final char[] c = new char[clen]; for (int i = 0; i < clen; i++) { @@ -53,11 +56,11 @@ public final class StringUtils { /** * Checks whether the String a valid Java number. this code is ported from jakarta commons lang. - * + * * @link http://jakarta.apache.org/commons/lang/apidocs/org/apache/commons/lang * /math/NumberUtils.html */ - public static boolean isNumber(final String str) { + public static boolean isNumber(@Nullable final String str) { if (str == null || str.length() == 0) { return false; } @@ -97,7 +100,7 @@ public final class StringUtils { } else if (chars[i] == '.') { if (hasDecPoint || hasExp) { - // two decimal points or dec in exponent + // two decimal points or dec in exponent return false; } hasDecPoint = true; @@ -170,6 +173,7 @@ public final class StringUtils { buf.setLength(0); } + @Nonnull public static String concat(@Nonnull final List<String> list, @Nonnull final String sep) { final StringBuilder buf = new StringBuilder(128); for (String s : list) { @@ -182,11 +186,29 @@ public final class StringUtils { return buf.toString(); } - public static String[] split(final String str, final char separatorChar) { + @Nonnull + public static String join(@Nonnull final List<String> list, @Nonnull final String sep) { + final StringBuilder buf = new StringBuilder(128); + for (int i = 0, size = list.size(); i < size; i++) { + if (i > 0) { // append separator before each element, except for the head element + buf.append(sep); + } + + final String s = list.get(i); + if (s != null) { + buf.append(s); + } + } + return buf.toString(); + } + + @Nullable + public static String[] split(@Nullable final String str, final char separatorChar) { return split(str, separatorChar, false); } - public static String[] split(final String str, final char separatorChar, + @Nullable + public static String[] split(@Nullable final String str, final char separatorChar, final boolean preserveAllTokens) { if (str == null) { return null; http://git-wip-us.apache.org/repos/asf/incubator-hivemall/blob/5e1d0d07/core/src/test/java/hivemall/tools/text/SingularizeUDFTest.java ---------------------------------------------------------------------- diff --git a/core/src/test/java/hivemall/tools/text/SingularizeUDFTest.java b/core/src/test/java/hivemall/tools/text/SingularizeUDFTest.java new file mode 100644 index 0000000..6ea9cc3 --- /dev/null +++ b/core/src/test/java/hivemall/tools/text/SingularizeUDFTest.java @@ -0,0 +1,71 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package hivemall.tools.text; + +import org.junit.Assert; +import org.junit.Before; +import org.junit.Test; + +public class SingularizeUDFTest { + + private SingularizeUDF udf; + + @Before + public void setUp() { + this.udf = new SingularizeUDF(); + } + + @Test + public void testNull() { + Assert.assertEquals(null, udf.evaluate(null)); + } + + @Test + public void testEmpty() { + Assert.assertEquals("", udf.evaluate("")); + } + + @Test + public void testUnchanged() { + Assert.assertEquals("christmas", udf.evaluate("christmas")); + } + + @Test + public void testCompound() { + Assert.assertEquals("mother-in-law", udf.evaluate("mothers-in-law")); + } + + @Test + public void testTailSingleQuote() { + Assert.assertEquals("dog's", udf.evaluate("dogs'")); + } + + @Test + public void testIrregular() { + Assert.assertEquals("child", udf.evaluate("children")); + } + + @Test + public void testRule() { + Assert.assertEquals("apple", udf.evaluate("apples")); + Assert.assertEquals("bus", udf.evaluate("buses")); + Assert.assertEquals("candy", udf.evaluate("candies")); + } + +} http://git-wip-us.apache.org/repos/asf/incubator-hivemall/blob/5e1d0d07/docs/gitbook/misc/generic_funcs.md ---------------------------------------------------------------------- diff --git a/docs/gitbook/misc/generic_funcs.md b/docs/gitbook/misc/generic_funcs.md index 03e1ef3..9775439 100644 --- a/docs/gitbook/misc/generic_funcs.md +++ b/docs/gitbook/misc/generic_funcs.md @@ -239,6 +239,14 @@ The compression level must be in range [-1,9] - `is_stopword(string word)` - Returns whether English stopword or not +- `singularize(string word)` - Returns singular form of a given English word + + ```sql + select singularize(lower("Apples")); + + > "apple" + ``` + - `tokenize(string englishText [, boolean toLowerCase])` - Returns words in array<string> - `tokenize_ja(String line [, const string mode = "normal", const list<string> stopWords, const list<string> stopTags])` - returns tokenized strings in array<string>. Refer [this article](../misc/tokenizer.html) for detail. http://git-wip-us.apache.org/repos/asf/incubator-hivemall/blob/5e1d0d07/resources/ddl/define-all-as-permanent.hive ---------------------------------------------------------------------- diff --git a/resources/ddl/define-all-as-permanent.hive b/resources/ddl/define-all-as-permanent.hive index b1c0075..100fe22 100644 --- a/resources/ddl/define-all-as-permanent.hive +++ b/resources/ddl/define-all-as-permanent.hive @@ -538,6 +538,9 @@ CREATE FUNCTION tokenize as 'hivemall.tools.text.TokenizeUDF' USING JAR '${hivem DROP FUNCTION IF EXISTS is_stopword; CREATE FUNCTION is_stopword as 'hivemall.tools.text.StopwordUDF' USING JAR '${hivemall_jar}'; +DROP FUNCTION IF EXISTS singularize; +CREATE FUNCTION singularize as 'hivemall.tools.text.SingularizeUDF' USING JAR '${hivemall_jar}'; + DROP FUNCTION IF EXISTS split_words; CREATE FUNCTION split_words as 'hivemall.tools.text.SplitWordsUDF' USING JAR '${hivemall_jar}'; http://git-wip-us.apache.org/repos/asf/incubator-hivemall/blob/5e1d0d07/resources/ddl/define-all.hive ---------------------------------------------------------------------- diff --git a/resources/ddl/define-all.hive b/resources/ddl/define-all.hive index e1933b4..6fb34ca 100644 --- a/resources/ddl/define-all.hive +++ b/resources/ddl/define-all.hive @@ -530,6 +530,9 @@ create temporary function tokenize as 'hivemall.tools.text.TokenizeUDF'; drop temporary function if exists is_stopword; create temporary function is_stopword as 'hivemall.tools.text.StopwordUDF'; +drop temporary function if exists singularize; +create temporary function singularize as 'hivemall.tools.text.SingularizeUDF'; + drop temporary function if exists split_words; create temporary function split_words as 'hivemall.tools.text.SplitWordsUDF'; http://git-wip-us.apache.org/repos/asf/incubator-hivemall/blob/5e1d0d07/resources/ddl/define-all.spark ---------------------------------------------------------------------- diff --git a/resources/ddl/define-all.spark b/resources/ddl/define-all.spark index db29d85..d0a1084 100644 --- a/resources/ddl/define-all.spark +++ b/resources/ddl/define-all.spark @@ -514,6 +514,9 @@ sqlContext.sql("CREATE TEMPORARY FUNCTION tokenize AS 'hivemall.tools.text.Token sqlContext.sql("DROP TEMPORARY FUNCTION IF EXISTS is_stopword") sqlContext.sql("CREATE TEMPORARY FUNCTION is_stopword AS 'hivemall.tools.text.StopwordUDF'") +sqlContext.sql("DROP TEMPORARY FUNCTION IF EXISTS singularize") +sqlContext.sql("CREATE TEMPORARY FUNCTION singularize AS 'hivemall.tools.text.SingularizeUDF'") + sqlContext.sql("DROP TEMPORARY FUNCTION IF EXISTS split_words") sqlContext.sql("CREATE TEMPORARY FUNCTION split_words AS 'hivemall.tools.text.SplitWordsUDF'") http://git-wip-us.apache.org/repos/asf/incubator-hivemall/blob/5e1d0d07/resources/ddl/define-udfs.td.hql ---------------------------------------------------------------------- diff --git a/resources/ddl/define-udfs.td.hql b/resources/ddl/define-udfs.td.hql index 7c9bfc7..d90cb3c 100644 --- a/resources/ddl/define-udfs.td.hql +++ b/resources/ddl/define-udfs.td.hql @@ -178,6 +178,7 @@ create temporary function train_ffm as 'hivemall.fm.FieldAwareFactorizationMachi create temporary function ffm_predict as 'hivemall.fm.FFMPredictGenericUDAF'; create temporary function add_field_indicies as 'hivemall.ftvec.trans.AddFieldIndicesUDF'; create temporary function to_ordered_list as 'hivemall.tools.list.UDAFToOrderedList'; +create temporary function singularize as 'hivemall.tools.text.SingularizeUDF'; -- NLP features create temporary function tokenize_ja as 'hivemall.nlp.tokenizer.KuromojiUDF';
