[systemds] branch master updated: [SYSTEMDS-3036] Stemming function PorterStemmer Functionality added in map() call. Syntax output = map(input, "x -> x.PorterStemmer.stem(x)")

ssiddiqi Thu, 01 Jul 2021 06:17:31 -0700

This is an automated email from the ASF dual-hosted git repository.

ssiddiqi pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/systemds.git



The following commit(s) were added to refs/heads/master by this push:
     new 18f4611  [SYSTEMDS-3036] Stemming function PorterStemmer Functionality 
added in map() call. Syntax output = map(input, "x -> x.PorterStemmer.stem(x)")
18f4611 is described below

commit 18f46113e0eb04720e390a1df9aead6ab8b373bc
Author: Shafaq Siddiqi <[email protected]>
AuthorDate: Thu Jul 1 15:00:35 2021 +0200

    [SYSTEMDS-3036] Stemming function PorterStemmer
    Functionality added in map() call.
    Syntax
    output = map(input, "x -> x.PorterStemmer.stem(x)")
    
    Closes #1319.
---
 .../sysds/runtime/matrix/data/FrameBlock.java      |   1 +
 .../apache/sysds/runtime/util/PorterStemmer.java   | 340 +++++++++++++++++++++
 .../builtin/BuiltinPorterStemmerTest.java          | 101 ++++++
 .../resources/datasets/stemming/dictionary.csv     | 110 +++++++
 .../functions/builtin/porterStemmerTest.dml        |  29 ++
 5 files changed, 581 insertions(+)

diff --git a/src/main/java/org/apache/sysds/runtime/matrix/data/FrameBlock.java 
b/src/main/java/org/apache/sysds/runtime/matrix/data/FrameBlock.java
index 0f77d53..322cfad 100644
--- a/src/main/java/org/apache/sysds/runtime/matrix/data/FrameBlock.java
+++ b/src/main/java/org/apache/sysds/runtime/matrix/data/FrameBlock.java
@@ -2212,6 +2212,7 @@ public class FrameBlock implements CacheBlock, 
Externalizable  {
 
                // construct class code
                sb.append("import 
org.apache.sysds.runtime.util.UtilFunctions;\n");
+               sb.append("import 
org.apache.sysds.runtime.util.PorterStemmer;\n");
                sb.append("import 
org.apache.sysds.runtime.matrix.data.FrameBlock.FrameMapFunction;\n");
                sb.append("public class " + cname + " extends FrameMapFunction 
{\n");
                if(varname.length == 1) {
diff --git a/src/main/java/org/apache/sysds/runtime/util/PorterStemmer.java 
b/src/main/java/org/apache/sysds/runtime/util/PorterStemmer.java
new file mode 100644
index 0000000..bc084fa
--- /dev/null
+++ b/src/main/java/org/apache/sysds/runtime/util/PorterStemmer.java
@@ -0,0 +1,340 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.sysds.runtime.util;
+
+import org.apache.commons.lang3.StringUtils;
+
+import java.util.HashMap;
+import java.util.Iterator;
+import java.util.Map;
+
+/**
+ * Stemmer, implementing the Porter Stemming Algorithm
+ *
+ * The Stemmer class transforms a word into its root form.  The input
+ * word can be provided a character at time (by calling add()), or at once
+ * by calling one of the various stem(something) methods.
+ */
+
+public class PorterStemmer
+{
+   /* m() measures the number of consonant sequences between 0 and j. if c is
+      a consonant sequence and v a vowel sequence, and <..> indicates arbitrary
+      presence,
+
+         <c><v>       gives 0
+         <c>vc<v>     gives 1
+         <c>vcvc<v>   gives 2
+         <c>vcvcvc<v> gives 3
+         ....
+   */
+
+       private static int calcM(String word)
+       {
+               int l = word.length() ;
+               int count = 0;
+               boolean currentConst = false;
+               for(int c = 0; c < l; c++) {
+                       if(cons(word, c))
+                       {
+                               if(!currentConst && c != 0) {
+                                       count += 1;
+                               }
+                               currentConst = true;
+                       }
+                       else
+                               currentConst = false;
+
+               }
+               return  count;
+       }
+
+       /* doublec(j) is true <=> j,(j-1) contain a double consonant. */
+
+       private static boolean doublec(String word)
+       {  int len = word.length() - 1;
+               if (len < 1) return false;
+               if (word.charAt(len) != word.charAt(len - 1)) return false;
+               return cons(word, len);
+       }
+
+   /* cvc(i) is true <=> i-2,i-1,i has the form consonant - vowel - consonant
+      and also if the second c is not w,x or y. this is used when trying to
+      restore an e at the end of a short word. e.g.
+
+         cav(e), lov(e), hop(e), crim(e), but
+         snow, box, tray.
+*      */
+       private static boolean cvc(String word)
+       {
+               int len = word.length();
+               int l = len - 1;
+               if (len < 3)
+                       return false;
+               if(!cons(word, l) | cons(word, l-1) | !cons(word, (l-2)))
+                       return false;
+               String ch = String.valueOf(word.charAt(l));
+               String exceptions = "wxy";
+               return !exceptions.contains(ch);
+       }
+
+       /* vowelinstem() is true <=> 0,...j contains a vowel */
+       private static boolean  vowelinStem(String word, String suffix) {
+               int length = word.length() - suffix.length();
+               for(int i=0; i<length; i++)
+                       if(!cons(word, i))
+                               return true;
+
+               return false;
+       }
+
+       /* cons(i) is true <=> b[i] is a consonant. */
+
+       private static boolean cons(String stem, int i)
+       {
+               String vowels = "aeiou";
+               char ch = stem.charAt(i);
+               if(vowels.contains(String.valueOf(stem.charAt(i))))
+                       return false;
+               if(ch == 'y')
+               {
+                       if(i == 0)
+                               return true;
+                       else
+                               return (!cons(stem, i - 1));
+               }
+               return true;
+       }
+       // process the collection of tuples to find which prefix matches the 
case.
+       private static String processMatched(String word, HashMap suffixAndfix, 
int mCount)
+       {
+               String stemmed = null;
+               Iterator it = suffixAndfix.entrySet().iterator();
+               while (it.hasNext() && (stemmed == null)) {
+                       Map.Entry pair = (Map.Entry)it.next();
+                       stemmed = replacer(word, pair.getKey().toString(), 
pair.getValue().toString(), mCount);
+                       it.remove();
+               }
+               return stemmed;
+       }
+
+       //      replace the suffix with suggeston
+       private static String replacer(String word, String orig, String 
replace, int mCount)
+       {
+               int l = word.length();
+               int suffixLength = orig.length();
+
+               if (word.endsWith(orig))
+               {
+                       String stem = word.substring( 0, l - suffixLength);
+                       int  m = calcM( stem );
+                       if (m > mCount)
+                               return stem.concat(replace);
+                       else
+                               return word;
+
+               }
+
+               return null;
+       }
+
+       /* step1() gets rid of plurals and -ed or -ing. e.g.
+       i.e., condition & suffix -> replacement
+               SSES -> SS
+               IES  -> I
+               SS -> SS
+               S -> ""
+               (m > 0) EED -> EE
+               vowelSequence(ED) -> ""
+               vowelsequence(ING) -> ""
+               any("at, bl, iz")  -> add(e)
+               doubleconsonant and not("l", "s", "z") -> remove single letter 
from end
+               (m == 1 and cvc) -> add(e)
+               turns terminal y to i when there is another vowel in the stem.
+   */
+
+       private static String step1(String word)
+       {
+               boolean flag = false;
+               if (word.endsWith("s"))
+               {
+                       if (word.endsWith("sses"))
+                               word = StringUtils.removeEnd(word, "es");
+                       else if (word.endsWith("ies")) {
+                               word = StringUtils.removeEnd(word, 
"ies").concat("i");
+                       }
+                       else if (!word.endsWith("ss") && word.endsWith("s"))
+                               word = StringUtils.removeEnd(word, "s");
+               }
+               if (word.endsWith("eed"))
+               {
+                       if (calcM(word) > 1)
+                               word = StringUtils.removeEnd(word, "d");
+               }
+               else if(word.endsWith("ed") && vowelinStem(word, "ed")) {
+                       word = StringUtils.removeEnd(word, "ed");
+                       flag = true;
+               }
+               else if(word.endsWith("ing") && vowelinStem(word, "ing"))
+               {
+                       word = StringUtils.removeEnd(word, "ing");
+
+                       flag = true;
+               }
+
+               if (flag)
+               {
+                       if(word.endsWith("at") || word.endsWith("bl") || 
word.endsWith("iz"))
+                               word = word.concat("e");
+                       int m = calcM(word);
+                       String last = String.valueOf(word.charAt(word.length() 
- 1));
+                       if (doublec(word) && !"lsz".contains(last))
+                               word = word.substring(0, word.length() - 1);
+                       else if (m == 1 && cvc(word))
+                               word = word.concat("e");
+               }
+               if (word.endsWith("y") && vowelinStem(word, "y"))
+                       word = StringUtils.removeEnd(word, "y").concat("i");
+
+               return word;
+       }
+
+       // step2() maps double suffices to single ones
+
+       private static String step2(String word) {
+               int len = word .length();
+               if (len == 0) return word;
+               HashMap<String, String> suffixAndfix = new HashMap<String, 
String>()
+               {{
+                       put("ational", "ate");
+                       put("tional","tion");
+                       put("enci","ence");
+                       put("anci","ance");
+                       put("izer","ize");
+                       put("bli","ble");
+                       put("alli", "al");
+                       put("entli","ent");
+                       put("eli","e");
+                       put("ousli","ous");
+                       put("ization","ize");
+                       put("ation","ate");
+                       put("ator","ate");
+                       put("alism","al");
+                       put("iveness", "ive");
+                       put("fulness","ful");
+                       put("ousness", "ous");
+                       put("aliti", "al");
+                       put("iviti","ive");
+                       put("biliti", "ble");
+                       put("log", "logi");
+                       put("icate", "ic");
+                       put("ative","");
+                       put("alize","al");
+                       put("iciti","ic");
+                       put("ical","ic");
+               }};
+
+               String stemmed = processMatched(word, suffixAndfix, 0);
+               return (stemmed != null)? stemmed: word;
+       }
+       // handles -ic-, -full, -ness etc.
+       private static String step3(String word) {
+               int len = word .length();
+               if (len == 0) return word;
+               HashMap<String, String> suffixAndfix = new HashMap<String, 
String>()
+               {{
+                       put("icate", "ic");
+                       put("ative","");
+                       put("alize","al");
+                       put("iciti","ic");
+                       put("ical","ic");
+                       put("ful","");
+                       put("ness","");
+               }};
+
+               String stemmed = processMatched(word, suffixAndfix, 0);
+               return (stemmed != null)? stemmed: word;
+
+       }
+
+       // takes off -ant, -ence etc., in context <c>vcvc<v>
+       private static String step4(String word)
+       {
+               // first part.
+               String[] suffix = new String[] {"al", "ance", "ence", "er", 
"ic", "able", "ible", "ant",
+                       "ement", "ment", "ent"};
+               String stemmed = null;
+               int i = 0;
+               while(stemmed == null && i < suffix.length)
+               {
+                       stemmed = replacer(word, suffix[i], "", 1);
+                       i++;
+               }
+               // exceptions
+               if(stemmed == null)
+               {
+                       if(word.length() > 4)
+                       {
+                               char ch = word.charAt(word.length() - 4);
+                               if(ch == 's' || ch == 't')
+                               {
+                                       stemmed = replacer(word, "ion", "", 1);
+                               }
+                       }
+               }
+               // exceptions
+               if (stemmed == null)
+               {
+                       suffix = new String[] {"ou",  "ism", "ate", "iti", 
"ous", "ive", "ize"};
+                       i = 0;
+                       while(stemmed == null && i < suffix.length)
+                       {
+                               stemmed = replacer(word, suffix[i], "", 1);
+                               i++;
+                       }
+               }
+
+               return (stemmed != null)? stemmed: word;
+       }
+       //      handle the last e and l
+       private static String step5(String word)
+       {
+               String stem = StringUtils.removeEnd(word, "e");
+               if(word.endsWith("e") && calcM(word) > 1)
+                       word =  stem;
+               if(word.endsWith("e") && calcM(word)  == 1 && !cvc(stem))
+                       word = stem;
+               if(word.endsWith("l") && doublec(word) && calcM(word) > 1)
+                       word = word.substring(0, word.length() - 1);
+
+               return word;
+       }
+       public static String stem (String word)
+       {
+               if(word.length() >= 3) {
+                       word = step1(word);
+                       word = step2(word);
+                       word = step3(word);
+                       word = step4(word);
+                       if(word.length() > 0)
+                               word = step5(word);
+               }
+               return word;
+       }
+}
\ No newline at end of file
diff --git 
a/src/test/java/org/apache/sysds/test/functions/builtin/BuiltinPorterStemmerTest.java
 
b/src/test/java/org/apache/sysds/test/functions/builtin/BuiltinPorterStemmerTest.java
new file mode 100644
index 0000000..ac13190
--- /dev/null
+++ 
b/src/test/java/org/apache/sysds/test/functions/builtin/BuiltinPorterStemmerTest.java
@@ -0,0 +1,101 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.sysds.test.functions.builtin;
+
+import org.apache.sysds.common.Types;
+import org.apache.sysds.runtime.matrix.data.FrameBlock;
+import org.apache.sysds.test.AutomatedTestBase;
+import org.apache.sysds.test.TestConfiguration;
+import org.apache.sysds.test.TestUtils;
+import org.junit.AfterClass;
+import org.junit.Assert;
+import org.junit.BeforeClass;
+import org.junit.Test;
+
+import java.io.IOException;
+
+public class BuiltinPorterStemmerTest extends AutomatedTestBase {
+
+       private final static String TEST_NAME = "porterStemmerTest";
+       private final static String TEST_DIR = "functions/builtin/";
+       private static final String TEST_CLASS_DIR = 
BuiltinPorterStemmerTest.class.getSimpleName() + "/";
+       private static final String INPUT = DATASET_DIR 
+"stemming/dictionary.csv";
+
+       @BeforeClass
+       public static void init() {
+               TestUtils.clearDirectory(TEST_DATA_DIR + TEST_CLASS_DIR);
+       }
+
+       @AfterClass
+       public static void cleanUp() {
+               if (TEST_CACHE_ENABLED) {
+                       TestUtils.clearDirectory(TEST_DATA_DIR + 
TEST_CLASS_DIR);
+               }
+       }
+
+       @Override
+       public void setUp() {
+               TestUtils.clearAssertionInformation();
+               addTestConfiguration(TEST_NAME, new 
TestConfiguration(TEST_CLASS_DIR, TEST_NAME, new String[] {"D"}));
+               if (TEST_CACHE_ENABLED) {
+                       setOutAndExpectedDeletionDisabled(true);
+               }
+       }
+       @Test
+       public void testStemmerCP() {
+               runStemmerTest(Types.ExecMode.SINGLE_NODE);
+       }
+
+       @Test
+       public void testStemmerSpark() {
+               runStemmerTest(Types.ExecMode.SPARK);
+       }
+
+       private void runStemmerTest(Types.ExecMode et)
+       {
+               Types.ExecMode modeOld = setExecMode(et);
+               try {
+                       loadTestConfiguration(getTestConfiguration(TEST_NAME));
+                       String HOME = SCRIPT_DIR + TEST_DIR;
+
+                       fullDMLScriptName = HOME + TEST_NAME + ".dml";
+                       programArgs = new String[] {"-args", INPUT, 
output("S"), output("E")};
+
+                       runTest(true, EXCEPTION_NOT_EXPECTED, null, -1);
+                       FrameBlock outputFrame = readDMLFrameFromHDFS("S", 
Types.FileFormat.CSV);
+                       FrameBlock inputFrame = readDMLFrameFromHDFS("E", 
Types.FileFormat.CSV);
+                       String[] output = 
(String[])outputFrame.getColumnData(0);
+                       String[] input = (String[])inputFrame.getColumnData(0);
+                       //expected vs stemmer output
+                       int count = 0;
+                       for(int i = 0; i<input.length; i++) {
+                               if(input[i].equals(output[i]))
+                                       count++;
+                       }
+                       Assert.assertEquals(110, count, 10);
+               }
+               catch(IOException e) {
+                       e.printStackTrace();
+               }
+               finally {
+                       resetExecMode(modeOld);
+               }
+       }
+}
diff --git a/src/test/resources/datasets/stemming/dictionary.csv 
b/src/test/resources/datasets/stemming/dictionary.csv
new file mode 100644
index 0000000..ec79e4e
--- /dev/null
+++ b/src/test/resources/datasets/stemming/dictionary.csv
@@ -0,0 +1,110 @@
+Words,Stemmed Equivalents
+caresses,caress
+ponies,poni
+ties,ti
+caress,caress
+cats,cat
+feed,feed
+agreed,agre
+disabled,disabl
+matting,mat
+mating,mate
+meeting,meet
+milling,mill
+messing,mess
+meetings,meet
+abbess,abbess
+abbey,abbei
+abbeys,abbei
+abbominable,abbomin
+abbot,abbot
+abbots,abbot
+abbreviated,abbrevi
+abdication,abdic
+abduction,abduct
+abed,ab
+abel,abel
+aberga,aberga
+abergavenny,abergavenni
+abet,abet
+abetting,abet
+abhominable,abhomin
+abhor,abhor
+abhorr,abhorr
+abhorred,abhor
+abhorrence,abhorr
+abhorring,abhor
+abhors,abhor
+abhorson,abhorson
+abide,abid
+abides,abid
+abilities,abil
+ability,abil
+abingdon,abingdon
+abject,abject
+abjectly,abjectli
+abjects,abject
+abjur,abjur
+abjure,abjur
+abler,abler
+ablest,ablest
+aboard,aboard
+accomplishment,accomplish
+boiling,boil
+boils,boil
+booths,booth
+booties,booti
+boundary,boundari
+bounded,bound
+cuts,cut
+cytherea,cytherea
+dawdling,dawdl
+deception,decept
+matting,mat
+management,manag
+manager,manag
+managing,manag
+manakin,manakin
+manasseh,manasseh
+manchen,manchen
+manchester,manchest
+manchus,manchu
+mandate,mandat
+mandragora,mandragora
+mandrake,mandrak
+mandrakes,mandrak
+mane,mane
+manent,manent
+manes,mane
+manet,manet
+manfully,manfulli
+mangelwurzel,mangelwurzel
+mangle,mangl
+mangled,mangl
+mangles,mangl
+mangling,mangl
+mangnall,mangnal
+mango,mango
+mangoes,mango
+mangy,mangi
+manhood,manhood
+manhoods,manhood
+mania,mania
+manifest,manifest
+manifested,manifest
+wived,wive
+wonderfully,wonderfulli
+wondering,wonder
+wonders,wonder
+worms,worm
+wormwood,wormwood
+wormy,wormi
+worn,worn
+worret,worret
+worried,worri
+worries,worri
+worry,worri
+worrying,worri
+worser,worser
+worship,worship
+worshipful,worship
\ No newline at end of file
diff --git a/src/test/scripts/functions/builtin/porterStemmerTest.dml 
b/src/test/scripts/functions/builtin/porterStemmerTest.dml
new file mode 100644
index 0000000..70fb877
--- /dev/null
+++ b/src/test/scripts/functions/builtin/porterStemmerTest.dml
@@ -0,0 +1,29 @@
+#-------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------
+
+# input frame contain two columns of actual words and their stemmed 
equivalents 
+words = read($1, data_type = "frame", header = TRUE, format = "csv")
+stemmed = map(words[, 1], "x -> PorterStemmer.stem(x)")
+# write the results from the stemmer
+write(stemmed, $2, format = "csv")
+# write the dictionary equivalents for matching in java file
+equ = words[, 2]
+write(equ, $3, format = "csv")
\ No newline at end of file

[systemds] branch master updated: [SYSTEMDS-3036] Stemming function PorterStemmer Functionality added in map() call. Syntax output = map(input, "x -> x.PorterStemmer.stem(x)")

Reply via email to