Hi, Giulio Yes, it is implemented in 100% pure Vala !
(I listed the code in plain text in my second posting on this subject) Serge. On Tue, Sep 17, 2013 at 3:31 PM, Giulio Paci <[email protected]> wrote: > Hi Serge, > thank you for sharing your code. May I ask why you implemented it in > pure Vala? > > Just for the sake of completeness, if anybody is interested in this kind > of tools for Vala, at > https://github.com/nemequ/vala-extra-vapis/blob/master/libstemmer.vapi > there are also the Vala bindings for the snowball stemmer library > (http://snowball.tartarus.org/). > > Bests, > Giulio. > > On 17/09/2013 11:33, Serge Hulne wrote: >> A more detailed description: >> >> http://tartarus.org/martin/PorterStemmer/ >> >> http://tartarus.org/martin/PorterStemmer/def.txt >> >> >> Serge Hulne. >> >> >> PS: The code as plain text: >> >> ------------------------ >> using GLib; >> using Posix; >> >> >> class Stemmer { >> private char[] b; >> private int i; /* offset into b */ >> private int i_end; /* offset to end of stemmed word */ >> private int j; >> private int k; >> private static int INC = 50; >> /* unit of size whereby b is increased */ >> >> >> public Stemmer() { >> b = new char[INC]; >> i = 0; >> i_end = 0; >> } >> >> /** >> * Add a character to the word being stemmed. When you are finished >> * adding characters, you can call stem(void) to stem the word. >> */ >> >> public void add(char ch) { >> if (i == b.length) { >> char[] new_b = new char[i+INC]; >> for (int c = 0; c < i; c++) >> new_b[c] = b[c]; >> b = new_b; >> } >> b[i++] = ch; >> } >> >> >> /** Adds wLen characters to the word being stemmed contained in a >> portion >> * of a char[] array. This is like repeated calls of add(char ch), but >> * faster. >> */ >> >> public void add2(char[] w, int wLen) { >> if (i+wLen >= b.length) { >> char[] new_b = new char[i+wLen+INC]; >> for (int c = 0; c < i; c++) >> new_b[c] = b[c]; >> b = new_b; >> } >> for (int c = 0; c < wLen; c++) >> b[i++] = w[c]; >> } >> >> /** >> * After a word has been stemmed, it can be retrieved by toString(), >> * or a reference to the internal buffer can be retrieved by >> getResultBuffer >> * and getResultLength (which is generally more efficient.) >> */ >> public string ToString() { >> var s = (string) b; >> //return new string(b,0,i_end); >> return s.slice(0,i_end); >> } >> >> /** >> * Returns the length of the word resulting from the stemming process. >> */ >> public int getResultLength() { >> return i_end; >> } >> >> /** >> * Returns a reference to a character buffer containing the results of >> * the stemming process. You also need to consult getResultLength() >> * to determine the length of the result. >> */ >> public char[] getResultBuffer() { >> return b; >> } >> >> /* cons(i) is true <=> b[i] is a consonant. */ >> private bool cons(int i) { >> switch (b[i]) { >> case 'a': >> case 'e': >> case 'i': >> case 'o': >> case 'u': >> return false; >> case 'y': >> return (i==0) ? true : !cons(i-1); >> default: >> return true; >> } >> } >> >> /* m() measures the number of consonant sequences between 0 and j. if c >> is >> a consonant sequence and v a vowel sequence, and <..> indicates >> arbitrary >> presence, >> >> <c><v> gives 0 >> <c>vc<v> gives 1 >> <c>vcvc<v> gives 2 >> <c>vcvcvc<v> gives 3 >> .... >> */ >> private int m() { >> int n = 0; >> int i = 0; >> while(true) { >> if (i > j) return n; >> if (! cons(i)) break; >> i++; >> } >> i++; >> while(true) { >> while(true) { >> if (i > j) return n; >> if (cons(i)) break; >> i++; >> } >> i++; >> n++; >> while(true) { >> if (i > j) return n; >> if (! cons(i)) break; >> i++; >> } >> i++; >> } >> } >> >> /* vowelinstem() is true <=> 0,...j contains a vowel */ >> private bool vowelinstem() { >> int i; >> for (i = 0; i <= j; i++) >> if (! cons(i)) >> return true; >> return false; >> } >> >> /* doublec(j) is true <=> j,(j-1) contain a double consonant. */ >> private bool doublec(int j) { >> if (j < 1) >> return false; >> if (b[j] != b[j-1]) >> return false; >> return cons(j); >> } >> >> /* cvc(i) is true <=> i-2,i-1,i has the form consonant - vowel - >> consonant >> and also if the second c is not w,x or y. this is used when trying to >> restore an e at the end of a short word. e.g. >> >> cav(e), lov(e), hop(e), crim(e), but >> snow, box, tray. >> >> */ >> private bool cvc(int i) { >> if (i < 2 || !cons(i) || cons(i-1) || !cons(i-2)) >> return false; >> int ch = b[i]; >> if (ch == 'w' || ch == 'x' || ch == 'y') >> return false; >> return true; >> } >> >> private bool ends(string s) { >> int l = s.length; >> int o = k-l+1; >> if (o < 0) >> return false; >> char[] sc = (char[]) s.data; >> for (int i = 0; i < l; i++) >> if (b[o+i] != sc[i]) >> return false; >> j = k-l; >> return true; >> } >> >> /* setto(s) sets (j+1),...k to the characters in the string s, >> readjusting >> k. */ >> private void setto(string s) { >> int l = s.length; >> int o = j+1; >> char[] sc = (char[]) s.data; >> for (int i = 0; i < l; i++) >> b[o+i] = sc[i]; >> k = j+l; >> } >> >> /* r(s) is used further down. */ >> private void r(string s) { >> if (m() > 0) >> setto(s); >> } >> >> /* step1() gets rid of plurals and -ed or -ing. e.g. >> caresses -> caress >> ponies -> poni >> ties -> ti >> caress -> caress >> cats -> cat >> >> feed -> feed >> agreed -> agree >> disabled -> disable >> >> matting -> mat >> mating -> mate >> meeting -> meet >> milling -> mill >> messing -> mess >> >> meetings -> meet >> >> */ >> >> private void step1() { >> //print("->step1()\n"); >> if (b[k] == 's') { >> if (ends("sses")) >> k -= 2; >> else if (ends("ies")) >> //setto(""); >> setto(""); >> else if (b[k-1] != 's') >> k--; >> } >> if (ends("eed")) { >> if (m() > 0) >> k--; >> } else if ((ends("ed") || ends("ing")) && vowelinstem()) { >> k = j; >> if (ends("at")) >> setto("ate"); >> else if (ends("bl")) >> setto("ble"); >> else if (ends("iz")) >> setto("ize"); >> else if (doublec(k)) { >> k--; >> int ch = b[k]; >> if (ch == 'l' || ch == 's' || ch == 'z') >> k++; >> } >> else if (m() == 1 && cvc(k)) setto("e"); >> } >> } >> >> /* step2() turns terminal y to i when there is another vowel in the >> stem. */ >> private void step2() { >> //print("->step2()\n"); >> if (ends("y") && vowelinstem()) >> b[k] = 'i'; >> } >> >> /* step3() maps double suffices to single ones. so -ization ( = -ize >> plus >> -ation) maps to -ize etc. note that the string before the suffix >> must give >> m() > 0. */ >> private void step3() { >> //print("->step3()\n"); >> if (k == 0) >> return; >> >> /* For Bug 1 */ >> switch (b[k-1]) { >> case 'a': >> if (ends("ational")) { >> r("ate"); >> break; >> } >> if (ends("tional")) { >> r("tion"); >> break; >> } >> break; >> case 'c': >> if (ends("enci")) { >> r("ence"); >> break; >> } >> if (ends("anci")) { >> r("ance"); >> break; >> } >> break; >> case 'e': >> if (ends("izer")) { >> r("ize"); >> break; >> } >> break; >> case 'l': >> if (ends("bli")) { >> r("ble"); >> break; >> } >> if (ends("alli")) { >> r("al"); >> break; >> } >> if (ends("entli")) { >> r("ent"); >> break; >> } >> if (ends("eli")) { >> r("e"); >> break; >> } >> if (ends("ousli")) { >> r("ous"); >> break; >> } >> break; >> case 'o': >> if (ends("ization")) { >> r("ize"); >> break; >> } >> if (ends("ation")) { >> r("ate"); >> break; >> } >> if (ends("ator")) { >> r("ate"); >> break; >> } >> break; >> case 's': >> if (ends("alism")) { >> r("al"); >> break; >> } >> if (ends("iveness")) { >> r("ive"); >> break; >> } >> if (ends("fulness")) { >> r("ful"); >> break; >> } >> if (ends("ousness")) { >> r("ous"); >> break; >> } >> break; >> case 't': >> if (ends("aliti")) { >> r("al"); >> break; >> } >> if (ends("iviti")) { >> r("ive"); >> break; >> } >> if (ends("biliti")) { >> r("ble"); >> break; >> } >> break; >> case 'g': >> if (ends("logi")) { >> r("log"); >> break; >> } >> break; >> default : >> break; >> } >> } >> >> /* step4() deals with -ic-, -full, -ness etc. similar strategy to >> step3. */ >> private void step4() { >> //print("->step4()\n"); >> switch (b[k]) { >> case 'e': >> if (ends("icate")) { >> r("ic"); >> break; >> } >> if (ends("ative")) { >> r(""); >> break; >> } >> if (ends("alize")) { >> r("al"); >> break; >> } >> break; >> case 'i': >> if (ends("iciti")) { >> r("ic"); >> break; >> } >> break; >> case 'l': >> if (ends("ical")) { >> r("ic"); >> break; >> } >> if (ends("ful")) { >> r(""); >> break; >> } >> break; >> case 's': >> if (ends("ness")) { >> r(""); >> break; >> } >> break; >> } >> } >> >> /* step5() takes off -ant, -ence etc., in context <c>vcvc<v>. */ >> private void step5() { >> //print("->step5()\n"); >> if (k == 0) >> return; >> >> /* for Bug 1 */ >> switch ( b[k-1] ) { >> case 'a': >> if (ends("al")) break; >> return; >> case 'c': >> if (ends("ance")) break; >> if (ends("ence")) break; >> return; >> case 'e': >> if (ends("er")) break; >> return; >> case 'i': >> if (ends("ic")) break; >> return; >> case 'l': >> if (ends("able")) break; >> if (ends("ible")) break; >> return; >> case 'n': >> if (ends("ant")) break; >> if (ends("ement")) break; >> if (ends("ment")) break; >> /* element etc. not stripped before the m */ >> if (ends("ent")) break; >> return; >> case 'o': >> if (ends("ion") && j >= 0 && (b[j] == 's' || b[j] == 't')) >> break; >> /* j >= 0 fixes Bug 2 */ >> if (ends("ou")) break; >> return; >> /* takes care of -ous */ >> case 's': >> if (ends("ism")) break; >> return; >> case 't': >> if (ends("ate")) break; >> if (ends("iti")) break; >> return; >> case 'u': >> if (ends("ous")) break; >> return; >> case 'v': >> if (ends("ive")) break; >> return; >> case 'z': >> if (ends("ize")) break; >> return; >> default: >> return; >> } >> if (m() > 1) >> k = j; >> } >> >> /* step6() removes a final -e if m() > 1. */ >> private void step6() { >> //print("->step6()\n"); >> j = k; >> >> if (b[k] == 'e') { >> int a = m(); >> if (a > 1 || a == 1 && !cvc(k-1)) >> k--; >> } >> if (b[k] == 'l' && doublec(k) && m() > 1) >> k--; >> } >> >> /** Stem the word placed into the Stemmer buffer through calls to add(). >> * Returns true if the stemming process resulted in a word different >> * from the input. You can retrieve the result with >> * getResultLength()/getResultBuffer() or toString(). >> */ >> public void stem() { >> k = i - 1; >> if (k > 1) { >> step1(); >> step2(); >> step3(); >> step4(); >> step5(); >> step6(); >> } >> i_end = k+1; >> i = 0; >> } >> } >> >> >> /** Test program for demonstrating the Stemmer. It reads text from a >> * a list of files, stems each word, and writes the result to standard >> * output. Note that the word stemmed is expected to be in lower case: >> * forcing lower case must be done outside the Stemmer class. >> * Usage: Stemmer file-name file-name ... >> */ >> >> >> /* >> int main(string[] args) { >> //print("Hello\n"); >> var s = new Stemmer(); >> >> >> >> if (args.length>1){ >> var w = args[1]; >> //print("w = %s\n", w); >> s.add2((char[])w.data, w.length); >> s.stem(); >> //print("%s\n", s.ToString()); >> } >> >> return 0; >> } >> */ >> ------------------------ >> _______________________________________________ >> vala-list mailing list >> [email protected] >> https://mail.gnome.org/mailman/listinfo/vala-list > > _______________________________________________ > vala-list mailing list > [email protected] > https://mail.gnome.org/mailman/listinfo/vala-list _______________________________________________ vala-list mailing list [email protected] https://mail.gnome.org/mailman/listinfo/vala-list
