Re: [Vala] Port of the Porter stemmer to Vala (more detailed description)

Giulio Paci Tue, 17 Sep 2013 06:32:02 -0700

Hi Serge,
        thank you for sharing your code. May I ask why you implemented it in
pure Vala?


Just for the sake of completeness, if anybody is interested in this kind
of tools for Vala, at
https://github.com/nemequ/vala-extra-vapis/blob/master/libstemmer.vapi
there are also the Vala bindings for the snowball stemmer library
(http://snowball.tartarus.org/).

Bests,
        Giulio.

On 17/09/2013 11:33, Serge Hulne wrote:
> A more detailed description:
> 
> http://tartarus.org/martin/PorterStemmer/
> 
> http://tartarus.org/martin/PorterStemmer/def.txt
> 
> 
> Serge Hulne.
> 
> 
> PS: The code as plain text:
> 
> ------------------------
> using GLib;
> using Posix;
> 
> 
> class Stemmer {
>     private char[] b;
>     private int i;                /* offset into b */
>     private int    i_end;          /* offset to end of stemmed word */
>     private int    j;
>     private int k;
>     private static int INC = 50;
>     /* unit of size whereby b is increased */
> 
> 
>     public Stemmer() {
>         b = new char[INC];
>         i = 0;
>         i_end = 0;
>     }
> 
>     /**
>      * Add a character to the word being stemmed.  When you are finished
>      * adding characters, you can call stem(void) to stem the word.
>      */
> 
>     public void add(char ch) {
>         if (i == b.length) {
>             char[] new_b = new char[i+INC];
>             for (int c = 0; c < i; c++)
>                 new_b[c] = b[c];
>             b = new_b;
>         }
>         b[i++] = ch;
>     }
> 
> 
>     /** Adds wLen characters to the word being stemmed contained in a
> portion
>      * of a char[] array. This is like repeated calls of add(char ch), but
>      * faster.
>      */
> 
>     public void add2(char[] w, int wLen) {
>         if (i+wLen >= b.length) {
>             char[] new_b = new char[i+wLen+INC];
>             for (int c = 0; c < i; c++)
>                 new_b[c] = b[c];
>             b = new_b;
>         }
>         for (int c = 0; c < wLen; c++)
>             b[i++] = w[c];
>     }
> 
>     /**
>      * After a word has been stemmed, it can be retrieved by toString(),
>      * or a reference to the internal buffer can be retrieved by
> getResultBuffer
>      * and getResultLength (which is generally more efficient.)
>      */
>     public string ToString() {
>         var s = (string) b;
>         //return new string(b,0,i_end);
>         return s.slice(0,i_end);
>     }
> 
>     /**
>      * Returns the length of the word resulting from the stemming process.
>      */
>     public int getResultLength() {
>         return i_end;
>     }
> 
>     /**
>      * Returns a reference to a character buffer containing the results of
>      * the stemming process.  You also need to consult getResultLength()
>      * to determine the length of the result.
>      */
>     public char[] getResultBuffer() {
>         return b;
>     }
> 
>     /* cons(i) is true <=> b[i] is a consonant. */
>     private bool cons(int i) {
>         switch (b[i]) {
>         case 'a':
>         case 'e':
>         case 'i':
>         case 'o':
>         case 'u':
>             return false;
>         case 'y':
>             return (i==0) ? true : !cons(i-1);
>         default:
>             return true;
>         }
>     }
> 
>     /* m() measures the number of consonant sequences between 0 and j. if c
> is
>        a consonant sequence and v a vowel sequence, and <..> indicates
> arbitrary
>        presence,
> 
>           <c><v>       gives 0
>           <c>vc<v>     gives 1
>           <c>vcvc<v>   gives 2
>           <c>vcvcvc<v> gives 3
>           ....
>     */
>     private int m() {
>         int n = 0;
>         int i = 0;
>         while(true) {
>             if (i > j) return n;
>             if (! cons(i)) break;
>             i++;
>         }
>         i++;
>         while(true) {
>             while(true) {
>                 if (i > j) return n;
>                 if (cons(i)) break;
>                 i++;
>             }
>             i++;
>             n++;
>             while(true) {
>                 if (i > j) return n;
>                 if (! cons(i)) break;
>                 i++;
>             }
>             i++;
>         }
>     }
> 
>     /* vowelinstem() is true <=> 0,...j contains a vowel */
>     private bool vowelinstem() {
>         int i;
>         for (i = 0; i <= j; i++)
>             if (! cons(i))
>                 return true;
>         return false;
>     }
> 
>     /* doublec(j) is true <=> j,(j-1) contain a double consonant. */
>     private bool doublec(int j) {
>         if (j < 1)
>             return false;
>         if (b[j] != b[j-1])
>             return false;
>         return cons(j);
>     }
> 
>     /* cvc(i) is true <=> i-2,i-1,i has the form consonant - vowel -
> consonant
>        and also if the second c is not w,x or y. this is used when trying to
>        restore an e at the end of a short word. e.g.
> 
>           cav(e), lov(e), hop(e), crim(e), but
>           snow, box, tray.
> 
>     */
>     private bool cvc(int i) {
>         if (i < 2 || !cons(i) || cons(i-1) || !cons(i-2))
>             return false;
>         int ch = b[i];
>         if (ch == 'w' || ch == 'x' || ch == 'y')
>             return false;
>         return true;
>     }
> 
>     private bool ends(string s) {
>         int l = s.length;
>         int o = k-l+1;
>         if (o < 0)
>             return false;
>         char[] sc = (char[]) s.data;
>         for (int i = 0; i < l; i++)
>             if (b[o+i] != sc[i])
>                 return false;
>         j = k-l;
>         return true;
>     }
> 
>     /* setto(s) sets (j+1),...k to the characters in the string s,
> readjusting
>        k. */
>     private void setto(string s) {
>         int l = s.length;
>         int o = j+1;
>         char[] sc = (char[]) s.data;
>         for (int i = 0; i < l; i++)
>             b[o+i] = sc[i];
>         k = j+l;
>     }
> 
>     /* r(s) is used further down. */
>     private void r(string s) {
>         if (m() > 0)
>             setto(s);
>     }
> 
>     /* step1() gets rid of plurals and -ed or -ing. e.g.
>            caresses  ->  caress
>            ponies    ->  poni
>            ties      ->  ti
>            caress    ->  caress
>            cats      ->  cat
> 
>            feed      ->  feed
>            agreed    ->  agree
>            disabled  ->  disable
> 
>            matting   ->  mat
>            mating    ->  mate
>            meeting   ->  meet
>            milling   ->  mill
>            messing   ->  mess
> 
>            meetings  ->  meet
> 
>     */
> 
>     private void step1() {
>         //print("->step1()\n");
>         if (b[k] == 's') {
>             if (ends("sses"))
>                 k -= 2;
>             else if (ends("ies"))
>                 //setto("");
>                 setto("");
>             else if (b[k-1] != 's')
>                 k--;
>         }
>         if (ends("eed")) {
>             if (m() > 0)
>                 k--;
>         } else if ((ends("ed") || ends("ing")) && vowelinstem()) {
>             k = j;
>             if (ends("at"))
>                 setto("ate");
>             else if (ends("bl"))
>                 setto("ble");
>             else if (ends("iz"))
>                 setto("ize");
>             else if (doublec(k)) {
>                 k--;
>                 int ch = b[k];
>                 if (ch == 'l' || ch == 's' || ch == 'z')
>                     k++;
>             }
>             else if (m() == 1 && cvc(k)) setto("e");
>         }
>     }
> 
>     /* step2() turns terminal y to i when there is another vowel in the
> stem. */
>     private void step2() {
>         //print("->step2()\n");
>         if (ends("y") && vowelinstem())
>             b[k] = 'i';
>     }
> 
>     /* step3() maps double suffices to single ones. so -ization ( = -ize
> plus
>        -ation) maps to -ize etc. note that the string before the suffix
> must give
>        m() > 0. */
>     private void step3() {
>         //print("->step3()\n");
>         if (k == 0)
>             return;
> 
>         /* For Bug 1 */
>         switch (b[k-1]) {
>         case 'a':
>             if (ends("ational")) {
>                 r("ate");
>                 break;
>             }
>             if (ends("tional")) {
>                 r("tion");
>                 break;
>             }
>             break;
>         case 'c':
>             if (ends("enci")) {
>                 r("ence");
>                 break;
>             }
>             if (ends("anci")) {
>                 r("ance");
>                 break;
>             }
>             break;
>         case 'e':
>             if (ends("izer")) {
>                 r("ize");
>                 break;
>             }
>             break;
>         case 'l':
>             if (ends("bli")) {
>                 r("ble");
>                 break;
>             }
>             if (ends("alli")) {
>                 r("al");
>                 break;
>             }
>             if (ends("entli")) {
>                 r("ent");
>                 break;
>             }
>             if (ends("eli")) {
>                 r("e");
>                 break;
>             }
>             if (ends("ousli")) {
>                 r("ous");
>                 break;
>             }
>             break;
>         case 'o':
>             if (ends("ization")) {
>                 r("ize");
>                 break;
>             }
>             if (ends("ation")) {
>                 r("ate");
>                 break;
>             }
>             if (ends("ator")) {
>                 r("ate");
>                 break;
>             }
>             break;
>         case 's':
>             if (ends("alism")) {
>                 r("al");
>                 break;
>             }
>             if (ends("iveness")) {
>                 r("ive");
>                 break;
>             }
>             if (ends("fulness")) {
>                 r("ful");
>                 break;
>             }
>             if (ends("ousness")) {
>                 r("ous");
>                 break;
>             }
>             break;
>         case 't':
>             if (ends("aliti")) {
>                 r("al");
>                 break;
>             }
>             if (ends("iviti")) {
>                 r("ive");
>                 break;
>             }
>             if (ends("biliti")) {
>                 r("ble");
>                 break;
>             }
>             break;
>         case 'g':
>             if (ends("logi")) {
>                 r("log");
>                 break;
>             }
>             break;
>         default :
>             break;
>         }
>     }
> 
>     /* step4() deals with -ic-, -full, -ness etc. similar strategy to
> step3. */
>     private void step4() {
>         //print("->step4()\n");
>         switch (b[k]) {
>         case 'e':
>             if (ends("icate")) {
>                 r("ic");
>                 break;
>             }
>             if (ends("ative")) {
>                 r("");
>                 break;
>             }
>             if (ends("alize")) {
>                 r("al");
>                 break;
>             }
>             break;
>         case 'i':
>             if (ends("iciti")) {
>                 r("ic");
>                 break;
>             }
>             break;
>         case 'l':
>             if (ends("ical")) {
>                 r("ic");
>                 break;
>             }
>             if (ends("ful")) {
>                 r("");
>                 break;
>             }
>             break;
>         case 's':
>             if (ends("ness")) {
>                 r("");
>                 break;
>             }
>             break;
>         }
>     }
> 
>     /* step5() takes off -ant, -ence etc., in context <c>vcvc<v>. */
>     private void step5() {
>         //print("->step5()\n");
>         if (k == 0)
>             return;
> 
>         /* for Bug 1 */
>         switch ( b[k-1] ) {
>         case 'a':
>             if (ends("al")) break;
>             return;
>         case 'c':
>             if (ends("ance")) break;
>             if (ends("ence")) break;
>             return;
>         case 'e':
>             if (ends("er")) break;
>             return;
>         case 'i':
>             if (ends("ic")) break;
>             return;
>         case 'l':
>             if (ends("able")) break;
>             if (ends("ible")) break;
>             return;
>         case 'n':
>             if (ends("ant")) break;
>             if (ends("ement")) break;
>             if (ends("ment")) break;
>             /* element etc. not stripped before the m */
>             if (ends("ent")) break;
>             return;
>         case 'o':
>             if (ends("ion") && j >= 0 && (b[j] == 's' || b[j] == 't'))
> break;
>             /* j >= 0 fixes Bug 2 */
>             if (ends("ou")) break;
>             return;
>             /* takes care of -ous */
>         case 's':
>             if (ends("ism")) break;
>             return;
>         case 't':
>             if (ends("ate")) break;
>             if (ends("iti")) break;
>             return;
>         case 'u':
>             if (ends("ous")) break;
>             return;
>         case 'v':
>             if (ends("ive")) break;
>             return;
>         case 'z':
>             if (ends("ize")) break;
>             return;
>         default:
>             return;
>         }
>         if (m() > 1)
>             k = j;
>     }
> 
>     /* step6() removes a final -e if m() > 1. */
>     private void step6() {
>         //print("->step6()\n");
>         j = k;
> 
>         if (b[k] == 'e') {
>             int a = m();
>             if (a > 1 || a == 1 && !cvc(k-1))
>                 k--;
>         }
>         if (b[k] == 'l' && doublec(k) && m() > 1)
>             k--;
>     }
> 
>     /** Stem the word placed into the Stemmer buffer through calls to add().
>      * Returns true if the stemming process resulted in a word different
>      * from the input.  You can retrieve the result with
>      * getResultLength()/getResultBuffer() or toString().
>      */
>     public void stem() {
>         k = i - 1;
>         if (k > 1) {
>             step1();
>             step2();
>             step3();
>             step4();
>             step5();
>             step6();
>         }
>         i_end = k+1;
>         i = 0;
>     }
> }
> 
> 
> /** Test program for demonstrating the Stemmer.  It reads text from a
>  * a list of files, stems each word, and writes the result to standard
>  * output. Note that the word stemmed is expected to be in lower case:
>  * forcing lower case must be done outside the Stemmer class.
>  * Usage: Stemmer file-name file-name ...
>  */
> 
> 
> /*
> int main(string[] args) {
>     //print("Hello\n");
>     var s = new Stemmer();
> 
> 
> 
>     if (args.length>1){
>         var w = args[1];
>         //print("w = %s\n", w);
>         s.add2((char[])w.data, w.length);
>         s.stem();
>         //print("%s\n", s.ToString());
>     }
> 
>     return 0;
> }
> */
> ------------------------
> _______________________________________________
> vala-list mailing list
> [email protected]
> https://mail.gnome.org/mailman/listinfo/vala-list

_______________________________________________
vala-list mailing list
[email protected]
https://mail.gnome.org/mailman/listinfo/vala-list

Re: [Vala] Port of the Porter stemmer to Vala (more detailed description)

Reply via email to