Re: [Vala] Port of the Porter stemmer to Vala (more detailed description)

Serge Hulne Tue, 17 Sep 2013 07:27:53 -0700

Hi, Giulio

Yes, it is implemented in 100% pure Vala !


(I listed the code in plain text in my second posting on this subject)

Serge.

On Tue, Sep 17, 2013 at 3:31 PM, Giulio Paci <[email protected]> wrote:
> Hi Serge,
>         thank you for sharing your code. May I ask why you implemented it in
> pure Vala?
>
> Just for the sake of completeness, if anybody is interested in this kind
> of tools for Vala, at
> https://github.com/nemequ/vala-extra-vapis/blob/master/libstemmer.vapi
> there are also the Vala bindings for the snowball stemmer library
> (http://snowball.tartarus.org/).
>
> Bests,
>         Giulio.
>
> On 17/09/2013 11:33, Serge Hulne wrote:
>> A more detailed description:
>>
>> http://tartarus.org/martin/PorterStemmer/
>>
>> http://tartarus.org/martin/PorterStemmer/def.txt
>>
>>
>> Serge Hulne.
>>
>>
>> PS: The code as plain text:
>>
>> ------------------------
>> using GLib;
>> using Posix;
>>
>>
>> class Stemmer {
>>     private char[] b;
>>     private int i;                /* offset into b */
>>     private int    i_end;          /* offset to end of stemmed word */
>>     private int    j;
>>     private int k;
>>     private static int INC = 50;
>>     /* unit of size whereby b is increased */
>>
>>
>>     public Stemmer() {
>>         b = new char[INC];
>>         i = 0;
>>         i_end = 0;
>>     }
>>
>>     /**
>>      * Add a character to the word being stemmed.  When you are finished
>>      * adding characters, you can call stem(void) to stem the word.
>>      */
>>
>>     public void add(char ch) {
>>         if (i == b.length) {
>>             char[] new_b = new char[i+INC];
>>             for (int c = 0; c < i; c++)
>>                 new_b[c] = b[c];
>>             b = new_b;
>>         }
>>         b[i++] = ch;
>>     }
>>
>>
>>     /** Adds wLen characters to the word being stemmed contained in a
>> portion
>>      * of a char[] array. This is like repeated calls of add(char ch), but
>>      * faster.
>>      */
>>
>>     public void add2(char[] w, int wLen) {
>>         if (i+wLen >= b.length) {
>>             char[] new_b = new char[i+wLen+INC];
>>             for (int c = 0; c < i; c++)
>>                 new_b[c] = b[c];
>>             b = new_b;
>>         }
>>         for (int c = 0; c < wLen; c++)
>>             b[i++] = w[c];
>>     }
>>
>>     /**
>>      * After a word has been stemmed, it can be retrieved by toString(),
>>      * or a reference to the internal buffer can be retrieved by
>> getResultBuffer
>>      * and getResultLength (which is generally more efficient.)
>>      */
>>     public string ToString() {
>>         var s = (string) b;
>>         //return new string(b,0,i_end);
>>         return s.slice(0,i_end);
>>     }
>>
>>     /**
>>      * Returns the length of the word resulting from the stemming process.
>>      */
>>     public int getResultLength() {
>>         return i_end;
>>     }
>>
>>     /**
>>      * Returns a reference to a character buffer containing the results of
>>      * the stemming process.  You also need to consult getResultLength()
>>      * to determine the length of the result.
>>      */
>>     public char[] getResultBuffer() {
>>         return b;
>>     }
>>
>>     /* cons(i) is true <=> b[i] is a consonant. */
>>     private bool cons(int i) {
>>         switch (b[i]) {
>>         case 'a':
>>         case 'e':
>>         case 'i':
>>         case 'o':
>>         case 'u':
>>             return false;
>>         case 'y':
>>             return (i==0) ? true : !cons(i-1);
>>         default:
>>             return true;
>>         }
>>     }
>>
>>     /* m() measures the number of consonant sequences between 0 and j. if c
>> is
>>        a consonant sequence and v a vowel sequence, and <..> indicates
>> arbitrary
>>        presence,
>>
>>           <c><v>       gives 0
>>           <c>vc<v>     gives 1
>>           <c>vcvc<v>   gives 2
>>           <c>vcvcvc<v> gives 3
>>           ....
>>     */
>>     private int m() {
>>         int n = 0;
>>         int i = 0;
>>         while(true) {
>>             if (i > j) return n;
>>             if (! cons(i)) break;
>>             i++;
>>         }
>>         i++;
>>         while(true) {
>>             while(true) {
>>                 if (i > j) return n;
>>                 if (cons(i)) break;
>>                 i++;
>>             }
>>             i++;
>>             n++;
>>             while(true) {
>>                 if (i > j) return n;
>>                 if (! cons(i)) break;
>>                 i++;
>>             }
>>             i++;
>>         }
>>     }
>>
>>     /* vowelinstem() is true <=> 0,...j contains a vowel */
>>     private bool vowelinstem() {
>>         int i;
>>         for (i = 0; i <= j; i++)
>>             if (! cons(i))
>>                 return true;
>>         return false;
>>     }
>>
>>     /* doublec(j) is true <=> j,(j-1) contain a double consonant. */
>>     private bool doublec(int j) {
>>         if (j < 1)
>>             return false;
>>         if (b[j] != b[j-1])
>>             return false;
>>         return cons(j);
>>     }
>>
>>     /* cvc(i) is true <=> i-2,i-1,i has the form consonant - vowel -
>> consonant
>>        and also if the second c is not w,x or y. this is used when trying to
>>        restore an e at the end of a short word. e.g.
>>
>>           cav(e), lov(e), hop(e), crim(e), but
>>           snow, box, tray.
>>
>>     */
>>     private bool cvc(int i) {
>>         if (i < 2 || !cons(i) || cons(i-1) || !cons(i-2))
>>             return false;
>>         int ch = b[i];
>>         if (ch == 'w' || ch == 'x' || ch == 'y')
>>             return false;
>>         return true;
>>     }
>>
>>     private bool ends(string s) {
>>         int l = s.length;
>>         int o = k-l+1;
>>         if (o < 0)
>>             return false;
>>         char[] sc = (char[]) s.data;
>>         for (int i = 0; i < l; i++)
>>             if (b[o+i] != sc[i])
>>                 return false;
>>         j = k-l;
>>         return true;
>>     }
>>
>>     /* setto(s) sets (j+1),...k to the characters in the string s,
>> readjusting
>>        k. */
>>     private void setto(string s) {
>>         int l = s.length;
>>         int o = j+1;
>>         char[] sc = (char[]) s.data;
>>         for (int i = 0; i < l; i++)
>>             b[o+i] = sc[i];
>>         k = j+l;
>>     }
>>
>>     /* r(s) is used further down. */
>>     private void r(string s) {
>>         if (m() > 0)
>>             setto(s);
>>     }
>>
>>     /* step1() gets rid of plurals and -ed or -ing. e.g.
>>            caresses  ->  caress
>>            ponies    ->  poni
>>            ties      ->  ti
>>            caress    ->  caress
>>            cats      ->  cat
>>
>>            feed      ->  feed
>>            agreed    ->  agree
>>            disabled  ->  disable
>>
>>            matting   ->  mat
>>            mating    ->  mate
>>            meeting   ->  meet
>>            milling   ->  mill
>>            messing   ->  mess
>>
>>            meetings  ->  meet
>>
>>     */
>>
>>     private void step1() {
>>         //print("->step1()\n");
>>         if (b[k] == 's') {
>>             if (ends("sses"))
>>                 k -= 2;
>>             else if (ends("ies"))
>>                 //setto("");
>>                 setto("");
>>             else if (b[k-1] != 's')
>>                 k--;
>>         }
>>         if (ends("eed")) {
>>             if (m() > 0)
>>                 k--;
>>         } else if ((ends("ed") || ends("ing")) && vowelinstem()) {
>>             k = j;
>>             if (ends("at"))
>>                 setto("ate");
>>             else if (ends("bl"))
>>                 setto("ble");
>>             else if (ends("iz"))
>>                 setto("ize");
>>             else if (doublec(k)) {
>>                 k--;
>>                 int ch = b[k];
>>                 if (ch == 'l' || ch == 's' || ch == 'z')
>>                     k++;
>>             }
>>             else if (m() == 1 && cvc(k)) setto("e");
>>         }
>>     }
>>
>>     /* step2() turns terminal y to i when there is another vowel in the
>> stem. */
>>     private void step2() {
>>         //print("->step2()\n");
>>         if (ends("y") && vowelinstem())
>>             b[k] = 'i';
>>     }
>>
>>     /* step3() maps double suffices to single ones. so -ization ( = -ize
>> plus
>>        -ation) maps to -ize etc. note that the string before the suffix
>> must give
>>        m() > 0. */
>>     private void step3() {
>>         //print("->step3()\n");
>>         if (k == 0)
>>             return;
>>
>>         /* For Bug 1 */
>>         switch (b[k-1]) {
>>         case 'a':
>>             if (ends("ational")) {
>>                 r("ate");
>>                 break;
>>             }
>>             if (ends("tional")) {
>>                 r("tion");
>>                 break;
>>             }
>>             break;
>>         case 'c':
>>             if (ends("enci")) {
>>                 r("ence");
>>                 break;
>>             }
>>             if (ends("anci")) {
>>                 r("ance");
>>                 break;
>>             }
>>             break;
>>         case 'e':
>>             if (ends("izer")) {
>>                 r("ize");
>>                 break;
>>             }
>>             break;
>>         case 'l':
>>             if (ends("bli")) {
>>                 r("ble");
>>                 break;
>>             }
>>             if (ends("alli")) {
>>                 r("al");
>>                 break;
>>             }
>>             if (ends("entli")) {
>>                 r("ent");
>>                 break;
>>             }
>>             if (ends("eli")) {
>>                 r("e");
>>                 break;
>>             }
>>             if (ends("ousli")) {
>>                 r("ous");
>>                 break;
>>             }
>>             break;
>>         case 'o':
>>             if (ends("ization")) {
>>                 r("ize");
>>                 break;
>>             }
>>             if (ends("ation")) {
>>                 r("ate");
>>                 break;
>>             }
>>             if (ends("ator")) {
>>                 r("ate");
>>                 break;
>>             }
>>             break;
>>         case 's':
>>             if (ends("alism")) {
>>                 r("al");
>>                 break;
>>             }
>>             if (ends("iveness")) {
>>                 r("ive");
>>                 break;
>>             }
>>             if (ends("fulness")) {
>>                 r("ful");
>>                 break;
>>             }
>>             if (ends("ousness")) {
>>                 r("ous");
>>                 break;
>>             }
>>             break;
>>         case 't':
>>             if (ends("aliti")) {
>>                 r("al");
>>                 break;
>>             }
>>             if (ends("iviti")) {
>>                 r("ive");
>>                 break;
>>             }
>>             if (ends("biliti")) {
>>                 r("ble");
>>                 break;
>>             }
>>             break;
>>         case 'g':
>>             if (ends("logi")) {
>>                 r("log");
>>                 break;
>>             }
>>             break;
>>         default :
>>             break;
>>         }
>>     }
>>
>>     /* step4() deals with -ic-, -full, -ness etc. similar strategy to
>> step3. */
>>     private void step4() {
>>         //print("->step4()\n");
>>         switch (b[k]) {
>>         case 'e':
>>             if (ends("icate")) {
>>                 r("ic");
>>                 break;
>>             }
>>             if (ends("ative")) {
>>                 r("");
>>                 break;
>>             }
>>             if (ends("alize")) {
>>                 r("al");
>>                 break;
>>             }
>>             break;
>>         case 'i':
>>             if (ends("iciti")) {
>>                 r("ic");
>>                 break;
>>             }
>>             break;
>>         case 'l':
>>             if (ends("ical")) {
>>                 r("ic");
>>                 break;
>>             }
>>             if (ends("ful")) {
>>                 r("");
>>                 break;
>>             }
>>             break;
>>         case 's':
>>             if (ends("ness")) {
>>                 r("");
>>                 break;
>>             }
>>             break;
>>         }
>>     }
>>
>>     /* step5() takes off -ant, -ence etc., in context <c>vcvc<v>. */
>>     private void step5() {
>>         //print("->step5()\n");
>>         if (k == 0)
>>             return;
>>
>>         /* for Bug 1 */
>>         switch ( b[k-1] ) {
>>         case 'a':
>>             if (ends("al")) break;
>>             return;
>>         case 'c':
>>             if (ends("ance")) break;
>>             if (ends("ence")) break;
>>             return;
>>         case 'e':
>>             if (ends("er")) break;
>>             return;
>>         case 'i':
>>             if (ends("ic")) break;
>>             return;
>>         case 'l':
>>             if (ends("able")) break;
>>             if (ends("ible")) break;
>>             return;
>>         case 'n':
>>             if (ends("ant")) break;
>>             if (ends("ement")) break;
>>             if (ends("ment")) break;
>>             /* element etc. not stripped before the m */
>>             if (ends("ent")) break;
>>             return;
>>         case 'o':
>>             if (ends("ion") && j >= 0 && (b[j] == 's' || b[j] == 't'))
>> break;
>>             /* j >= 0 fixes Bug 2 */
>>             if (ends("ou")) break;
>>             return;
>>             /* takes care of -ous */
>>         case 's':
>>             if (ends("ism")) break;
>>             return;
>>         case 't':
>>             if (ends("ate")) break;
>>             if (ends("iti")) break;
>>             return;
>>         case 'u':
>>             if (ends("ous")) break;
>>             return;
>>         case 'v':
>>             if (ends("ive")) break;
>>             return;
>>         case 'z':
>>             if (ends("ize")) break;
>>             return;
>>         default:
>>             return;
>>         }
>>         if (m() > 1)
>>             k = j;
>>     }
>>
>>     /* step6() removes a final -e if m() > 1. */
>>     private void step6() {
>>         //print("->step6()\n");
>>         j = k;
>>
>>         if (b[k] == 'e') {
>>             int a = m();
>>             if (a > 1 || a == 1 && !cvc(k-1))
>>                 k--;
>>         }
>>         if (b[k] == 'l' && doublec(k) && m() > 1)
>>             k--;
>>     }
>>
>>     /** Stem the word placed into the Stemmer buffer through calls to add().
>>      * Returns true if the stemming process resulted in a word different
>>      * from the input.  You can retrieve the result with
>>      * getResultLength()/getResultBuffer() or toString().
>>      */
>>     public void stem() {
>>         k = i - 1;
>>         if (k > 1) {
>>             step1();
>>             step2();
>>             step3();
>>             step4();
>>             step5();
>>             step6();
>>         }
>>         i_end = k+1;
>>         i = 0;
>>     }
>> }
>>
>>
>> /** Test program for demonstrating the Stemmer.  It reads text from a
>>  * a list of files, stems each word, and writes the result to standard
>>  * output. Note that the word stemmed is expected to be in lower case:
>>  * forcing lower case must be done outside the Stemmer class.
>>  * Usage: Stemmer file-name file-name ...
>>  */
>>
>>
>> /*
>> int main(string[] args) {
>>     //print("Hello\n");
>>     var s = new Stemmer();
>>
>>
>>
>>     if (args.length>1){
>>         var w = args[1];
>>         //print("w = %s\n", w);
>>         s.add2((char[])w.data, w.length);
>>         s.stem();
>>         //print("%s\n", s.ToString());
>>     }
>>
>>     return 0;
>> }
>> */
>> ------------------------
>> _______________________________________________
>> vala-list mailing list
>> [email protected]
>> https://mail.gnome.org/mailman/listinfo/vala-list
>
> _______________________________________________
> vala-list mailing list
> [email protected]
> https://mail.gnome.org/mailman/listinfo/vala-list
_______________________________________________
vala-list mailing list
[email protected]
https://mail.gnome.org/mailman/listinfo/vala-list

Re: [Vala] Port of the Porter stemmer to Vala (more detailed description)

Reply via email to