Hi Serge,
thank you for sharing your code. May I ask why you implemented it in
pure Vala?
Just for the sake of completeness, if anybody is interested in this kind
of tools for Vala, at
https://github.com/nemequ/vala-extra-vapis/blob/master/libstemmer.vapi
there are also the Vala bindings for the snowball stemmer library
(http://snowball.tartarus.org/).
Bests,
Giulio.
On 17/09/2013 11:33, Serge Hulne wrote:
> A more detailed description:
>
> http://tartarus.org/martin/PorterStemmer/
>
> http://tartarus.org/martin/PorterStemmer/def.txt
>
>
> Serge Hulne.
>
>
> PS: The code as plain text:
>
> ------------------------
> using GLib;
> using Posix;
>
>
> class Stemmer {
> private char[] b;
> private int i; /* offset into b */
> private int i_end; /* offset to end of stemmed word */
> private int j;
> private int k;
> private static int INC = 50;
> /* unit of size whereby b is increased */
>
>
> public Stemmer() {
> b = new char[INC];
> i = 0;
> i_end = 0;
> }
>
> /**
> * Add a character to the word being stemmed. When you are finished
> * adding characters, you can call stem(void) to stem the word.
> */
>
> public void add(char ch) {
> if (i == b.length) {
> char[] new_b = new char[i+INC];
> for (int c = 0; c < i; c++)
> new_b[c] = b[c];
> b = new_b;
> }
> b[i++] = ch;
> }
>
>
> /** Adds wLen characters to the word being stemmed contained in a
> portion
> * of a char[] array. This is like repeated calls of add(char ch), but
> * faster.
> */
>
> public void add2(char[] w, int wLen) {
> if (i+wLen >= b.length) {
> char[] new_b = new char[i+wLen+INC];
> for (int c = 0; c < i; c++)
> new_b[c] = b[c];
> b = new_b;
> }
> for (int c = 0; c < wLen; c++)
> b[i++] = w[c];
> }
>
> /**
> * After a word has been stemmed, it can be retrieved by toString(),
> * or a reference to the internal buffer can be retrieved by
> getResultBuffer
> * and getResultLength (which is generally more efficient.)
> */
> public string ToString() {
> var s = (string) b;
> //return new string(b,0,i_end);
> return s.slice(0,i_end);
> }
>
> /**
> * Returns the length of the word resulting from the stemming process.
> */
> public int getResultLength() {
> return i_end;
> }
>
> /**
> * Returns a reference to a character buffer containing the results of
> * the stemming process. You also need to consult getResultLength()
> * to determine the length of the result.
> */
> public char[] getResultBuffer() {
> return b;
> }
>
> /* cons(i) is true <=> b[i] is a consonant. */
> private bool cons(int i) {
> switch (b[i]) {
> case 'a':
> case 'e':
> case 'i':
> case 'o':
> case 'u':
> return false;
> case 'y':
> return (i==0) ? true : !cons(i-1);
> default:
> return true;
> }
> }
>
> /* m() measures the number of consonant sequences between 0 and j. if c
> is
> a consonant sequence and v a vowel sequence, and <..> indicates
> arbitrary
> presence,
>
> <c><v> gives 0
> <c>vc<v> gives 1
> <c>vcvc<v> gives 2
> <c>vcvcvc<v> gives 3
> ....
> */
> private int m() {
> int n = 0;
> int i = 0;
> while(true) {
> if (i > j) return n;
> if (! cons(i)) break;
> i++;
> }
> i++;
> while(true) {
> while(true) {
> if (i > j) return n;
> if (cons(i)) break;
> i++;
> }
> i++;
> n++;
> while(true) {
> if (i > j) return n;
> if (! cons(i)) break;
> i++;
> }
> i++;
> }
> }
>
> /* vowelinstem() is true <=> 0,...j contains a vowel */
> private bool vowelinstem() {
> int i;
> for (i = 0; i <= j; i++)
> if (! cons(i))
> return true;
> return false;
> }
>
> /* doublec(j) is true <=> j,(j-1) contain a double consonant. */
> private bool doublec(int j) {
> if (j < 1)
> return false;
> if (b[j] != b[j-1])
> return false;
> return cons(j);
> }
>
> /* cvc(i) is true <=> i-2,i-1,i has the form consonant - vowel -
> consonant
> and also if the second c is not w,x or y. this is used when trying to
> restore an e at the end of a short word. e.g.
>
> cav(e), lov(e), hop(e), crim(e), but
> snow, box, tray.
>
> */
> private bool cvc(int i) {
> if (i < 2 || !cons(i) || cons(i-1) || !cons(i-2))
> return false;
> int ch = b[i];
> if (ch == 'w' || ch == 'x' || ch == 'y')
> return false;
> return true;
> }
>
> private bool ends(string s) {
> int l = s.length;
> int o = k-l+1;
> if (o < 0)
> return false;
> char[] sc = (char[]) s.data;
> for (int i = 0; i < l; i++)
> if (b[o+i] != sc[i])
> return false;
> j = k-l;
> return true;
> }
>
> /* setto(s) sets (j+1),...k to the characters in the string s,
> readjusting
> k. */
> private void setto(string s) {
> int l = s.length;
> int o = j+1;
> char[] sc = (char[]) s.data;
> for (int i = 0; i < l; i++)
> b[o+i] = sc[i];
> k = j+l;
> }
>
> /* r(s) is used further down. */
> private void r(string s) {
> if (m() > 0)
> setto(s);
> }
>
> /* step1() gets rid of plurals and -ed or -ing. e.g.
> caresses -> caress
> ponies -> poni
> ties -> ti
> caress -> caress
> cats -> cat
>
> feed -> feed
> agreed -> agree
> disabled -> disable
>
> matting -> mat
> mating -> mate
> meeting -> meet
> milling -> mill
> messing -> mess
>
> meetings -> meet
>
> */
>
> private void step1() {
> //print("->step1()\n");
> if (b[k] == 's') {
> if (ends("sses"))
> k -= 2;
> else if (ends("ies"))
> //setto("");
> setto("");
> else if (b[k-1] != 's')
> k--;
> }
> if (ends("eed")) {
> if (m() > 0)
> k--;
> } else if ((ends("ed") || ends("ing")) && vowelinstem()) {
> k = j;
> if (ends("at"))
> setto("ate");
> else if (ends("bl"))
> setto("ble");
> else if (ends("iz"))
> setto("ize");
> else if (doublec(k)) {
> k--;
> int ch = b[k];
> if (ch == 'l' || ch == 's' || ch == 'z')
> k++;
> }
> else if (m() == 1 && cvc(k)) setto("e");
> }
> }
>
> /* step2() turns terminal y to i when there is another vowel in the
> stem. */
> private void step2() {
> //print("->step2()\n");
> if (ends("y") && vowelinstem())
> b[k] = 'i';
> }
>
> /* step3() maps double suffices to single ones. so -ization ( = -ize
> plus
> -ation) maps to -ize etc. note that the string before the suffix
> must give
> m() > 0. */
> private void step3() {
> //print("->step3()\n");
> if (k == 0)
> return;
>
> /* For Bug 1 */
> switch (b[k-1]) {
> case 'a':
> if (ends("ational")) {
> r("ate");
> break;
> }
> if (ends("tional")) {
> r("tion");
> break;
> }
> break;
> case 'c':
> if (ends("enci")) {
> r("ence");
> break;
> }
> if (ends("anci")) {
> r("ance");
> break;
> }
> break;
> case 'e':
> if (ends("izer")) {
> r("ize");
> break;
> }
> break;
> case 'l':
> if (ends("bli")) {
> r("ble");
> break;
> }
> if (ends("alli")) {
> r("al");
> break;
> }
> if (ends("entli")) {
> r("ent");
> break;
> }
> if (ends("eli")) {
> r("e");
> break;
> }
> if (ends("ousli")) {
> r("ous");
> break;
> }
> break;
> case 'o':
> if (ends("ization")) {
> r("ize");
> break;
> }
> if (ends("ation")) {
> r("ate");
> break;
> }
> if (ends("ator")) {
> r("ate");
> break;
> }
> break;
> case 's':
> if (ends("alism")) {
> r("al");
> break;
> }
> if (ends("iveness")) {
> r("ive");
> break;
> }
> if (ends("fulness")) {
> r("ful");
> break;
> }
> if (ends("ousness")) {
> r("ous");
> break;
> }
> break;
> case 't':
> if (ends("aliti")) {
> r("al");
> break;
> }
> if (ends("iviti")) {
> r("ive");
> break;
> }
> if (ends("biliti")) {
> r("ble");
> break;
> }
> break;
> case 'g':
> if (ends("logi")) {
> r("log");
> break;
> }
> break;
> default :
> break;
> }
> }
>
> /* step4() deals with -ic-, -full, -ness etc. similar strategy to
> step3. */
> private void step4() {
> //print("->step4()\n");
> switch (b[k]) {
> case 'e':
> if (ends("icate")) {
> r("ic");
> break;
> }
> if (ends("ative")) {
> r("");
> break;
> }
> if (ends("alize")) {
> r("al");
> break;
> }
> break;
> case 'i':
> if (ends("iciti")) {
> r("ic");
> break;
> }
> break;
> case 'l':
> if (ends("ical")) {
> r("ic");
> break;
> }
> if (ends("ful")) {
> r("");
> break;
> }
> break;
> case 's':
> if (ends("ness")) {
> r("");
> break;
> }
> break;
> }
> }
>
> /* step5() takes off -ant, -ence etc., in context <c>vcvc<v>. */
> private void step5() {
> //print("->step5()\n");
> if (k == 0)
> return;
>
> /* for Bug 1 */
> switch ( b[k-1] ) {
> case 'a':
> if (ends("al")) break;
> return;
> case 'c':
> if (ends("ance")) break;
> if (ends("ence")) break;
> return;
> case 'e':
> if (ends("er")) break;
> return;
> case 'i':
> if (ends("ic")) break;
> return;
> case 'l':
> if (ends("able")) break;
> if (ends("ible")) break;
> return;
> case 'n':
> if (ends("ant")) break;
> if (ends("ement")) break;
> if (ends("ment")) break;
> /* element etc. not stripped before the m */
> if (ends("ent")) break;
> return;
> case 'o':
> if (ends("ion") && j >= 0 && (b[j] == 's' || b[j] == 't'))
> break;
> /* j >= 0 fixes Bug 2 */
> if (ends("ou")) break;
> return;
> /* takes care of -ous */
> case 's':
> if (ends("ism")) break;
> return;
> case 't':
> if (ends("ate")) break;
> if (ends("iti")) break;
> return;
> case 'u':
> if (ends("ous")) break;
> return;
> case 'v':
> if (ends("ive")) break;
> return;
> case 'z':
> if (ends("ize")) break;
> return;
> default:
> return;
> }
> if (m() > 1)
> k = j;
> }
>
> /* step6() removes a final -e if m() > 1. */
> private void step6() {
> //print("->step6()\n");
> j = k;
>
> if (b[k] == 'e') {
> int a = m();
> if (a > 1 || a == 1 && !cvc(k-1))
> k--;
> }
> if (b[k] == 'l' && doublec(k) && m() > 1)
> k--;
> }
>
> /** Stem the word placed into the Stemmer buffer through calls to add().
> * Returns true if the stemming process resulted in a word different
> * from the input. You can retrieve the result with
> * getResultLength()/getResultBuffer() or toString().
> */
> public void stem() {
> k = i - 1;
> if (k > 1) {
> step1();
> step2();
> step3();
> step4();
> step5();
> step6();
> }
> i_end = k+1;
> i = 0;
> }
> }
>
>
> /** Test program for demonstrating the Stemmer. It reads text from a
> * a list of files, stems each word, and writes the result to standard
> * output. Note that the word stemmed is expected to be in lower case:
> * forcing lower case must be done outside the Stemmer class.
> * Usage: Stemmer file-name file-name ...
> */
>
>
> /*
> int main(string[] args) {
> //print("Hello\n");
> var s = new Stemmer();
>
>
>
> if (args.length>1){
> var w = args[1];
> //print("w = %s\n", w);
> s.add2((char[])w.data, w.length);
> s.stem();
> //print("%s\n", s.ToString());
> }
>
> return 0;
> }
> */
> ------------------------
> _______________________________________________
> vala-list mailing list
> [email protected]
> https://mail.gnome.org/mailman/listinfo/vala-list
_______________________________________________
vala-list mailing list
[email protected]
https://mail.gnome.org/mailman/listinfo/vala-list