Title: [500] trunk/stemmer4jr: First implementation of stemmer4jr
Revision
500
Author
olabini
Date
2007-04-25 06:28:09 -0400 (Wed, 25 Apr 2007)

Log Message

First implementation of stemmer4jr

Modified Paths


Added Paths

Diff

Added: trunk/stemmer4jr/lib/stemmer.rb (0 => 500)


--- trunk/stemmer4jr/lib/stemmer.rb	                        (rev 0)
+++ trunk/stemmer4jr/lib/stemmer.rb	2007-04-25 10:28:09 UTC (rev 500)
@@ -0,0 +1,70 @@
+#
+# == Stemmable module
+#
+# This module is automatically added to the String and Array classes when you:
+#
+#  require 'stemmer'
+#
+# It adds a +stem+ method to String and Array.
+#
+#  str = 'this is a string'
+#
+#  stemmed_str = str.stem
+#
+#  array = %w{this is an array}
+#
+#  stemmed_array = array.stem
+#
+# By default, stemming occurs in english. If you want to stem in another language, just give it as a parameter:
+#
+#  str = 'Chaîne de caractères française'
+#
+#  stemmed_str = str.stem('fr')
+#
+# Or you can change the default configuration:
+#
+#  Stemmable::stemmer_default_language = 'fr'
+#
+#  stemmed_str = str.stem
+#
+module Stemmable
+  begin
+    require 'rubygems'
+    gem 'stemmer4jr'
+  ensure
+    require 'stemmer4r'
+  end
+
+  @@stemmer_default_language = 'en'
+  @@stemmer = Stemmer.new('en')
+  @@UTF8_MAP = {
+    'fr' => 'iso-8859-1'
+  }
+
+  def Stemmable.stemmer_default_language=(language)
+    @@stemmer_default_language = language
+    @@stemmer = Stemmer.new(language)
+    language
+  end
+
+  def Stemmable.stemmer_default_language
+    return @@stemmer_default_language
+  end
+
+  def stem(language = nil)
+    if (language.nil?)
+      @@stemmer.stem(self)
+    else
+      stemmer = Stemmer.new(language)
+      stemmer.stem(self)
+    end
+  end
+end
+
+class String
+  include Stemmable
+end
+
+class Array
+  include Stemmable
+end

Added: trunk/stemmer4jr/lib/stemmer_utf8.rb (0 => 500)


--- trunk/stemmer4jr/lib/stemmer_utf8.rb	                        (rev 0)
+++ trunk/stemmer4jr/lib/stemmer_utf8.rb	2007-04-25 10:28:09 UTC (rev 500)
@@ -0,0 +1,66 @@
+#
+# == Stemmable_utf8 module
+#
+# This module is automatically added to the String and Array classes when you:
+#
+#  require 'stemmer_utf8'
+#
+# It adds a +stem_utf8+ method to String and Array.
+#
+#  str_utf8 = 'this is a UTF-8 encoded string'
+#
+#  stemmed_str_utf8 = str_utf8.stem_utf8
+#
+#  array_utf8 = %w{this is an array with utf8 caracters}
+#
+#  stemmed_array_utf8 = array_utf8.stem_utf8
+#
+# By default, stemming occurs in english. If you want to stem in another language, just give it as a parameter:
+#
+#  str_utf8 = 'Cha\xEEne de caract\xE8res fran\xE7aise en UTF-8'
+#
+#  stemmed_str_utf8 = str.stem_utf8('fr')
+#
+# Or you can change the default configuration:
+#
+#  Stemmable::stemmer_default_language = 'fr'
+#
+#  stemmed_str_utf8 = str_utf8.stem_utf8
+#
+module Stemmable_utf8
+  include Stemmable
+
+  def stem_utf8(language = nil)
+    require 'iconv'
+    if (language.nil?)
+      language = @@stemmer_default_language
+      stemmer = @@stemmer
+    else
+      stemmer = Stemmer.new(language)
+    end
+    language_encoding = @@UTF8_MAP[language] || 'iso-8859-1'
+    if self.is_a?(String)
+      Iconv.new('utf-8', language_encoding).iconv(stemmer.stem(Iconv.new(language_encoding, 'utf-8').iconv(self)))
+    elsif self.is_a?(Array)
+      temp = []
+      output = []
+      Iconv.open(language_encoding, 'utf-8') do |cd|
+        self.each { |s| temp << cd.iconv(s) + cd.iconv(nil) }
+      end
+      Iconv.open('utf-8', language_encoding) do |cd|
+        stemmer.stem(temp).each { |s| output << cd.iconv(s) + cd.iconv(nil) }
+      end
+      output
+    else
+      raise 'no valid type'
+    end
+  end
+end
+
+class String
+  include Stemmable_utf8
+end
+
+class Array
+  include Stemmable_utf8
+end

Modified: trunk/stemmer4jr/src/java/Stemmer4rService.java (499 => 500)


--- trunk/stemmer4jr/src/java/Stemmer4rService.java	2007-04-25 08:18:06 UTC (rev 499)
+++ trunk/stemmer4jr/src/java/Stemmer4rService.java	2007-04-25 10:28:09 UTC (rev 500)
@@ -27,13 +27,170 @@
  ***** END LICENSE BLOCK *****/
 import java.io.IOException;
 
+import java.lang.reflect.Method;
+
+import java.util.Map;
+import java.util.HashMap;
+
 import org.jruby.Ruby;
+import org.jruby.RubyArray;
 import org.jruby.RubyClass;
+import org.jruby.RubyString;
+import org.jruby.runtime.builtin.IRubyObject;
 import org.jruby.runtime.CallbackFactory;
 import org.jruby.runtime.load.BasicLibraryService;
 
+import org.tartarus.snowball.SnowballProgram;
+import org.tartarus.snowball.ext.danishStemmer;
+import org.tartarus.snowball.ext.dutchStemmer;
+import org.tartarus.snowball.ext.englishStemmer;
+import org.tartarus.snowball.ext.finnishStemmer;
+import org.tartarus.snowball.ext.frenchStemmer;
+import org.tartarus.snowball.ext.germanStemmer;
+import org.tartarus.snowball.ext.hungarianStemmer;
+import org.tartarus.snowball.ext.italianStemmer;
+import org.tartarus.snowball.ext.norwegianStemmer;
+import org.tartarus.snowball.ext.porterStemmer;
+import org.tartarus.snowball.ext.portugueseStemmer;
+import org.tartarus.snowball.ext.romanianStemmer;
+import org.tartarus.snowball.ext.russianStemmer;
+import org.tartarus.snowball.ext.spanishStemmer;
+import org.tartarus.snowball.ext.swedishStemmer;
+import org.tartarus.snowball.ext.turkishStemmer;
+
 public class Stemmer4rService implements BasicLibraryService {
+    private final static Map STEMMERS = new HashMap();
+
+    static {
+        STEMMERS.put("danish",danishStemmer.class);
+        STEMMERS.put("da",danishStemmer.class);
+        STEMMERS.put("dutch",dutchStemmer.class);
+        STEMMERS.put("nl",dutchStemmer.class);
+        STEMMERS.put("english",englishStemmer.class);
+        STEMMERS.put("en",englishStemmer.class);
+        STEMMERS.put("finnish",finnishStemmer.class);
+        STEMMERS.put("fi",finnishStemmer.class);
+        STEMMERS.put("french",frenchStemmer.class);
+        STEMMERS.put("fr",frenchStemmer.class);
+        STEMMERS.put("german",germanStemmer.class);
+        STEMMERS.put("de",germanStemmer.class);
+        STEMMERS.put("hungarian",hungarianStemmer.class);
+        STEMMERS.put("hu",hungarianStemmer.class);
+        STEMMERS.put("italian",italianStemmer.class);
+        STEMMERS.put("it",italianStemmer.class);
+        STEMMERS.put("norwegian",norwegianStemmer.class);
+        STEMMERS.put("no",norwegianStemmer.class);
+        STEMMERS.put("porter",porterStemmer.class);
+        STEMMERS.put("portuguese",portugueseStemmer.class);
+        STEMMERS.put("pt",portugueseStemmer.class);
+        STEMMERS.put("romanian",romanianStemmer.class);
+        STEMMERS.put("ro",romanianStemmer.class);
+        STEMMERS.put("russian",russianStemmer.class);
+        STEMMERS.put("ru",russianStemmer.class);
+        STEMMERS.put("spanish",spanishStemmer.class);
+        STEMMERS.put("es",spanishStemmer.class);
+        STEMMERS.put("swedish",swedishStemmer.class);
+        STEMMERS.put("sv",swedishStemmer.class);
+        STEMMERS.put("turkish",turkishStemmer.class);
+        STEMMERS.put("tr",turkishStemmer.class);
+    }
+
+    private static Class getStemmer(String name) {
+        name = name.toLowerCase();
+        if(!STEMMERS.containsKey(name)) {
+            try {
+                Class c = Class.forName("org.tartarus.snowball.ext." + name + "Stemmer");
+                STEMMERS.put(name, c);
+                return c;
+            } catch(Exception e) {
+                return null;
+            }
+        }
+        return (Class)STEMMERS.get(name);
+    }
+
+    private static class SnowballContainer {
+        SnowballProgram snow = null;
+        Method stem = null;
+        private final static Object[] emptyArgs = new Object[0];
+
+        public String stem(String word) {
+            snow.setCurrent(word);
+            try {
+                stem.invoke(snow, emptyArgs);
+            } catch(Exception e) {}
+            return snow.getCurrent();
+        }
+    }
+
+    public static IRubyObject initialize(IRubyObject recv, IRubyObject algorithm) {
+        String algo = algorithm.convertToString().toString();
+        Class cc = getStemmer(algo);
+
+        if(cc == null) {
+            throw recv.getRuntime().newArgumentError("Algorithm '" + algo + "' doesn't exist");
+        }
+
+        SnowballContainer container = new SnowballContainer();
+
+        try {
+            container.snow = (SnowballProgram)cc.newInstance();
+            container.stem = cc.getMethod("stem", new Class[0]);
+        } catch(Exception e) {
+            throw recv.getRuntime().newArgumentError("Algorithm '" + algo + "' doesn't exist");
+        }
+
+        recv.dataWrapStruct(container);
+
+        return recv;
+    }
+
+    public static IRubyObject stem(IRubyObject recv, IRubyObject obj) {
+        IRubyObject ret = null;
+        SnowballContainer stemmer = (SnowballContainer)recv.dataGetStruct();
+        String cword;
+
+        if(obj instanceof RubyString) {
+            cword = obj.toString().toLowerCase();
+            if(cword.indexOf(' ') != -1) {
+                // String of words
+                StringBuffer rets = new StringBuffer();
+                String SEP = "";
+                String[] ss = cword.split(" ");
+                for(int i=0,j=ss.length;i<j;i++) {
+                    if(ss[i].length() > 0) {
+                        rets.append(SEP);
+                        rets.append(stemmer.stem(ss[i]));
+                        SEP = " ";
+                    }
+                }
+                ret = recv.getRuntime().newString(rets.toString());
+            } else {
+                // One word
+                ret = recv.getRuntime().newString(stemmer.stem(cword));
+            }
+        } else if(obj instanceof RubyArray) {
+            int len = ((RubyArray)obj).getLength();
+            RubyArray ret1 = recv.getRuntime().newArray(len);
+            for(int i=0;i<len;i++) {
+                cword = ((RubyArray)obj).eltInternal(i).convertToString().toString().toLowerCase();
+                ret1.append(recv.getRuntime().newString(stemmer.stem(cword)));
+            }
+            ret = ret1;
+        } else {
+            throw recv.getRuntime().newTypeError("not valid value");
+        }
+
+        return ret;
+    }
+    
     public boolean basicLoad(final Ruby runtime) throws IOException {
+        RubyClass cStemmer = runtime.defineClass("Stemmer", runtime.getObject(), runtime.getObject().getAllocator());
+        CallbackFactory cf = runtime.callbackFactory(Stemmer4rService.class);
+
+        cStemmer.defineFastMethod("initialize", cf.getFastSingletonMethod("initialize",IRubyObject.class));
+        cStemmer.defineFastMethod("stem", cf.getFastSingletonMethod("stem",IRubyObject.class));
+
         return true;
     }
 }
_______________________________________________
Jruby-extras-devel mailing list
[email protected]
http://rubyforge.org/mailman/listinfo/jruby-extras-devel

Reply via email to