Added: trunk/stemmer4jr/lib/stemmer.rb (0 => 500)
--- trunk/stemmer4jr/lib/stemmer.rb (rev 0)
+++ trunk/stemmer4jr/lib/stemmer.rb 2007-04-25 10:28:09 UTC (rev 500)
@@ -0,0 +1,70 @@
+#
+# == Stemmable module
+#
+# This module is automatically added to the String and Array classes when you:
+#
+# require 'stemmer'
+#
+# It adds a +stem+ method to String and Array.
+#
+# str = 'this is a string'
+#
+# stemmed_str = str.stem
+#
+# array = %w{this is an array}
+#
+# stemmed_array = array.stem
+#
+# By default, stemming occurs in english. If you want to stem in another language, just give it as a parameter:
+#
+# str = 'Chaîne de caractères française'
+#
+# stemmed_str = str.stem('fr')
+#
+# Or you can change the default configuration:
+#
+# Stemmable::stemmer_default_language = 'fr'
+#
+# stemmed_str = str.stem
+#
+module Stemmable
+ begin
+ require 'rubygems'
+ gem 'stemmer4jr'
+ ensure
+ require 'stemmer4r'
+ end
+
+ @@stemmer_default_language = 'en'
+ @@stemmer = Stemmer.new('en')
+ @@UTF8_MAP = {
+ 'fr' => 'iso-8859-1'
+ }
+
+ def Stemmable.stemmer_default_language=(language)
+ @@stemmer_default_language = language
+ @@stemmer = Stemmer.new(language)
+ language
+ end
+
+ def Stemmable.stemmer_default_language
+ return @@stemmer_default_language
+ end
+
+ def stem(language = nil)
+ if (language.nil?)
+ @@stemmer.stem(self)
+ else
+ stemmer = Stemmer.new(language)
+ stemmer.stem(self)
+ end
+ end
+end
+
+class String
+ include Stemmable
+end
+
+class Array
+ include Stemmable
+end
Added: trunk/stemmer4jr/lib/stemmer_utf8.rb (0 => 500)
--- trunk/stemmer4jr/lib/stemmer_utf8.rb (rev 0)
+++ trunk/stemmer4jr/lib/stemmer_utf8.rb 2007-04-25 10:28:09 UTC (rev 500)
@@ -0,0 +1,66 @@
+#
+# == Stemmable_utf8 module
+#
+# This module is automatically added to the String and Array classes when you:
+#
+# require 'stemmer_utf8'
+#
+# It adds a +stem_utf8+ method to String and Array.
+#
+# str_utf8 = 'this is a UTF-8 encoded string'
+#
+# stemmed_str_utf8 = str_utf8.stem_utf8
+#
+# array_utf8 = %w{this is an array with utf8 caracters}
+#
+# stemmed_array_utf8 = array_utf8.stem_utf8
+#
+# By default, stemming occurs in english. If you want to stem in another language, just give it as a parameter:
+#
+# str_utf8 = 'Cha\xEEne de caract\xE8res fran\xE7aise en UTF-8'
+#
+# stemmed_str_utf8 = str.stem_utf8('fr')
+#
+# Or you can change the default configuration:
+#
+# Stemmable::stemmer_default_language = 'fr'
+#
+# stemmed_str_utf8 = str_utf8.stem_utf8
+#
+module Stemmable_utf8
+ include Stemmable
+
+ def stem_utf8(language = nil)
+ require 'iconv'
+ if (language.nil?)
+ language = @@stemmer_default_language
+ stemmer = @@stemmer
+ else
+ stemmer = Stemmer.new(language)
+ end
+ language_encoding = @@UTF8_MAP[language] || 'iso-8859-1'
+ if self.is_a?(String)
+ Iconv.new('utf-8', language_encoding).iconv(stemmer.stem(Iconv.new(language_encoding, 'utf-8').iconv(self)))
+ elsif self.is_a?(Array)
+ temp = []
+ output = []
+ Iconv.open(language_encoding, 'utf-8') do |cd|
+ self.each { |s| temp << cd.iconv(s) + cd.iconv(nil) }
+ end
+ Iconv.open('utf-8', language_encoding) do |cd|
+ stemmer.stem(temp).each { |s| output << cd.iconv(s) + cd.iconv(nil) }
+ end
+ output
+ else
+ raise 'no valid type'
+ end
+ end
+end
+
+class String
+ include Stemmable_utf8
+end
+
+class Array
+ include Stemmable_utf8
+end
Modified: trunk/stemmer4jr/src/java/Stemmer4rService.java (499 => 500)
--- trunk/stemmer4jr/src/java/Stemmer4rService.java 2007-04-25 08:18:06 UTC (rev 499)
+++ trunk/stemmer4jr/src/java/Stemmer4rService.java 2007-04-25 10:28:09 UTC (rev 500)
@@ -27,13 +27,170 @@
***** END LICENSE BLOCK *****/
import java.io.IOException;
+import java.lang.reflect.Method;
+
+import java.util.Map;
+import java.util.HashMap;
+
import org.jruby.Ruby;
+import org.jruby.RubyArray;
import org.jruby.RubyClass;
+import org.jruby.RubyString;
+import org.jruby.runtime.builtin.IRubyObject;
import org.jruby.runtime.CallbackFactory;
import org.jruby.runtime.load.BasicLibraryService;
+import org.tartarus.snowball.SnowballProgram;
+import org.tartarus.snowball.ext.danishStemmer;
+import org.tartarus.snowball.ext.dutchStemmer;
+import org.tartarus.snowball.ext.englishStemmer;
+import org.tartarus.snowball.ext.finnishStemmer;
+import org.tartarus.snowball.ext.frenchStemmer;
+import org.tartarus.snowball.ext.germanStemmer;
+import org.tartarus.snowball.ext.hungarianStemmer;
+import org.tartarus.snowball.ext.italianStemmer;
+import org.tartarus.snowball.ext.norwegianStemmer;
+import org.tartarus.snowball.ext.porterStemmer;
+import org.tartarus.snowball.ext.portugueseStemmer;
+import org.tartarus.snowball.ext.romanianStemmer;
+import org.tartarus.snowball.ext.russianStemmer;
+import org.tartarus.snowball.ext.spanishStemmer;
+import org.tartarus.snowball.ext.swedishStemmer;
+import org.tartarus.snowball.ext.turkishStemmer;
+
public class Stemmer4rService implements BasicLibraryService {
+ private final static Map STEMMERS = new HashMap();
+
+ static {
+ STEMMERS.put("danish",danishStemmer.class);
+ STEMMERS.put("da",danishStemmer.class);
+ STEMMERS.put("dutch",dutchStemmer.class);
+ STEMMERS.put("nl",dutchStemmer.class);
+ STEMMERS.put("english",englishStemmer.class);
+ STEMMERS.put("en",englishStemmer.class);
+ STEMMERS.put("finnish",finnishStemmer.class);
+ STEMMERS.put("fi",finnishStemmer.class);
+ STEMMERS.put("french",frenchStemmer.class);
+ STEMMERS.put("fr",frenchStemmer.class);
+ STEMMERS.put("german",germanStemmer.class);
+ STEMMERS.put("de",germanStemmer.class);
+ STEMMERS.put("hungarian",hungarianStemmer.class);
+ STEMMERS.put("hu",hungarianStemmer.class);
+ STEMMERS.put("italian",italianStemmer.class);
+ STEMMERS.put("it",italianStemmer.class);
+ STEMMERS.put("norwegian",norwegianStemmer.class);
+ STEMMERS.put("no",norwegianStemmer.class);
+ STEMMERS.put("porter",porterStemmer.class);
+ STEMMERS.put("portuguese",portugueseStemmer.class);
+ STEMMERS.put("pt",portugueseStemmer.class);
+ STEMMERS.put("romanian",romanianStemmer.class);
+ STEMMERS.put("ro",romanianStemmer.class);
+ STEMMERS.put("russian",russianStemmer.class);
+ STEMMERS.put("ru",russianStemmer.class);
+ STEMMERS.put("spanish",spanishStemmer.class);
+ STEMMERS.put("es",spanishStemmer.class);
+ STEMMERS.put("swedish",swedishStemmer.class);
+ STEMMERS.put("sv",swedishStemmer.class);
+ STEMMERS.put("turkish",turkishStemmer.class);
+ STEMMERS.put("tr",turkishStemmer.class);
+ }
+
+ private static Class getStemmer(String name) {
+ name = name.toLowerCase();
+ if(!STEMMERS.containsKey(name)) {
+ try {
+ Class c = Class.forName("org.tartarus.snowball.ext." + name + "Stemmer");
+ STEMMERS.put(name, c);
+ return c;
+ } catch(Exception e) {
+ return null;
+ }
+ }
+ return (Class)STEMMERS.get(name);
+ }
+
+ private static class SnowballContainer {
+ SnowballProgram snow = null;
+ Method stem = null;
+ private final static Object[] emptyArgs = new Object[0];
+
+ public String stem(String word) {
+ snow.setCurrent(word);
+ try {
+ stem.invoke(snow, emptyArgs);
+ } catch(Exception e) {}
+ return snow.getCurrent();
+ }
+ }
+
+ public static IRubyObject initialize(IRubyObject recv, IRubyObject algorithm) {
+ String algo = algorithm.convertToString().toString();
+ Class cc = getStemmer(algo);
+
+ if(cc == null) {
+ throw recv.getRuntime().newArgumentError("Algorithm '" + algo + "' doesn't exist");
+ }
+
+ SnowballContainer container = new SnowballContainer();
+
+ try {
+ container.snow = (SnowballProgram)cc.newInstance();
+ container.stem = cc.getMethod("stem", new Class[0]);
+ } catch(Exception e) {
+ throw recv.getRuntime().newArgumentError("Algorithm '" + algo + "' doesn't exist");
+ }
+
+ recv.dataWrapStruct(container);
+
+ return recv;
+ }
+
+ public static IRubyObject stem(IRubyObject recv, IRubyObject obj) {
+ IRubyObject ret = null;
+ SnowballContainer stemmer = (SnowballContainer)recv.dataGetStruct();
+ String cword;
+
+ if(obj instanceof RubyString) {
+ cword = obj.toString().toLowerCase();
+ if(cword.indexOf(' ') != -1) {
+ // String of words
+ StringBuffer rets = new StringBuffer();
+ String SEP = "";
+ String[] ss = cword.split(" ");
+ for(int i=0,j=ss.length;i<j;i++) {
+ if(ss[i].length() > 0) {
+ rets.append(SEP);
+ rets.append(stemmer.stem(ss[i]));
+ SEP = " ";
+ }
+ }
+ ret = recv.getRuntime().newString(rets.toString());
+ } else {
+ // One word
+ ret = recv.getRuntime().newString(stemmer.stem(cword));
+ }
+ } else if(obj instanceof RubyArray) {
+ int len = ((RubyArray)obj).getLength();
+ RubyArray ret1 = recv.getRuntime().newArray(len);
+ for(int i=0;i<len;i++) {
+ cword = ((RubyArray)obj).eltInternal(i).convertToString().toString().toLowerCase();
+ ret1.append(recv.getRuntime().newString(stemmer.stem(cword)));
+ }
+ ret = ret1;
+ } else {
+ throw recv.getRuntime().newTypeError("not valid value");
+ }
+
+ return ret;
+ }
+
public boolean basicLoad(final Ruby runtime) throws IOException {
+ RubyClass cStemmer = runtime.defineClass("Stemmer", runtime.getObject(), runtime.getObject().getAllocator());
+ CallbackFactory cf = runtime.callbackFactory(Stemmer4rService.class);
+
+ cStemmer.defineFastMethod("initialize", cf.getFastSingletonMethod("initialize",IRubyObject.class));
+ cStemmer.defineFastMethod("stem", cf.getFastSingletonMethod("stem",IRubyObject.class));
+
return true;
}
}