otis 02/02/13 17:19:07 Modified: src/java/org/apache/lucene/analysis/de GermanStemmer.java Log: - Fixed a bug with substCount variable which was not being reset. - Re-formatted the whole class to match the rest of the code more closely (still differs, but... :)). Revision Changes Path 1.3 +261 -261 jakarta-lucene/src/java/org/apache/lucene/analysis/de/GermanStemmer.java Index: GermanStemmer.java =================================================================== RCS file: /home/cvs/jakarta-lucene/src/java/org/apache/lucene/analysis/de/GermanStemmer.java,v retrieving revision 1.2 retrieving revision 1.3 diff -u -r1.2 -r1.3 --- GermanStemmer.java 10 Dec 2001 21:18:24 -0000 1.2 +++ GermanStemmer.java 14 Feb 2002 01:19:07 -0000 1.3 @@ -1,287 +1,287 @@ package org.apache.lucene.analysis.de; /** - * A stemmer for german words. The algorithm is based on the report + * A stemmer for German words. The algorithm is based on the report * "A Fast and Simple Stemming Algorithm for German Words" by Jörg * Caumanns ([EMAIL PROTECTED]). * * @author Gerhard Schwarz - * @version $Id: GermanStemmer.java,v 1.2 2001/12/10 21:18:24 gschwarz Exp $ + * @version $Id: GermanStemmer.java,v 1.3 2002/02/14 01:19:07 otis Exp $ */ public class GermanStemmer { - - /** - * Buffer for the terms while stemming them. - */ - private StringBuffer sb = new StringBuffer(); - /** - * Indicates if a term is handled as a noun. - */ - private boolean uppercase = false; - /** - * Amount of characters that are removed with <tt>substitute()</tt> while stemming. - */ - private int substCount = 0; - public GermanStemmer() { + /** + * Buffer for the terms while stemming them. + */ + private StringBuffer sb = new StringBuffer(); + /** + * Indicates if a term is handled as a noun. + */ + private boolean uppercase = false; + /** + * Amount of characters that are removed with <tt>substitute()</tt> while stemming. + */ + private int substCount = 0; + + public GermanStemmer() { + } + + /** + * Stemms the given term to an unique <tt>discriminator</tt>. + * + * @param term The term that should be stemmed. + * @return Discriminator for <tt>term</tt> + */ + protected String stem( String term ) { + if ( !isStemmable( term ) ) { + return term; + } + // Mark a possible noun. + if ( Character.isUpperCase( term.charAt( 0 ) ) ) { + uppercase = true; + } + else { + uppercase = false; } - - /** - * Stemms the given term to an unique <tt>discriminator</tt>. - * - * @param term The term that should be stemmed. - * @return Discriminator for <tt>term</tt> - */ - protected String stem( String term ) { - if ( !isStemmable( term ) ) { - return term; - } - // Mark a possible noun. - if ( Character.isUpperCase( term.charAt( 0 ) ) ) { - uppercase = true; + // Use lowercase for medium stemming. + term = term.toLowerCase(); + // Reset the StringBuffer. + sb.delete( 0, sb.length() ); + sb.insert( 0, term ); + sb = substitute( sb ); + // Nouns have only seven possible suffixes. + if ( uppercase && sb.length() > 3 ) { + if ( sb.substring( sb.length() - 3, sb.length() ).equals( "ern" ) ) { + sb.delete( sb.length() - 3, sb.length() ); + } + else if ( sb.substring( sb.length() - 2, sb.length() ).equals( "en" ) ) { + sb.delete( sb.length() - 2, sb.length() ); + } + else if ( sb.substring( sb.length() - 2, sb.length() ).equals( "er" ) ) { + sb.delete( sb.length() - 2, sb.length() ); + } + else if ( sb.substring( sb.length() - 2, sb.length() ).equals( "es" ) ) { + sb.delete( sb.length() - 2, sb.length() ); + } + else if ( sb.charAt( sb.length() - 1 ) == 'e' ) { + sb.deleteCharAt( sb.length() - 1 ); + } + else if ( sb.charAt( sb.length() - 1 ) == 'n' ) { + sb.deleteCharAt( sb.length() - 1 ); + } + else if ( sb.charAt( sb.length() - 1 ) == 's' ) { + sb.deleteCharAt( sb.length() - 1 ); + } + // Additional step for female plurals of professions and inhabitants. + if ( sb.length() > 5 && sb.substring( sb.length() - 3, sb.length() ).equals( "erin*" ) ) { + sb.deleteCharAt( sb.length() -1 ); + } + // Additional step for irregular plural nouns like "Matrizen -> Matrix". + if ( sb.charAt( sb.length() - 1 ) == ( 'z' ) ) { + sb.setCharAt( sb.length() - 1, 'x' ); + } + } + // Strip the 7 "base" suffixes: "e", "s", "n", "t", "em", "er", "nd" from all + // other terms. Adjectives, Verbs and Adverbs have a total of 52 different + // possible suffixes, stripping only the characters from they are build + // does mostly the same + else { + // Strip base suffixes as long as enough characters remain. + boolean doMore = true; + while ( sb.length() > 3 && doMore ) { + if ( ( sb.length() + substCount > 5 ) && sb.substring( sb.length() - 2, sb.length() ).equals( "nd" ) ) { + sb.delete( sb.length() - 2, sb.length() ); + } + else if ( ( sb.length() + substCount > 4 ) && sb.substring( sb.length() - 2, sb.length() ).equals( "er" ) ) { + sb.delete( sb.length() - 2, sb.length() ); + } + else if ( ( sb.length() + substCount > 4 ) && sb.substring( sb.length() - 2, sb.length() ).equals( "em" ) ) { + sb.delete( sb.length() - 2, sb.length() ); + } + else if ( sb.charAt( sb.length() - 1 ) == 't' ) { + sb.deleteCharAt( sb.length() - 1 ); } - else { - uppercase = false; + else if ( sb.charAt( sb.length() - 1 ) == 'n' ) { + sb.deleteCharAt( sb.length() - 1 ); + } + else if ( sb.charAt( sb.length() - 1 ) == 's' ) { + sb.deleteCharAt( sb.length() - 1 ); + } + else if ( sb.charAt( sb.length() - 1 ) == 'e' ) { + sb.deleteCharAt( sb.length() - 1 ); } - // Use lowercase for medium stemming. - term = term.toLowerCase(); - // Reset the StringBuffer. - sb.delete( 0, sb.length() ); - sb.insert( 0, term ); - sb = substitute( sb ); - // Nouns have only seven possible suffixes. - if ( uppercase && sb.length() > 3 ) { - if ( sb.substring( sb.length() - 3, sb.length() ).equals( "ern" ) ) { - sb.delete( sb.length() - 3, sb.length() ); - } - else if ( sb.substring( sb.length() - 2, sb.length() ).equals( "en" ) ) { - sb.delete( sb.length() - 2, sb.length() ); - } - else if ( sb.substring( sb.length() - 2, sb.length() ).equals( "er" ) ) { - sb.delete( sb.length() - 2, sb.length() ); - } - else if ( sb.substring( sb.length() - 2, sb.length() ).equals( "es" ) ) { - sb.delete( sb.length() - 2, sb.length() ); - } - else if ( sb.charAt( sb.length() - 1 ) == 'e' ) { - sb.deleteCharAt( sb.length() - 1 ); - } - else if ( sb.charAt( sb.length() - 1 ) == 'n' ) { - sb.deleteCharAt( sb.length() - 1 ); - } - else if ( sb.charAt( sb.length() - 1 ) == 's' ) { - sb.deleteCharAt( sb.length() - 1 ); - } - // Additional step for female plurals of professions and inhabitants. - if ( sb.length() > 5 && sb.substring( sb.length() - 3, sb.length() ).equals( "erin*" ) ) { - sb.deleteCharAt( sb.length() -1 ); - } - // Additional step for irregular plural nouns like "Matrizen -> Matrix". - if ( sb.charAt( sb.length() - 1 ) == ( 'z' ) ) { - sb.setCharAt( sb.length() - 1, 'x' ); - } - } - // Strip the 7 "base" suffixes: "e", "s", "n", "t", "em", "er", "nd" from all - // other terms. Adjectives, Verbs and Adverbs have a total of 52 different - // possible suffixes, stripping only the characters from they are build - // does mostly the same else { - // Strip base suffixes as long as enough characters remain. - boolean doMore = true; - while ( sb.length() > 3 && doMore ) { - if ( ( sb.length() + substCount > 5 ) && sb.substring( sb.length() - 2, sb.length() ).equals( "nd" ) ) { - sb.delete( sb.length() - 2, sb.length() ); - } - else if ( ( sb.length() + substCount > 4 ) && sb.substring( sb.length() - 2, sb.length() ).equals( "er" ) ) { - sb.delete( sb.length() - 2, sb.length() ); - } - else if ( ( sb.length() + substCount > 4 ) && sb.substring( sb.length() - 2, sb.length() ).equals( "em" ) ) { - sb.delete( sb.length() - 2, sb.length() ); - } - else if ( sb.charAt( sb.length() - 1 ) == 't' ) { - sb.deleteCharAt( sb.length() - 1 ); - } - else if ( sb.charAt( sb.length() - 1 ) == 'n' ) { - sb.deleteCharAt( sb.length() - 1 ); - } - else if ( sb.charAt( sb.length() - 1 ) == 's' ) { - sb.deleteCharAt( sb.length() - 1 ); - } - else if ( sb.charAt( sb.length() - 1 ) == 'e' ) { - sb.deleteCharAt( sb.length() - 1 ); - } - else { - doMore = false; - } - } - } - sb = resubstitute( sb ); - if ( !uppercase ) { - sb = removeParticleDenotion( sb ); + doMore = false; } - return sb.toString(); + } } + sb = resubstitute( sb ); + if ( !uppercase ) { + sb = removeParticleDenotion( sb ); + } + return sb.toString(); + } - /** - * Removes a particle denotion ("ge") from a term, but only if at least 3 - * characters will remain. - * - * @return The term without particle denotion, if there was one. - */ - private StringBuffer removeParticleDenotion( StringBuffer buffer ) { - for ( int c = 0; c < buffer.length(); c++ ) { - // Strip from the beginning of the string to the "ge" inclusive - if ( c < ( buffer.length() - 4 ) && buffer.charAt( c ) == 'g' && buffer.charAt ( c + 1 ) == 'e' ) { - buffer.delete( 0, c + 2 ); - } - } - return sb; + /** + * Removes a particle denotion ("ge") from a term, but only if at least 3 + * characters will remain. + * + * @return The term without particle denotion, if there was one. + */ + private StringBuffer removeParticleDenotion( StringBuffer buffer ) { + for ( int c = 0; c < buffer.length(); c++ ) { + // Strip from the beginning of the string to the "ge" inclusive + if ( c < ( buffer.length() - 4 ) && buffer.charAt( c ) == 'g' && buffer.charAt ( c + 1 ) == 'e' ) { + buffer.delete( 0, c + 2 ); + } } + return sb; + } - /** - * Do some substitutions for the term to reduce overstemming: - * - * - Substitute Umlauts with their corresponding vowel: äöü -> aou, - * "ß" is substituted by "ss" - * - Substitute a second char of an pair of equal characters with - * an asterisk: ?? -> ?* - * - Substitute some common character combinations with a token: - * sch/ch/ei/ie/ig/st -> $/§/%/&/#/! - * - * @return The term with all needed substitutions. - */ - private StringBuffer substitute( StringBuffer buffer ) { - for ( int c = 0; c < buffer.length(); c++ ) { - // Replace the second char of a pair of the equal characters with an asterisk - if ( c > 0 && buffer.charAt( c ) == buffer.charAt ( c - 1 ) ) { - buffer.setCharAt( c, '*' ); - } - // Substitute Umlauts. - else if ( buffer.charAt( c ) == 'ä' ) { - buffer.setCharAt( c, 'a' ); - } - else if ( buffer.charAt( c ) == 'ö' ) { - buffer.setCharAt( c, 'o' ); - } - else if ( buffer.charAt( c ) == 'ü' ) { - buffer.setCharAt( c, 'u' ); - } - // Take care that at least one character is left left side from the current one - if ( c < buffer.length() - 1 ) { - if ( buffer.charAt( c ) == 'ß' ) { - buffer.setCharAt( c, 's' ); - buffer.insert( c + 1, 's' ); - substCount++; - } - // Masking several common character combinations with an token - else if ( ( c < buffer.length() - 2 ) && buffer.charAt( c ) == 's' && buffer.charAt( c + 1 ) == 'c' && buffer.charAt( c + 2 ) == 'h' ) { - buffer.setCharAt( c, '$' ); - buffer.delete( c + 1, c + 3 ); - substCount =+ 2; - } - else if ( buffer.charAt( c ) == 'c' && buffer.charAt( c + 1 ) == 'h' ) { - buffer.setCharAt( c, '§' ); - buffer.deleteCharAt( c + 1 ); - substCount++; - } - else if ( buffer.charAt( c ) == 'e' && buffer.charAt( c + 1 ) == 'i' ) { - buffer.setCharAt( c, '%' ); - buffer.deleteCharAt( c + 1 ); - substCount++; - } - else if ( buffer.charAt( c ) == 'i' && buffer.charAt( c + 1 ) == 'e' ) { - buffer.setCharAt( c, '&' ); - buffer.deleteCharAt( c + 1 ); - substCount++; - } - else if ( buffer.charAt( c ) == 'i' && buffer.charAt( c + 1 ) == 'g' ) { - buffer.setCharAt( c, '#' ); - buffer.deleteCharAt( c + 1 ); - substCount++; - } - else if ( buffer.charAt( c ) == 's' && buffer.charAt( c + 1 ) == 't' ) { - buffer.setCharAt( c, '!' ); - buffer.deleteCharAt( c + 1 ); - substCount++; - } - } + /** + * Do some substitutions for the term to reduce overstemming: + * + * - Substitute Umlauts with their corresponding vowel: äöü -> aou, + * "ß" is substituted by "ss" + * - Substitute a second char of an pair of equal characters with + * an asterisk: ?? -> ?* + * - Substitute some common character combinations with a token: + * sch/ch/ei/ie/ig/st -> $/§/%/&/#/! + * + * @return The term with all needed substitutions. + */ + private StringBuffer substitute( StringBuffer buffer ) { + substCount = 0; + for ( int c = 0; c < buffer.length(); c++ ) { + // Replace the second char of a pair of the equal characters with an asterisk + if ( c > 0 && buffer.charAt( c ) == buffer.charAt ( c - 1 ) ) { + buffer.setCharAt( c, '*' ); + } + // Substitute Umlauts. + else if ( buffer.charAt( c ) == 'ä' ) { + buffer.setCharAt( c, 'a' ); + } + else if ( buffer.charAt( c ) == 'ö' ) { + buffer.setCharAt( c, 'o' ); + } + else if ( buffer.charAt( c ) == 'ü' ) { + buffer.setCharAt( c, 'u' ); + } + // Take care that at least one character is left left side from the current one + if ( c < buffer.length() - 1 ) { + if ( buffer.charAt( c ) == 'ß' ) { + buffer.setCharAt( c, 's' ); + buffer.insert( c + 1, 's' ); + substCount++; + } + // Masking several common character combinations with an token + else if ( ( c < buffer.length() - 2 ) && buffer.charAt( c ) == 's' && buffer.charAt( c + 1 ) == 'c' && buffer.charAt( c + 2 ) == 'h' ) { + buffer.setCharAt( c, '$' ); + buffer.delete( c + 1, c + 3 ); + substCount =+ 2; + } + else if ( buffer.charAt( c ) == 'c' && buffer.charAt( c + 1 ) == 'h' ) { + buffer.setCharAt( c, '§' ); + buffer.deleteCharAt( c + 1 ); + substCount++; + } + else if ( buffer.charAt( c ) == 'e' && buffer.charAt( c + 1 ) == 'i' ) { + buffer.setCharAt( c, '%' ); + buffer.deleteCharAt( c + 1 ); + substCount++; + } + else if ( buffer.charAt( c ) == 'i' && buffer.charAt( c + 1 ) == 'e' ) { + buffer.setCharAt( c, '&' ); + buffer.deleteCharAt( c + 1 ); + substCount++; + } + else if ( buffer.charAt( c ) == 'i' && buffer.charAt( c + 1 ) == 'g' ) { + buffer.setCharAt( c, '#' ); + buffer.deleteCharAt( c + 1 ); + substCount++; + } + else if ( buffer.charAt( c ) == 's' && buffer.charAt( c + 1 ) == 't' ) { + buffer.setCharAt( c, '!' ); + buffer.deleteCharAt( c + 1 ); + substCount++; } - return buffer; + } } + return buffer; + } - /** - * Checks a term if it can be processed correctly. - * - * @return true if, and only if, the given term consists in letters. - */ - private boolean isStemmable( String term ) { - boolean upper = false; - int first = -1; - for ( int c = 0; c < term.length(); c++ ) { - // Discard terms that contain non-letter characters. - if ( !Character.isLetter( term.charAt( c ) ) ) { - return false; - } - // Discard terms that contain multiple uppercase letters. - if ( Character.isUpperCase( term.charAt( c ) ) ) { - if ( upper ) { - return false; - } - // First encountered uppercase letter, set flag and save - // position. - else { - first = c; - upper = true; - } - } - } - // Discard the term if it contains a single uppercase letter that - // is not starting the term. - if ( first > 0 ) { - return false; - } - return true; - } - /** - * Undoes the changes made by substitute(). That are character pairs and - * character combinations. Umlauts will remain as their corresponding vowel, - * as "ß" remains as "ss". - * - * @return The term without the not human readable substitutions. - */ - private StringBuffer resubstitute( StringBuffer buffer ) { - for ( int c = 0; c < buffer.length(); c++ ) { - if ( buffer.charAt( c ) == '*' ) { - char x = buffer.charAt( c - 1 ); - buffer.setCharAt( c, x ); - } - else if ( buffer.charAt( c ) == '$' ) { - buffer.setCharAt( c, 's' ); - buffer.insert( c + 1, new char[]{'c', 'h'}, 0, 2 ); - } - else if ( buffer.charAt( c ) == '§' ) { - buffer.setCharAt( c, 'c' ); - buffer.insert( c + 1, 'h' ); - } - else if ( buffer.charAt( c ) == '%' ) { - buffer.setCharAt( c, 'e' ); - buffer.insert( c + 1, 'i' ); - } - else if ( buffer.charAt( c ) == '&' ) { - buffer.setCharAt( c, 'i' ); - buffer.insert( c + 1, 'e' ); - } - else if ( buffer.charAt( c ) == '#' ) { - buffer.setCharAt( c, 'i' ); - buffer.insert( c + 1, 'g' ); - } - else if ( buffer.charAt( c ) == '!' ) { - buffer.setCharAt( c, 's' ); - buffer.insert( c + 1, 't' ); - } + /** + * Checks a term if it can be processed correctly. + * + * @return true if, and only if, the given term consists in letters. + */ + private boolean isStemmable( String term ) { + boolean upper = false; + int first = -1; + for ( int c = 0; c < term.length(); c++ ) { + // Discard terms that contain non-letter characters. + if ( !Character.isLetter( term.charAt( c ) ) ) { + return false; + } + // Discard terms that contain multiple uppercase letters. + if ( Character.isUpperCase( term.charAt( c ) ) ) { + if ( upper ) { + return false; + } + // First encountered uppercase letter, set flag and save + // position. + else { + first = c; + upper = true; } - return buffer; + } } + // Discard the term if it contains a single uppercase letter that + // is not starting the term. + if ( first > 0 ) { + return false; + } + return true; + } + /** + * Undoes the changes made by substitute(). That are character pairs and + * character combinations. Umlauts will remain as their corresponding vowel, + * as "ß" remains as "ss". + * + * @return The term without the not human readable substitutions. + */ + private StringBuffer resubstitute( StringBuffer buffer ) { + for ( int c = 0; c < buffer.length(); c++ ) { + if ( buffer.charAt( c ) == '*' ) { + char x = buffer.charAt( c - 1 ); + buffer.setCharAt( c, x ); + } + else if ( buffer.charAt( c ) == '$' ) { + buffer.setCharAt( c, 's' ); + buffer.insert( c + 1, new char[]{'c', 'h'}, 0, 2 ); + } + else if ( buffer.charAt( c ) == '§' ) { + buffer.setCharAt( c, 'c' ); + buffer.insert( c + 1, 'h' ); + } + else if ( buffer.charAt( c ) == '%' ) { + buffer.setCharAt( c, 'e' ); + buffer.insert( c + 1, 'i' ); + } + else if ( buffer.charAt( c ) == '&' ) { + buffer.setCharAt( c, 'i' ); + buffer.insert( c + 1, 'e' ); + } + else if ( buffer.charAt( c ) == '#' ) { + buffer.setCharAt( c, 'i' ); + buffer.insert( c + 1, 'g' ); + } + else if ( buffer.charAt( c ) == '!' ) { + buffer.setCharAt( c, 's' ); + buffer.insert( c + 1, 't' ); + } + } + return buffer; + } } -
-- To unsubscribe, e-mail: <mailto:[EMAIL PROTECTED]> For additional commands, e-mail: <mailto:[EMAIL PROTECTED]>