otis 2004/03/30 07:54:49 Modified: src/java/org/apache/lucene/analysis/de GermanStemFilter.java GermanStemmer.java WordlistLoader.java Log: - Fixed mixed-up indentation Revision Changes Path 1.8 +31 -32 jakarta-lucene/src/java/org/apache/lucene/analysis/de/GermanStemFilter.java Index: GermanStemFilter.java =================================================================== RCS file: /home/cvs/jakarta-lucene/src/java/org/apache/lucene/analysis/de/GermanStemFilter.java,v retrieving revision 1.7 retrieving revision 1.8 diff -u -r1.7 -r1.8 --- GermanStemFilter.java 29 Mar 2004 22:48:01 -0000 1.7 +++ GermanStemFilter.java 30 Mar 2004 15:54:48 -0000 1.8 @@ -40,22 +40,21 @@ private Token token = null; private GermanStemmer stemmer = null; private Set exclusionSet = null; - + public GermanStemFilter( TokenStream in ) { - super(in); - stemmer = new GermanStemmer(); + super(in); + stemmer = new GermanStemmer(); } - + /** * Builds a GermanStemFilter that uses an exclusiontable. * @deprecated Use [EMAIL PROTECTED] #GermanStemFilter(org.apache.lucene.analysis.TokenStream, java.util.Set)} instead. */ public GermanStemFilter( TokenStream in, Hashtable exclusiontable ) { - this( in ); - exclusionSet = new HashSet(exclusiontable.keySet()); - + this( in ); + exclusionSet = new HashSet(exclusiontable.keySet()); } /** @@ -63,32 +62,32 @@ */ public GermanStemFilter( TokenStream in, Set exclusionSet ) { - this( in ); - this.exclusionSet = exclusionSet; + this( in ); + this.exclusionSet = exclusionSet; } /** * @return Returns the next token in the stream, or null at EOS */ public final Token next() - throws IOException + throws IOException { - if ( ( token = input.next() ) == null ) { - return null; - } - // Check the exclusiontable - else if ( exclusionSet != null && exclusionSet.contains( token.termText() ) ) { - return token; - } - else { - String s = stemmer.stem( token.termText() ); - // If not stemmed, dont waste the time creating a new token - if ( !s.equals( token.termText() ) ) { - return new Token( s, token.startOffset(), - token.endOffset(), token.type() ); - } - return token; - } + if ( ( token = input.next() ) == null ) { + return null; + } + // Check the exclusiontable + else if ( exclusionSet != null && exclusionSet.contains( token.termText() ) ) { + return token; + } + else { + String s = stemmer.stem( token.termText() ); + // If not stemmed, dont waste the time creating a new token + if ( !s.equals( token.termText() ) ) { + return new Token( s, token.startOffset(), + token.endOffset(), token.type() ); + } + return token; + } } /** @@ -96,9 +95,9 @@ */ public void setStemmer( GermanStemmer stemmer ) { - if ( stemmer != null ) { - this.stemmer = stemmer; - } + if ( stemmer != null ) { + this.stemmer = stemmer; + } } /** @@ -107,7 +106,7 @@ */ public void setExclusionTable( Hashtable exclusiontable ) { - exclusionSet = new HashSet(exclusiontable.keySet()); + exclusionSet = new HashSet(exclusiontable.keySet()); } /** @@ -115,6 +114,6 @@ */ public void setExclusionSet( Set exclusionSet ) { - this.exclusionSet = exclusionSet; + this.exclusionSet = exclusionSet; } } 1.10 +157 -156 jakarta-lucene/src/java/org/apache/lucene/analysis/de/GermanStemmer.java Index: GermanStemmer.java =================================================================== RCS file: /home/cvs/jakarta-lucene/src/java/org/apache/lucene/analysis/de/GermanStemmer.java,v retrieving revision 1.9 retrieving revision 1.10 diff -u -r1.9 -r1.10 --- GermanStemmer.java 30 Mar 2004 15:44:58 -0000 1.9 +++ GermanStemmer.java 30 Mar 2004 15:54:48 -0000 1.10 @@ -44,20 +44,20 @@ */ protected String stem( String term ) { - // Use lowercase for medium stemming. - term = term.toLowerCase(); - if ( !isStemmable( term ) ) - return term; - // Reset the StringBuffer. - sb.delete( 0, sb.length() ); - sb.insert( 0, term ); - // Stemming starts here... - substitute( sb ); - strip( sb ); - optimize( sb ); - resubstitute( sb ); - removeParticleDenotion( sb ); - return sb.toString(); + // Use lowercase for medium stemming. + term = term.toLowerCase(); + if ( !isStemmable( term ) ) + return term; + // Reset the StringBuffer. + sb.delete( 0, sb.length() ); + sb.insert( 0, term ); + // Stemming starts here... + substitute( sb ); + strip( sb ); + optimize( sb ); + resubstitute( sb ); + removeParticleDenotion( sb ); + return sb.toString(); } /** @@ -67,10 +67,11 @@ */ private boolean isStemmable( String term ) { - for ( int c = 0; c < term.length(); c++ ) { - if ( !Character.isLetter( term.charAt( c ) ) ) return false; - } - return true; + for ( int c = 0; c < term.length(); c++ ) { + if ( !Character.isLetter( term.charAt( c ) ) ) + return false; + } + return true; } /** @@ -83,38 +84,38 @@ */ private void strip( StringBuffer buffer ) { - boolean doMore = true; - while ( doMore && buffer.length() > 3 ) { - if ( ( buffer.length() + substCount > 5 ) && - buffer.substring( buffer.length() - 2, buffer.length() ).equals( "nd" ) ) - { - buffer.delete( buffer.length() - 2, buffer.length() ); - } - else if ( ( buffer.length() + substCount > 4 ) && - buffer.substring( buffer.length() - 2, buffer.length() ).equals( "em" ) ) { - buffer.delete( buffer.length() - 2, buffer.length() ); - } - else if ( ( buffer.length() + substCount > 4 ) && - buffer.substring( buffer.length() - 2, buffer.length() ).equals( "er" ) ) { - buffer.delete( buffer.length() - 2, buffer.length() ); - } - else if ( buffer.charAt( buffer.length() - 1 ) == 'e' ) { - buffer.deleteCharAt( buffer.length() - 1 ); - } - else if ( buffer.charAt( buffer.length() - 1 ) == 's' ) { - buffer.deleteCharAt( buffer.length() - 1 ); - } - else if ( buffer.charAt( buffer.length() - 1 ) == 'n' ) { - buffer.deleteCharAt( buffer.length() - 1 ); - } - // "t" occurs only as suffix of verbs. - else if ( buffer.charAt( buffer.length() - 1 ) == 't' ) { - buffer.deleteCharAt( buffer.length() - 1 ); - } - else { - doMore = false; - } - } + boolean doMore = true; + while ( doMore && buffer.length() > 3 ) { + if ( ( buffer.length() + substCount > 5 ) && + buffer.substring( buffer.length() - 2, buffer.length() ).equals( "nd" ) ) + { + buffer.delete( buffer.length() - 2, buffer.length() ); + } + else if ( ( buffer.length() + substCount > 4 ) && + buffer.substring( buffer.length() - 2, buffer.length() ).equals( "em" ) ) { + buffer.delete( buffer.length() - 2, buffer.length() ); + } + else if ( ( buffer.length() + substCount > 4 ) && + buffer.substring( buffer.length() - 2, buffer.length() ).equals( "er" ) ) { + buffer.delete( buffer.length() - 2, buffer.length() ); + } + else if ( buffer.charAt( buffer.length() - 1 ) == 'e' ) { + buffer.deleteCharAt( buffer.length() - 1 ); + } + else if ( buffer.charAt( buffer.length() - 1 ) == 's' ) { + buffer.deleteCharAt( buffer.length() - 1 ); + } + else if ( buffer.charAt( buffer.length() - 1 ) == 'n' ) { + buffer.deleteCharAt( buffer.length() - 1 ); + } + // "t" occurs only as suffix of verbs. + else if ( buffer.charAt( buffer.length() - 1 ) == 't' ) { + buffer.deleteCharAt( buffer.length() - 1 ); + } + else { + doMore = false; + } + } } /** @@ -123,15 +124,15 @@ */ private void optimize( StringBuffer buffer ) { - // Additional step for female plurals of professions and inhabitants. - if ( buffer.length() > 5 && buffer.substring( buffer.length() - 5, buffer.length() ).equals( "erin*" ) ) { - buffer.deleteCharAt( buffer.length() -1 ); - strip( buffer ); - } - // Additional step for irregular plural nouns like "Matrizen -> Matrix". - if ( buffer.charAt( buffer.length() - 1 ) == ( 'z' ) ) { - buffer.setCharAt( buffer.length() - 1, 'x' ); - } + // Additional step for female plurals of professions and inhabitants. + if ( buffer.length() > 5 && buffer.substring( buffer.length() - 5, buffer.length() ).equals( "erin*" ) ) { + buffer.deleteCharAt( buffer.length() -1 ); + strip( buffer ); + } + // Additional step for irregular plural nouns like "Matrizen -> Matrix". + if ( buffer.charAt( buffer.length() - 1 ) == ( 'z' ) ) { + buffer.setCharAt( buffer.length() - 1, 'x' ); + } } /** @@ -139,14 +140,14 @@ */ private void removeParticleDenotion( StringBuffer buffer ) { - if ( buffer.length() > 4 ) { - for ( int c = 0; c < buffer.length() - 3; c++ ) { - if ( buffer.substring( c, c + 4 ).equals( "gege" ) ) { - buffer.delete( c, c + 2 ); - return; - } - } - } + if ( buffer.length() > 4 ) { + for ( int c = 0; c < buffer.length() - 3; c++ ) { + if ( buffer.substring( c, c + 4 ).equals( "gege" ) ) { + buffer.delete( c, c + 2 ); + return; + } + } + } } /** @@ -161,64 +162,64 @@ */ private void substitute( StringBuffer buffer ) { - substCount = 0; - for ( int c = 0; c < buffer.length(); c++ ) { - // Replace the second char of a pair of the equal characters with an asterisk - if ( c > 0 && buffer.charAt( c ) == buffer.charAt ( c - 1 ) ) { - buffer.setCharAt( c, '*' ); - } - // Substitute Umlauts. - else if ( buffer.charAt( c ) == 'ä' ) { - buffer.setCharAt( c, 'a' ); - } - else if ( buffer.charAt( c ) == 'ö' ) { - buffer.setCharAt( c, 'o' ); - } - else if ( buffer.charAt( c ) == 'ü' ) { - buffer.setCharAt( c, 'u' ); - } - // Take care that at least one character is left left side from the current one - if ( c < buffer.length() - 1 ) { - if ( buffer.charAt( c ) == 'ß' ) { - buffer.setCharAt( c, 's' ); - buffer.insert( c + 1, 's' ); - substCount++; - } - // Masking several common character combinations with an token - else if ( ( c < buffer.length() - 2 ) && buffer.charAt( c ) == 's' && - buffer.charAt( c + 1 ) == 'c' && buffer.charAt( c + 2 ) == 'h' ) - { - buffer.setCharAt( c, '$' ); - buffer.delete( c + 1, c + 3 ); - substCount =+ 2; - } - else if ( buffer.charAt( c ) == 'c' && buffer.charAt( c + 1 ) == 'h' ) { - buffer.setCharAt( c, '§' ); - buffer.deleteCharAt( c + 1 ); - substCount++; - } - else if ( buffer.charAt( c ) == 'e' && buffer.charAt( c + 1 ) == 'i' ) { - buffer.setCharAt( c, '%' ); - buffer.deleteCharAt( c + 1 ); - substCount++; - } - else if ( buffer.charAt( c ) == 'i' && buffer.charAt( c + 1 ) == 'e' ) { - buffer.setCharAt( c, '&' ); - buffer.deleteCharAt( c + 1 ); - substCount++; - } - else if ( buffer.charAt( c ) == 'i' && buffer.charAt( c + 1 ) == 'g' ) { - buffer.setCharAt( c, '#' ); - buffer.deleteCharAt( c + 1 ); - substCount++; - } - else if ( buffer.charAt( c ) == 's' && buffer.charAt( c + 1 ) == 't' ) { - buffer.setCharAt( c, '!' ); - buffer.deleteCharAt( c + 1 ); - substCount++; - } - } - } + substCount = 0; + for ( int c = 0; c < buffer.length(); c++ ) { + // Replace the second char of a pair of the equal characters with an asterisk + if ( c > 0 && buffer.charAt( c ) == buffer.charAt ( c - 1 ) ) { + buffer.setCharAt( c, '*' ); + } + // Substitute Umlauts. + else if ( buffer.charAt( c ) == 'ä' ) { + buffer.setCharAt( c, 'a' ); + } + else if ( buffer.charAt( c ) == 'ö' ) { + buffer.setCharAt( c, 'o' ); + } + else if ( buffer.charAt( c ) == 'ü' ) { + buffer.setCharAt( c, 'u' ); + } + // Take care that at least one character is left left side from the current one + if ( c < buffer.length() - 1 ) { + if ( buffer.charAt( c ) == 'ß' ) { + buffer.setCharAt( c, 's' ); + buffer.insert( c + 1, 's' ); + substCount++; + } + // Masking several common character combinations with an token + else if ( ( c < buffer.length() - 2 ) && buffer.charAt( c ) == 's' && + buffer.charAt( c + 1 ) == 'c' && buffer.charAt( c + 2 ) == 'h' ) + { + buffer.setCharAt( c, '$' ); + buffer.delete( c + 1, c + 3 ); + substCount =+ 2; + } + else if ( buffer.charAt( c ) == 'c' && buffer.charAt( c + 1 ) == 'h' ) { + buffer.setCharAt( c, '§' ); + buffer.deleteCharAt( c + 1 ); + substCount++; + } + else if ( buffer.charAt( c ) == 'e' && buffer.charAt( c + 1 ) == 'i' ) { + buffer.setCharAt( c, '%' ); + buffer.deleteCharAt( c + 1 ); + substCount++; + } + else if ( buffer.charAt( c ) == 'i' && buffer.charAt( c + 1 ) == 'e' ) { + buffer.setCharAt( c, '&' ); + buffer.deleteCharAt( c + 1 ); + substCount++; + } + else if ( buffer.charAt( c ) == 'i' && buffer.charAt( c + 1 ) == 'g' ) { + buffer.setCharAt( c, '#' ); + buffer.deleteCharAt( c + 1 ); + substCount++; + } + else if ( buffer.charAt( c ) == 's' && buffer.charAt( c + 1 ) == 't' ) { + buffer.setCharAt( c, '!' ); + buffer.deleteCharAt( c + 1 ); + substCount++; + } + } + } } /** @@ -228,35 +229,35 @@ */ private void resubstitute( StringBuffer buffer ) { - for ( int c = 0; c < buffer.length(); c++ ) { - if ( buffer.charAt( c ) == '*' ) { - char x = buffer.charAt( c - 1 ); - buffer.setCharAt( c, x ); - } - else if ( buffer.charAt( c ) == '$' ) { - buffer.setCharAt( c, 's' ); - buffer.insert( c + 1, new char[]{'c', 'h'}, 0, 2 ); - } - else if ( buffer.charAt( c ) == '§' ) { - buffer.setCharAt( c, 'c' ); - buffer.insert( c + 1, 'h' ); - } - else if ( buffer.charAt( c ) == '%' ) { - buffer.setCharAt( c, 'e' ); - buffer.insert( c + 1, 'i' ); - } - else if ( buffer.charAt( c ) == '&' ) { - buffer.setCharAt( c, 'i' ); - buffer.insert( c + 1, 'e' ); - } - else if ( buffer.charAt( c ) == '#' ) { - buffer.setCharAt( c, 'i' ); - buffer.insert( c + 1, 'g' ); - } - else if ( buffer.charAt( c ) == '!' ) { - buffer.setCharAt( c, 's' ); - buffer.insert( c + 1, 't' ); - } - } + for ( int c = 0; c < buffer.length(); c++ ) { + if ( buffer.charAt( c ) == '*' ) { + char x = buffer.charAt( c - 1 ); + buffer.setCharAt( c, x ); + } + else if ( buffer.charAt( c ) == '$' ) { + buffer.setCharAt( c, 's' ); + buffer.insert( c + 1, new char[]{'c', 'h'}, 0, 2 ); + } + else if ( buffer.charAt( c ) == '§' ) { + buffer.setCharAt( c, 'c' ); + buffer.insert( c + 1, 'h' ); + } + else if ( buffer.charAt( c ) == '%' ) { + buffer.setCharAt( c, 'e' ); + buffer.insert( c + 1, 'i' ); + } + else if ( buffer.charAt( c ) == '&' ) { + buffer.setCharAt( c, 'i' ); + buffer.insert( c + 1, 'e' ); + } + else if ( buffer.charAt( c ) == '#' ) { + buffer.setCharAt( c, 'i' ); + buffer.insert( c + 1, 'g' ); + } + else if ( buffer.charAt( c ) == '!' ) { + buffer.setCharAt( c, 's' ); + buffer.insert( c + 1, 't' ); + } + } } } 1.10 +2 -2 jakarta-lucene/src/java/org/apache/lucene/analysis/de/WordlistLoader.java Index: WordlistLoader.java =================================================================== RCS file: /home/cvs/jakarta-lucene/src/java/org/apache/lucene/analysis/de/WordlistLoader.java,v retrieving revision 1.9 retrieving revision 1.10 diff -u -r1.9 -r1.10 --- WordlistLoader.java 30 Mar 2004 15:44:58 -0000 1.9 +++ WordlistLoader.java 30 Mar 2004 15:54:48 -0000 1.10 @@ -53,8 +53,8 @@ String word = null; while ((word = lnr.readLine()) != null) { result.add(word.trim()); - } } + } finally { if (lnr != null) lnr.close();
--------------------------------------------------------------------- To unsubscribe, e-mail: [EMAIL PROTECTED] For additional commands, e-mail: [EMAIL PROTECTED]