de GermanStemFilter.java GermanStemmer.java WordlistLoader.java

otis Tue, 30 Mar 2004 07:55:14 -0800

otis        2004/03/30 07:54:49

  Modified:    src/java/org/apache/lucene/analysis/de GermanStemFilter.java
                        GermanStemmer.java WordlistLoader.java
  Log:
  - Fixed mixed-up indentation
  
  Revision  Changes    Path
  1.8       +31 -32    
jakarta-lucene/src/java/org/apache/lucene/analysis/de/GermanStemFilter.java
  
  Index: GermanStemFilter.java
  ===================================================================
  RCS file: 
/home/cvs/jakarta-lucene/src/java/org/apache/lucene/analysis/de/GermanStemFilter.java,v
  retrieving revision 1.7
  retrieving revision 1.8
  diff -u -r1.7 -r1.8
  --- GermanStemFilter.java     29 Mar 2004 22:48:01 -0000      1.7
  +++ GermanStemFilter.java     30 Mar 2004 15:54:48 -0000      1.8
  @@ -40,22 +40,21 @@
       private Token token = null;
       private GermanStemmer stemmer = null;
       private Set exclusionSet = null;
  -    
  +
       public GermanStemFilter( TokenStream in )
       {
  -        super(in);
  -     stemmer = new GermanStemmer();
  +      super(in);
  +      stemmer = new GermanStemmer();
       }
  -    
  +
       /**
        * Builds a GermanStemFilter that uses an exclusiontable.
        * @deprecated Use [EMAIL PROTECTED] 
#GermanStemFilter(org.apache.lucene.analysis.TokenStream, java.util.Set)} instead.
        */
       public GermanStemFilter( TokenStream in, Hashtable exclusiontable )
       {
  -     this( in );
  -     exclusionSet = new HashSet(exclusiontable.keySet());
  -
  +      this( in );
  +      exclusionSet = new HashSet(exclusiontable.keySet());
       }
   
       /**
  @@ -63,32 +62,32 @@
        */
       public GermanStemFilter( TokenStream in, Set exclusionSet )
       {
  -     this( in );
  -     this.exclusionSet = exclusionSet;
  +      this( in );
  +      this.exclusionSet = exclusionSet;
       }
   
       /**
        * @return  Returns the next token in the stream, or null at EOS
        */
       public final Token next()
  -     throws IOException
  +      throws IOException
       {
  -     if ( ( token = input.next() ) == null ) {
  -         return null;
  -     }
  -     // Check the exclusiontable
  -     else if ( exclusionSet != null && exclusionSet.contains( token.termText() ) ) {
  -         return token;
  -     }
  -     else {
  -         String s = stemmer.stem( token.termText() );
  -         // If not stemmed, dont waste the time creating a new token
  -         if ( !s.equals( token.termText() ) ) {
  -             return new Token( s, token.startOffset(),
  -                 token.endOffset(), token.type() );
  -         }
  -         return token;
  -     }
  +      if ( ( token = input.next() ) == null ) {
  +        return null;
  +      }
  +      // Check the exclusiontable
  +      else if ( exclusionSet != null && exclusionSet.contains( token.termText() ) ) 
{
  +        return token;
  +      }
  +      else {
  +        String s = stemmer.stem( token.termText() );
  +        // If not stemmed, dont waste the time creating a new token
  +        if ( !s.equals( token.termText() ) ) {
  +          return new Token( s, token.startOffset(),
  +            token.endOffset(), token.type() );
  +        }
  +        return token;
  +      }
       }
   
       /**
  @@ -96,9 +95,9 @@
        */
       public void setStemmer( GermanStemmer stemmer )
       {
  -     if ( stemmer != null ) {
  -         this.stemmer = stemmer;
  -     }
  +      if ( stemmer != null ) {
  +        this.stemmer = stemmer;
  +      }
       }
   
       /**
  @@ -107,7 +106,7 @@
        */
       public void setExclusionTable( Hashtable exclusiontable )
       {
  -     exclusionSet = new HashSet(exclusiontable.keySet());
  +      exclusionSet = new HashSet(exclusiontable.keySet());
       }
   
       /**
  @@ -115,6 +114,6 @@
        */
       public void setExclusionSet( Set exclusionSet )
       {
  -     this.exclusionSet = exclusionSet;
  +      this.exclusionSet = exclusionSet;
       }
   }
  
  
  
  1.10      +157 -156  
jakarta-lucene/src/java/org/apache/lucene/analysis/de/GermanStemmer.java
  
  Index: GermanStemmer.java
  ===================================================================
  RCS file: 
/home/cvs/jakarta-lucene/src/java/org/apache/lucene/analysis/de/GermanStemmer.java,v
  retrieving revision 1.9
  retrieving revision 1.10
  diff -u -r1.9 -r1.10
  --- GermanStemmer.java        30 Mar 2004 15:44:58 -0000      1.9
  +++ GermanStemmer.java        30 Mar 2004 15:54:48 -0000      1.10
  @@ -44,20 +44,20 @@
        */
       protected String stem( String term )
       {
  -     // Use lowercase for medium stemming.
  -     term = term.toLowerCase();
  -     if ( !isStemmable( term ) )
  -         return term;
  -     // Reset the StringBuffer.
  -     sb.delete( 0, sb.length() );
  -     sb.insert( 0, term );
  -     // Stemming starts here...
  -     substitute( sb );
  -     strip( sb );
  -     optimize( sb );
  -     resubstitute( sb );
  -     removeParticleDenotion( sb );
  -     return sb.toString();
  +      // Use lowercase for medium stemming.
  +      term = term.toLowerCase();
  +      if ( !isStemmable( term ) )
  +        return term;
  +      // Reset the StringBuffer.
  +      sb.delete( 0, sb.length() );
  +      sb.insert( 0, term );
  +      // Stemming starts here...
  +      substitute( sb );
  +      strip( sb );
  +      optimize( sb );
  +      resubstitute( sb );
  +      removeParticleDenotion( sb );
  +      return sb.toString();
       }
   
       /**
  @@ -67,10 +67,11 @@
        */
       private boolean isStemmable( String term )
       {
  -     for ( int c = 0; c < term.length(); c++ ) {
  -         if ( !Character.isLetter( term.charAt( c ) ) ) return false;
  -     }
  -     return true;
  +      for ( int c = 0; c < term.length(); c++ ) {
  +        if ( !Character.isLetter( term.charAt( c ) ) )
  +          return false;
  +      }
  +      return true;
       }
   
       /**
  @@ -83,38 +84,38 @@
        */
       private void strip( StringBuffer buffer )
       {
  -     boolean doMore = true;
  -     while ( doMore && buffer.length() > 3 ) {
  -         if ( ( buffer.length() + substCount > 5 ) &&
  -             buffer.substring( buffer.length() - 2, buffer.length() ).equals( "nd" 
) )
  -         {
  -             buffer.delete( buffer.length() - 2, buffer.length() );
  -         }
  -         else if ( ( buffer.length() + substCount > 4 ) &&
  -             buffer.substring( buffer.length() - 2, buffer.length() ).equals( "em" 
) ) {
  -             buffer.delete( buffer.length() - 2, buffer.length() );
  -         }
  -         else if ( ( buffer.length() + substCount > 4 ) &&
  -             buffer.substring( buffer.length() - 2, buffer.length() ).equals( "er" 
) ) {
  -             buffer.delete( buffer.length() - 2, buffer.length() );
  -         }
  -         else if ( buffer.charAt( buffer.length() - 1 ) == 'e' ) {
  -             buffer.deleteCharAt( buffer.length() - 1 );
  -         }
  -         else if ( buffer.charAt( buffer.length() - 1 ) == 's' ) {
  -             buffer.deleteCharAt( buffer.length() - 1 );
  -         }
  -         else if ( buffer.charAt( buffer.length() - 1 ) == 'n' ) {
  -             buffer.deleteCharAt( buffer.length() - 1 );
  -         }
  -         // "t" occurs only as suffix of verbs.
  -         else if ( buffer.charAt( buffer.length() - 1 ) == 't' ) {
  -             buffer.deleteCharAt( buffer.length() - 1 );
  -         }
  -         else {
  -             doMore = false;
  -         }
  -     }
  +      boolean doMore = true;
  +      while ( doMore && buffer.length() > 3 ) {
  +        if ( ( buffer.length() + substCount > 5 ) &&
  +          buffer.substring( buffer.length() - 2, buffer.length() ).equals( "nd" ) )
  +        {
  +          buffer.delete( buffer.length() - 2, buffer.length() );
  +        }
  +        else if ( ( buffer.length() + substCount > 4 ) &&
  +          buffer.substring( buffer.length() - 2, buffer.length() ).equals( "em" ) ) 
{
  +            buffer.delete( buffer.length() - 2, buffer.length() );
  +        }
  +        else if ( ( buffer.length() + substCount > 4 ) &&
  +          buffer.substring( buffer.length() - 2, buffer.length() ).equals( "er" ) ) 
{
  +            buffer.delete( buffer.length() - 2, buffer.length() );
  +        }
  +        else if ( buffer.charAt( buffer.length() - 1 ) == 'e' ) {
  +          buffer.deleteCharAt( buffer.length() - 1 );
  +        }
  +        else if ( buffer.charAt( buffer.length() - 1 ) == 's' ) {
  +          buffer.deleteCharAt( buffer.length() - 1 );
  +        }
  +        else if ( buffer.charAt( buffer.length() - 1 ) == 'n' ) {
  +          buffer.deleteCharAt( buffer.length() - 1 );
  +        }
  +        // "t" occurs only as suffix of verbs.
  +        else if ( buffer.charAt( buffer.length() - 1 ) == 't' ) {
  +          buffer.deleteCharAt( buffer.length() - 1 );
  +        }
  +        else {
  +          doMore = false;
  +        }
  +      }
       }
   
       /**
  @@ -123,15 +124,15 @@
        */
       private void optimize( StringBuffer buffer )
       {
  -     // Additional step for female plurals of professions and inhabitants.
  -     if ( buffer.length() > 5 && buffer.substring( buffer.length() - 5, 
buffer.length() ).equals( "erin*" ) ) {
  -         buffer.deleteCharAt( buffer.length() -1 );
  -         strip( buffer );
  -     }
  -     // Additional step for irregular plural nouns like "Matrizen -> Matrix".
  -     if ( buffer.charAt( buffer.length() - 1 ) == ( 'z' ) ) {
  -         buffer.setCharAt( buffer.length() - 1, 'x' );
  -     }
  +      // Additional step for female plurals of professions and inhabitants.
  +      if ( buffer.length() > 5 && buffer.substring( buffer.length() - 5, 
buffer.length() ).equals( "erin*" ) ) {
  +        buffer.deleteCharAt( buffer.length() -1 );
  +        strip( buffer );
  +      }
  +      // Additional step for irregular plural nouns like "Matrizen -> Matrix".
  +      if ( buffer.charAt( buffer.length() - 1 ) == ( 'z' ) ) {
  +        buffer.setCharAt( buffer.length() - 1, 'x' );
  +      }
       }
   
       /**
  @@ -139,14 +140,14 @@
        */
       private void removeParticleDenotion( StringBuffer buffer )
       {
  -     if ( buffer.length() > 4 ) {
  -         for ( int c = 0; c < buffer.length() - 3; c++ ) {
  -             if ( buffer.substring( c, c + 4 ).equals( "gege" ) ) {
  -                 buffer.delete( c, c + 2 );
  -                 return;
  -             }
  -         }
  -     }
  +      if ( buffer.length() > 4 ) {
  +        for ( int c = 0; c < buffer.length() - 3; c++ ) {
  +          if ( buffer.substring( c, c + 4 ).equals( "gege" ) ) {
  +            buffer.delete( c, c + 2 );
  +            return;
  +          }
  +        }
  +      }
       }
   
       /**
  @@ -161,64 +162,64 @@
        */
       private void substitute( StringBuffer buffer )
       {
  -     substCount = 0;
  -     for ( int c = 0; c < buffer.length(); c++ ) {
  -         // Replace the second char of a pair of the equal characters with an 
asterisk
  -         if ( c > 0 && buffer.charAt( c ) == buffer.charAt ( c - 1 )  ) {
  -             buffer.setCharAt( c, '*' );
  -         }
  -         // Substitute Umlauts.
  -         else if ( buffer.charAt( c ) == 'ä' ) {
  -             buffer.setCharAt( c, 'a' );
  -         }
  -         else if ( buffer.charAt( c ) == 'ö' ) {
  -             buffer.setCharAt( c, 'o' );
  -         }
  -         else if ( buffer.charAt( c ) == 'ü' ) {
  -             buffer.setCharAt( c, 'u' );
  -         }
  -         // Take care that at least one character is left left side from the 
current one
  -         if ( c < buffer.length() - 1 ) {
  -             if ( buffer.charAt( c ) == 'ß' ) {
  -                 buffer.setCharAt( c, 's' );
  -                 buffer.insert( c + 1, 's' );
  -                 substCount++;
  -             }
  -             // Masking several common character combinations with an token
  -             else if ( ( c < buffer.length() - 2 ) && buffer.charAt( c ) == 's' &&
  -                 buffer.charAt( c + 1 ) == 'c' && buffer.charAt( c + 2 ) == 'h' )
  -             {
  -                 buffer.setCharAt( c, '$' );
  -                 buffer.delete( c + 1, c + 3 );
  -                 substCount =+ 2;
  -             }
  -             else if ( buffer.charAt( c ) == 'c' && buffer.charAt( c + 1 ) == 'h' ) 
{
  -                 buffer.setCharAt( c, '§' );
  -                 buffer.deleteCharAt( c + 1 );
  -                 substCount++;
  -             }
  -             else if ( buffer.charAt( c ) == 'e' && buffer.charAt( c + 1 ) == 'i' ) 
{
  -                 buffer.setCharAt( c, '%' );
  -                 buffer.deleteCharAt( c + 1 );
  -                 substCount++;
  -             }
  -             else if ( buffer.charAt( c ) == 'i' && buffer.charAt( c + 1 ) == 'e' ) 
{
  -                 buffer.setCharAt( c, '&' );
  -                 buffer.deleteCharAt( c + 1 );
  -                 substCount++;
  -             }
  -             else if ( buffer.charAt( c ) == 'i' && buffer.charAt( c + 1 ) == 'g' ) 
{
  -                 buffer.setCharAt( c, '#' );
  -                 buffer.deleteCharAt( c + 1 );
  -                 substCount++;
  -             }
  -             else if ( buffer.charAt( c ) == 's' && buffer.charAt( c + 1 ) == 't' ) 
{
  -                 buffer.setCharAt( c, '!' );
  -                 buffer.deleteCharAt( c + 1 );
  -                 substCount++;
  -             }
  -         }
  -     }
  +      substCount = 0;
  +      for ( int c = 0; c < buffer.length(); c++ ) {
  +        // Replace the second char of a pair of the equal characters with an 
asterisk
  +        if ( c > 0 && buffer.charAt( c ) == buffer.charAt ( c - 1 )  ) {
  +          buffer.setCharAt( c, '*' );
  +        }
  +        // Substitute Umlauts.
  +        else if ( buffer.charAt( c ) == 'ä' ) {
  +          buffer.setCharAt( c, 'a' );
  +        }
  +        else if ( buffer.charAt( c ) == 'ö' ) {
  +          buffer.setCharAt( c, 'o' );
  +        }
  +        else if ( buffer.charAt( c ) == 'ü' ) {
  +          buffer.setCharAt( c, 'u' );
  +        }
  +        // Take care that at least one character is left left side from the current 
one
  +        if ( c < buffer.length() - 1 ) {
  +          if ( buffer.charAt( c ) == 'ß' ) {
  +            buffer.setCharAt( c, 's' );
  +            buffer.insert( c + 1, 's' );
  +            substCount++;
  +          }
  +          // Masking several common character combinations with an token
  +          else if ( ( c < buffer.length() - 2 ) && buffer.charAt( c ) == 's' &&
  +            buffer.charAt( c + 1 ) == 'c' && buffer.charAt( c + 2 ) == 'h' )
  +          {
  +            buffer.setCharAt( c, '$' );
  +            buffer.delete( c + 1, c + 3 );
  +            substCount =+ 2;
  +          }
  +          else if ( buffer.charAt( c ) == 'c' && buffer.charAt( c + 1 ) == 'h' ) {
  +            buffer.setCharAt( c, '§' );
  +            buffer.deleteCharAt( c + 1 );
  +            substCount++;
  +          }
  +          else if ( buffer.charAt( c ) == 'e' && buffer.charAt( c + 1 ) == 'i' ) {
  +            buffer.setCharAt( c, '%' );
  +            buffer.deleteCharAt( c + 1 );
  +            substCount++;
  +          }
  +          else if ( buffer.charAt( c ) == 'i' && buffer.charAt( c + 1 ) == 'e' ) {
  +            buffer.setCharAt( c, '&' );
  +            buffer.deleteCharAt( c + 1 );
  +            substCount++;
  +          }
  +          else if ( buffer.charAt( c ) == 'i' && buffer.charAt( c + 1 ) == 'g' ) {
  +            buffer.setCharAt( c, '#' );
  +            buffer.deleteCharAt( c + 1 );
  +            substCount++;
  +          }
  +          else if ( buffer.charAt( c ) == 's' && buffer.charAt( c + 1 ) == 't' ) {
  +            buffer.setCharAt( c, '!' );
  +            buffer.deleteCharAt( c + 1 );
  +            substCount++;
  +          }
  +        }
  +      }
       }
   
       /**
  @@ -228,35 +229,35 @@
        */
       private void resubstitute( StringBuffer buffer )
       {
  -     for ( int c = 0; c < buffer.length(); c++ ) {
  -         if ( buffer.charAt( c ) == '*' ) {
  -             char x = buffer.charAt( c - 1 );
  -             buffer.setCharAt( c, x );
  -         }
  -         else if ( buffer.charAt( c ) == '$' ) {
  -             buffer.setCharAt( c, 's' );
  -             buffer.insert( c + 1, new char[]{'c', 'h'}, 0, 2 );
  -         }
  -         else if ( buffer.charAt( c ) == '§' ) {
  -             buffer.setCharAt( c, 'c' );
  -             buffer.insert( c + 1, 'h' );
  -         }
  -         else if ( buffer.charAt( c ) == '%' ) {
  -             buffer.setCharAt( c, 'e' );
  -             buffer.insert( c + 1, 'i' );
  -         }
  -         else if ( buffer.charAt( c ) == '&' ) {
  -             buffer.setCharAt( c, 'i' );
  -             buffer.insert( c + 1, 'e' );
  -         }
  -         else if ( buffer.charAt( c ) == '#' ) {
  -             buffer.setCharAt( c, 'i' );
  -             buffer.insert( c + 1, 'g' );
  -         }
  -         else if ( buffer.charAt( c ) == '!' ) {
  -             buffer.setCharAt( c, 's' );
  -             buffer.insert( c + 1, 't' );
  -         }
  -     }
  +      for ( int c = 0; c < buffer.length(); c++ ) {
  +        if ( buffer.charAt( c ) == '*' ) {
  +          char x = buffer.charAt( c - 1 );
  +          buffer.setCharAt( c, x );
  +        }
  +        else if ( buffer.charAt( c ) == '$' ) {
  +          buffer.setCharAt( c, 's' );
  +          buffer.insert( c + 1, new char[]{'c', 'h'}, 0, 2 );
  +        }
  +        else if ( buffer.charAt( c ) == '§' ) {
  +          buffer.setCharAt( c, 'c' );
  +          buffer.insert( c + 1, 'h' );
  +        }
  +        else if ( buffer.charAt( c ) == '%' ) {
  +          buffer.setCharAt( c, 'e' );
  +          buffer.insert( c + 1, 'i' );
  +        }
  +        else if ( buffer.charAt( c ) == '&' ) {
  +          buffer.setCharAt( c, 'i' );
  +          buffer.insert( c + 1, 'e' );
  +        }
  +        else if ( buffer.charAt( c ) == '#' ) {
  +          buffer.setCharAt( c, 'i' );
  +          buffer.insert( c + 1, 'g' );
  +        }
  +        else if ( buffer.charAt( c ) == '!' ) {
  +          buffer.setCharAt( c, 's' );
  +          buffer.insert( c + 1, 't' );
  +        }
  +      }
       }
   }
  
  
  
  1.10      +2 -2      
jakarta-lucene/src/java/org/apache/lucene/analysis/de/WordlistLoader.java
  
  Index: WordlistLoader.java
  ===================================================================
  RCS file: 
/home/cvs/jakarta-lucene/src/java/org/apache/lucene/analysis/de/WordlistLoader.java,v
  retrieving revision 1.9
  retrieving revision 1.10
  diff -u -r1.9 -r1.10
  --- WordlistLoader.java       30 Mar 2004 15:44:58 -0000      1.9
  +++ WordlistLoader.java       30 Mar 2004 15:54:48 -0000      1.10
  @@ -53,8 +53,8 @@
         String word = null;
         while ((word = lnr.readLine()) != null) {
           result.add(word.trim());
  -        }
         }
  +    }
       finally {
         if (lnr != null)
           lnr.close();


---------------------------------------------------------------------
To unsubscribe, e-mail: [EMAIL PROTECTED]
For additional commands, e-mail: [EMAIL PROTECTED]

cvs commit: jakarta-lucene/src/java/org/apache/lucene/analysis/de GermanStemFilter.java GermanStemmer.java WordlistLoader.java

Reply via email to