Re: getting the token position

Igal @ getRailo.org Thu, 10 Jan 2013 16:45:11 -0800

hi Denis,

thanks for your reply. OffsetAttribute gives the character positionwhereas I was looking for the Token Position. I ended up adding theattached PositionAttribute/PositionAttributeImpl/PositionFilter.

as it turned out though I didn't need that attribute as there was aneasier way to "fix" my Shingle Filter for the purpose of Not creatingshingles from separate calls that add data. all I had to do was resetthe buffer when input.incrementToken() returns false.

attached is also my implementation of the Shingle Filter (I named itShingaFilter to avoid confusion with the original ShingleFilter).

if anyone's "bored" and can look at my code I'd appreciate any critique-- especially if I'm doing something horribly wrong.


thanks,


Igal


On 1/10/2013 4:24 PM, Denis Bazhenov wrote:

What you are looking for is OffsetAttribute. Also consider the possibility of using 
ShingleFilter with position increment > 1 and then filtering tokens containing 
"_" (underscore). This will be easier, I guess.

On Jan 11, 2013, at 7:14 AM, Igal @ getRailo.org <[email protected]> wrote:

hi all,

how can I get the Token's Position from the TokenStream / Tokenizer / Analyzer 
?  I know that there's a TokenPositionIncrement Attribute and a 
TokenPositionLength Attribute, but is there an easy way to get the token 
position or do I need to implement my own attribute by adding one of the 
attributes mentioned above?

the reason I need it is that I wrote an implementation of a ShingleFilter which breaks shingles at punctuations so the 
tokens [token number one, word two] will create the shingles [ "token number", "number one", 
"word two" ] -- but Not [ "one word" ] because of the comma.  I want it to break shingles at 
increment gaps as well.

thanks,


Igal


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

---
Denis Bazhenov <[email protected]>






---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

/*
 * To change this template, choose Tools | Templates
 * and open the template in the editor.
 */
package s21waf.text.lucene4;

import org.apache.lucene.util.Attribute;

/**
 *
 * @author Igal
 */
public interface PositionAttribute extends Attribute {
        
    public void setPosition( int value );
    
    public int getPosition();
}

/*
 * To change this template, choose Tools | Templates
 * and open the template in the editor.
 */
package s21waf.text.lucene4;

import org.apache.lucene.util.AttributeImpl;

/**
 *
 * @author Igal
 */
public class PositionAttributeImpl extends AttributeImpl implements 
PositionAttribute {

    int position = 0;
    
    @Override
    public void clear() {
        
        this.position = 0;
    }

    @Override
    public void copyTo(AttributeImpl ai) {
        
        ((PositionAttribute)ai).setPosition( this.position );
    }

    public void setPosition(int value) {
        
        this.position = value;
    }

    public int getPosition() {
        
        return this.position;
    }
    
}

/*
 * To change this template, choose Tools | Templates
 * and open the template in the editor.
 */
package s21waf.text.lucene4;

import java.io.IOException;
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
import org.apache.lucene.analysis.tokenattributes.TypeAttribute;

/**
 *
 * @author Igal
 */
public class PositionFilter extends TokenFilter {
    
    int pos = 0;
    
    private final PositionIncrementAttribute posIncAtt = addAttribute( 
PositionIncrementAttribute.class );
    
    private final PositionAttribute posAtt = addAttribute( 
PositionAttribute.class );

    
    public PositionFilter( TokenStream in ) {
    
        super( in );
        
    }
    
    
    @Override
    public boolean incrementToken() throws IOException {
    
        if ( !input.incrementToken() )
            return false;
        
        this.pos += posIncAtt.getPositionIncrement();
        
        posAtt.setPosition( this.pos );        
        
        return true;
    }
    
}

/*
 * To change this template, choose Tools | Templates
 * and open the template in the editor.
 */
package s21waf.text.lucene4;

import java.io.IOException;
import java.io.StringReader;
import java.util.ArrayList;
import java.util.Collections;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.core.WhitespaceTokenizer;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
import org.apache.lucene.util.AttributeSource;
import s21waf.text.lucene4.LuceneUtils.TokenAttributes;

/**
 *
 * @author Igal
 */
public class ShingaFilter extends TokenFilter {

    public static final String TOKEN_TYPE_SHINGA = LuceneUtils.TYPE_SHINGLE;
    
    public static final String PHRASE_BREAKERS = ",;.:!()[]{}";    // if a term 
ends with one of those then the phrase shingle breaks

    private final LinkedList<String> terms;
    private final LinkedList<String> queue;
    
    private final int maxShingleSize;
    private int minShingleSize = 2;

    private AttributeSource.State capturedState;

    private final CharTermAttribute termAtt = addAttribute( 
CharTermAttribute.class );
    private final PositionIncrementAttribute posIncAtt = addAttribute( 
PositionIncrementAttribute.class );
    private final TypeAttribute typeAtt = addAttribute( TypeAttribute.class );

    private final PunctuationAttribute puncAtt = addAttribute( 
PunctuationAttribute.class );
    
    private final PositionAttribute posAtt = addAttribute( 
PositionAttribute.class );
    
    private String[] addedTerms;
    private int pos = 0;
    
    public ShingaFilter( TokenStream input, int maxShingleSize ) {

        super( input );

        this.maxShingleSize = maxShingleSize;

        terms = new LinkedList();
        
        queue = new LinkedList();
        
        addedTerms = new String[ maxShingleSize * ( maxShingleSize - 1 ) ];
    }


    public ShingaFilter( TokenStream input ) {

        this( input, 3 );
    }


    @Override
    public boolean incrementToken() throws IOException {

        if ( !queue.isEmpty() ) {
            // if we have values from previous iteration, return them one at a 
time with a restored state and no-position-increment
            
            String phrase = queue.poll();
            
            restoreState( capturedState );
            
            typeAtt.setType( TOKEN_TYPE_SHINGA );
            
            termAtt.setEmpty();
            termAtt.append( phrase );
            
            posIncAtt.setPositionIncrement( 0 );
            return true;
        }
        
        if ( !input.incrementToken() ) {    
            // end of stream
            
            // System.out.println( this.getClass().getName() + ": EOS.  Terms: 
" + terms.toString() );
            
            terms.clear();  // reset terms in case we call the tokenizer again 
so that we don't create shingles between separate entries
            
            return false;                       
        }
            
        /*/ probably don't need this
        int tokenPos = posAtt.getPosition();
        
        if ( tokenPos > 0 ) {   // if PositionFilter is not in the filter chain 
then tokenPos will always be 0
            
            System.out.println( this.getClass().getName() + ": " + tokenPos );
        }   //*/
        
        if ( addPhrasesToStack() )              // add phrases to stack
            capturedState = captureState();     // capture state if any phrases 
were added
        
        return true;                            // return current token
    }
    
    
    boolean addPhrasesToStack() {
        
        boolean doBreakPhrase = false;
        
        String term = termAtt.toString();
        
        int first = term.codePointAt( 0 );
        
        int punc = puncAtt.getPunctuation();
        
        if ( punc != PunctuationAttribute.NONE ) {

//            System.out.println("+" + punc);
        }
        
        if ( PHRASE_BREAKERS.indexOf( first ) > -1 ) {
         
            terms.clear();
            
            term = removePunc( term );
            
            termAtt.setEmpty();
            termAtt.append( term );
        }
        
        if ( !term.isEmpty() ) {    // in case removePunct cleared the term 
completely
        
//            if ( term.contains("led") )     System.out.println("xxx\t" + 
term);
            
            int last  = term.codePointAt( term.length() - 1 );

            if ( PHRASE_BREAKERS.indexOf( last ) > -1 ) {

                doBreakPhrase = true;   // set flag to clear the terms queue 
After adding this term

                termAtt.setLength( term.length() - 1 );
                term = termAtt.toString();
            }
                        
            if ( useTerm() ) {

                terms.add( term );

                if ( terms.size() > maxShingleSize )
                    terms.removeFirst();

                queue.addAll( getPhrases() );
            }
        }
        
        if ( doBreakPhrase )
            terms.clear();
        
        return !queue.isEmpty();
    }
    
    
    boolean useTerm() {
        
        String type = typeAtt.type();
                
        return ( !type.equals( LuceneUtils.TYPE_SYNONYM ) );
    }
    

    List<String> getPhrases() {

        if ( terms.size() < minShingleSize )
            return Collections.EMPTY_LIST;
        
        List<String> results = new ArrayList();

        for ( int j=minShingleSize; j <= terms.size(); j++ ) {   // min phrase 
size is 2

            StringBuilder sb = new StringBuilder( maxShingleSize * 10 );

            for ( int i=0; i<j; i++ ) {

                sb.append( terms.get( i ) );

                if ( i < j - 1 )
                    sb.append( " " );
            }

            String phrase = sb.toString();
            
            if ( !isAlreadyAdded( phrase ) ) {
            
                results.add( phrase );
                
                addedTerms[ pos++ % addedTerms.length ] = phrase;
            }
        }

        return results;
    }

    
    boolean isAlreadyAdded( String term ) {
        
        for ( String t : addedTerms ) {
            
            if ( term.equalsIgnoreCase( t ) )
                return true;
        }
        
        return false;
    }
    


    /** test methods below */
    public static String printTokens( String phrase, List<Map> list, boolean 
doPrint ) {

        StringBuilder sb = new StringBuilder();

        for (Map m : list) {

            int pos = Integer.parseInt( m.get( "position").toString() );

            String term = m.get( "term" ).toString();

            sb.append( term ).append( " (" ).append( pos ).append( ") " );
        }

        if ( doPrint )
            System.out.println( sb.toString() );

        return sb.toString();
    }


    public static void main( String[] args ) throws Exception {

        String test = "the quick, somewhat ugly, brown fox jumps over the lazy 
dog";

//        test = "the quick";
        
        TokenStream ts = new WhitespaceTokenizer( LuceneUtils.VERSION, new 
StringReader( test ) );

        ts = new ShingaFilter( ts );

        List<TokenAttributes> terms = LuceneUtils.getTokens( ts );

        for ( TokenAttributes ta : terms )
            System.out.println("\t" + ta.term + "\t\t" + ta.attr);
    }

    
    private String removePunc(String term) {
        
        StringBuilder sb = new StringBuilder();
        
        char[] buffer = term.toCharArray();
        
        for ( char c : buffer ) {
            
            /*/
            if ( Character.isWhitespace( c ) && sb.length() == 0 )
                continue;   //*/
            
            if ( PHRASE_BREAKERS.indexOf( c ) == -1 )
                sb.append( c );
        }
        
        return sb.toString();
    }

}

---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Re: getting the token position

Reply via email to