[EMAIL PROTECTED] wrote:

Yes, this issue has come up before with other choices of analyzers.
I think it should be fixable without changing any of the highlighter APIs - can you email me or post here the source to your analyzer?



Code attached - don't make fun of it please :) - very prelim. I think it only uses one other file, (TRQueue) also attached (but: note, it's in a different package). Also any comments in the code may be inaccurate. The general goal is as stated in my earlier mail, examples are:


AlphaBeta ->
Alpha (incr 0)
Beta (incr 0)
AlphaBeta (incr 1)

MAX_INT ->
MAX (incr 0)
INT (incr 0)
MAX_INT (incr 1)

thx,
Dave

Cheers
Mark

---------------------------------------------------------------------
To unsubscribe, e-mail: [EMAIL PROTECTED]
For additional commands, e-mail: [EMAIL PROTECTED]




package com.tropo.lucene;

import org.apache.lucene.analysis.*;
import java.io.*;
import java.util.*;
import com.tropo.util.*;
import java.util.regex.*;
/**
 * Try to parse javadoc better than othe analyzers.
 */
public final class JavadocAnalyzer
        extends Analyzer
{

        // [A-Za-z0-9._]+
        // 
        public final TokenStream tokenStream( String fieldName, Reader reader)
        {
                return new LowerCaseFilter( new JStream( fieldName, reader));
        }

        /**
         * Try to break up a token into subset/subtokens that might be said to occur 
in the same place.
         */
        public static List breakup( String s)
        {
                // "a" -> null
                // "alphaBeta" -> "alpha", "Beta"
                // "XXAlpha" -> ?, Alpha
                // BIG_NUM -> "BIG", "NUM"

                List lis = new LinkedList();

                Matcher m;

                m = breakupPattern.matcher( s);
                while (m.find())
                {
                        String g = m.group();
                        if ( ! g.equals( s))
                                lis.add( g);
                }

                // hard ones
                m = breakupPattern2.matcher( s);
                while (m.find())
                {
                        String g;
                        if ( m.groupCount() == 2) // wierd XXFoo case
                                g = m.group( 2);
                        else
                                g = m.group();
                        if ( ! g.equals( s))
                                lis.add( g);
                        /*
                        o.println( "gc: " + m.groupCount() +
                                           "/" + m.group( 0) + "/" + m.group( 1) + "/" 
+ m.group( 2));
                        */
                        //lis.add( m.group());
                }               
                return lis;
        }       


        /**
         *
         */
        private static class JStream
                extends TokenStream
        {
                private TRQueue q = new TRQueue();
                private Set already = new HashSet();
                private String fieldName;
                private PushbackReader pb;

                private StringBuffer sb = new StringBuffer( 32);
                private int offset;

                // eat white
                // have 
                private int state = 0;
                

                /**
                 *
                 */
                private JStream( String fieldName, Reader reader)
                {
                        this.fieldName = fieldName;
                        pb = new PushbackReader( reader);
                }


                /**
                 *
                 */
                public Token next()
                        throws IOException
                {
                        if ( q.size() > 0) // pre-calculated
                                return (Token) q.dequeue();
                        int c;
                        int start = offset;
                        sb.setLength( 0);
                        offset--;
                        boolean done = false;
                        String type = "mystery";
                        state = 0;
                        
                        while ( ! done &&
                                        ( c = pb.read()) != -1)
                        {
                                char ch = (char) c;
                                offset++;
                                switch( state)
                                {
                                case 0:
                                        if ( Character.isJavaIdentifierStart( ch))
                                        {
                                                start = offset;
                                                sb.append( ch);
                                                state = 1;
                                                type = "id";
                                        }
                                        else if ( Character.isDigit( ch))
                                        {
                                                type = "number";
                                                start = offset;
                                                sb.append( ch);
                                                state = 2;                             
                 
                                        }
                                        break;
                                        
                                case 1:
                                        if ( Character.isJavaIdentifierPart( ch) || ch 
== '.')
                                        {
                                                sb.append( ch);
                                        }
                                        else
                                        {
                                                pb.unread( ch);
                                                done = true;
                                        }
                                        break;
                                case 2:
                                        if ( Character.isDigit( ch))
                                        {
                                                sb.append( ch);
                                        }
                                        else
                                        {
                                                pb.unread( c);
                                                done = true;
                                        }
                                        break;                                  
                                }
                        }
                        String s = sb.toString();
                        if ( s.length() == 0)
                                return null;
                        else
                        {
                                already.clear();
                                Token t;                                
                                int incr = 1;
                                if ( s.indexOf( '.') > 0)
                                {
                                        //System.out.println( "XX BEFORE: " + s);
                                        String[] result = s.split( "[\\.]+");

                                        for (int x=0; x<result.length; x++)
                                        {
                                                //
                                                List lis = breakup( result[ x]);
                                                if ( lis.size() > 0)
                                                {
                                                        Iterator it = lis.iterator();
                                                        while ( it.hasNext())
                                                        {
                                                                String nt = (String) 
it.next();
                                                                if ( already.add( nt))
                                                                {
                                                                        t = new Token( 
nt, start, offset, type);
                                                                        
t.setPositionIncrement( 0);
                                                                        q.queue( t);
                                                                }
                                                        }
                                                }
                                                //
                                                t = new Token( result[x], start, 
offset, type);
                                                if ( x < result.length-1)
                                                        t.setPositionIncrement( 0);
                                                q.queue( t);
                                        }
                                        t = new Token( s, start, offset, type); 
                                        t.setPositionIncrement( 0);
                                        return t;
                                }
                                else
                                {
                                        List lis = breakup( s);
                                        if ( lis.size() > 0)
                                        {
                                                Iterator it= lis.iterator();
                                                while ( it.hasNext())
                                                {
                                                        String nt = (String) it.next();
                                                        if ( already.add( nt))
                                                        {
                                                                t = new Token( nt, 
start, offset, type);
                                                                
t.setPositionIncrement( 0);
                                                                q.queue( t);
                                                        }
                                                }
                                        }
                                        return new Token( s, start, offset, type);
                                }
                        }
                }





                /**
                 *
                 */
                public void close()
           throws IOException
                {
                        pb.close();
                        pb = null;
                }
                
        }

        public static void main( String[] a)
        {
                String s = "XXFoo JavaDoc Analyzis BIG_NUM 123 java.util.HashMap x2";
                //Pattern p = Pattern.compile("(http://[^\\s()]+)");
                o.println( "starting: " + s);
                o.println( breakup( s));
                

        }

        static PrintStream o = System.out;
                
        static String p1 = "[A-Z][a-z]+";
        static String p2 = "[0-9]+";
        static String p3 = "[A-Z]+";
        static String p4 = "[a-z]+";

        // next set
        static String q1 = "([A-Z]+)([A-Z][a-z]+)";
        static String q2 = "([a-z]+[0-9]+)";
        static String q3 = "([A-Z]+[0-9]+)";    
        static Pattern breakupPattern;
        static Pattern breakupPattern2; 

        static
        {
                breakupPattern = Pattern.compile( "(" + p1 + ")" + "|" +
                                                                                  "(" 
+ p2 + ")" + "|" +
                                                                                  "(" 
+ p3 + ")" + "|" +
                                                                                  "(" 
+ p4 + ")");
                
                breakupPattern2 = Pattern.compile( q1  + "|" + q2 + "|" + q3);

        }
}
                

package com.tropo.util;

import java.util.*;

/**
 * The collections package gives us LinkedList, which
 * can be a queue, but it doesn't have correct blocking
 * semantics on dequeue. That's what we offer.
 * @see com.tropo.persist.PersistentQueue
 */
public class TRQueue
{
        /**
         * Create an empty queue.
         */
        public TRQueue()
        {
        }
        
        /**
         * Thread-safe queueing, wakes up safeDequeue too.
         * @see #safeDequeue
         */
        public synchronized void safeQueue( Object x)
        {
                ll.addFirst( x);
                notify();
        }
        /**
         *
         */
        public synchronized void queue( Object x)
        {
                ll.addFirst( x);
        }       

        /**
         * Blocking dequeue.
         */
        public synchronized Object safeDequeue()
        {
                while( true )
                {
                        try
                        {
                                return ll.removeLast();
                        }
                        catch( NoSuchElementException ouch)
                        {
                                try
                                {
                                        wait();
                                }
                                catch( InterruptedException ie)
                                {
                                        return null; // 3/2003 - let threads die
                                }
                        }
                }
        }

        /**
         * 
         */
        public Object dequeue()
        {
                try
                {
                        return ll.removeLast();
                }
                catch( NoSuchElementException ouch)
                {
                        return null;
                }
        }


        /**
         *
         */
        public synchronized int size()
        {
                return ll.size();
        }

        private LinkedList ll = new LinkedList();
        
}

---------------------------------------------------------------------
To unsubscribe, e-mail: [EMAIL PROTECTED]
For additional commands, e-mail: [EMAIL PROTECTED]

Reply via email to