[EMAIL PROTECTED] wrote:
Yes, this issue has come up before with other choices of analyzers.
I think it should be fixable without changing any of the highlighter APIs - can you email me or post here the source to your analyzer?
Code attached - don't make fun of it please :) - very prelim. I think it only uses one other file, (TRQueue) also attached (but: note, it's in a different package). Also any comments in the code may be inaccurate. The general goal is as stated in my earlier mail, examples are:
AlphaBeta -> Alpha (incr 0) Beta (incr 0) AlphaBeta (incr 1)
MAX_INT -> MAX (incr 0) INT (incr 0) MAX_INT (incr 1)
thx, Dave
Cheers Mark
--------------------------------------------------------------------- To unsubscribe, e-mail: [EMAIL PROTECTED] For additional commands, e-mail: [EMAIL PROTECTED]
package com.tropo.lucene;
import org.apache.lucene.analysis.*;
import java.io.*;
import java.util.*;
import com.tropo.util.*;
import java.util.regex.*;
/**
* Try to parse javadoc better than othe analyzers.
*/
public final class JavadocAnalyzer
extends Analyzer
{
// [A-Za-z0-9._]+
//
public final TokenStream tokenStream( String fieldName, Reader reader)
{
return new LowerCaseFilter( new JStream( fieldName, reader));
}
/**
* Try to break up a token into subset/subtokens that might be said to occur
in the same place.
*/
public static List breakup( String s)
{
// "a" -> null
// "alphaBeta" -> "alpha", "Beta"
// "XXAlpha" -> ?, Alpha
// BIG_NUM -> "BIG", "NUM"
List lis = new LinkedList();
Matcher m;
m = breakupPattern.matcher( s);
while (m.find())
{
String g = m.group();
if ( ! g.equals( s))
lis.add( g);
}
// hard ones
m = breakupPattern2.matcher( s);
while (m.find())
{
String g;
if ( m.groupCount() == 2) // wierd XXFoo case
g = m.group( 2);
else
g = m.group();
if ( ! g.equals( s))
lis.add( g);
/*
o.println( "gc: " + m.groupCount() +
"/" + m.group( 0) + "/" + m.group( 1) + "/"
+ m.group( 2));
*/
//lis.add( m.group());
}
return lis;
}
/**
*
*/
private static class JStream
extends TokenStream
{
private TRQueue q = new TRQueue();
private Set already = new HashSet();
private String fieldName;
private PushbackReader pb;
private StringBuffer sb = new StringBuffer( 32);
private int offset;
// eat white
// have
private int state = 0;
/**
*
*/
private JStream( String fieldName, Reader reader)
{
this.fieldName = fieldName;
pb = new PushbackReader( reader);
}
/**
*
*/
public Token next()
throws IOException
{
if ( q.size() > 0) // pre-calculated
return (Token) q.dequeue();
int c;
int start = offset;
sb.setLength( 0);
offset--;
boolean done = false;
String type = "mystery";
state = 0;
while ( ! done &&
( c = pb.read()) != -1)
{
char ch = (char) c;
offset++;
switch( state)
{
case 0:
if ( Character.isJavaIdentifierStart( ch))
{
start = offset;
sb.append( ch);
state = 1;
type = "id";
}
else if ( Character.isDigit( ch))
{
type = "number";
start = offset;
sb.append( ch);
state = 2;
}
break;
case 1:
if ( Character.isJavaIdentifierPart( ch) || ch
== '.')
{
sb.append( ch);
}
else
{
pb.unread( ch);
done = true;
}
break;
case 2:
if ( Character.isDigit( ch))
{
sb.append( ch);
}
else
{
pb.unread( c);
done = true;
}
break;
}
}
String s = sb.toString();
if ( s.length() == 0)
return null;
else
{
already.clear();
Token t;
int incr = 1;
if ( s.indexOf( '.') > 0)
{
//System.out.println( "XX BEFORE: " + s);
String[] result = s.split( "[\\.]+");
for (int x=0; x<result.length; x++)
{
//
List lis = breakup( result[ x]);
if ( lis.size() > 0)
{
Iterator it = lis.iterator();
while ( it.hasNext())
{
String nt = (String)
it.next();
if ( already.add( nt))
{
t = new Token(
nt, start, offset, type);
t.setPositionIncrement( 0);
q.queue( t);
}
}
}
//
t = new Token( result[x], start,
offset, type);
if ( x < result.length-1)
t.setPositionIncrement( 0);
q.queue( t);
}
t = new Token( s, start, offset, type);
t.setPositionIncrement( 0);
return t;
}
else
{
List lis = breakup( s);
if ( lis.size() > 0)
{
Iterator it= lis.iterator();
while ( it.hasNext())
{
String nt = (String) it.next();
if ( already.add( nt))
{
t = new Token( nt,
start, offset, type);
t.setPositionIncrement( 0);
q.queue( t);
}
}
}
return new Token( s, start, offset, type);
}
}
}
/**
*
*/
public void close()
throws IOException
{
pb.close();
pb = null;
}
}
public static void main( String[] a)
{
String s = "XXFoo JavaDoc Analyzis BIG_NUM 123 java.util.HashMap x2";
//Pattern p = Pattern.compile("(http://[^\\s()]+)");
o.println( "starting: " + s);
o.println( breakup( s));
}
static PrintStream o = System.out;
static String p1 = "[A-Z][a-z]+";
static String p2 = "[0-9]+";
static String p3 = "[A-Z]+";
static String p4 = "[a-z]+";
// next set
static String q1 = "([A-Z]+)([A-Z][a-z]+)";
static String q2 = "([a-z]+[0-9]+)";
static String q3 = "([A-Z]+[0-9]+)";
static Pattern breakupPattern;
static Pattern breakupPattern2;
static
{
breakupPattern = Pattern.compile( "(" + p1 + ")" + "|" +
"("
+ p2 + ")" + "|" +
"("
+ p3 + ")" + "|" +
"("
+ p4 + ")");
breakupPattern2 = Pattern.compile( q1 + "|" + q2 + "|" + q3);
}
}
package com.tropo.util;
import java.util.*;
/**
* The collections package gives us LinkedList, which
* can be a queue, but it doesn't have correct blocking
* semantics on dequeue. That's what we offer.
* @see com.tropo.persist.PersistentQueue
*/
public class TRQueue
{
/**
* Create an empty queue.
*/
public TRQueue()
{
}
/**
* Thread-safe queueing, wakes up safeDequeue too.
* @see #safeDequeue
*/
public synchronized void safeQueue( Object x)
{
ll.addFirst( x);
notify();
}
/**
*
*/
public synchronized void queue( Object x)
{
ll.addFirst( x);
}
/**
* Blocking dequeue.
*/
public synchronized Object safeDequeue()
{
while( true )
{
try
{
return ll.removeLast();
}
catch( NoSuchElementException ouch)
{
try
{
wait();
}
catch( InterruptedException ie)
{
return null; // 3/2003 - let threads die
}
}
}
}
/**
*
*/
public Object dequeue()
{
try
{
return ll.removeLast();
}
catch( NoSuchElementException ouch)
{
return null;
}
}
/**
*
*/
public synchronized int size()
{
return ll.size();
}
private LinkedList ll = new LinkedList();
}
--------------------------------------------------------------------- To unsubscribe, e-mail: [EMAIL PROTECTED] For additional commands, e-mail: [EMAIL PROTECTED]
