Kashif Khadim wrote:

Hi ,

I have multiple index of lucene and want know how can
i delete duplicates from these index. I am using
MultiSearcher to search on these. I have duplicates
"urls" in these index, any sample code  or tool will
be a big help.

Here's some ancient code that I've used - consider it a "fragment" i.e. it won't compile as it needs a couple of other classes, but it should be obvious what's missing (.e.g. DFields.URL is something like "url").



package com.tropo.lucene;

import org.apache.lucene.analysis.*;
import org.apache.lucene.index.*;
import org.apache.lucene.document.*;
import org.apache.lucene.analysis.standard.*;

import java.io.*;
import java.util.*;
import com.tropo.lang.*;

/**
 *
 */
public class DupUrls
        extends com.tropo.TRBase
{
        public static void main(String[] args)
                throws Throwable
        {
                boolean show = false;
                if ( args.length > 0)
                {
                        for ( int i = 0; i< args.length; i++)
                        {
                                if ( args[ i].equals( "-s"))
                                {
                                        show = true;
                                        continue;
                                }
                                if (show)
                                        showdups( args[ i]);
                                else
                                        dedup( args[ i]);                       
        
                        }
                        System.exit( 0);
                }
                String[] ar = new File( ".").list();
                for ( int ii = 0; ii < ar.length; ii++)
                {
                        if ( ar[ ii].endsWith( "_index") &&
                                 new File( ar[ ii]).isDirectory())
                        {
                                dedup( ar[ ii]);
                        }
                }
                System.exit( 0);                
        }

        /**
         *
         */
        private static void showdups( String foo)
                throws Throwable
        {
                IndexReader r = IndexReader.open( foo);
                Set already = new HashSet();
                int num = r.numDocs();
                for ( int i = 0; i < num; i++)
                {
                        if ( r.isDeleted( i)) continue;
                        Document d = r.document( i);
                        String u = d.get( DFields.URL);
                        if ( already.contains( u))
                                o.println( u);
                        else
                                already.add( u);
                }
                r.close();
        }

        /**
         *
         */
        private static void dedup( String foo)
                throws Throwable
        {
                IndexReader r = IndexReader.open( foo);
                int num = r.numDocs();
                o.println( foo + ": " + num);
                Set already = new HashSet();
                int nd = 0;
                int stale = 0;
                PrintStream ps = new PrintStream( new FileOutputStream( foo + 
".txt"));
                ps.println( "File: " + foo);
                ps.println( "Docs: " + nf.format( num));
                ps.println();
                for ( int i = 0; i < num; i++)
                {

                        if ( r.isDeleted( i))
                        {
                                stale++;
                                continue;
                        }
                        Document d = r.document( i);                    
                        String u = d.get( DFields.URL);
                        if ( already.contains( u))
                        {
                                r.delete( i);
                                nd++;
                                ps.println( u);
                                if ( false)
                                {
                                        Enumeration e = d.fields();
                                        while ( e.hasMoreElements())
                                        {
                                                Field z = (Field) 
e.nextElement();
                                                ps.println( "\t\t\tname=" + z.name() + 
" sv=" +
                                                                        "\"" + z.toString() + 
"\"" +
                                                                        "\"" + z.stringValue() 
+ "\"" +
                                                                        " 
stored=" +
                                                                        z.isStored() + 
"/ indexed=" +
                                                                        z.isIndexed() + 
"/ tokenized=" +
                                                                        
z.isTokenized());
                                        }
                                        ps.println();
                                }
                        }
                        else
                                already.add( u);
                }
                if ( nd > 0)
                        o.println( "\t\tdup=" + nf.format( nd));
                if ( stale > 0)
                        o.println( "\t\tstale=" + nf.format( stale));
                ps.close();
                r.close();
                o.println( "before opt");
IndexWriter writer = new IndexWriter( foo, IndexBase.getAnalyzer(), false);
                writer.optimize();
                o.println( "after opt");
                writer.close();
                o.println( "after close");            
        }


        static PrintStream o = System.out;


}



Thanks,
Kashif.



__________________________________________________
Do You Yahoo!?
Tired of spam? Yahoo! Mail has the best spam protection around http://mail.yahoo.com
---------------------------------------------------------------------
To unsubscribe, e-mail: [EMAIL PROTECTED]
For additional commands, e-mail: [EMAIL PROTECTED]



---------------------------------------------------------------------
To unsubscribe, e-mail: [EMAIL PROTECTED]
For additional commands, e-mail: [EMAIL PROTECTED]

Reply via email to