Update of /cvsroot/nutch/nutch/src/java/net/nutch/tools In directory sc8-pr-cvs1.sourceforge.net:/tmp/cvs-serv29999
Modified Files: SegmentMergeTool.java Log Message: Bugfixes and enhancements related to dealing with corrupted segments. This version should correctly process segment directories with the following errors: * partial output due to a crashed or interrupted fetcher. If this condition occurs, the tool tries to use as many valid entries as possible, skipping all corrupted entries. * invalid segment data - possibly due to a crash, or a directory not containing any segment data. In this case such directory is skipped, and all data in it is ignored. Thanks to Jason Boss for his perseverance in testing various patches on his collection of real-life segment data. Index: SegmentMergeTool.java =================================================================== RCS file: /cvsroot/nutch/nutch/src/java/net/nutch/tools/SegmentMergeTool.java,v retrieving revision 1.5 retrieving revision 1.6 diff -C2 -d -r1.5 -r1.6 *** SegmentMergeTool.java 4 Oct 2004 15:42:58 -0000 1.5 --- SegmentMergeTool.java 6 Oct 2004 22:27:25 -0000 1.6 *************** *** 4,13 **** --- 4,19 ---- package net.nutch.tools; + import java.io.EOFException; import java.io.File; import java.io.FileFilter; import java.text.SimpleDateFormat; + import java.util.ArrayList; + import java.util.Arrays; import java.util.Date; import java.util.HashMap; import java.util.Iterator; + import java.util.List; + import java.util.ListIterator; + import java.util.logging.Level; import java.util.logging.Logger; *************** *** 60,64 **** private String segments = null; private String output = null; ! private File[] segdirs = null; private boolean runIndexer = false; private boolean createMaster = false; --- 66,71 ---- private String segments = null; private String output = null; ! private List segdirs = null; ! private List allsegdirs = null; private boolean runIndexer = false; private boolean createMaster = false; *************** *** 81,89 **** // count the number of valid entries. // XXX We assume that all other data files contain the ! // XXX same number of valid entries - which may not be ! // XXX true if Fetcher crashed in the middle of update... Writable w = new FetcherOutput(); ! while (fetcherReader.next(w) != null) ! size++; // reposition to the start fetcherReader.reset(); --- 88,104 ---- // count the number of valid entries. // XXX We assume that all other data files contain the ! // XXX same number of valid entries - which is not always ! // XXX true if Fetcher crashed in the middle of update. ! // XXX We compensate for this later, when actually ! // XXX reading the entries. Writable w = new FetcherOutput(); ! try { ! while (fetcherReader.next(w) != null) ! size++; ! } catch (EOFException eof) { ! // the file is truncated - probably due to a crashed fetcher. ! // Use just the part that we can... ! LOG.warning(" - segment " + dir + " is corrupt, using only " + size + " entries."); ! } // reposition to the start fetcherReader.reset(); *************** *** 126,130 **** File segs = new File(segments); if (!segs.exists() || !segs.isDirectory()) throw new Exception("Not a segments dir: " + segs); ! segdirs = segs.listFiles(new FileFilter() { public boolean accept(File file) { if (file.isDirectory()) return true; --- 141,145 ---- File segs = new File(segments); if (!segs.exists() || !segs.isDirectory()) throw new Exception("Not a segments dir: " + segs); ! File[] dirs = segs.listFiles(new FileFilter() { public boolean accept(File file) { if (file.isDirectory()) return true; *************** *** 132,135 **** --- 147,151 ---- } }); + allsegdirs = Arrays.asList(dirs); this.output = output; } *************** *** 143,156 **** public void run() { try { // open all segments long total = 0L; ! for (int i = 0; i < segdirs.length; i++) { ! SegmentReader sr = new SegmentReader(segdirs[i]); total += sr.size; ! LOG.info("Segment " + segdirs[i].getName() + ": " + sr.size + " entries."); ! readers.put(segdirs[i].getName(), sr); } ! LOG.info("TOTAL " + total + " input entries in " + segdirs.length + " segments."); File masterDir = null; if (master != null) masterDir = new File(master); --- 159,182 ---- public void run() { try { + segdirs = new ArrayList(); // open all segments long total = 0L; ! for (int i = 0; i < allsegdirs.size(); i++) { ! File dir = (File)allsegdirs.get(i); ! SegmentReader sr = null; ! try { ! sr = new SegmentReader(dir); ! } catch (Exception e) { ! // this segment is hosed, don't use it ! LOG.warning(" - segment " + dir + " is corrupt, skipping all entries."); ! continue; ! } ! segdirs.add(dir); total += sr.size; ! LOG.info("Segment " + dir.getName() + ": " + sr.size + " entries."); ! readers.put(dir.getName(), sr); } ! LOG.info("TOTAL " + total + " input entries in " + segdirs.size() + " segments."); File masterDir = null; if (master != null) masterDir = new File(master); *************** *** 168,177 **** LOG.info("Creating master unique index..."); // check that all segment indexes exist. If not, create them. ! for (int i = 0; i < segdirs.length; i++) { ! File indexerDone = new File(segdirs[i], IndexSegment.DONE_NAME); if (!indexerDone.exists()) { // Index this segment ! LOG.info(" - creating missing index for " + segdirs[i].getName()); ! IndexSegment.main(new String[]{segdirs[i].toString()}); } } --- 194,204 ---- LOG.info("Creating master unique index..."); // check that all segment indexes exist. If not, create them. ! for (int i = 0; i < segdirs.size(); i++) { ! File dir = (File)segdirs.get(i); ! File indexerDone = new File(dir, IndexSegment.DONE_NAME); if (!indexerDone.exists()) { // Index this segment ! LOG.info(" - creating missing index for " + dir.getName()); ! IndexSegment.main(new String[]{dir.toString()}); } } *************** *** 179,186 **** DeleteDuplicates.main(new String[]{segments, new File(segments).getParentFile().toString()}); LOG.info(" - creating merged index in " + masterDir); ! String[] args = new String[segdirs.length + 1]; args[0] = masterDir.toString(); ! for (int i = 0; i < segdirs.length; i++) { ! args[i + 1] = segdirs[i].toString(); } IndexMerger.main(args); --- 206,213 ---- DeleteDuplicates.main(new String[]{segments, new File(segments).getParentFile().toString()}); LOG.info(" - creating merged index in " + masterDir); ! String[] args = new String[segdirs.size() + 1]; args[0] = masterDir.toString(); ! for (int i = 0; i < segdirs.size(); i++) { ! args[i + 1] = ((File)segdirs.get(i)).toString(); } IndexMerger.main(args); *************** *** 232,240 **** continue; } ! // get data from the reader ! sr.fetcherReader.get(docid, fo); ! sr.contentReader.get(docid, co); ! sr.parseTextReader.get(docid, pt); ! sr.parseDataReader.get(docid, pd); // write it back fetcherWriter.append(fo); --- 259,274 ---- continue; } ! try { ! // get data from the reader ! sr.fetcherReader.get(docid, fo); ! sr.contentReader.get(docid, co); ! sr.parseTextReader.get(docid, pt); ! sr.parseDataReader.get(docid, pd); ! } catch (Throwable t) { ! // don't break the loop, because only one of the segments ! // may be corrupted... ! LOG.fine(" - corrupt entry no. " + docid + " in segment " + segName + " - skipping."); ! continue; ! } // write it back fetcherWriter.append(fo); *************** *** 258,264 **** } if (delSegs) { LOG.info("Deleting old segments..."); ! for (int i = 0; i < segdirs.length; i++) { ! FileUtil.fullyDelete(segdirs[i]); } } --- 292,300 ---- } if (delSegs) { + // This deletes also all corrupt segments, which are + // unusable anyway LOG.info("Deleting old segments..."); ! for (int i = 0; i < allsegdirs.size(); i++) { ! FileUtil.fullyDelete((File)allsegdirs.get(i)); } } ------------------------------------------------------- This SF.net email is sponsored by: IT Product Guide on ITManagersJournal Use IT products in your business? Tell us what you think of them. Give us Your Opinions, Get Free ThinkGeek Gift Certificates! Click to find out more http://productguide.itmanagersjournal.com/guidepromo.tmpl _______________________________________________ Nutch-cvs mailing list [EMAIL PROTECTED] https://lists.sourceforge.net/lists/listinfo/nutch-cvs