Update of /cvsroot/nutch/nutch/src/java/net/nutch/io In directory sc8-pr-cvs1.sourceforge.net:/tmp/cvs-serv9402
Modified Files: BufferedRandomAccessFile.java MapFile.java SequenceFile.java Log Message: Add configurable buffer size to RandomAccessFile. Use this in SequenceFile (and automatically in all subclasses). Add a static method to MapFile for fixing broken MapFiles. Index: MapFile.java =================================================================== RCS file: /cvsroot/nutch/nutch/src/java/net/nutch/io/MapFile.java,v retrieving revision 1.23 retrieving revision 1.24 diff -C2 -d -r1.23 -r1.24 *** MapFile.java 20 Aug 2004 20:36:11 -0000 1.23 --- MapFile.java 5 Nov 2004 15:02:13 -0000 1.24 *************** *** 341,344 **** --- 341,402 ---- } + /** + * This method attempts to fix a corrupt MapFile by re-creating its index. + * @param nfs filesystem + * @param dir directory containing the MapFile data and index + * @param keyClass key class (has to be a subclass of Writable) + * @param valueClass value class (has to be a subclass of Writable) + * @param dryrun do not perform any changes, just report what needs to be done + * @return number of valid entries in this MapFile, or -1 if no fixing was needed + * @throws Exception + */ + public static long fix(NutchFileSystem nfs, File dir, + Class keyClass, Class valueClass, boolean dryrun) throws Exception { + String dr = (dryrun ? "[DRY RUN ] " : ""); + File data = new File(dir, DATA_FILE_NAME); + File index = new File(dir, INDEX_FILE_NAME); + int indexInterval = 128; + if (!nfs.exists(data)) { + // there's nothing we can do to fix this! + throw new Exception(dr + "Missing data file in " + dir + ", impossible to fix this."); + } + if (nfs.exists(index)) { + // no fixing needed + return -1; + } + SequenceFile.Reader dataReader = new SequenceFile.Reader(nfs, data.toString()); + if (!dataReader.getKeyClass().equals(keyClass)) { + throw new Exception(dr + "Wrong key class in " + dir + ", expected" + keyClass.getName() + + ", got " + dataReader.getKeyClass().getName()); + } + if (!dataReader.getValueClass().equals(valueClass)) { + throw new Exception(dr + "Wrong value class in " + dir + ", expected" + valueClass.getName() + + ", got " + dataReader.getValueClass().getName()); + } + long cnt = 0L; + Writable key = (Writable)keyClass.getConstructor(new Class[0]).newInstance(new Object[0]); + Writable value = (Writable)valueClass.getConstructor(new Class[0]).newInstance(new Object[0]); + SequenceFile.Writer indexWriter = null; + if (!dryrun) indexWriter = new SequenceFile.Writer(nfs, index.toString(), keyClass, LongWritable.class); + try { + long pos = 0L; + LongWritable position = new LongWritable(); + while(dataReader.next(key, value)) { + cnt++; + if (cnt % indexInterval == 0) { + position.set(pos); + if (!dryrun) indexWriter.append(key, position); + } + pos = dataReader.getPosition(); + } + } catch(Throwable t) { + // truncated data file. swallow it. + } + dataReader.close(); + if (!dryrun) indexWriter.close(); + return cnt; + } + + public static void main(String[] args) throws Exception { String usage = "Usage: MapFile inFile outFile"; Index: BufferedRandomAccessFile.java =================================================================== RCS file: /cvsroot/nutch/nutch/src/java/net/nutch/io/BufferedRandomAccessFile.java,v retrieving revision 1.1 retrieving revision 1.2 diff -C2 -d -r1.1 -r1.2 *** BufferedRandomAccessFile.java 20 Aug 2004 20:36:11 -0000 1.1 --- BufferedRandomAccessFile.java 5 Nov 2004 15:02:13 -0000 1.2 *************** *** 359,363 **** public BufferedRandomAccessFile(NutchFileSystem nfs, String file, boolean isReadOnly) throws IOException { ! this(nfs, file, 4096, isReadOnly); } --- 359,363 ---- public BufferedRandomAccessFile(NutchFileSystem nfs, String file, boolean isReadOnly) throws IOException { ! this(nfs, file, -1, isReadOnly); } *************** *** 365,368 **** --- 365,369 ---- boolean isReadOnly) throws IOException { super(nfs, file, isReadOnly ? "r" : "rw"); + if (bufLen == -1) bufLen = NutchConf.getInt("io.raf.buffer.size", 4096); this.buf = new byte[bufLen]; this.isReadOnly = isReadOnly; Index: SequenceFile.java =================================================================== RCS file: /cvsroot/nutch/nutch/src/java/net/nutch/io/SequenceFile.java,v retrieving revision 1.26 retrieving revision 1.27 diff -C2 -d -r1.26 -r1.27 *** SequenceFile.java 20 Aug 2004 20:36:11 -0000 1.26 --- SequenceFile.java 5 Nov 2004 15:02:13 -0000 1.27 *************** *** 131,135 **** /** Open the named file. */ public Reader(NutchFileSystem nfs, String file) throws IOException { ! this(nfs, file, 4096); } --- 131,135 ---- /** Open the named file. */ public Reader(NutchFileSystem nfs, String file) throws IOException { ! this(nfs, file, -1); } ------------------------------------------------------- This SF.Net email is sponsored by: Sybase ASE Linux Express Edition - download now for FREE LinuxWorld Reader's Choice Award Winner for best database on Linux. http://ads.osdn.com/?ad_id=5588&alloc_id=12065&op=click _______________________________________________ Nutch-cvs mailing list [EMAIL PROTECTED] https://lists.sourceforge.net/lists/listinfo/nutch-cvs