Update of /cvsroot/nutch/nutch/src/java/net/nutch/io
In directory sc8-pr-cvs1.sourceforge.net:/tmp/cvs-serv9402

Modified Files:
        BufferedRandomAccessFile.java MapFile.java SequenceFile.java 
Log Message:
Add configurable buffer size to RandomAccessFile. Use this in SequenceFile
(and automatically in all subclasses).

Add a static method to MapFile for fixing broken MapFiles.



Index: MapFile.java
===================================================================
RCS file: /cvsroot/nutch/nutch/src/java/net/nutch/io/MapFile.java,v
retrieving revision 1.23
retrieving revision 1.24
diff -C2 -d -r1.23 -r1.24
*** MapFile.java        20 Aug 2004 20:36:11 -0000      1.23
--- MapFile.java        5 Nov 2004 15:02:13 -0000       1.24
***************
*** 341,344 ****
--- 341,402 ----
    }
  
+   /**
+    * This method attempts to fix a corrupt MapFile by re-creating its index.
+    * @param nfs filesystem
+    * @param dir directory containing the MapFile data and index
+    * @param keyClass key class (has to be a subclass of Writable)
+    * @param valueClass value class (has to be a subclass of Writable)
+    * @param dryrun do not perform any changes, just report what needs to be done
+    * @return number of valid entries in this MapFile, or -1 if no fixing was needed
+    * @throws Exception
+    */
+   public static long fix(NutchFileSystem nfs, File dir,
+           Class keyClass, Class valueClass, boolean dryrun) throws Exception {
+     String dr = (dryrun ? "[DRY RUN ] " : "");
+     File data = new File(dir, DATA_FILE_NAME);
+     File index = new File(dir, INDEX_FILE_NAME);
+     int indexInterval = 128;
+     if (!nfs.exists(data)) {
+       // there's nothing we can do to fix this!
+       throw new Exception(dr + "Missing data file in " + dir + ", impossible to fix 
this.");
+     }
+     if (nfs.exists(index)) {
+       // no fixing needed
+       return -1;
+     }
+     SequenceFile.Reader dataReader = new SequenceFile.Reader(nfs, data.toString());
+     if (!dataReader.getKeyClass().equals(keyClass)) {
+       throw new Exception(dr + "Wrong key class in " + dir + ", expected" + 
keyClass.getName() +
+               ", got " + dataReader.getKeyClass().getName());
+     }
+     if (!dataReader.getValueClass().equals(valueClass)) {
+       throw new Exception(dr + "Wrong value class in " + dir + ", expected" + 
valueClass.getName() +
+               ", got " + dataReader.getValueClass().getName());
+     }
+     long cnt = 0L;
+     Writable key = (Writable)keyClass.getConstructor(new Class[0]).newInstance(new 
Object[0]);
+     Writable value = (Writable)valueClass.getConstructor(new 
Class[0]).newInstance(new Object[0]);
+     SequenceFile.Writer indexWriter = null;
+     if (!dryrun) indexWriter = new SequenceFile.Writer(nfs, index.toString(), 
keyClass, LongWritable.class);
+     try {
+       long pos = 0L;
+       LongWritable position = new LongWritable();
+       while(dataReader.next(key, value)) {
+         cnt++;
+         if (cnt % indexInterval == 0) {
+           position.set(pos);
+           if (!dryrun) indexWriter.append(key, position);
+         }
+         pos = dataReader.getPosition();
+       }
+     } catch(Throwable t) {
+       // truncated data file. swallow it.
+     }
+     dataReader.close();
+     if (!dryrun) indexWriter.close();
+     return cnt;
+   }
+ 
+ 
    public static void main(String[] args) throws Exception {
      String usage = "Usage: MapFile inFile outFile";

Index: BufferedRandomAccessFile.java
===================================================================
RCS file: /cvsroot/nutch/nutch/src/java/net/nutch/io/BufferedRandomAccessFile.java,v
retrieving revision 1.1
retrieving revision 1.2
diff -C2 -d -r1.1 -r1.2
*** BufferedRandomAccessFile.java       20 Aug 2004 20:36:11 -0000      1.1
--- BufferedRandomAccessFile.java       5 Nov 2004 15:02:13 -0000       1.2
***************
*** 359,363 ****
      
      public BufferedRandomAccessFile(NutchFileSystem nfs, String file, boolean 
isReadOnly) throws IOException {
!         this(nfs, file, 4096, isReadOnly);
      }
  
--- 359,363 ----
      
      public BufferedRandomAccessFile(NutchFileSystem nfs, String file, boolean 
isReadOnly) throws IOException {
!         this(nfs, file, -1, isReadOnly);
      }
  
***************
*** 365,368 ****
--- 365,369 ----
                                      boolean isReadOnly) throws IOException {
          super(nfs, file, isReadOnly ? "r" : "rw");
+         if (bufLen == -1) bufLen = NutchConf.getInt("io.raf.buffer.size", 4096);
          this.buf = new byte[bufLen];
          this.isReadOnly = isReadOnly;

Index: SequenceFile.java
===================================================================
RCS file: /cvsroot/nutch/nutch/src/java/net/nutch/io/SequenceFile.java,v
retrieving revision 1.26
retrieving revision 1.27
diff -C2 -d -r1.26 -r1.27
*** SequenceFile.java   20 Aug 2004 20:36:11 -0000      1.26
--- SequenceFile.java   5 Nov 2004 15:02:13 -0000       1.27
***************
*** 131,135 ****
      /** Open the named file. */
      public Reader(NutchFileSystem nfs, String file) throws IOException {
!       this(nfs, file, 4096);
      }
  
--- 131,135 ----
      /** Open the named file. */
      public Reader(NutchFileSystem nfs, String file) throws IOException {
!       this(nfs, file, -1);
      }
  



-------------------------------------------------------
This SF.Net email is sponsored by:
Sybase ASE Linux Express Edition - download now for FREE
LinuxWorld Reader's Choice Award Winner for best database on Linux.
http://ads.osdn.com/?ad_id=5588&alloc_id=12065&op=click
_______________________________________________
Nutch-cvs mailing list
[EMAIL PROTECTED]
https://lists.sourceforge.net/lists/listinfo/nutch-cvs

Reply via email to