Update of /cvsroot/nutch/nutch/src/test/net/nutch/tools In directory sc8-pr-cvs1.sourceforge.net:/tmp/cvs-serv3470
Modified Files: TestSegmentMergeTool.java Log Message: Slightly corrected version of JUnit test for SegmentMergeTool. Index: TestSegmentMergeTool.java =================================================================== RCS file: /cvsroot/nutch/nutch/src/test/net/nutch/tools/TestSegmentMergeTool.java,v retrieving revision 1.3 retrieving revision 1.4 diff -C2 -d -r1.3 -r1.4 *** TestSegmentMergeTool.java 6 Oct 2004 23:05:56 -0000 1.3 --- TestSegmentMergeTool.java 14 Nov 2004 21:32:41 -0000 1.4 *************** *** 6,9 **** --- 6,10 ---- import java.io.*; import java.util.Properties; + import java.util.Random; import net.nutch.db.Page; *************** *** 11,14 **** --- 12,17 ---- import net.nutch.io.ArrayFile; import net.nutch.io.MD5Hash; + import net.nutch.segment.SegmentReader; + import net.nutch.segment.SegmentWriter; import net.nutch.util.*; import net.nutch.pagedb.FetchListEntry; *************** *** 23,31 **** public class TestSegmentMergeTool extends TestCase { ! private static final int SEGMENT_CNT = 5; ! private static final int PAGE_CNT = 200; ! private File testDir = null; public TestSegmentMergeTool(String name) { --- 26,34 ---- public class TestSegmentMergeTool extends TestCase { ! protected static final int SEGMENT_CNT = 10; ! protected static final int PAGE_CNT = 500; ! protected File testDir = null; public TestSegmentMergeTool(String name) { *************** *** 46,51 **** /** ! * Create test segment data. NOTE: if segment format changes you need to ! * modify the way segment data is created here (e.g. add more writers). * * @param dir segment directory --- 49,53 ---- /** ! * Create test segment data. * * @param dir segment directory *************** *** 55,72 **** */ protected void createSegmentData(NutchFileSystem nfs, File dir, boolean unique) throws Exception { ! // Each segment consists of: ! // - FetcherOutput ! // - Content ! // - ParseData ! // - ParseText ! // Create writers for these data files ! ArrayFile.Writer fetcherWriter = new ArrayFile.Writer(nfs, new File(dir, FetcherOutput.DIR_NAME).toString(), ! FetcherOutput.class); ! ArrayFile.Writer contentWriter = new ArrayFile.Writer(nfs, new File(dir, Content.DIR_NAME).toString(), ! Content.class); ! ArrayFile.Writer parseDataWriter = new ArrayFile.Writer(nfs, new File(dir, ParseData.DIR_NAME).toString(), ! ParseData.class); ! ArrayFile.Writer parseTextWriter = new ArrayFile.Writer(nfs, new File(dir, ParseText.DIR_NAME).toString(), ! ParseText.class); for (int i = 0; i < PAGE_CNT; i++) { String url = "http://www.example.com/page-" + i; --- 57,62 ---- */ protected void createSegmentData(NutchFileSystem nfs, File dir, boolean unique) throws Exception { ! SegmentWriter sw = new SegmentWriter(nfs, dir, true); ! Random r = new Random(System.currentTimeMillis()); for (int i = 0; i < PAGE_CNT; i++) { String url = "http://www.example.com/page-" + i; *************** *** 79,107 **** FetchListEntry fle = new FetchListEntry(true, new Page(url, 1.0f, 1.0f), new String[] { "test" + rnd }); FetcherOutput fo = new FetcherOutput(fle, MD5Hash.digest(url), FetcherOutput.SUCCESS); ! fetcherWriter.append(fo); ! String content = "<html><body><h1>Hello from Page " + i + "</h1>"; if (unique) { ! content += "<p>Created at epoch time: " + System.currentTimeMillis() + "</p>"; } ! content += "</body></html>"; Properties meta = new Properties(); meta.setProperty("Content-Type", "text/html"); meta.setProperty("Host", "http://localhost"); meta.setProperty("Connection", "Keep-alive, close"); ! Content c = new Content(url, "http://www.example.com", content.getBytes("UTF-8"), "text/html", meta); ! contentWriter.append(c); ParseData pd = new ParseData("Hello from Page " + i, new Outlink[0], meta); ! parseDataWriter.append(pd); ! String text = "Hello from Page" + i; if (unique) { ! text += "\nCreated at epoch time: " + System.currentTimeMillis(); } ! ParseText pt = new ParseText(text); ! parseTextWriter.append(pt); } ! fetcherWriter.close(); ! contentWriter.close(); ! parseDataWriter.close(); ! parseTextWriter.close(); } --- 69,97 ---- FetchListEntry fle = new FetchListEntry(true, new Page(url, 1.0f, 1.0f), new String[] { "test" + rnd }); FetcherOutput fo = new FetcherOutput(fle, MD5Hash.digest(url), FetcherOutput.SUCCESS); ! StringBuffer content = new StringBuffer("<html><body><h1>Hello from Page " + i + "</h1>"); if (unique) { ! content.append("<p>Created at epoch time: " + System.currentTimeMillis() + ", " + r.nextLong() + "</p>"); } ! for (int k = 0; k < 10; k++) { ! content.append("<p>" + k + " lines of text in the queue, " + k + " lines of text...</p>\n"); ! } ! content.append("</body></html>"); Properties meta = new Properties(); meta.setProperty("Content-Type", "text/html"); meta.setProperty("Host", "http://localhost"); meta.setProperty("Connection", "Keep-alive, close"); ! Content co = new Content(url, "http://www.example.com", content.toString().getBytes("UTF-8"), "text/html", meta); ParseData pd = new ParseData("Hello from Page " + i, new Outlink[0], meta); ! StringBuffer text = new StringBuffer("Hello from Page" + i); if (unique) { ! text.append("\nCreated at epoch time: " + System.currentTimeMillis() + ", " + r.nextLong()); } ! for (int k = 0; k < 10; k++) { ! text.append(k + " lines of text in the queue, " + k + " lines of text...\n"); ! } ! ParseText pt = new ParseText(text.toString()); ! sw.append(fo, co, pt, pd); } ! sw.close(); } *************** *** 114,118 **** NutchFileSystem nfs = new LocalFileSystem(); try { - //System.out.println(".tearDown() entered."); super.tearDown(); try { --- 104,107 ---- *************** *** 121,125 **** System.out.println("NON-FATAL: " + e.getMessage()); } - //System.out.println(".tearDown() finished."); } finally { nfs.close(); --- 110,113 ---- *************** *** 144,155 **** createSegmentData(nfs, f, true); } ! SegmentMergeTool ! .main(new String[] { dataDir.toString(), "-o", outSegment.toString(), "-cm", "-i", "-ds", "-dm" }); ! SegmentMergeTool.SegmentReader sr = new SegmentMergeTool.SegmentReader(outSegment.listFiles()[0]); ! assertEquals(sr.size, SEGMENT_CNT * PAGE_CNT); sr.close(); ! } catch (Exception e) { e.printStackTrace(); ! fail(e.getMessage()); } } finally { --- 132,142 ---- createSegmentData(nfs, f, true); } ! runTool(dataDir, outSegment); ! SegmentReader sr = new SegmentReader(outSegment.listFiles()[0]); ! assertEquals(SEGMENT_CNT * PAGE_CNT, sr.size); sr.close(); ! } catch (Throwable e) { e.printStackTrace(); ! fail(e.getMessage() + ", " + e.getStackTrace()); } } finally { *************** *** 157,160 **** --- 144,153 ---- } } + + protected void runTool(File dataDir, File outSegment) throws Exception { + SegmentMergeTool.main( + new String[] {"-dir", dataDir.toString(), "-o", outSegment.toString(), + "-ds"}); + } /** *************** *** 175,182 **** createSegmentData(nfs, f, false); } ! SegmentMergeTool ! .main(new String[] { dataDir.toString(), "-o", outSegment.toString(), "-cm", "-i", "-ds", "-dm" }); ! SegmentMergeTool.SegmentReader sr = new SegmentMergeTool.SegmentReader(outSegment.listFiles()[0]); ! assertEquals(sr.size, PAGE_CNT); sr.close(); } catch (Exception e) { --- 168,174 ---- createSegmentData(nfs, f, false); } ! runTool(dataDir, outSegment); ! SegmentReader sr = new SegmentReader(outSegment.listFiles()[0]); ! assertEquals(PAGE_CNT, sr.size); sr.close(); } catch (Exception e) { *************** *** 202,210 **** nfs.mkdirs(f); createSegmentData(nfs, f, true); switch (i) { case 0: ! // truncate the ! // fetcherOutput data ! // file File data = new File(f, FetcherOutput.DIR_NAME); data = new File(data, "data"); --- 194,201 ---- nfs.mkdirs(f); createSegmentData(nfs, f, true); + // corrupt some segments in various ways... be creative :-) switch (i) { case 0: ! // truncate the fetcherOutput data file File data = new File(f, FetcherOutput.DIR_NAME); data = new File(data, "data"); *************** *** 214,219 **** break; case 1: ! // truncate the Content ! // data file data = new File(f, Content.DIR_NAME); data = new File(data, "data"); --- 205,209 ---- break; case 1: ! // truncate the Content data file data = new File(f, Content.DIR_NAME); data = new File(data, "data"); *************** *** 229,232 **** --- 219,235 ---- new File(data, "index").delete(); break; + case 3: + // remove the "index" files - this is a very typical symptom for + // segments created by a crashed fetcher process. Such segments should + // be automatically fixed and recovered. + data = new File(f, FetcherOutput.DIR_NAME); + new File(data, "index").delete(); + data = new File(f, Content.DIR_NAME); + new File(data, "index").delete(); + data = new File(f, ParseData.DIR_NAME); + new File(data, "index").delete(); + data = new File(f, ParseText.DIR_NAME); + new File(data, "index").delete(); + break; default: // do nothing *************** *** 234,240 **** } } ! SegmentMergeTool ! .main(new String[] { dataDir.toString(), "-o", outSegment.toString(), "-cm", "-i", "-ds", "-dm" }); ! SegmentMergeTool.SegmentReader sr = new SegmentMergeTool.SegmentReader(outSegment.listFiles()[0]); // we arrive at this expression as follows: // 1. SEGMENT_CNT - 1 : because we trash one whole segment --- 237,242 ---- } } ! runTool(dataDir, outSegment); ! SegmentReader sr = new SegmentReader(outSegment.listFiles()[0]); // we arrive at this expression as follows: // 1. SEGMENT_CNT - 1 : because we trash one whole segment *************** *** 243,247 **** // 3. + 2: because sometimes truncation falls on // the boundary of the last entry ! int maxCnt = PAGE_CNT * (SEGMENT_CNT - 1) - 2 * PAGE_CNT / 4 + 2; assertTrue(sr.size < maxCnt); sr.close(); --- 245,250 ---- // 3. + 2: because sometimes truncation falls on // the boundary of the last entry ! int maxCnt = PAGE_CNT * (SEGMENT_CNT - 1) - 2 * PAGE_CNT / 4 + 2 * (SEGMENT_CNT -1); ! //System.out.println("maxCnt=" + maxCnt + ", sr.size=" + sr.size); assertTrue(sr.size < maxCnt); sr.close(); ------------------------------------------------------- This SF.Net email is sponsored by: InterSystems CACHE FREE OODBMS DOWNLOAD - A multidimensional database that combines robust object and relational technologies, making it a perfect match for Java, C++,COM, XML, ODBC and JDBC. www.intersystems.com/match8 _______________________________________________ Nutch-cvs mailing list [EMAIL PROTECTED] https://lists.sourceforge.net/lists/listinfo/nutch-cvs