Author: lewismc Date: Sat Sep 15 16:16:48 2012 New Revision: 1385103 URL: http://svn.apache.org/viewvc?rev=1385103&view=rev Log: NUTCH-1162 Write JUnit tests for parse-js
Added: nutch/branches/2.x/src/plugin/parse-js/sample/ nutch/branches/2.x/src/plugin/parse-js/src/java/org/apache/nutch/parse/js/package.html nutch/branches/2.x/src/plugin/parse-js/src/test/ nutch/branches/2.x/src/plugin/parse-js/src/test/org/ nutch/branches/2.x/src/plugin/parse-js/src/test/org/apache/ nutch/branches/2.x/src/plugin/parse-js/src/test/org/apache/nutch/ nutch/branches/2.x/src/plugin/parse-js/src/test/org/apache/nutch/parse/ nutch/branches/2.x/src/plugin/parse-js/src/test/org/apache/nutch/parse/js/ nutch/branches/2.x/src/plugin/parse-js/src/test/org/apache/nutch/parse/js/TestJSParseFilter.java Removed: nutch/branches/2.x/src/plugin/parse-js/src/java/org/apache/nutch/package.html Modified: nutch/branches/2.x/CHANGES.txt nutch/branches/2.x/conf/nutch-default.xml nutch/branches/2.x/src/plugin/build.xml nutch/branches/2.x/src/plugin/microformats-reltag/src/java/org/apache/nutch/microformats/reltag/RelTagParser.java nutch/branches/2.x/src/plugin/microformats-reltag/src/test/org/apache/nutch/microformats/reltag/TestRelTagParser.java nutch/branches/2.x/src/plugin/parse-js/build.xml nutch/branches/2.x/src/plugin/parse-js/src/java/org/apache/nutch/parse/js/JSParseFilter.java nutch/branches/2.x/src/plugin/parse-tika/src/test/org/apache/nutch/parse/tika/TestRTFParser.java Modified: nutch/branches/2.x/CHANGES.txt URL: http://svn.apache.org/viewvc/nutch/branches/2.x/CHANGES.txt?rev=1385103&r1=1385102&r2=1385103&view=diff ============================================================================== --- nutch/branches/2.x/CHANGES.txt (original) +++ nutch/branches/2.x/CHANGES.txt Sat Sep 15 16:16:48 2012 @@ -2,6 +2,12 @@ Nutch Change Log Release 2.1 - Current Development +* NUTCH-1162 Write JUnit tests for parse-js (lewismc) + +* NUTCH-1161 Write JUnit tests for microformats-reltag plugin (lewismc) + +* NUTCH-1160 Write JUnit tests for index-basic (lewismc) + * NUTCH-1456 Updater not setting batchId in markers correctly. (Alexander Kingson via ferdy) * NUTCH-1459 Remove dead code (phase2) from InjectorJob (ferdy) Modified: nutch/branches/2.x/conf/nutch-default.xml URL: http://svn.apache.org/viewvc/nutch/branches/2.x/conf/nutch-default.xml?rev=1385103&r1=1385102&r2=1385103&view=diff ============================================================================== --- nutch/branches/2.x/conf/nutch-default.xml (original) +++ nutch/branches/2.x/conf/nutch-default.xml Sat Sep 15 16:16:48 2012 @@ -749,6 +749,8 @@ effect.</description> </property> +<!-- BasicIndexingfilter plugin properties --> + <property> <name>indexer.max.title.length</name> <value>100</value> Modified: nutch/branches/2.x/src/plugin/build.xml URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/plugin/build.xml?rev=1385103&r1=1385102&r2=1385103&view=diff ============================================================================== --- nutch/branches/2.x/src/plugin/build.xml (original) +++ nutch/branches/2.x/src/plugin/build.xml Sat Sep 15 16:16:48 2012 @@ -74,6 +74,7 @@ <ant dir="parse-tika" target="test"/> <ant dir="protocol-file" target="test"/> <ant dir="parse-html" target="test"/> + <ant dir="parse-js" target="test"/> <ant dir="index-anchor" target="test"/> <ant dir="index-basic" target="test"/> <ant dir="index-more" target="test"/> Modified: nutch/branches/2.x/src/plugin/microformats-reltag/src/java/org/apache/nutch/microformats/reltag/RelTagParser.java URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/plugin/microformats-reltag/src/java/org/apache/nutch/microformats/reltag/RelTagParser.java?rev=1385103&r1=1385102&r2=1385103&view=diff ============================================================================== --- nutch/branches/2.x/src/plugin/microformats-reltag/src/java/org/apache/nutch/microformats/reltag/RelTagParser.java (original) +++ nutch/branches/2.x/src/plugin/microformats-reltag/src/java/org/apache/nutch/microformats/reltag/RelTagParser.java Sat Sep 15 16:16:48 2012 @@ -135,7 +135,13 @@ public class RelTagParser implements Par FIELDS.add(WebPage.Field.BASE_URL); FIELDS.add(WebPage.Field.METADATA); } - + + /** + * Gets all the fields for a given {@link WebPage} + * Many datastores need to setup the mapreduce job by specifying the fields + * needed. All extensions that work on WebPage are able to specify what fields + * they need. + */ @Override public Collection<Field> getFields() { return FIELDS; Modified: nutch/branches/2.x/src/plugin/microformats-reltag/src/test/org/apache/nutch/microformats/reltag/TestRelTagParser.java URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/plugin/microformats-reltag/src/test/org/apache/nutch/microformats/reltag/TestRelTagParser.java?rev=1385103&r1=1385102&r2=1385103&view=diff ============================================================================== --- nutch/branches/2.x/src/plugin/microformats-reltag/src/test/org/apache/nutch/microformats/reltag/TestRelTagParser.java (original) +++ nutch/branches/2.x/src/plugin/microformats-reltag/src/test/org/apache/nutch/microformats/reltag/TestRelTagParser.java Sat Sep 15 16:16:48 2012 @@ -27,17 +27,20 @@ import org.apache.hadoop.conf.Configurat import org.apache.nutch.parse.Parse; import org.apache.nutch.parse.ParseException; import org.apache.nutch.parse.ParseUtil; -import org.apache.nutch.parse.Parser; import org.apache.nutch.protocol.ProtocolException; import org.apache.nutch.storage.WebPage; import org.apache.nutch.util.MimeUtil; import org.apache.nutch.util.NutchConfiguration; import org.junit.Test; - import junit.framework.TestCase; /** - * Junit test for {@link RelTagParser} based on John Xing's parser tests. + * Junit test for {@link RelTagParser} based mainly John Xing's parser tests. + * We are not concerned with actual parse text within the sample file, instead + * we assert that the rel-tags we expect are found in the WebPage metadata. + * To check the parser is working as expected we unwrap the ByteBuffer obtained + * from metadata, the same type as * we use in expected (String). So just the + * other way around as we wrapped the metadata value. * * @author lewismc * @@ -51,10 +54,10 @@ public class TestRelTagParser extends Te // Make sure sample files are copied to "test.data" as specified in // ./src/plugin/microformats-reltag/build.xml during plugin compilation. - - private String[] sampleFile = { "microformats_reltag_test.html" }; + private String sampleFile = "microformats_reltag_test.html"; - private String expectedText = "rel=\"tag\" · Microformats Wiki"; + // rel-tag's we expect to be extracted from page.getMetadata() + private String expectedRelTags = "Category:Specifications Category:rel-tag "; private Configuration conf; @@ -62,42 +65,35 @@ public class TestRelTagParser extends Te super(name); } - protected void setUp() { - conf = NutchConfiguration.create(); + @Test + public void testRelTagParser() throws ProtocolException, ParseException, IOException { + conf = NutchConfiguration.create(); conf.set("file.content.limit", "-1"); - } - - protected void tearDown() { - } - - public String getTextContent(String fileName) throws ProtocolException, ParseException, IOException { Parse parse; - String urlString = sampleDir + fileSeparator + fileName; + String urlString = "file:" + sampleDir + fileSeparator + sampleFile; - File file = new File(urlString); + File file = new File(sampleDir + fileSeparator + sampleFile); byte[] bytes = new byte[(int) file.length()]; DataInputStream in = new DataInputStream(new FileInputStream(file)); in.readFully(bytes); in.close(); WebPage page = new WebPage(); - page.setBaseUrl(new Utf8("file:"+urlString)); + page.setBaseUrl(new Utf8(urlString)); page.setContent(ByteBuffer.wrap(bytes)); MimeUtil mimeutil = new MimeUtil(conf); String mtype = mimeutil.getMimeType(file); page.setContentType(new Utf8(mtype)); - parse = new ParseUtil(conf).parse("file:"+urlString, page); - - return parse.getText(); + parse = new ParseUtil(conf).parse(urlString, page); + + //begin assertion for tests + ByteBuffer bbuf = page.getFromMetadata(new Utf8("Rel-Tag")); + byte[] byteArray = new byte[bbuf.remaining()]; + bbuf.get(byteArray); + String s = new String(byteArray); + //bbuf.flip(); + assertEquals("We expect 2 tab-separated rel-tag's extracted by the filter", + expectedRelTags, s); } - @Test - public void testRelTagParser() throws ProtocolException, ParseException, IOException { - - for (int i = 0; i < sampleFile.length; i++) { - String found = getTextContent(sampleFile[i]); - assertTrue("text found : '" + found + "'", found.startsWith(expectedText)); - } - } - } \ No newline at end of file Modified: nutch/branches/2.x/src/plugin/parse-js/build.xml URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/plugin/parse-js/build.xml?rev=1385103&r1=1385102&r2=1385103&view=diff ============================================================================== --- nutch/branches/2.x/src/plugin/parse-js/build.xml (original) +++ nutch/branches/2.x/src/plugin/parse-js/build.xml Sat Sep 15 16:16:48 2012 @@ -19,4 +19,18 @@ <import file="../build-plugin.xml"/> + <!-- Deploy Unit test dependencies --> + <target name="deps-test"> + <ant target="deploy" inheritall="false" dir="../nutch-extensionpoints"/> + <ant target="deploy" inheritall="false" dir="../protocol-file"/> + </target> + + <!-- for junit test --> + <mkdir dir="${build.test}/data"/> + <copy todir="${build.test}/data"> + <fileset dir="sample"> + <include name="*.html"/> + <include name="*.js"/> + </fileset> + </copy> </project> Modified: nutch/branches/2.x/src/plugin/parse-js/src/java/org/apache/nutch/parse/js/JSParseFilter.java URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/plugin/parse-js/src/java/org/apache/nutch/parse/js/JSParseFilter.java?rev=1385103&r1=1385102&r2=1385103&view=diff ============================================================================== --- nutch/branches/2.x/src/plugin/parse-js/src/java/org/apache/nutch/parse/js/JSParseFilter.java (original) +++ nutch/branches/2.x/src/plugin/parse-js/src/java/org/apache/nutch/parse/js/JSParseFilter.java Sat Sep 15 16:16:48 2012 @@ -69,6 +69,15 @@ public class JSParseFilter implements Pa private Configuration conf; + /** + * Scan the JavaScript looking for possible {@link Outlink}'s + * @param url URL of the {@link WebPage} to be parsed + * @param page {@link WebPage} object relative to the URL + * @param parse {@link Parse} object holding parse status + * @param metatags within the {@link NutchDocument} + * @param doc The {@link NutchDocument} object + * @return parse the actual {@link Parse} object + */ @Override public Parse filter(String url, WebPage page, Parse parse, HTMLMetaTags metaTags, DocumentFragment doc) { @@ -104,9 +113,10 @@ public class JSParseFilter implements Pa if (i > 0) script.append('\n'); script.append(nn.item(i).getNodeValue()); } - // if (LOG.isInfoEnabled()) { - // LOG.info("script: language=" + lang + ", text: " + script.toString()); - // } + // This logging makes the output very messy. + //if (LOG.isInfoEnabled()) { + // LOG.info("script: language=" + lang + ", text: " + script.toString()); + //} Outlink[] links = getJSLinks(script.toString(), "", base); if (links != null && links.length > 0) outlinks.addAll(Arrays.asList(links)); // no other children of interest here, go one level up. @@ -141,6 +151,12 @@ public class JSParseFilter implements Pa } } + /** + * Set the {@link Configuration} object + * @param url URL of the {@link WebPage} which is parsed + * @param page {@link WebPage} object relative to the URL + * @return parse the actual {@link Parse} object + */ @Override public Parse getParse(String url, WebPage page) { String type = TableUtil.toString(page.getContentType()); @@ -182,7 +198,9 @@ public class JSParseFilter implements Pa try { baseURL = new URL(base); } catch (Exception e) { - if (LOG.isErrorEnabled()) { LOG.error("getJSLinks", e); } + if (LOG.isErrorEnabled()) { + LOG.error("error assigning base URL", e); + } } try { @@ -207,7 +225,9 @@ public class JSParseFilter implements Pa url = result.group(2); PatternMatcherInput input1 = new PatternMatcherInput(url); if (!matcher1.matches(input1, pattern1)) { - //if (LOG.isTraceEnabled()) { LOG.trace(" - invalid '" + url + "'"); } + if (LOG.isTraceEnabled()) { + LOG.trace(" - invalid '" + url + "'"); + } continue; } if (url.startsWith("www.")) { @@ -234,7 +254,9 @@ public class JSParseFilter implements Pa } catch (Exception ex) { // if it is a malformed URL we just throw it away and continue with // extraction. - if (LOG.isErrorEnabled()) { LOG.error("getJSLinks", ex); } + if (LOG.isErrorEnabled()) { + LOG.error(" - invalid or malformed URL", ex); + } } final Outlink[] retval; @@ -249,6 +271,12 @@ public class JSParseFilter implements Pa return retval; } + /** + * Main method which can be run from command line with the plugin option. + * The method takes two arguments e.g. o.a.n.parse.js.JSParseFilter file.js baseURL + * @param args + * @throws Exception + */ public static void main(String[] args) throws Exception { if (args.length < 2) { System.err.println(JSParseFilter.class.getName() + " file.js baseURL"); @@ -267,14 +295,26 @@ public class JSParseFilter implements Pa System.out.println(" - " + links[i]); } + /** + * Set the {@link Configuration} object + */ public void setConf(Configuration conf) { this.conf = conf; } + /** + * Get the {@link Configuration} object + */ public Configuration getConf() { return this.conf; } + /** + * Gets all the fields for a given {@link WebPage} + * Many datastores need to setup the mapreduce job by specifying the fields + * needed. All extensions that work on WebPage are able to specify what fields + * they need. + */ @Override public Collection<WebPage.Field> getFields() { return null; Added: nutch/branches/2.x/src/plugin/parse-js/src/java/org/apache/nutch/parse/js/package.html URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/plugin/parse-js/src/java/org/apache/nutch/parse/js/package.html?rev=1385103&view=auto ============================================================================== --- nutch/branches/2.x/src/plugin/parse-js/src/java/org/apache/nutch/parse/js/package.html (added) +++ nutch/branches/2.x/src/plugin/parse-js/src/java/org/apache/nutch/parse/js/package.html Sat Sep 15 16:16:48 2012 @@ -0,0 +1,6 @@ +<html> +<body> +<p>A parser plugin and content filter to extract all (possible) links +from JavaScript files and code snippets.</p> +</body> +</html> Added: nutch/branches/2.x/src/plugin/parse-js/src/test/org/apache/nutch/parse/js/TestJSParseFilter.java URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/plugin/parse-js/src/test/org/apache/nutch/parse/js/TestJSParseFilter.java?rev=1385103&view=auto ============================================================================== --- nutch/branches/2.x/src/plugin/parse-js/src/test/org/apache/nutch/parse/js/TestJSParseFilter.java (added) +++ nutch/branches/2.x/src/plugin/parse-js/src/test/org/apache/nutch/parse/js/TestJSParseFilter.java Sat Sep 15 16:16:48 2012 @@ -0,0 +1,108 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.nutch.parse.js; + +import java.io.DataInputStream; +import java.io.File; +import java.io.FileInputStream; +import java.io.FileNotFoundException; +import java.io.IOException; +import java.nio.ByteBuffer; + +import org.apache.avro.util.Utf8; +import org.apache.hadoop.conf.Configuration; +import org.apache.nutch.parse.Outlink; +import org.apache.nutch.parse.Parse; +import org.apache.nutch.parse.ParseException; +import org.apache.nutch.parse.ParseUtil; +import org.apache.nutch.protocol.ProtocolException; +import org.apache.nutch.storage.WebPage; +import org.apache.nutch.util.MimeUtil; +import org.apache.nutch.util.NutchConfiguration; +import org.junit.Test; + +import junit.framework.TestCase; + +/** + * JUnit test case for {@link JSParseFilter} which tests + * 1. That 5 outlinks are extracted from JavaScript snippets embedded in HTML + * 2. That X outlinks are extracted from a pure JavaScript file (this is temporarily disabled) + * + * @author lewismc + */ + +public class TestJSParseFilter extends TestCase { + + private String fileSeparator = System.getProperty("file.separator"); + + // This system property is defined in ./src/plugin/build-plugin.xml + private String sampleDir = System.getProperty("test.data", "."); + + // Make sure sample files are copied to "test.data" as specified in + // ./src/plugin/parse-js/build.xml during plugin compilation. + private String[] sampleFiles = { "parse_pure_js_test.js", "parse_embedded_js_test.html" }; + + private Configuration conf; + + public TestJSParseFilter(String name) { + super(name); + } + + protected void setUp() { + conf = NutchConfiguration.create(); + conf.set("file.content.limit", "-1"); + } + + protected void tearDown() { + } + + public Outlink[] getOutlinks(String[] sampleFiles) throws ProtocolException, ParseException, IOException { + String urlString; + Parse parse; + + urlString = "file:" + sampleDir + fileSeparator + sampleFiles; + File file = new File(urlString); + byte[] bytes = new byte[(int) file.length()]; + DataInputStream dip = new DataInputStream(new FileInputStream(file)); + dip.readFully(bytes); + dip.close(); + + WebPage page = new WebPage(); + page.setBaseUrl(new Utf8(urlString)); + page.setContent(ByteBuffer.wrap(bytes)); + MimeUtil mutil = new MimeUtil(conf); + String mime = mutil.getMimeType(file); + page.setContentType(new Utf8(mime)); + + parse = new ParseUtil(conf).parse(urlString, page); + return parse.getOutlinks(); + } + + @Test + public void testOutlinkExtraction() throws ProtocolException, ParseException, IOException { + String[] filenames = new File(sampleDir).list(); + for (int i = 0; i < filenames.length; i++) { + if (filenames[i].endsWith(".js") == true) { + assertEquals("number of outlinks in .js test file should be 5", 5, getOutlinks(sampleFiles)); + // temporarily disabled as a suitable pure JS file could not be be found. + //} else { + //assertEquals("number of outlinks in .html file should be X", 5, getOutlinks(sampleFiles)); + } + } + } + +} \ No newline at end of file Modified: nutch/branches/2.x/src/plugin/parse-tika/src/test/org/apache/nutch/parse/tika/TestRTFParser.java URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/plugin/parse-tika/src/test/org/apache/nutch/parse/tika/TestRTFParser.java?rev=1385103&r1=1385102&r2=1385103&view=diff ============================================================================== --- nutch/branches/2.x/src/plugin/parse-tika/src/test/org/apache/nutch/parse/tika/TestRTFParser.java (original) +++ nutch/branches/2.x/src/plugin/parse-tika/src/test/org/apache/nutch/parse/tika/TestRTFParser.java Sat Sep 15 16:16:48 2012 @@ -16,23 +16,6 @@ ******************************************************************************/ package org.apache.nutch.parse.tika; -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - // JUnit imports import java.io.DataInputStream; import java.io.File; @@ -79,7 +62,6 @@ public class TestRTFParser extends TestC } public void testIt() throws ProtocolException, ParseException, IOException { - /* Temporarily disabled - see Tika-748 String urlString; Parse parse; @@ -97,22 +79,23 @@ public class TestRTFParser extends TestC WebPage page = new WebPage(); page.setBaseUrl(new Utf8(urlString)); page.setContent(ByteBuffer.wrap(bytes)); - MimeType mtype = mimeutil.getMimeType(file); - page.setContentType(new Utf8(mtype.getName())); + String mtype = mimeutil.getMimeType(file); + page.setContentType(new Utf8(mtype)); parse = new ParseUtil(conf).parse(urlString, page); + String title = parse.getTitle(); String text = parse.getText(); - assertEquals("The quick brown fox jumps over the lazy dog", text.trim()); + assertEquals("test rft document", title); + //assertEquals("The quick brown fox jumps over the lazy dog", text.trim()); - String title = parse.getTitle(); + // HOW DO WE GET THE PARSE METADATA? // Metadata meta = parse(); // METADATA extraction is not yet supported in Tika - // assertEquals("test rft document", title); + // // assertEquals("tests", meta.get(DublinCore.SUBJECT)); - */ } }