Modified: nutch/trunk/src/plugin/index-more/src/java/org/apache/nutch/indexer/more/MoreIndexingFilter.java URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/index-more/src/java/org/apache/nutch/indexer/more/MoreIndexingFilter.java?rev=1655526&r1=1655525&r2=1655526&view=diff ============================================================================== --- nutch/trunk/src/plugin/index-more/src/java/org/apache/nutch/indexer/more/MoreIndexingFilter.java (original) +++ nutch/trunk/src/plugin/index-more/src/java/org/apache/nutch/indexer/more/MoreIndexingFilter.java Thu Jan 29 05:38:59 2015 @@ -55,25 +55,28 @@ import org.apache.commons.lang.time.Date * Add (or reset) a few metaData properties as respective fields (if they are * available), so that they can be accurately used within the search index. * - * 'lastModifed' is indexed to support query by date, 'contentLength' obtains content length from the HTTP - * header, 'type' field is indexed to support query by type and finally the 'title' field is an attempt - * to reset the title if a content-disposition hint exists. The logic is that such a presence is indicative - * that the content provider wants the filename therein to be used as the title. - * + * 'lastModifed' is indexed to support query by date, 'contentLength' obtains + * content length from the HTTP header, 'type' field is indexed to support query + * by type and finally the 'title' field is an attempt to reset the title if a + * content-disposition hint exists. The logic is that such a presence is + * indicative that the content provider wants the filename therein to be used as + * the title. + * * Still need to make content-length searchable! - * + * * @author John Xing */ public class MoreIndexingFilter implements IndexingFilter { - public static final Logger LOG = LoggerFactory.getLogger(MoreIndexingFilter.class); + public static final Logger LOG = LoggerFactory + .getLogger(MoreIndexingFilter.class); /** Get the MimeTypes resolver instance. */ private MimeUtil MIME; private Tika tika = new Tika(); /** Map for mime-type substitution */ - private HashMap<String,String> mimeMap = null; + private HashMap<String, String> mimeMap = null; private boolean mapMimes = false; public NutchDocument filter(NutchDocument doc, Parse parse, Text url, @@ -89,23 +92,24 @@ public class MoreIndexingFilter implemen return doc; } - // Add time related meta info. Add last-modified if present. Index date as + // Add time related meta info. Add last-modified if present. Index date as // last-modified, or, if that's not present, use fetch time. - private NutchDocument addTime(NutchDocument doc, ParseData data, - String url, CrawlDatum datum) { + private NutchDocument addTime(NutchDocument doc, ParseData data, String url, + CrawlDatum datum) { long time = -1; String lastModified = data.getMeta(Metadata.LAST_MODIFIED); - if (lastModified != null) { // try parse last-modified - time = getTime(lastModified,url); // use as time - // store as string + if (lastModified != null) { // try parse last-modified + time = getTime(lastModified, url); // use as time + // store as string doc.add("lastModified", new Date(time)); } - if (time == -1) { // if no last-modified specified in HTTP header - time = datum.getModifiedTime(); // use value in CrawlDatum - if (time <= 0) { // if also unset - time = datum.getFetchTime(); // use time the fetch took place (fetchTime of fetchDatum) + if (time == -1) { // if no last-modified specified in HTTP header + time = datum.getModifiedTime(); // use value in CrawlDatum + if (time <= 0) { // if also unset + time = datum.getFetchTime(); // use time the fetch took place (fetchTime + // of fetchDatum) } } @@ -119,43 +123,29 @@ public class MoreIndexingFilter implemen try { time = HttpDateFormat.toLong(date); } catch (ParseException e) { - // try to parse it as date in alternative format - try { - Date parsedDate = DateUtils.parseDate(date, - new String [] { - "EEE MMM dd HH:mm:ss yyyy", - "EEE MMM dd HH:mm:ss yyyy zzz", - "EEE MMM dd HH:mm:ss zzz yyyy", - "EEE, MMM dd HH:mm:ss yyyy zzz", - "EEE, dd MMM yyyy HH:mm:ss zzz", - "EEE,dd MMM yyyy HH:mm:ss zzz", - "EEE, dd MMM yyyy HH:mm:sszzz", - "EEE, dd MMM yyyy HH:mm:ss", - "EEE, dd-MMM-yy HH:mm:ss zzz", - "yyyy/MM/dd HH:mm:ss.SSS zzz", - "yyyy/MM/dd HH:mm:ss.SSS", - "yyyy/MM/dd HH:mm:ss zzz", - "yyyy/MM/dd", - "yyyy.MM.dd HH:mm:ss", - "yyyy-MM-dd HH:mm", - "MMM dd yyyy HH:mm:ss. zzz", - "MMM dd yyyy HH:mm:ss zzz", - "dd.MM.yyyy HH:mm:ss zzz", - "dd MM yyyy HH:mm:ss zzz", - "dd.MM.yyyy; HH:mm:ss", - "dd.MM.yyyy HH:mm:ss", - "dd.MM.yyyy zzz", - "yyyy-MM-dd'T'HH:mm:ss'Z'" - }); - time = parsedDate.getTime(); - // if (LOG.isWarnEnabled()) { - // LOG.warn(url + ": parsed date: " + date +" to:"+time); - // } - } catch (Exception e2) { - if (LOG.isWarnEnabled()) { - LOG.warn(url + ": can't parse erroneous date: " + date); - } - } + // try to parse it as date in alternative format + try { + Date parsedDate = DateUtils.parseDate(date, new String[] { + "EEE MMM dd HH:mm:ss yyyy", "EEE MMM dd HH:mm:ss yyyy zzz", + "EEE MMM dd HH:mm:ss zzz yyyy", "EEE, MMM dd HH:mm:ss yyyy zzz", + "EEE, dd MMM yyyy HH:mm:ss zzz", "EEE,dd MMM yyyy HH:mm:ss zzz", + "EEE, dd MMM yyyy HH:mm:sszzz", "EEE, dd MMM yyyy HH:mm:ss", + "EEE, dd-MMM-yy HH:mm:ss zzz", "yyyy/MM/dd HH:mm:ss.SSS zzz", + "yyyy/MM/dd HH:mm:ss.SSS", "yyyy/MM/dd HH:mm:ss zzz", "yyyy/MM/dd", + "yyyy.MM.dd HH:mm:ss", "yyyy-MM-dd HH:mm", + "MMM dd yyyy HH:mm:ss. zzz", "MMM dd yyyy HH:mm:ss zzz", + "dd.MM.yyyy HH:mm:ss zzz", "dd MM yyyy HH:mm:ss zzz", + "dd.MM.yyyy; HH:mm:ss", "dd.MM.yyyy HH:mm:ss", "dd.MM.yyyy zzz", + "yyyy-MM-dd'T'HH:mm:ss'Z'" }); + time = parsedDate.getTime(); + // if (LOG.isWarnEnabled()) { + // LOG.warn(url + ": parsed date: " + date +" to:"+time); + // } + } catch (Exception e2) { + if (LOG.isWarnEnabled()) { + LOG.warn(url + ": can't parse erroneous date: " + date); + } + } } return time; } @@ -187,7 +177,7 @@ public class MoreIndexingFilter implemen * all case insensitive. The query filter is implemented in * {@link TypeQueryFilter}. * </p> - * + * * @param doc * @param data * @param url @@ -212,9 +202,9 @@ public class MoreIndexingFilter implemen // (using MimeTypes.getMimeType(byte[], String), but I don't know // which field it is? // if (MAGIC) { - // contentType = MIME.getMimeType(url, content); + // contentType = MIME.getMimeType(url, content); // } else { - // contentType = MIME.getMimeType(url); + // contentType = MIME.getMimeType(url); // } mimeType = tika.detect(url); @@ -243,20 +233,20 @@ public class MoreIndexingFilter implemen if (conf.getBoolean("moreIndexingFilter.indexMimeTypeParts", true)) { String[] parts = getParts(contentType); - for(String part: parts) { + for (String part : parts) { doc.add("type", part); } } // leave this for future improvement - //MimeTypeParameterList parameterList = mimeType.getParameters() + // MimeTypeParameterList parameterList = mimeType.getParameters() return doc; } - /** * Utility method for splitting mime type into type and subtype. + * * @param mimeType * @return */ @@ -273,15 +263,13 @@ public class MoreIndexingFilter implemen // Content-Disposition: inline; filename="foo.ppt" private Configuration conf; - static Pattern patterns[] = {null, null}; + static Pattern patterns[] = { null, null }; static { try { // order here is important - patterns[0] = - Pattern.compile("\\bfilename=['\"](.+)['\"]"); - patterns[1] = - Pattern.compile("\\bfilename=(\\S+)\\b"); + patterns[0] = Pattern.compile("\\bfilename=['\"](.+)['\"]"); + patterns[1] = Pattern.compile("\\bfilename=(\\S+)\\b"); } catch (PatternSyntaxException e) { // just ignore } @@ -292,7 +280,7 @@ public class MoreIndexingFilter implemen if (contentDisposition == null || doc.getFieldValue("title") != null) return doc; - for (int i=0; i<patterns.length; i++) { + for (int i = 0; i < patterns.length; i++) { Matcher matcher = patterns[i].matcher(contentDisposition); if (matcher.find()) { doc.add("title", matcher.group(1)); @@ -324,11 +312,12 @@ public class MoreIndexingFilter implemen } private void readConfiguration() throws IOException { - BufferedReader reader = new BufferedReader(conf.getConfResourceAsReader("contenttype-mapping.txt")); + BufferedReader reader = new BufferedReader( + conf.getConfResourceAsReader("contenttype-mapping.txt")); String line; String parts[]; - mimeMap = new HashMap<String,String>(); + mimeMap = new HashMap<String, String>(); while ((line = reader.readLine()) != null) { if (StringUtils.isNotBlank(line) && !line.startsWith("#")) {
Modified: nutch/trunk/src/plugin/index-more/src/test/org/apache/nutch/indexer/more/TestMoreIndexingFilter.java URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/index-more/src/test/org/apache/nutch/indexer/more/TestMoreIndexingFilter.java?rev=1655526&r1=1655525&r2=1655526&view=diff ============================================================================== --- nutch/trunk/src/plugin/index-more/src/test/org/apache/nutch/indexer/more/TestMoreIndexingFilter.java (original) +++ nutch/trunk/src/plugin/index-more/src/test/org/apache/nutch/indexer/more/TestMoreIndexingFilter.java Thu Jan 29 05:38:59 2015 @@ -40,7 +40,7 @@ public class TestMoreIndexingFilter { assertContentType(conf, "text/html", "text/html"); assertContentType(conf, "text/html; charset=UTF-8", "text/html"); } - + @Test public void testGetParts() { String[] parts = MoreIndexingFilter.getParts("text/html"); @@ -51,7 +51,7 @@ public class TestMoreIndexingFilter { * @since NUTCH-901 */ @Test - public void testNoParts(){ + public void testNoParts() { Configuration conf = NutchConfiguration.create(); conf.setBoolean("moreIndexingFilter.indexMimeTypeParts", false); MoreIndexingFilter filter = new MoreIndexingFilter(); @@ -59,18 +59,18 @@ public class TestMoreIndexingFilter { Assert.assertNotNull(filter); NutchDocument doc = new NutchDocument(); ParseImpl parse = new ParseImpl("foo bar", new ParseData()); - - try{ - filter.filter(doc, parse, new Text("http://nutch.apache.org/index.html"), new CrawlDatum(), new Inlinks()); - } - catch(Exception e){ - e.printStackTrace(); - Assert.fail(e.getMessage()); + + try { + filter.filter(doc, parse, new Text("http://nutch.apache.org/index.html"), + new CrawlDatum(), new Inlinks()); + } catch (Exception e) { + e.printStackTrace(); + Assert.fail(e.getMessage()); } Assert.assertNotNull(doc); Assert.assertTrue(doc.getFieldNames().contains("type")); Assert.assertEquals(1, doc.getField("type").getValues().size()); - Assert.assertEquals("text/html", doc.getFieldValue("type")); + Assert.assertEquals("text/html", doc.getFieldValue("type")); } @Test @@ -89,8 +89,9 @@ public class TestMoreIndexingFilter { NutchDocument doc = new NutchDocument(); doc = filter.filter(doc, parseImpl, url, new CrawlDatum(), new Inlinks()); - Assert.assertEquals("content-disposition not detected", "filename.ext", doc.getFieldValue("title")); - + Assert.assertEquals("content-disposition not detected", "filename.ext", + doc.getFieldValue("title")); + /* NUTCH-1140: do not add second title to avoid a multi-valued title field */ doc = new NutchDocument(); doc.add("title", "title"); @@ -105,15 +106,18 @@ public class TestMoreIndexingFilter { Assert.assertEquals(expected[i], parts[i]); } } - - private void assertContentType(Configuration conf, String source, String expected) throws IndexingException { + + private void assertContentType(Configuration conf, String source, + String expected) throws IndexingException { Metadata metadata = new Metadata(); metadata.add(Response.CONTENT_TYPE, source); MoreIndexingFilter filter = new MoreIndexingFilter(); filter.setConf(conf); - NutchDocument doc = filter.filter(new NutchDocument(), new ParseImpl("text", new ParseData( - new ParseStatus(), "title", new Outlink[0], metadata)), new Text( - "http://www.example.com/"), new CrawlDatum(), new Inlinks()); - Assert.assertEquals("mime type not detected", expected, doc.getFieldValue("type")); + NutchDocument doc = filter.filter(new NutchDocument(), new ParseImpl( + "text", new ParseData(new ParseStatus(), "title", new Outlink[0], + metadata)), new Text("http://www.example.com/"), new CrawlDatum(), + new Inlinks()); + Assert.assertEquals("mime type not detected", expected, + doc.getFieldValue("type")); } } Modified: nutch/trunk/src/plugin/index-static/src/java/org/apache/nutch/indexer/staticfield/StaticFieldIndexer.java URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/index-static/src/java/org/apache/nutch/indexer/staticfield/StaticFieldIndexer.java?rev=1655526&r1=1655525&r2=1655526&view=diff ============================================================================== --- nutch/trunk/src/plugin/index-static/src/java/org/apache/nutch/indexer/staticfield/StaticFieldIndexer.java (original) +++ nutch/trunk/src/plugin/index-static/src/java/org/apache/nutch/indexer/staticfield/StaticFieldIndexer.java Thu Jan 29 05:38:59 2015 @@ -45,11 +45,16 @@ public class StaticFieldIndexer implemen * The {@link StaticFieldIndexer} filter object which adds fields as per * configuration setting. See {@code index.static} in nutch-default.xml. * - * @param doc The {@link NutchDocument} object - * @param parse The relevant {@link Parse} object passing through the filter - * @param url URL to be filtered for anchor text - * @param datum The {@link CrawlDatum} entry - * @param inlinks The {@link Inlinks} containing anchor text + * @param doc + * The {@link NutchDocument} object + * @param parse + * The relevant {@link Parse} object passing through the filter + * @param url + * URL to be filtered for anchor text + * @param datum + * The {@link CrawlDatum} entry + * @param inlinks + * The {@link Inlinks} containing anchor text * @return filtered NutchDocument */ public NutchDocument filter(NutchDocument doc, Parse parse, Text url, @@ -66,17 +71,19 @@ public class StaticFieldIndexer implemen } /** - * Populate a HashMap from a list of fieldname:fieldcontent. - * See {@index.static} in nutch-default.xml. + * Populate a HashMap from a list of fieldname:fieldcontent. See + * {@index.static} in nutch-default.xml. * - * @param fieldsString string containing field:value pairs + * @param fieldsString + * string containing field:value pairs * @return HashMap of fields and their corresponding values */ private HashMap<String, String[]> parseFields(String fieldsString) { HashMap<String, String[]> fields = new HashMap<String, String[]>(); - /* The format is very easy, it's a comma-separated list of fields in the - form <name>:<value> + /* + * The format is very easy, it's a comma-separated list of fields in the + * form <name>:<value> */ for (String field : fieldsString.split(",")) { String[] entry = field.split(":"); Modified: nutch/trunk/src/plugin/index-static/src/test/org/apache/nutch/indexer/staticfield/TestStaticFieldIndexerTest.java URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/index-static/src/test/org/apache/nutch/indexer/staticfield/TestStaticFieldIndexerTest.java?rev=1655526&r1=1655525&r2=1655526&view=diff ============================================================================== --- nutch/trunk/src/plugin/index-static/src/test/org/apache/nutch/indexer/staticfield/TestStaticFieldIndexerTest.java (original) +++ nutch/trunk/src/plugin/index-static/src/test/org/apache/nutch/indexer/staticfield/TestStaticFieldIndexerTest.java Thu Jan 29 05:38:59 2015 @@ -28,11 +28,10 @@ import org.junit.Before; import org.junit.Test; /** - * JUnit test case which tests - * 1. that static data fields are added to a document - * 2. that empty {@code index.static} does not add anything to the document - * 3. that valid field:value pairs are added to the document - * 4. that fields and values added to the document are trimmed + * JUnit test case which tests 1. that static data fields are added to a + * document 2. that empty {@code index.static} does not add anything to the + * document 3. that valid field:value pairs are added to the document 4. that + * fields and values added to the document are trimmed * * @author tejasp */ @@ -59,7 +58,8 @@ public class TestStaticFieldIndexerTest /** * Test that empty {@code index.static} does not add anything to the document - * @throws Exception + * + * @throws Exception */ @Test public void testEmptyIndexStatic() throws Exception { @@ -77,12 +77,14 @@ public class TestStaticFieldIndexerTest } Assert.assertNotNull(doc); - Assert.assertTrue("tests if no field is set for empty index.static", doc.getFieldNames().isEmpty()); + Assert.assertTrue("tests if no field is set for empty index.static", doc + .getFieldNames().isEmpty()); } /** * Test that valid field:value pairs are added to the document - * @throws Exception + * + * @throws Exception */ @Test public void testNormalScenario() throws Exception { @@ -102,13 +104,15 @@ public class TestStaticFieldIndexerTest } Assert.assertNotNull(doc); - Assert.assertFalse("test if doc is not empty", doc.getFieldNames().isEmpty()); - Assert.assertEquals("test if doc has 3 fields", 3, doc.getFieldNames().size()); - Assert.assertTrue("test if doc has field1", doc.getField("field1").getValues() - .contains("val1")); - Assert.assertTrue("test if doc has field2", doc.getField("field2").getValues() - .contains("val2")); - Assert.assertTrue("test if doc has field4", doc.getField("field4").getValues() - .contains("val4")); + Assert.assertFalse("test if doc is not empty", doc.getFieldNames() + .isEmpty()); + Assert.assertEquals("test if doc has 3 fields", 3, doc.getFieldNames() + .size()); + Assert.assertTrue("test if doc has field1", doc.getField("field1") + .getValues().contains("val1")); + Assert.assertTrue("test if doc has field2", doc.getField("field2") + .getValues().contains("val2")); + Assert.assertTrue("test if doc has field4", doc.getField("field4") + .getValues().contains("val4")); } } Modified: nutch/trunk/src/plugin/indexer-dummy/src/java/org/apache/nutch/indexwriter/dummy/DummyIndexWriter.java URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/indexer-dummy/src/java/org/apache/nutch/indexwriter/dummy/DummyIndexWriter.java?rev=1655526&r1=1655525&r2=1655526&view=diff ============================================================================== --- nutch/trunk/src/plugin/indexer-dummy/src/java/org/apache/nutch/indexwriter/dummy/DummyIndexWriter.java (original) +++ nutch/trunk/src/plugin/indexer-dummy/src/java/org/apache/nutch/indexwriter/dummy/DummyIndexWriter.java Thu Jan 29 05:38:59 2015 @@ -35,12 +35,13 @@ import org.slf4j.LoggerFactory; * and add. */ public class DummyIndexWriter implements IndexWriter { - public static final Logger LOG = LoggerFactory.getLogger(DummyIndexWriter.class); + public static final Logger LOG = LoggerFactory + .getLogger(DummyIndexWriter.class); private Configuration config; private Writer writer; private boolean delete = false; - public void open(JobConf job, String name) throws IOException { + public void open(JobConf job, String name) throws IOException { delete = job.getBoolean(IndexerMapReduce.INDEXER_DELETE, false); } @@ -82,19 +83,21 @@ public class DummyIndexWriter implements String path = conf.get("dummy.path"); if (path == null) { String message = "Missing path. Should be set via -Ddummy.path"; - message+="\n"+describe(); + message += "\n" + describe(); LOG.error(message); throw new RuntimeException(message); } try { writer = new BufferedWriter(new FileWriter(conf.get("dummy.path"))); - } catch (IOException e) {} + } catch (IOException e) { + } } - public String describe(){ + public String describe() { StringBuffer sb = new StringBuffer("DummyIndexWriter\n"); - sb.append("\t").append("dummy.path : Path of the file to write to (mandatory)\n"); + sb.append("\t").append( + "dummy.path : Path of the file to write to (mandatory)\n"); return sb.toString(); } } Modified: nutch/trunk/src/plugin/indexer-dummy/src/java/org/apache/nutch/indexwriter/dummy/package-info.java URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/indexer-dummy/src/java/org/apache/nutch/indexwriter/dummy/package-info.java?rev=1655526&r1=1655525&r2=1655526&view=diff ============================================================================== --- nutch/trunk/src/plugin/indexer-dummy/src/java/org/apache/nutch/indexwriter/dummy/package-info.java (original) +++ nutch/trunk/src/plugin/indexer-dummy/src/java/org/apache/nutch/indexwriter/dummy/package-info.java Thu Jan 29 05:38:59 2015 @@ -20,3 +20,4 @@ * text file, action is one of "add", "update", or "delete". */ package org.apache.nutch.indexwriter.dummy; + Modified: nutch/trunk/src/plugin/indexer-elastic/src/java/org/apache/nutch/indexwriter/elastic/package-info.java URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/indexer-elastic/src/java/org/apache/nutch/indexwriter/elastic/package-info.java?rev=1655526&r1=1655525&r2=1655526&view=diff ============================================================================== --- nutch/trunk/src/plugin/indexer-elastic/src/java/org/apache/nutch/indexwriter/elastic/package-info.java (original) +++ nutch/trunk/src/plugin/indexer-elastic/src/java/org/apache/nutch/indexwriter/elastic/package-info.java Thu Jan 29 05:38:59 2015 @@ -19,3 +19,4 @@ * Index writer plugin for <a href="http://www.elasticsearch.org/">Elasticsearch</a>. */ package org.apache.nutch.indexwriter.elastic; + Modified: nutch/trunk/src/plugin/indexer-solr/src/java/org/apache/nutch/indexwriter/solr/SolrConstants.java URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/indexer-solr/src/java/org/apache/nutch/indexwriter/solr/SolrConstants.java?rev=1655526&r1=1655525&r2=1655526&view=diff ============================================================================== --- nutch/trunk/src/plugin/indexer-solr/src/java/org/apache/nutch/indexwriter/solr/SolrConstants.java (original) +++ nutch/trunk/src/plugin/indexer-solr/src/java/org/apache/nutch/indexwriter/solr/SolrConstants.java Thu Jan 29 05:38:59 2015 @@ -30,10 +30,10 @@ public interface SolrConstants { public static final String USERNAME = SOLR_PREFIX + "auth.username"; public static final String PASSWORD = SOLR_PREFIX + "auth.password"; - + @Deprecated public static final String COMMIT_INDEX = SOLR_PREFIX + "commit.index"; - + @Deprecated public static final String PARAMS = SOLR_PREFIX + "params"; Modified: nutch/trunk/src/plugin/indexer-solr/src/java/org/apache/nutch/indexwriter/solr/SolrIndexWriter.java URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/indexer-solr/src/java/org/apache/nutch/indexwriter/solr/SolrIndexWriter.java?rev=1655526&r1=1655525&r2=1655526&view=diff ============================================================================== --- nutch/trunk/src/plugin/indexer-solr/src/java/org/apache/nutch/indexwriter/solr/SolrIndexWriter.java (original) +++ nutch/trunk/src/plugin/indexer-solr/src/java/org/apache/nutch/indexwriter/solr/SolrIndexWriter.java Thu Jan 29 05:38:59 2015 @@ -39,169 +39,175 @@ import org.slf4j.LoggerFactory; public class SolrIndexWriter implements IndexWriter { - public static final Logger LOG = LoggerFactory - .getLogger(SolrIndexWriter.class); + public static final Logger LOG = LoggerFactory + .getLogger(SolrIndexWriter.class); - private SolrServer solr; - private SolrMappingReader solrMapping; - private ModifiableSolrParams params; - - private Configuration config; - - private final List<SolrInputDocument> inputDocs = new ArrayList<SolrInputDocument>(); - - private int batchSize; - private int numDeletes = 0; - private boolean delete = false; - - public void open(JobConf job, String name) throws IOException { - SolrServer server = SolrUtils.getCommonsHttpSolrServer(job); - init(server, job); - } - - // package protected for tests - void init(SolrServer server, JobConf job) throws IOException { - solr = server; - batchSize = job.getInt(SolrConstants.COMMIT_SIZE, 1000); - solrMapping = SolrMappingReader.getInstance(job); - delete = job.getBoolean(IndexerMapReduce.INDEXER_DELETE, false); - // parse optional params - params = new ModifiableSolrParams(); - String paramString = job.get(IndexerMapReduce.INDEXER_PARAMS); - if (paramString != null) { - String[] values = paramString.split("&"); - for (String v : values) { - String[] kv = v.split("="); - if (kv.length < 2) { - continue; - } - params.add(kv[0], kv[1]); - } - } - } - - public void delete(String key) throws IOException { - if (delete) { - try { - solr.deleteById(key); - numDeletes++; - } catch (final SolrServerException e) { - throw makeIOException(e); - } - } - } - - @Override - public void update(NutchDocument doc) throws IOException { - write(doc); - } - - public void write(NutchDocument doc) throws IOException { - final SolrInputDocument inputDoc = new SolrInputDocument(); - for (final Entry<String, NutchField> e : doc) { - for (final Object val : e.getValue().getValues()) { - // normalise the string representation for a Date - Object val2 = val; - - if (val instanceof Date) { - val2 = DateUtil.getThreadLocalDateFormat().format(val); - } - - if (e.getKey().equals("content") || e.getKey().equals("title")) { - val2 = SolrUtils.stripNonCharCodepoints((String) val); - } - - inputDoc.addField(solrMapping.mapKey(e.getKey()), val2, e - .getValue().getWeight()); - String sCopy = solrMapping.mapCopyKey(e.getKey()); - if (sCopy != e.getKey()) { - inputDoc.addField(sCopy, val); - } - } - } - - inputDoc.setDocumentBoost(doc.getWeight()); - inputDocs.add(inputDoc); - if (inputDocs.size() + numDeletes >= batchSize) { - try { - LOG.info("Indexing " + Integer.toString(inputDocs.size()) - + " documents"); - LOG.info("Deleting " + Integer.toString(numDeletes) - + " documents"); - numDeletes = 0; - UpdateRequest req = new UpdateRequest(); - req.add(inputDocs); - req.setParams(params); - req.process(solr); - } catch (final SolrServerException e) { - throw makeIOException(e); - } - inputDocs.clear(); - } - } + private SolrServer solr; + private SolrMappingReader solrMapping; + private ModifiableSolrParams params; + + private Configuration config; + + private final List<SolrInputDocument> inputDocs = new ArrayList<SolrInputDocument>(); + + private int batchSize; + private int numDeletes = 0; + private boolean delete = false; + + public void open(JobConf job, String name) throws IOException { + SolrServer server = SolrUtils.getCommonsHttpSolrServer(job); + init(server, job); + } + + // package protected for tests + void init(SolrServer server, JobConf job) throws IOException { + solr = server; + batchSize = job.getInt(SolrConstants.COMMIT_SIZE, 1000); + solrMapping = SolrMappingReader.getInstance(job); + delete = job.getBoolean(IndexerMapReduce.INDEXER_DELETE, false); + // parse optional params + params = new ModifiableSolrParams(); + String paramString = job.get(IndexerMapReduce.INDEXER_PARAMS); + if (paramString != null) { + String[] values = paramString.split("&"); + for (String v : values) { + String[] kv = v.split("="); + if (kv.length < 2) { + continue; + } + params.add(kv[0], kv[1]); + } + } + } + + public void delete(String key) throws IOException { + if (delete) { + try { + solr.deleteById(key); + numDeletes++; + } catch (final SolrServerException e) { + throw makeIOException(e); + } + } + } + + @Override + public void update(NutchDocument doc) throws IOException { + write(doc); + } + + public void write(NutchDocument doc) throws IOException { + final SolrInputDocument inputDoc = new SolrInputDocument(); + for (final Entry<String, NutchField> e : doc) { + for (final Object val : e.getValue().getValues()) { + // normalise the string representation for a Date + Object val2 = val; + + if (val instanceof Date) { + val2 = DateUtil.getThreadLocalDateFormat().format(val); + } + + if (e.getKey().equals("content") || e.getKey().equals("title")) { + val2 = SolrUtils.stripNonCharCodepoints((String) val); + } + + inputDoc.addField(solrMapping.mapKey(e.getKey()), val2, e.getValue() + .getWeight()); + String sCopy = solrMapping.mapCopyKey(e.getKey()); + if (sCopy != e.getKey()) { + inputDoc.addField(sCopy, val); + } + } + } + + inputDoc.setDocumentBoost(doc.getWeight()); + inputDocs.add(inputDoc); + if (inputDocs.size() + numDeletes >= batchSize) { + try { + LOG.info("Indexing " + Integer.toString(inputDocs.size()) + + " documents"); + LOG.info("Deleting " + Integer.toString(numDeletes) + " documents"); + numDeletes = 0; + UpdateRequest req = new UpdateRequest(); + req.add(inputDocs); + req.setParams(params); + req.process(solr); + } catch (final SolrServerException e) { + throw makeIOException(e); + } + inputDocs.clear(); + } + } + + public void close() throws IOException { + try { + if (!inputDocs.isEmpty()) { + LOG.info("Indexing " + Integer.toString(inputDocs.size()) + + " documents"); + if (numDeletes > 0) { + LOG.info("Deleting " + Integer.toString(numDeletes) + " documents"); + } + UpdateRequest req = new UpdateRequest(); + req.add(inputDocs); + req.setParams(params); + req.process(solr); + inputDocs.clear(); + } + } catch (final SolrServerException e) { + throw makeIOException(e); + } + } + + @Override + public void commit() throws IOException { + try { + solr.commit(); + } catch (SolrServerException e) { + throw makeIOException(e); + } + } + + public static IOException makeIOException(SolrServerException e) { + final IOException ioe = new IOException(); + ioe.initCause(e); + return ioe; + } + + @Override + public Configuration getConf() { + return config; + } + + @Override + public void setConf(Configuration conf) { + config = conf; + String serverURL = conf.get(SolrConstants.SERVER_URL); + if (serverURL == null) { + String message = "Missing SOLR URL. Should be set via -D " + + SolrConstants.SERVER_URL; + message += "\n" + describe(); + LOG.error(message); + throw new RuntimeException(message); + } + } + + public String describe() { + StringBuffer sb = new StringBuffer("SOLRIndexWriter\n"); + sb.append("\t").append(SolrConstants.SERVER_URL) + .append(" : URL of the SOLR instance (mandatory)\n"); + sb.append("\t").append(SolrConstants.COMMIT_SIZE) + .append(" : buffer size when sending to SOLR (default 1000)\n"); + sb.append("\t") + .append(SolrConstants.MAPPING_FILE) + .append( + " : name of the mapping file for fields (default solrindex-mapping.xml)\n"); + sb.append("\t").append(SolrConstants.USE_AUTH) + .append(" : use authentication (default false)\n"); + sb.append("\t").append(SolrConstants.USERNAME) + .append(" : username for authentication\n"); + sb.append("\t").append(SolrConstants.PASSWORD) + .append(" : password for authentication\n"); + return sb.toString(); + } - public void close() throws IOException { - try { - if (!inputDocs.isEmpty()) { - LOG.info("Indexing " + Integer.toString(inputDocs.size()) - + " documents"); - if (numDeletes > 0) { - LOG.info("Deleting " + Integer.toString(numDeletes) - + " documents"); - } - UpdateRequest req = new UpdateRequest(); - req.add(inputDocs); - req.setParams(params); - req.process(solr); - inputDocs.clear(); - } - } catch (final SolrServerException e) { - throw makeIOException(e); - } - } - - @Override - public void commit() throws IOException { - try { - solr.commit(); - } catch (SolrServerException e) { - throw makeIOException(e); - } - } - - public static IOException makeIOException(SolrServerException e) { - final IOException ioe = new IOException(); - ioe.initCause(e); - return ioe; - } - - @Override - public Configuration getConf() { - return config; - } - - @Override - public void setConf(Configuration conf) { - config = conf; - String serverURL = conf.get(SolrConstants.SERVER_URL); - if (serverURL == null) { - String message = "Missing SOLR URL. Should be set via -D " - + SolrConstants.SERVER_URL; - message+="\n"+describe(); - LOG.error(message); - throw new RuntimeException(message); - } - } - - public String describe(){ - StringBuffer sb = new StringBuffer("SOLRIndexWriter\n"); - sb.append("\t").append(SolrConstants.SERVER_URL).append(" : URL of the SOLR instance (mandatory)\n"); - sb.append("\t").append(SolrConstants.COMMIT_SIZE).append(" : buffer size when sending to SOLR (default 1000)\n"); - sb.append("\t").append(SolrConstants.MAPPING_FILE).append(" : name of the mapping file for fields (default solrindex-mapping.xml)\n"); - sb.append("\t").append(SolrConstants.USE_AUTH).append(" : use authentication (default false)\n"); - sb.append("\t").append(SolrConstants.USERNAME).append(" : username for authentication\n"); - sb.append("\t").append(SolrConstants.PASSWORD).append(" : password for authentication\n"); - return sb.toString(); - } - } Modified: nutch/trunk/src/plugin/indexer-solr/src/java/org/apache/nutch/indexwriter/solr/SolrMappingReader.java URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/indexer-solr/src/java/org/apache/nutch/indexwriter/solr/SolrMappingReader.java?rev=1655526&r1=1655525&r2=1655526&view=diff ============================================================================== --- nutch/trunk/src/plugin/indexer-solr/src/java/org/apache/nutch/indexwriter/solr/SolrMappingReader.java (original) +++ nutch/trunk/src/plugin/indexer-solr/src/java/org/apache/nutch/indexwriter/solr/SolrMappingReader.java Thu Jan 29 05:38:59 2015 @@ -38,16 +38,17 @@ import org.xml.sax.SAXException; public class SolrMappingReader { public static Logger LOG = LoggerFactory.getLogger(SolrMappingReader.class); - + private Configuration conf; - + private Map<String, String> keyMap = new HashMap<String, String>(); private Map<String, String> copyMap = new HashMap<String, String>(); private String uniqueKey = "id"; - + public static synchronized SolrMappingReader getInstance(Configuration conf) { ObjectCache cache = ObjectCache.get(conf); - SolrMappingReader instance = (SolrMappingReader)cache.getObject(SolrMappingReader.class.getName()); + SolrMappingReader instance = (SolrMappingReader) cache + .getObject(SolrMappingReader.class.getName()); if (instance == null) { instance = new SolrMappingReader(conf); cache.setObject(SolrMappingReader.class.getName(), instance); @@ -60,9 +61,10 @@ public class SolrMappingReader { parseMapping(); } - private void parseMapping() { + private void parseMapping() { InputStream ssInputStream = null; - ssInputStream = conf.getConfResourceAsInputStream(conf.get(SolrConstants.MAPPING_FILE, "solrindex-mapping.xml")); + ssInputStream = conf.getConfResourceAsInputStream(conf.get( + SolrConstants.MAPPING_FILE, "solrindex-mapping.xml")); InputSource inputSource = new InputSource(ssInputStream); try { @@ -74,48 +76,50 @@ public class SolrMappingReader { if (fieldList.getLength() > 0) { for (int i = 0; i < fieldList.getLength(); i++) { Element element = (Element) fieldList.item(i); - LOG.info("source: " + element.getAttribute("source") + " dest: " + element.getAttribute("dest")); - keyMap.put(element.getAttribute("source"), element.getAttribute("dest")); + LOG.info("source: " + element.getAttribute("source") + " dest: " + + element.getAttribute("dest")); + keyMap.put(element.getAttribute("source"), + element.getAttribute("dest")); } } NodeList copyFieldList = rootElement.getElementsByTagName("copyField"); if (copyFieldList.getLength() > 0) { for (int i = 0; i < copyFieldList.getLength(); i++) { Element element = (Element) copyFieldList.item(i); - LOG.info("source: " + element.getAttribute("source") + " dest: " + element.getAttribute("dest")); - copyMap.put(element.getAttribute("source"), element.getAttribute("dest")); + LOG.info("source: " + element.getAttribute("source") + " dest: " + + element.getAttribute("dest")); + copyMap.put(element.getAttribute("source"), + element.getAttribute("dest")); } } NodeList uniqueKeyItem = rootElement.getElementsByTagName("uniqueKey"); if (uniqueKeyItem.getLength() > 1) { LOG.warn("More than one unique key definitions found in solr index mapping, using default 'id'"); uniqueKey = "id"; - } - else if (uniqueKeyItem.getLength() == 0) { + } else if (uniqueKeyItem.getLength() == 0) { LOG.warn("No unique key definition found in solr index mapping using, default 'id'"); - } - else{ - uniqueKey = uniqueKeyItem.item(0).getFirstChild().getNodeValue(); + } else { + uniqueKey = uniqueKeyItem.item(0).getFirstChild().getNodeValue(); } } catch (MalformedURLException e) { - LOG.warn(e.toString()); + LOG.warn(e.toString()); } catch (SAXException e) { - LOG.warn(e.toString()); + LOG.warn(e.toString()); } catch (IOException e) { - LOG.warn(e.toString()); + LOG.warn(e.toString()); } catch (ParserConfigurationException e) { - LOG.warn(e.toString()); - } + LOG.warn(e.toString()); + } } - + public Map<String, String> getKeyMap() { return keyMap; } - + public Map<String, String> getCopyMap() { return copyMap; } - + public String getUniqueKey() { return uniqueKey; } @@ -128,14 +132,14 @@ public class SolrMappingReader { } public String mapKey(String key) throws IOException { - if(keyMap.containsKey(key)) { + if (keyMap.containsKey(key)) { key = (String) keyMap.get(key); } return key; } public String mapCopyKey(String key) throws IOException { - if(copyMap.containsKey(key)) { + if (copyMap.containsKey(key)) { key = (String) copyMap.get(key); } return key; Modified: nutch/trunk/src/plugin/indexer-solr/src/java/org/apache/nutch/indexwriter/solr/SolrUtils.java URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/indexer-solr/src/java/org/apache/nutch/indexwriter/solr/SolrUtils.java?rev=1655526&r1=1655525&r2=1655526&view=diff ============================================================================== --- nutch/trunk/src/plugin/indexer-solr/src/java/org/apache/nutch/indexwriter/solr/SolrUtils.java (original) +++ nutch/trunk/src/plugin/indexer-solr/src/java/org/apache/nutch/indexwriter/solr/SolrUtils.java Thu Jan 29 05:38:59 2015 @@ -31,8 +31,9 @@ public class SolrUtils { public static Logger LOG = LoggerFactory.getLogger(SolrUtils.class); - public static CommonsHttpSolrServer getCommonsHttpSolrServer(JobConf job) throws MalformedURLException { - HttpClient client=new HttpClient(); + public static CommonsHttpSolrServer getCommonsHttpSolrServer(JobConf job) + throws MalformedURLException { + HttpClient client = new HttpClient(); // Check for username/password if (job.getBoolean(SolrConstants.USE_AUTH, false)) { @@ -40,9 +41,13 @@ public class SolrUtils { LOG.info("Authenticating as: " + username); - AuthScope scope = new AuthScope(AuthScope.ANY_HOST, AuthScope.ANY_PORT, AuthScope.ANY_REALM, AuthScope.ANY_SCHEME); + AuthScope scope = new AuthScope(AuthScope.ANY_HOST, AuthScope.ANY_PORT, + AuthScope.ANY_REALM, AuthScope.ANY_SCHEME); - client.getState().setCredentials(scope, new UsernamePasswordCredentials(username, job.get(SolrConstants.PASSWORD))); + client.getState().setCredentials( + scope, + new UsernamePasswordCredentials(username, job + .get(SolrConstants.PASSWORD))); HttpClientParams params = client.getParams(); params.setAuthenticationPreemptive(true); @@ -51,7 +56,7 @@ public class SolrUtils { } String serverURL = job.get(SolrConstants.SERVER_URL); - + return new CommonsHttpSolrServer(serverURL, client); } @@ -62,8 +67,10 @@ public class SolrUtils { for (int i = 0; i < input.length(); i++) { ch = input.charAt(i); - // Strip all non-characters http://unicode.org/cldr/utility/list-unicodeset.jsp?a=[:Noncharacter_Code_Point=True:] - // and non-printable control characters except tabulator, new line and carriage return + // Strip all non-characters + // http://unicode.org/cldr/utility/list-unicodeset.jsp?a=[:Noncharacter_Code_Point=True:] + // and non-printable control characters except tabulator, new line and + // carriage return if (ch % 0x10000 != 0xffff && // 0xffff - 0x10ffff range step 0x10000 ch % 0x10000 != 0xfffe && // 0xfffe - 0x10fffe range (ch <= 0xfdd0 || ch >= 0xfdef) && // 0xfdd0 - 0xfdef Modified: nutch/trunk/src/plugin/indexer-solr/src/java/org/apache/nutch/indexwriter/solr/package-info.java URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/indexer-solr/src/java/org/apache/nutch/indexwriter/solr/package-info.java?rev=1655526&r1=1655525&r2=1655526&view=diff ============================================================================== --- nutch/trunk/src/plugin/indexer-solr/src/java/org/apache/nutch/indexwriter/solr/package-info.java (original) +++ nutch/trunk/src/plugin/indexer-solr/src/java/org/apache/nutch/indexwriter/solr/package-info.java Thu Jan 29 05:38:59 2015 @@ -19,3 +19,4 @@ * Index writer plugin for <a href="http://lucene.apache.org/solr/">Apache Solr</a>. */ package org.apache.nutch.indexwriter.solr; + Modified: nutch/trunk/src/plugin/language-identifier/src/java/org/apache/nutch/analysis/lang/HTMLLanguageParser.java URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/language-identifier/src/java/org/apache/nutch/analysis/lang/HTMLLanguageParser.java?rev=1655526&r1=1655525&r2=1655526&view=diff ============================================================================== --- nutch/trunk/src/plugin/language-identifier/src/java/org/apache/nutch/analysis/lang/HTMLLanguageParser.java (original) +++ nutch/trunk/src/plugin/language-identifier/src/java/org/apache/nutch/analysis/lang/HTMLLanguageParser.java Thu Jan 29 05:38:59 2015 @@ -41,289 +41,280 @@ import org.w3c.dom.Node; public class HTMLLanguageParser implements HtmlParseFilter { - public static final Logger LOG = LoggerFactory - .getLogger(HTMLLanguageParser.class); + public static final Logger LOG = LoggerFactory + .getLogger(HTMLLanguageParser.class); - private int detect = -1, identify = -1; + private int detect = -1, identify = -1; - private int contentMaxlength = -1; + private int contentMaxlength = -1; - private boolean onlyCertain = false; + private boolean onlyCertain = false; - /* A static Map of ISO-639 language codes */ - private static Map<String, String> LANGUAGES_MAP = new HashMap<String, String>(); - static { - try { - Properties p = new Properties(); - p.load(HTMLLanguageParser.class - .getResourceAsStream("langmappings.properties")); - Enumeration<?> keys = p.keys(); - while (keys.hasMoreElements()) { - String key = (String) keys.nextElement(); - String[] values = p.getProperty(key).split(",", -1); - LANGUAGES_MAP.put(key, key); - for (int i = 0; i < values.length; i++) { - LANGUAGES_MAP.put(values[i].trim().toLowerCase(), key); - } - } - } catch (Exception e) { - if (LOG.isErrorEnabled()) { - LOG.error(e.toString()); - } - } + /* A static Map of ISO-639 language codes */ + private static Map<String, String> LANGUAGES_MAP = new HashMap<String, String>(); + static { + try { + Properties p = new Properties(); + p.load(HTMLLanguageParser.class + .getResourceAsStream("langmappings.properties")); + Enumeration<?> keys = p.keys(); + while (keys.hasMoreElements()) { + String key = (String) keys.nextElement(); + String[] values = p.getProperty(key).split(",", -1); + LANGUAGES_MAP.put(key, key); + for (int i = 0; i < values.length; i++) { + LANGUAGES_MAP.put(values[i].trim().toLowerCase(), key); + } + } + } catch (Exception e) { + if (LOG.isErrorEnabled()) { + LOG.error(e.toString()); + } } + } - private Configuration conf; + private Configuration conf; - /** - * Scan the HTML document looking at possible indications of content - * language<br> - * <li>1. html lang attribute - * (http://www.w3.org/TR/REC-html40/struct/dirlang.html#h-8.1) <li>2. meta - * dc.language - * (http://dublincore.org/documents/2000/07/16/usageguide/qualified - * -html.shtml#language) <li>3. meta http-equiv (content-language) - * (http://www.w3.org/TR/REC-html40/struct/global.html#h-7.4.4.2) <br> - */ - public ParseResult filter(Content content, ParseResult parseResult, - HTMLMetaTags metaTags, DocumentFragment doc) { - String lang = null; - - Parse parse = parseResult.get(content.getUrl()); - - if (detect >= 0 && identify < 0) { - lang = detectLanguage(parse, doc); - } else if (detect < 0 && identify >= 0) { - lang = identifyLanguage(parse); - } else if (detect < identify) { - lang = detectLanguage(parse, doc); - if (lang == null) { - lang = identifyLanguage(parse); - } - } else if (identify < detect) { - lang = identifyLanguage(parse); - if (lang == null) { - lang = detectLanguage(parse, doc); - } - } else { - LOG.warn("No configuration for language extraction policy is provided"); - return parseResult; - } + /** + * Scan the HTML document looking at possible indications of content language<br> + * <li>1. html lang attribute + * (http://www.w3.org/TR/REC-html40/struct/dirlang.html#h-8.1) <li>2. meta + * dc.language + * (http://dublincore.org/documents/2000/07/16/usageguide/qualified + * -html.shtml#language) <li>3. meta http-equiv (content-language) + * (http://www.w3.org/TR/REC-html40/struct/global.html#h-7.4.4.2) <br> + */ + public ParseResult filter(Content content, ParseResult parseResult, + HTMLMetaTags metaTags, DocumentFragment doc) { + String lang = null; + + Parse parse = parseResult.get(content.getUrl()); + + if (detect >= 0 && identify < 0) { + lang = detectLanguage(parse, doc); + } else if (detect < 0 && identify >= 0) { + lang = identifyLanguage(parse); + } else if (detect < identify) { + lang = detectLanguage(parse, doc); + if (lang == null) { + lang = identifyLanguage(parse); + } + } else if (identify < detect) { + lang = identifyLanguage(parse); + if (lang == null) { + lang = detectLanguage(parse, doc); + } + } else { + LOG.warn("No configuration for language extraction policy is provided"); + return parseResult; + } - if (lang != null) { - parse.getData().getParseMeta().set(Metadata.LANGUAGE, lang); - return parseResult; - } + if (lang != null) { + parse.getData().getParseMeta().set(Metadata.LANGUAGE, lang); + return parseResult; + } - return parseResult; + return parseResult; + } + + /** Try to find the document's language from page headers and metadata */ + private String detectLanguage(Parse page, DocumentFragment doc) { + String lang = getLanguageFromMetadata(page.getData().getParseMeta()); + if (lang == null) { + LanguageParser parser = new LanguageParser(doc); + lang = parser.getLanguage(); } - /** Try to find the document's language from page headers and metadata */ - private String detectLanguage(Parse page, DocumentFragment doc) { - String lang = getLanguageFromMetadata(page.getData().getParseMeta()); - if (lang == null) { - LanguageParser parser = new LanguageParser(doc); - lang = parser.getLanguage(); - } + if (lang != null) { + return lang; + } - if (lang != null) { - return lang; - } + lang = page.getData().getContentMeta().get(Response.CONTENT_LANGUAGE); - lang = page.getData().getContentMeta().get(Response.CONTENT_LANGUAGE); + return lang; + } - return lang; + /** Use statistical language identification to extract page language */ + private String identifyLanguage(Parse parse) { + StringBuilder text = new StringBuilder(); + if (parse == null) + return null; + + String title = parse.getData().getTitle(); + if (title != null) { + text.append(title.toString()); } - /** Use statistical language identification to extract page language */ - private String identifyLanguage(Parse parse) { - StringBuilder text = new StringBuilder(); - if (parse == null) - return null; + String content = parse.getText(); + if (content != null) { + text.append(" ").append(content.toString()); + } - String title = parse.getData().getTitle(); - if (title != null) { - text.append(title.toString()); - } + // trim content? + String titleandcontent = text.toString(); - String content = parse.getText(); - if (content != null) { - text.append(" ").append(content.toString()); - } + if (this.contentMaxlength != -1 + && titleandcontent.length() > this.contentMaxlength) + titleandcontent = titleandcontent.substring(0, contentMaxlength); - // trim content? - String titleandcontent = text.toString(); + LanguageIdentifier identifier = new LanguageIdentifier(titleandcontent); - if (this.contentMaxlength != -1 - && titleandcontent.length() > this.contentMaxlength) - titleandcontent = titleandcontent.substring(0, contentMaxlength); - - LanguageIdentifier identifier = new LanguageIdentifier(titleandcontent); - - if (onlyCertain) { - if (identifier.isReasonablyCertain()) - return identifier.getLanguage(); - else - return null; - } + if (onlyCertain) { + if (identifier.isReasonablyCertain()) return identifier.getLanguage(); + else + return null; } + return identifier.getLanguage(); + } - // Check in the metadata whether the language has already been stored there - // by Tika - private static String getLanguageFromMetadata(Metadata meta) { - if (meta == null) - return null; - // dublin core - String lang = meta.get("dc.language"); - if (lang != null) - return lang; - // meta content-language - lang = meta.get("content-language"); - if (lang != null) - return lang; - // lang attribute - return meta.get("lang"); - } - - static class LanguageParser { - - private String dublinCore = null; - private String htmlAttribute = null; - private String httpEquiv = null; - private String language = null; - - LanguageParser(Node node) { - parse(node); - if (htmlAttribute != null) { - language = htmlAttribute; - } else if (dublinCore != null) { - language = dublinCore; - } else { - language = httpEquiv; - } - } - - String getLanguage() { - return language; - } - - void parse(Node node) { - - NodeWalker walker = new NodeWalker(node); - while (walker.hasNext()) { + // Check in the metadata whether the language has already been stored there + // by Tika + private static String getLanguageFromMetadata(Metadata meta) { + if (meta == null) + return null; + // dublin core + String lang = meta.get("dc.language"); + if (lang != null) + return lang; + // meta content-language + lang = meta.get("content-language"); + if (lang != null) + return lang; + // lang attribute + return meta.get("lang"); + } + + static class LanguageParser { + + private String dublinCore = null; + private String htmlAttribute = null; + private String httpEquiv = null; + private String language = null; + + LanguageParser(Node node) { + parse(node); + if (htmlAttribute != null) { + language = htmlAttribute; + } else if (dublinCore != null) { + language = dublinCore; + } else { + language = httpEquiv; + } + } - Node currentNode = walker.nextNode(); - String nodeName = currentNode.getNodeName(); - short nodeType = currentNode.getNodeType(); + String getLanguage() { + return language; + } - if (nodeType == Node.ELEMENT_NODE) { + void parse(Node node) { - // Check for the lang HTML attribute - if (htmlAttribute == null) { - htmlAttribute = parseLanguage(((Element) currentNode) - .getAttribute("lang")); - } + NodeWalker walker = new NodeWalker(node); + while (walker.hasNext()) { - // Check for Meta - if ("meta".equalsIgnoreCase(nodeName)) { - NamedNodeMap attrs = currentNode.getAttributes(); - - // Check for the dc.language Meta - if (dublinCore == null) { - for (int i = 0; i < attrs.getLength(); i++) { - Node attrnode = attrs.item(i); - if ("name".equalsIgnoreCase(attrnode - .getNodeName())) { - if ("dc.language".equalsIgnoreCase(attrnode - .getNodeValue())) { - Node valueattr = attrs - .getNamedItem("content"); - if (valueattr != null) { - dublinCore = parseLanguage(valueattr - .getNodeValue()); - } - } - } - } - } - - // Check for the http-equiv content-language - if (httpEquiv == null) { - for (int i = 0; i < attrs.getLength(); i++) { - Node attrnode = attrs.item(i); - if ("http-equiv".equalsIgnoreCase(attrnode - .getNodeName())) { - if ("content-language".equals(attrnode - .getNodeValue().toLowerCase())) { - Node valueattr = attrs - .getNamedItem("content"); - if (valueattr != null) { - httpEquiv = parseLanguage(valueattr - .getNodeValue()); - } - } - } - } - } + Node currentNode = walker.nextNode(); + String nodeName = currentNode.getNodeName(); + short nodeType = currentNode.getNodeType(); + + if (nodeType == Node.ELEMENT_NODE) { + + // Check for the lang HTML attribute + if (htmlAttribute == null) { + htmlAttribute = parseLanguage(((Element) currentNode) + .getAttribute("lang")); + } + + // Check for Meta + if ("meta".equalsIgnoreCase(nodeName)) { + NamedNodeMap attrs = currentNode.getAttributes(); + + // Check for the dc.language Meta + if (dublinCore == null) { + for (int i = 0; i < attrs.getLength(); i++) { + Node attrnode = attrs.item(i); + if ("name".equalsIgnoreCase(attrnode.getNodeName())) { + if ("dc.language".equalsIgnoreCase(attrnode.getNodeValue())) { + Node valueattr = attrs.getNamedItem("content"); + if (valueattr != null) { + dublinCore = parseLanguage(valueattr.getNodeValue()); } + } } + } + } - if ((dublinCore != null) && (htmlAttribute != null) - && (httpEquiv != null)) { - return; + // Check for the http-equiv content-language + if (httpEquiv == null) { + for (int i = 0; i < attrs.getLength(); i++) { + Node attrnode = attrs.item(i); + if ("http-equiv".equalsIgnoreCase(attrnode.getNodeName())) { + if ("content-language".equals(attrnode.getNodeValue() + .toLowerCase())) { + Node valueattr = attrs.getNamedItem("content"); + if (valueattr != null) { + httpEquiv = parseLanguage(valueattr.getNodeValue()); + } + } } + } } + } } - /** - * Parse a language string and return an ISO 639 primary code, or - * <code>null</code> if something wrong occurs, or if no language is - * found. - */ - final static String parseLanguage(String lang) { - - if (lang == null) { - return null; - } - - String code = null; - String language = null; + if ((dublinCore != null) && (htmlAttribute != null) + && (httpEquiv != null)) { + return; + } + } + } - // First, split multi-valued values - String langs[] = lang.split(",| |;|\\.|\\(|\\)|=", -1); + /** + * Parse a language string and return an ISO 639 primary code, or + * <code>null</code> if something wrong occurs, or if no language is found. + */ + final static String parseLanguage(String lang) { - int i = 0; - while ((language == null) && (i < langs.length)) { - // Then, get the primary code - code = langs[i].split("-")[0]; - code = code.split("_")[0]; - // Find the ISO 639 code - language = (String) LANGUAGES_MAP.get(code.toLowerCase()); - i++; - } - - return language; - } + if (lang == null) { + return null; + } + + String code = null; + String language = null; + + // First, split multi-valued values + String langs[] = lang.split(",| |;|\\.|\\(|\\)|=", -1); + + int i = 0; + while ((language == null) && (i < langs.length)) { + // Then, get the primary code + code = langs[i].split("-")[0]; + code = code.split("_")[0]; + // Find the ISO 639 code + language = (String) LANGUAGES_MAP.get(code.toLowerCase()); + i++; + } + return language; } - public void setConf(Configuration conf) { - this.conf = conf; - contentMaxlength = conf.getInt("lang.analyze.max.length", -1); - onlyCertain = conf.getBoolean("lang.identification.only.certain", false); - String[] policy = conf.getStrings("lang.extraction.policy"); - for (int i = 0; i < policy.length; i++) { - if (policy[i].equals("detect")) { - detect = i; - } else if (policy[i].equals("identify")) { - identify = i; - } - } - } + } - public Configuration getConf() { - return this.conf; + public void setConf(Configuration conf) { + this.conf = conf; + contentMaxlength = conf.getInt("lang.analyze.max.length", -1); + onlyCertain = conf.getBoolean("lang.identification.only.certain", false); + String[] policy = conf.getStrings("lang.extraction.policy"); + for (int i = 0; i < policy.length; i++) { + if (policy[i].equals("detect")) { + detect = i; + } else if (policy[i].equals("identify")) { + identify = i; + } } + } + + public Configuration getConf() { + return this.conf; + } } \ No newline at end of file Modified: nutch/trunk/src/plugin/language-identifier/src/java/org/apache/nutch/analysis/lang/LanguageIndexingFilter.java URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/language-identifier/src/java/org/apache/nutch/analysis/lang/LanguageIndexingFilter.java?rev=1655526&r1=1655525&r2=1655526&view=diff ============================================================================== --- nutch/trunk/src/plugin/language-identifier/src/java/org/apache/nutch/analysis/lang/LanguageIndexingFilter.java (original) +++ nutch/trunk/src/plugin/language-identifier/src/java/org/apache/nutch/analysis/lang/LanguageIndexingFilter.java Thu Jan 29 05:38:59 2015 @@ -16,7 +16,6 @@ */ package org.apache.nutch.analysis.lang; - // Nutch imports import org.apache.nutch.crawl.CrawlDatum; import org.apache.nutch.crawl.Inlinks; @@ -31,29 +30,27 @@ import org.apache.nutch.net.protocols.Re // Hadoop imports import org.apache.hadoop.conf.Configuration; - /** - * An {@link org.apache.nutch.indexer.IndexingFilter} that - * add a <code>lang</code> (language) field to the document. - * + * An {@link org.apache.nutch.indexer.IndexingFilter} that add a + * <code>lang</code> (language) field to the document. + * * It tries to find the language of the document by: * <ul> - * <li>First, checking if {@link HTMLLanguageParser} add some language - * information</li> - * <li>Then, checking if a <code>Content-Language</code> HTTP header can be - * found</li> - * <li>Finaly by analyzing the document content</li> + * <li>First, checking if {@link HTMLLanguageParser} add some language + * information</li> + * <li>Then, checking if a <code>Content-Language</code> HTTP header can be + * found</li> + * <li>Finaly by analyzing the document content</li> * </ul> - * + * * @author Sami Siren * @author Jerome Charron */ public class LanguageIndexingFilter implements IndexingFilter { - private Configuration conf; -/** + /** * Constructs a new Language Indexing Filter. */ public LanguageIndexingFilter() { @@ -61,15 +58,15 @@ public class LanguageIndexingFilter impl } // Inherited JavaDoc - public NutchDocument filter(NutchDocument doc, Parse parse, Text url, CrawlDatum datum, Inlinks inlinks) - throws IndexingException { + public NutchDocument filter(NutchDocument doc, Parse parse, Text url, + CrawlDatum datum, Inlinks inlinks) throws IndexingException { // check if LANGUAGE found, possibly put there by HTMLLanguageParser String lang = parse.getData().getParseMeta().get(Metadata.LANGUAGE); // check if HTTP-header tels us the language if (lang == null) { - lang = parse.getData().getContentMeta().get(Response.CONTENT_LANGUAGE); + lang = parse.getData().getContentMeta().get(Response.CONTENT_LANGUAGE); } if (lang == null || lang.length() == 0) { Modified: nutch/trunk/src/plugin/language-identifier/src/test/org/apache/nutch/analysis/lang/TestHTMLLanguageParser.java URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/language-identifier/src/test/org/apache/nutch/analysis/lang/TestHTMLLanguageParser.java?rev=1655526&r1=1655525&r2=1655526&view=diff ============================================================================== --- nutch/trunk/src/plugin/language-identifier/src/test/org/apache/nutch/analysis/lang/TestHTMLLanguageParser.java (original) +++ nutch/trunk/src/plugin/language-identifier/src/test/org/apache/nutch/analysis/lang/TestHTMLLanguageParser.java Thu Jan 29 05:38:59 2015 @@ -19,8 +19,6 @@ package org.apache.nutch.analysis.lang; import java.io.BufferedReader; import java.io.InputStreamReader; - - // Nutch imports import org.apache.nutch.metadata.Metadata; import org.apache.nutch.parse.Parse; @@ -31,7 +29,6 @@ import org.apache.tika.language.Language import org.junit.Assert; import org.junit.Test; - public class TestHTMLLanguageParser { private static String URL = "http://foo.bar/"; @@ -47,7 +44,7 @@ public class TestHTMLLanguageParser { String metalanguages[] = { "fi", "en", "en" }; /** - * Test parsing of language identifiers from html + * Test parsing of language identifiers from html **/ @Test public void testMetaHTMLParsing() { @@ -58,7 +55,8 @@ public class TestHTMLLanguageParser { for (int t = 0; t < docs.length; t++) { Content content = getContent(docs[t]); Parse parse = parser.parse(content).get(content.getUrl()); - Assert.assertEquals(metalanguages[t], (String) parse.getData().getParseMeta().get(Metadata.LANGUAGE)); + Assert.assertEquals(metalanguages[t], (String) parse.getData() + .getParseMeta().get(Metadata.LANGUAGE)); } } catch (Exception e) { e.printStackTrace(System.out); @@ -70,65 +68,38 @@ public class TestHTMLLanguageParser { /** Test of <code>LanguageParser.parseLanguage(String)</code> method. */ @Test public void testParseLanguage() { - String tests[][] = { - { "(SCHEME=ISO.639-1) sv", "sv" }, - { "(SCHEME=RFC1766) sv-FI", "sv" }, - { "(SCHEME=Z39.53) SWE", "sv" }, - { "EN_US, SV, EN, EN_UK", "en" }, - { "English Swedish", "en" }, - { "English, swedish", "en" }, - { "English,Swedish", "en" }, - { "Other (Svenska)", "sv" }, - { "SE", "se" }, - { "SV", "sv" }, - { "SV charset=iso-8859-1", "sv" }, - { "SV-FI", "sv" }, - { "SV; charset=iso-8859-1", "sv" }, - { "SVE", "sv" }, - { "SW", "sw" }, - { "SWE", "sv" }, - { "SWEDISH", "sv" }, - { "Sv", "sv" }, - { "Sve", "sv" }, - { "Svenska", "sv" }, - { "Swedish", "sv" }, - { "Swedish, svenska", "sv" }, - { "en, sv", "en" }, - { "sv", "sv" }, - { "sv, be, dk, de, fr, no, pt, ch, fi, en", "sv" }, - { "sv,en", "sv" }, - { "sv-FI", "sv" }, - { "sv-SE", "sv" }, - { "sv-en", "sv" }, - { "sv-fi", "sv" }, - { "sv-se", "sv" }, - { "sv; Content-Language: sv", "sv" }, - { "sv_SE", "sv" }, - { "sve", "sv" }, - { "svenska, swedish, engelska, english", "sv" }, - { "sw", "sw" }, - { "swe", "sv" }, - { "swe.SPR.", "sv" }, - { "sweden", "sv" }, - { "swedish", "sv" }, - { "swedish,", "sv" }, - { "text/html; charset=sv-SE", "sv" }, - { "text/html; sv", "sv" }, - { "torp, stuga, uthyres, bed & breakfast", null } - }; - - for (int i=0; i<44; i++) { - Assert.assertEquals(tests[i][1], HTMLLanguageParser.LanguageParser.parseLanguage(tests[i][0])); + String tests[][] = { { "(SCHEME=ISO.639-1) sv", "sv" }, + { "(SCHEME=RFC1766) sv-FI", "sv" }, { "(SCHEME=Z39.53) SWE", "sv" }, + { "EN_US, SV, EN, EN_UK", "en" }, { "English Swedish", "en" }, + { "English, swedish", "en" }, { "English,Swedish", "en" }, + { "Other (Svenska)", "sv" }, { "SE", "se" }, { "SV", "sv" }, + { "SV charset=iso-8859-1", "sv" }, { "SV-FI", "sv" }, + { "SV; charset=iso-8859-1", "sv" }, { "SVE", "sv" }, { "SW", "sw" }, + { "SWE", "sv" }, { "SWEDISH", "sv" }, { "Sv", "sv" }, { "Sve", "sv" }, + { "Svenska", "sv" }, { "Swedish", "sv" }, { "Swedish, svenska", "sv" }, + { "en, sv", "en" }, { "sv", "sv" }, + { "sv, be, dk, de, fr, no, pt, ch, fi, en", "sv" }, { "sv,en", "sv" }, + { "sv-FI", "sv" }, { "sv-SE", "sv" }, { "sv-en", "sv" }, + { "sv-fi", "sv" }, { "sv-se", "sv" }, + { "sv; Content-Language: sv", "sv" }, { "sv_SE", "sv" }, + { "sve", "sv" }, { "svenska, swedish, engelska, english", "sv" }, + { "sw", "sw" }, { "swe", "sv" }, { "swe.SPR.", "sv" }, + { "sweden", "sv" }, { "swedish", "sv" }, { "swedish,", "sv" }, + { "text/html; charset=sv-SE", "sv" }, { "text/html; sv", "sv" }, + { "torp, stuga, uthyres, bed & breakfast", null } }; + + for (int i = 0; i < 44; i++) { + Assert.assertEquals(tests[i][1], + HTMLLanguageParser.LanguageParser.parseLanguage(tests[i][0])); } } - - + private Content getContent(String text) { Metadata meta = new Metadata(); meta.add("Content-Type", "text/html"); - return new Content(URL, BASE, text.getBytes(), "text/html", meta, NutchConfiguration.create()); + return new Content(URL, BASE, text.getBytes(), "text/html", meta, + NutchConfiguration.create()); } - @Test public void testLanguageIndentifier() { Modified: nutch/trunk/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/BlockedException.java URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/BlockedException.java?rev=1655526&r1=1655525&r2=1655526&view=diff ============================================================================== --- nutch/trunk/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/BlockedException.java (original) +++ nutch/trunk/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/BlockedException.java Thu Jan 29 05:38:59 2015 @@ -18,7 +18,7 @@ package org.apache.nutch.protocol.http.api; public class BlockedException extends HttpException { - + public BlockedException(String msg) { super(msg); }
