p...

lewismc Wed, 28 Jan 2015 21:40:10 -0800

Modified: 
nutch/trunk/src/plugin/index-more/src/java/org/apache/nutch/indexer/more/MoreIndexingFilter.java
URL: 
http://svn.apache.org/viewvc/nutch/trunk/src/plugin/index-more/src/java/org/apache/nutch/indexer/more/MoreIndexingFilter.java?rev=1655526&r1=1655525&r2=1655526&view=diff
==============================================================================
--- 
nutch/trunk/src/plugin/index-more/src/java/org/apache/nutch/indexer/more/MoreIndexingFilter.java
 (original)
+++ 
nutch/trunk/src/plugin/index-more/src/java/org/apache/nutch/indexer/more/MoreIndexingFilter.java
 Thu Jan 29 05:38:59 2015
@@ -55,25 +55,28 @@ import org.apache.commons.lang.time.Date
  * Add (or reset) a few metaData properties as respective fields (if they are
  * available), so that they can be accurately used within the search index.
  * 
- * 'lastModifed' is indexed to support query by date, 'contentLength' obtains 
content length from the HTTP
- * header, 'type' field is indexed to support query by type and finally the 
'title' field is an attempt 
- * to reset the title if a content-disposition hint exists. The logic is that 
such a presence is indicative 
- * that the content provider wants the filename therein to be used as the 
title.
- *
+ * 'lastModifed' is indexed to support query by date, 'contentLength' obtains
+ * content length from the HTTP header, 'type' field is indexed to support 
query
+ * by type and finally the 'title' field is an attempt to reset the title if a
+ * content-disposition hint exists. The logic is that such a presence is
+ * indicative that the content provider wants the filename therein to be used 
as
+ * the title.
+ * 
  * Still need to make content-length searchable!
- *
+ * 
  * @author John Xing
  */
 
 public class MoreIndexingFilter implements IndexingFilter {
-  public static final Logger LOG = 
LoggerFactory.getLogger(MoreIndexingFilter.class);
+  public static final Logger LOG = LoggerFactory
+      .getLogger(MoreIndexingFilter.class);
 
   /** Get the MimeTypes resolver instance. */
   private MimeUtil MIME;
   private Tika tika = new Tika();
 
   /** Map for mime-type substitution */
-  private HashMap<String,String> mimeMap = null;
+  private HashMap<String, String> mimeMap = null;
   private boolean mapMimes = false;
 
   public NutchDocument filter(NutchDocument doc, Parse parse, Text url,
@@ -89,23 +92,24 @@ public class MoreIndexingFilter implemen
     return doc;
   }
 
-  // Add time related meta info.  Add last-modified if present.  Index date as
+  // Add time related meta info. Add last-modified if present. Index date as
   // last-modified, or, if that's not present, use fetch time.
-  private NutchDocument addTime(NutchDocument doc, ParseData data,
-                           String url, CrawlDatum datum) {
+  private NutchDocument addTime(NutchDocument doc, ParseData data, String url,
+      CrawlDatum datum) {
     long time = -1;
 
     String lastModified = data.getMeta(Metadata.LAST_MODIFIED);
-    if (lastModified != null) {                   // try parse last-modified
-      time = getTime(lastModified,url);           // use as time
-                                                  // store as string
+    if (lastModified != null) { // try parse last-modified
+      time = getTime(lastModified, url); // use as time
+                                         // store as string
       doc.add("lastModified", new Date(time));
     }
 
-    if (time == -1) {                             // if no last-modified 
specified in HTTP header
-      time = datum.getModifiedTime();             // use value in CrawlDatum
-      if (time <= 0) {                            // if also unset
-        time = datum.getFetchTime();              // use time the fetch took 
place (fetchTime of fetchDatum)
+    if (time == -1) { // if no last-modified specified in HTTP header
+      time = datum.getModifiedTime(); // use value in CrawlDatum
+      if (time <= 0) { // if also unset
+        time = datum.getFetchTime(); // use time the fetch took place 
(fetchTime
+                                     // of fetchDatum)
       }
     }
 
@@ -119,43 +123,29 @@ public class MoreIndexingFilter implemen
     try {
       time = HttpDateFormat.toLong(date);
     } catch (ParseException e) {
-  // try to parse it as date in alternative format
-  try {
-      Date parsedDate = DateUtils.parseDate(date,
-      new String [] {
-          "EEE MMM dd HH:mm:ss yyyy",
-          "EEE MMM dd HH:mm:ss yyyy zzz",
-          "EEE MMM dd HH:mm:ss zzz yyyy",
-          "EEE, MMM dd HH:mm:ss yyyy zzz",
-          "EEE, dd MMM yyyy HH:mm:ss zzz",
-          "EEE,dd MMM yyyy HH:mm:ss zzz",
-          "EEE, dd MMM yyyy HH:mm:sszzz",
-          "EEE, dd MMM yyyy HH:mm:ss",
-          "EEE, dd-MMM-yy HH:mm:ss zzz",
-          "yyyy/MM/dd HH:mm:ss.SSS zzz",
-          "yyyy/MM/dd HH:mm:ss.SSS",
-          "yyyy/MM/dd HH:mm:ss zzz",
-          "yyyy/MM/dd",
-          "yyyy.MM.dd HH:mm:ss",
-          "yyyy-MM-dd HH:mm",
-          "MMM dd yyyy HH:mm:ss. zzz",
-          "MMM dd yyyy HH:mm:ss zzz",
-          "dd.MM.yyyy HH:mm:ss zzz",
-          "dd MM yyyy HH:mm:ss zzz",
-          "dd.MM.yyyy; HH:mm:ss",
-          "dd.MM.yyyy HH:mm:ss",
-          "dd.MM.yyyy zzz",
-          "yyyy-MM-dd'T'HH:mm:ss'Z'"
-      });
-      time = parsedDate.getTime();
-            // if (LOG.isWarnEnabled()) {
-      //   LOG.warn(url + ": parsed date: " + date +" to:"+time);
-            // }
-  } catch (Exception e2) {
-            if (LOG.isWarnEnabled()) {
-        LOG.warn(url + ": can't parse erroneous date: " + date);
-            }
-  }
+      // try to parse it as date in alternative format
+      try {
+        Date parsedDate = DateUtils.parseDate(date, new String[] {
+            "EEE MMM dd HH:mm:ss yyyy", "EEE MMM dd HH:mm:ss yyyy zzz",
+            "EEE MMM dd HH:mm:ss zzz yyyy", "EEE, MMM dd HH:mm:ss yyyy zzz",
+            "EEE, dd MMM yyyy HH:mm:ss zzz", "EEE,dd MMM yyyy HH:mm:ss zzz",
+            "EEE, dd MMM yyyy HH:mm:sszzz", "EEE, dd MMM yyyy HH:mm:ss",
+            "EEE, dd-MMM-yy HH:mm:ss zzz", "yyyy/MM/dd HH:mm:ss.SSS zzz",
+            "yyyy/MM/dd HH:mm:ss.SSS", "yyyy/MM/dd HH:mm:ss zzz", "yyyy/MM/dd",
+            "yyyy.MM.dd HH:mm:ss", "yyyy-MM-dd HH:mm",
+            "MMM dd yyyy HH:mm:ss. zzz", "MMM dd yyyy HH:mm:ss zzz",
+            "dd.MM.yyyy HH:mm:ss zzz", "dd MM yyyy HH:mm:ss zzz",
+            "dd.MM.yyyy; HH:mm:ss", "dd.MM.yyyy HH:mm:ss", "dd.MM.yyyy zzz",
+            "yyyy-MM-dd'T'HH:mm:ss'Z'" });
+        time = parsedDate.getTime();
+        // if (LOG.isWarnEnabled()) {
+        // LOG.warn(url + ": parsed date: " + date +" to:"+time);
+        // }
+      } catch (Exception e2) {
+        if (LOG.isWarnEnabled()) {
+          LOG.warn(url + ": can't parse erroneous date: " + date);
+        }
+      }
     }
     return time;
   }
@@ -187,7 +177,7 @@ public class MoreIndexingFilter implemen
    * all case insensitive. The query filter is implemented in
    * {@link TypeQueryFilter}.
    * </p>
-   *
+   * 
    * @param doc
    * @param data
    * @param url
@@ -212,9 +202,9 @@ public class MoreIndexingFilter implemen
       // (using MimeTypes.getMimeType(byte[], String), but I don't know
       // which field it is?
       // if (MAGIC) {
-      //   contentType = MIME.getMimeType(url, content);
+      // contentType = MIME.getMimeType(url, content);
       // } else {
-      //   contentType = MIME.getMimeType(url);
+      // contentType = MIME.getMimeType(url);
       // }
 
       mimeType = tika.detect(url);
@@ -243,20 +233,20 @@ public class MoreIndexingFilter implemen
     if (conf.getBoolean("moreIndexingFilter.indexMimeTypeParts", true)) {
       String[] parts = getParts(contentType);
 
-      for(String part: parts) {
+      for (String part : parts) {
         doc.add("type", part);
       }
     }
 
     // leave this for future improvement
-    //MimeTypeParameterList parameterList = mimeType.getParameters()
+    // MimeTypeParameterList parameterList = mimeType.getParameters()
 
     return doc;
   }
 
-
   /**
    * Utility method for splitting mime type into type and subtype.
+   * 
    * @param mimeType
    * @return
    */
@@ -273,15 +263,13 @@ public class MoreIndexingFilter implemen
   // Content-Disposition: inline; filename="foo.ppt"
   private Configuration conf;
 
-  static Pattern patterns[] = {null, null};
+  static Pattern patterns[] = { null, null };
 
   static {
     try {
       // order here is important
-      patterns[0] =
-        Pattern.compile("\\bfilename=['\"](.+)['\"]");
-      patterns[1] =
-        Pattern.compile("\\bfilename=(\\S+)\\b");
+      patterns[0] = Pattern.compile("\\bfilename=['\"](.+)['\"]");
+      patterns[1] = Pattern.compile("\\bfilename=(\\S+)\\b");
     } catch (PatternSyntaxException e) {
       // just ignore
     }
@@ -292,7 +280,7 @@ public class MoreIndexingFilter implemen
     if (contentDisposition == null || doc.getFieldValue("title") != null)
       return doc;
 
-    for (int i=0; i<patterns.length; i++) {
+    for (int i = 0; i < patterns.length; i++) {
       Matcher matcher = patterns[i].matcher(contentDisposition);
       if (matcher.find()) {
         doc.add("title", matcher.group(1));
@@ -324,11 +312,12 @@ public class MoreIndexingFilter implemen
   }
 
   private void readConfiguration() throws IOException {
-    BufferedReader reader = new 
BufferedReader(conf.getConfResourceAsReader("contenttype-mapping.txt"));
+    BufferedReader reader = new BufferedReader(
+        conf.getConfResourceAsReader("contenttype-mapping.txt"));
     String line;
     String parts[];
 
-    mimeMap = new HashMap<String,String>();
+    mimeMap = new HashMap<String, String>();
 
     while ((line = reader.readLine()) != null) {
       if (StringUtils.isNotBlank(line) && !line.startsWith("#")) {


Modified: 
nutch/trunk/src/plugin/index-more/src/test/org/apache/nutch/indexer/more/TestMoreIndexingFilter.java
URL: 
http://svn.apache.org/viewvc/nutch/trunk/src/plugin/index-more/src/test/org/apache/nutch/indexer/more/TestMoreIndexingFilter.java?rev=1655526&r1=1655525&r2=1655526&view=diff
==============================================================================
--- 
nutch/trunk/src/plugin/index-more/src/test/org/apache/nutch/indexer/more/TestMoreIndexingFilter.java
 (original)
+++ 
nutch/trunk/src/plugin/index-more/src/test/org/apache/nutch/indexer/more/TestMoreIndexingFilter.java
 Thu Jan 29 05:38:59 2015
@@ -40,7 +40,7 @@ public class TestMoreIndexingFilter {
     assertContentType(conf, "text/html", "text/html");
     assertContentType(conf, "text/html; charset=UTF-8", "text/html");
   }
-  
+
   @Test
   public void testGetParts() {
     String[] parts = MoreIndexingFilter.getParts("text/html");
@@ -51,7 +51,7 @@ public class TestMoreIndexingFilter {
    * @since NUTCH-901
    */
   @Test
-  public void testNoParts(){
+  public void testNoParts() {
     Configuration conf = NutchConfiguration.create();
     conf.setBoolean("moreIndexingFilter.indexMimeTypeParts", false);
     MoreIndexingFilter filter = new MoreIndexingFilter();
@@ -59,18 +59,18 @@ public class TestMoreIndexingFilter {
     Assert.assertNotNull(filter);
     NutchDocument doc = new NutchDocument();
     ParseImpl parse = new ParseImpl("foo bar", new ParseData());
-    
-    try{
-        filter.filter(doc, parse, new 
Text("http://nutch.apache.org/index.html";), new CrawlDatum(), new Inlinks());
-    }
-    catch(Exception e){
-        e.printStackTrace();
-        Assert.fail(e.getMessage());
+
+    try {
+      filter.filter(doc, parse, new Text("http://nutch.apache.org/index.html";),
+          new CrawlDatum(), new Inlinks());
+    } catch (Exception e) {
+      e.printStackTrace();
+      Assert.fail(e.getMessage());
     }
     Assert.assertNotNull(doc);
     Assert.assertTrue(doc.getFieldNames().contains("type"));
     Assert.assertEquals(1, doc.getField("type").getValues().size());
-    Assert.assertEquals("text/html", doc.getFieldValue("type"));    
+    Assert.assertEquals("text/html", doc.getFieldValue("type"));
   }
 
   @Test
@@ -89,8 +89,9 @@ public class TestMoreIndexingFilter {
     NutchDocument doc = new NutchDocument();
     doc = filter.filter(doc, parseImpl, url, new CrawlDatum(), new Inlinks());
 
-    Assert.assertEquals("content-disposition not detected", "filename.ext", 
doc.getFieldValue("title"));
-    
+    Assert.assertEquals("content-disposition not detected", "filename.ext",
+        doc.getFieldValue("title"));
+
     /* NUTCH-1140: do not add second title to avoid a multi-valued title field 
*/
     doc = new NutchDocument();
     doc.add("title", "title");
@@ -105,15 +106,18 @@ public class TestMoreIndexingFilter {
       Assert.assertEquals(expected[i], parts[i]);
     }
   }
-  
-  private void assertContentType(Configuration conf, String source, String 
expected) throws IndexingException {
+
+  private void assertContentType(Configuration conf, String source,
+      String expected) throws IndexingException {
     Metadata metadata = new Metadata();
     metadata.add(Response.CONTENT_TYPE, source);
     MoreIndexingFilter filter = new MoreIndexingFilter();
     filter.setConf(conf);
-    NutchDocument doc = filter.filter(new NutchDocument(), new 
ParseImpl("text", new ParseData(
-        new ParseStatus(), "title", new Outlink[0], metadata)), new Text(
-        "http://www.example.com/";), new CrawlDatum(), new Inlinks());
-    Assert.assertEquals("mime type not detected", expected, 
doc.getFieldValue("type"));
+    NutchDocument doc = filter.filter(new NutchDocument(), new ParseImpl(
+        "text", new ParseData(new ParseStatus(), "title", new Outlink[0],
+            metadata)), new Text("http://www.example.com/";), new CrawlDatum(),
+        new Inlinks());
+    Assert.assertEquals("mime type not detected", expected,
+        doc.getFieldValue("type"));
   }
 }

Modified: 
nutch/trunk/src/plugin/index-static/src/java/org/apache/nutch/indexer/staticfield/StaticFieldIndexer.java
URL: 
http://svn.apache.org/viewvc/nutch/trunk/src/plugin/index-static/src/java/org/apache/nutch/indexer/staticfield/StaticFieldIndexer.java?rev=1655526&r1=1655525&r2=1655526&view=diff
==============================================================================
--- 
nutch/trunk/src/plugin/index-static/src/java/org/apache/nutch/indexer/staticfield/StaticFieldIndexer.java
 (original)
+++ 
nutch/trunk/src/plugin/index-static/src/java/org/apache/nutch/indexer/staticfield/StaticFieldIndexer.java
 Thu Jan 29 05:38:59 2015
@@ -45,11 +45,16 @@ public class StaticFieldIndexer implemen
    * The {@link StaticFieldIndexer} filter object which adds fields as per
    * configuration setting. See {@code index.static} in nutch-default.xml.
    * 
-   * @param doc The {@link NutchDocument} object
-   * @param parse  The relevant {@link Parse} object passing through the filter
-   * @param url URL to be filtered for anchor text
-   * @param datum The {@link CrawlDatum} entry
-   * @param inlinks The {@link Inlinks} containing anchor text
+   * @param doc
+   *          The {@link NutchDocument} object
+   * @param parse
+   *          The relevant {@link Parse} object passing through the filter
+   * @param url
+   *          URL to be filtered for anchor text
+   * @param datum
+   *          The {@link CrawlDatum} entry
+   * @param inlinks
+   *          The {@link Inlinks} containing anchor text
    * @return filtered NutchDocument
    */
   public NutchDocument filter(NutchDocument doc, Parse parse, Text url,
@@ -66,17 +71,19 @@ public class StaticFieldIndexer implemen
   }
 
   /**
-   * Populate a HashMap from a list of fieldname:fieldcontent.
-   * See {@index.static} in nutch-default.xml.
+   * Populate a HashMap from a list of fieldname:fieldcontent. See
+   * {@index.static} in nutch-default.xml.
    * 
-   * @param fieldsString string containing field:value pairs
+   * @param fieldsString
+   *          string containing field:value pairs
    * @return HashMap of fields and their corresponding values
    */
   private HashMap<String, String[]> parseFields(String fieldsString) {
     HashMap<String, String[]> fields = new HashMap<String, String[]>();
 
-    /* The format is very easy, it's a comma-separated list of fields in the
-       form <name>:<value>
+    /*
+     * The format is very easy, it's a comma-separated list of fields in the
+     * form <name>:<value>
      */
     for (String field : fieldsString.split(",")) {
       String[] entry = field.split(":");

Modified: 
nutch/trunk/src/plugin/index-static/src/test/org/apache/nutch/indexer/staticfield/TestStaticFieldIndexerTest.java
URL: 
http://svn.apache.org/viewvc/nutch/trunk/src/plugin/index-static/src/test/org/apache/nutch/indexer/staticfield/TestStaticFieldIndexerTest.java?rev=1655526&r1=1655525&r2=1655526&view=diff
==============================================================================
--- 
nutch/trunk/src/plugin/index-static/src/test/org/apache/nutch/indexer/staticfield/TestStaticFieldIndexerTest.java
 (original)
+++ 
nutch/trunk/src/plugin/index-static/src/test/org/apache/nutch/indexer/staticfield/TestStaticFieldIndexerTest.java
 Thu Jan 29 05:38:59 2015
@@ -28,11 +28,10 @@ import org.junit.Before;
 import org.junit.Test;
 
 /**
- * JUnit test case which tests 
- * 1. that static data fields are added to a document
- * 2. that empty {@code index.static} does not add anything to the document
- * 3. that valid field:value pairs are added to the document
- * 4. that fields and values added to the document are trimmed 
+ * JUnit test case which tests 1. that static data fields are added to a
+ * document 2. that empty {@code index.static} does not add anything to the
+ * document 3. that valid field:value pairs are added to the document 4. that
+ * fields and values added to the document are trimmed
  * 
  * @author tejasp
  */
@@ -59,7 +58,8 @@ public class TestStaticFieldIndexerTest
 
   /**
    * Test that empty {@code index.static} does not add anything to the document
-   * @throws Exception 
+   * 
+   * @throws Exception
    */
   @Test
   public void testEmptyIndexStatic() throws Exception {
@@ -77,12 +77,14 @@ public class TestStaticFieldIndexerTest
     }
 
     Assert.assertNotNull(doc);
-    Assert.assertTrue("tests if no field is set for empty index.static", 
doc.getFieldNames().isEmpty());
+    Assert.assertTrue("tests if no field is set for empty index.static", doc
+        .getFieldNames().isEmpty());
   }
 
   /**
    * Test that valid field:value pairs are added to the document
-   * @throws Exception 
+   * 
+   * @throws Exception
    */
   @Test
   public void testNormalScenario() throws Exception {
@@ -102,13 +104,15 @@ public class TestStaticFieldIndexerTest
     }
 
     Assert.assertNotNull(doc);
-    Assert.assertFalse("test if doc is not empty", 
doc.getFieldNames().isEmpty());
-    Assert.assertEquals("test if doc has 3 fields", 3, 
doc.getFieldNames().size());
-    Assert.assertTrue("test if doc has field1", 
doc.getField("field1").getValues()
-        .contains("val1"));
-    Assert.assertTrue("test if doc has field2", 
doc.getField("field2").getValues()
-        .contains("val2"));
-    Assert.assertTrue("test if doc has field4", 
doc.getField("field4").getValues()
-        .contains("val4"));
+    Assert.assertFalse("test if doc is not empty", doc.getFieldNames()
+        .isEmpty());
+    Assert.assertEquals("test if doc has 3 fields", 3, doc.getFieldNames()
+        .size());
+    Assert.assertTrue("test if doc has field1", doc.getField("field1")
+        .getValues().contains("val1"));
+    Assert.assertTrue("test if doc has field2", doc.getField("field2")
+        .getValues().contains("val2"));
+    Assert.assertTrue("test if doc has field4", doc.getField("field4")
+        .getValues().contains("val4"));
   }
 }

Modified: 
nutch/trunk/src/plugin/indexer-dummy/src/java/org/apache/nutch/indexwriter/dummy/DummyIndexWriter.java
URL: 
http://svn.apache.org/viewvc/nutch/trunk/src/plugin/indexer-dummy/src/java/org/apache/nutch/indexwriter/dummy/DummyIndexWriter.java?rev=1655526&r1=1655525&r2=1655526&view=diff
==============================================================================
--- 
nutch/trunk/src/plugin/indexer-dummy/src/java/org/apache/nutch/indexwriter/dummy/DummyIndexWriter.java
 (original)
+++ 
nutch/trunk/src/plugin/indexer-dummy/src/java/org/apache/nutch/indexwriter/dummy/DummyIndexWriter.java
 Thu Jan 29 05:38:59 2015
@@ -35,12 +35,13 @@ import org.slf4j.LoggerFactory;
  * and add.
  */
 public class DummyIndexWriter implements IndexWriter {
-  public static final Logger LOG = 
LoggerFactory.getLogger(DummyIndexWriter.class);
+  public static final Logger LOG = LoggerFactory
+      .getLogger(DummyIndexWriter.class);
   private Configuration config;
   private Writer writer;
   private boolean delete = false;
 
-  public void open(JobConf job, String name) throws IOException {  
+  public void open(JobConf job, String name) throws IOException {
     delete = job.getBoolean(IndexerMapReduce.INDEXER_DELETE, false);
   }
 
@@ -82,19 +83,21 @@ public class DummyIndexWriter implements
     String path = conf.get("dummy.path");
     if (path == null) {
       String message = "Missing path. Should be set via -Ddummy.path";
-      message+="\n"+describe();
+      message += "\n" + describe();
       LOG.error(message);
       throw new RuntimeException(message);
     }
 
     try {
       writer = new BufferedWriter(new FileWriter(conf.get("dummy.path")));
-    } catch (IOException e) {}
+    } catch (IOException e) {
+    }
   }
 
-  public String describe(){
+  public String describe() {
     StringBuffer sb = new StringBuffer("DummyIndexWriter\n");
-    sb.append("\t").append("dummy.path : Path of the file to write to 
(mandatory)\n");
+    sb.append("\t").append(
+        "dummy.path : Path of the file to write to (mandatory)\n");
     return sb.toString();
   }
 }

Modified: 
nutch/trunk/src/plugin/indexer-dummy/src/java/org/apache/nutch/indexwriter/dummy/package-info.java
URL: 
http://svn.apache.org/viewvc/nutch/trunk/src/plugin/indexer-dummy/src/java/org/apache/nutch/indexwriter/dummy/package-info.java?rev=1655526&r1=1655525&r2=1655526&view=diff
==============================================================================
--- 
nutch/trunk/src/plugin/indexer-dummy/src/java/org/apache/nutch/indexwriter/dummy/package-info.java
 (original)
+++ 
nutch/trunk/src/plugin/indexer-dummy/src/java/org/apache/nutch/indexwriter/dummy/package-info.java
 Thu Jan 29 05:38:59 2015
@@ -20,3 +20,4 @@
  * text file, action is one of "add", "update", or "delete".
  */
 package org.apache.nutch.indexwriter.dummy;
+

Modified: 
nutch/trunk/src/plugin/indexer-elastic/src/java/org/apache/nutch/indexwriter/elastic/package-info.java
URL: 
http://svn.apache.org/viewvc/nutch/trunk/src/plugin/indexer-elastic/src/java/org/apache/nutch/indexwriter/elastic/package-info.java?rev=1655526&r1=1655525&r2=1655526&view=diff
==============================================================================
--- 
nutch/trunk/src/plugin/indexer-elastic/src/java/org/apache/nutch/indexwriter/elastic/package-info.java
 (original)
+++ 
nutch/trunk/src/plugin/indexer-elastic/src/java/org/apache/nutch/indexwriter/elastic/package-info.java
 Thu Jan 29 05:38:59 2015
@@ -19,3 +19,4 @@
  * Index writer plugin for <a 
href="http://www.elasticsearch.org/";>Elasticsearch</a>.
  */
 package org.apache.nutch.indexwriter.elastic;
+

Modified: 
nutch/trunk/src/plugin/indexer-solr/src/java/org/apache/nutch/indexwriter/solr/SolrConstants.java
URL: 
http://svn.apache.org/viewvc/nutch/trunk/src/plugin/indexer-solr/src/java/org/apache/nutch/indexwriter/solr/SolrConstants.java?rev=1655526&r1=1655525&r2=1655526&view=diff
==============================================================================
--- 
nutch/trunk/src/plugin/indexer-solr/src/java/org/apache/nutch/indexwriter/solr/SolrConstants.java
 (original)
+++ 
nutch/trunk/src/plugin/indexer-solr/src/java/org/apache/nutch/indexwriter/solr/SolrConstants.java
 Thu Jan 29 05:38:59 2015
@@ -30,10 +30,10 @@ public interface SolrConstants {
   public static final String USERNAME = SOLR_PREFIX + "auth.username";
 
   public static final String PASSWORD = SOLR_PREFIX + "auth.password";
-  
+
   @Deprecated
   public static final String COMMIT_INDEX = SOLR_PREFIX + "commit.index";
-  
+
   @Deprecated
   public static final String PARAMS = SOLR_PREFIX + "params";
 

Modified: 
nutch/trunk/src/plugin/indexer-solr/src/java/org/apache/nutch/indexwriter/solr/SolrIndexWriter.java
URL: 
http://svn.apache.org/viewvc/nutch/trunk/src/plugin/indexer-solr/src/java/org/apache/nutch/indexwriter/solr/SolrIndexWriter.java?rev=1655526&r1=1655525&r2=1655526&view=diff
==============================================================================
--- 
nutch/trunk/src/plugin/indexer-solr/src/java/org/apache/nutch/indexwriter/solr/SolrIndexWriter.java
 (original)
+++ 
nutch/trunk/src/plugin/indexer-solr/src/java/org/apache/nutch/indexwriter/solr/SolrIndexWriter.java
 Thu Jan 29 05:38:59 2015
@@ -39,169 +39,175 @@ import org.slf4j.LoggerFactory;
 
 public class SolrIndexWriter implements IndexWriter {
 
-    public static final Logger LOG = LoggerFactory
-            .getLogger(SolrIndexWriter.class);
+  public static final Logger LOG = LoggerFactory
+      .getLogger(SolrIndexWriter.class);
 
-    private SolrServer solr;
-    private SolrMappingReader solrMapping;
-    private ModifiableSolrParams params;
-
-    private Configuration config;
-
-    private final List<SolrInputDocument> inputDocs = new 
ArrayList<SolrInputDocument>();
-
-    private int batchSize;
-    private int numDeletes = 0;
-    private boolean delete = false;
-
-    public void open(JobConf job, String name) throws IOException {
-        SolrServer server = SolrUtils.getCommonsHttpSolrServer(job);
-        init(server, job);
-    }
-
-    // package protected for tests
-    void init(SolrServer server, JobConf job) throws IOException {
-        solr = server;
-        batchSize = job.getInt(SolrConstants.COMMIT_SIZE, 1000);
-        solrMapping = SolrMappingReader.getInstance(job);
-        delete = job.getBoolean(IndexerMapReduce.INDEXER_DELETE, false);
-        // parse optional params
-        params = new ModifiableSolrParams();
-        String paramString = job.get(IndexerMapReduce.INDEXER_PARAMS);
-        if (paramString != null) {
-            String[] values = paramString.split("&");
-            for (String v : values) {
-                String[] kv = v.split("=");
-                if (kv.length < 2) {
-                    continue;
-                }
-                params.add(kv[0], kv[1]);
-            }
-        }
-    }
-
-    public void delete(String key) throws IOException {
-        if (delete) {
-            try {
-                solr.deleteById(key);
-                numDeletes++;
-            } catch (final SolrServerException e) {
-                throw makeIOException(e);
-            }
-        }
-    }
-
-    @Override
-    public void update(NutchDocument doc) throws IOException {
-        write(doc);
-    }
-
-    public void write(NutchDocument doc) throws IOException {
-        final SolrInputDocument inputDoc = new SolrInputDocument();
-        for (final Entry<String, NutchField> e : doc) {
-            for (final Object val : e.getValue().getValues()) {
-                // normalise the string representation for a Date
-                Object val2 = val;
-
-                if (val instanceof Date) {
-                    val2 = DateUtil.getThreadLocalDateFormat().format(val);
-                }
-
-                if (e.getKey().equals("content") || 
e.getKey().equals("title")) {
-                    val2 = SolrUtils.stripNonCharCodepoints((String) val);
-                }
-
-                inputDoc.addField(solrMapping.mapKey(e.getKey()), val2, e
-                        .getValue().getWeight());
-                String sCopy = solrMapping.mapCopyKey(e.getKey());
-                if (sCopy != e.getKey()) {
-                    inputDoc.addField(sCopy, val);
-                }
-            }
-        }
-
-        inputDoc.setDocumentBoost(doc.getWeight());
-        inputDocs.add(inputDoc);
-        if (inputDocs.size() + numDeletes >= batchSize) {
-            try {
-                LOG.info("Indexing " + Integer.toString(inputDocs.size())
-                        + " documents");
-                LOG.info("Deleting " + Integer.toString(numDeletes)
-                        + " documents");
-                numDeletes = 0;
-                UpdateRequest req = new UpdateRequest();
-                req.add(inputDocs);
-                req.setParams(params);
-                req.process(solr);
-            } catch (final SolrServerException e) {
-                throw makeIOException(e);
-            }
-            inputDocs.clear();
-        }
-    }
+  private SolrServer solr;
+  private SolrMappingReader solrMapping;
+  private ModifiableSolrParams params;
+
+  private Configuration config;
+
+  private final List<SolrInputDocument> inputDocs = new 
ArrayList<SolrInputDocument>();
+
+  private int batchSize;
+  private int numDeletes = 0;
+  private boolean delete = false;
+
+  public void open(JobConf job, String name) throws IOException {
+    SolrServer server = SolrUtils.getCommonsHttpSolrServer(job);
+    init(server, job);
+  }
+
+  // package protected for tests
+  void init(SolrServer server, JobConf job) throws IOException {
+    solr = server;
+    batchSize = job.getInt(SolrConstants.COMMIT_SIZE, 1000);
+    solrMapping = SolrMappingReader.getInstance(job);
+    delete = job.getBoolean(IndexerMapReduce.INDEXER_DELETE, false);
+    // parse optional params
+    params = new ModifiableSolrParams();
+    String paramString = job.get(IndexerMapReduce.INDEXER_PARAMS);
+    if (paramString != null) {
+      String[] values = paramString.split("&");
+      for (String v : values) {
+        String[] kv = v.split("=");
+        if (kv.length < 2) {
+          continue;
+        }
+        params.add(kv[0], kv[1]);
+      }
+    }
+  }
+
+  public void delete(String key) throws IOException {
+    if (delete) {
+      try {
+        solr.deleteById(key);
+        numDeletes++;
+      } catch (final SolrServerException e) {
+        throw makeIOException(e);
+      }
+    }
+  }
+
+  @Override
+  public void update(NutchDocument doc) throws IOException {
+    write(doc);
+  }
+
+  public void write(NutchDocument doc) throws IOException {
+    final SolrInputDocument inputDoc = new SolrInputDocument();
+    for (final Entry<String, NutchField> e : doc) {
+      for (final Object val : e.getValue().getValues()) {
+        // normalise the string representation for a Date
+        Object val2 = val;
+
+        if (val instanceof Date) {
+          val2 = DateUtil.getThreadLocalDateFormat().format(val);
+        }
+
+        if (e.getKey().equals("content") || e.getKey().equals("title")) {
+          val2 = SolrUtils.stripNonCharCodepoints((String) val);
+        }
+
+        inputDoc.addField(solrMapping.mapKey(e.getKey()), val2, e.getValue()
+            .getWeight());
+        String sCopy = solrMapping.mapCopyKey(e.getKey());
+        if (sCopy != e.getKey()) {
+          inputDoc.addField(sCopy, val);
+        }
+      }
+    }
+
+    inputDoc.setDocumentBoost(doc.getWeight());
+    inputDocs.add(inputDoc);
+    if (inputDocs.size() + numDeletes >= batchSize) {
+      try {
+        LOG.info("Indexing " + Integer.toString(inputDocs.size())
+            + " documents");
+        LOG.info("Deleting " + Integer.toString(numDeletes) + " documents");
+        numDeletes = 0;
+        UpdateRequest req = new UpdateRequest();
+        req.add(inputDocs);
+        req.setParams(params);
+        req.process(solr);
+      } catch (final SolrServerException e) {
+        throw makeIOException(e);
+      }
+      inputDocs.clear();
+    }
+  }
+
+  public void close() throws IOException {
+    try {
+      if (!inputDocs.isEmpty()) {
+        LOG.info("Indexing " + Integer.toString(inputDocs.size())
+            + " documents");
+        if (numDeletes > 0) {
+          LOG.info("Deleting " + Integer.toString(numDeletes) + " documents");
+        }
+        UpdateRequest req = new UpdateRequest();
+        req.add(inputDocs);
+        req.setParams(params);
+        req.process(solr);
+        inputDocs.clear();
+      }
+    } catch (final SolrServerException e) {
+      throw makeIOException(e);
+    }
+  }
+
+  @Override
+  public void commit() throws IOException {
+    try {
+      solr.commit();
+    } catch (SolrServerException e) {
+      throw makeIOException(e);
+    }
+  }
+
+  public static IOException makeIOException(SolrServerException e) {
+    final IOException ioe = new IOException();
+    ioe.initCause(e);
+    return ioe;
+  }
+
+  @Override
+  public Configuration getConf() {
+    return config;
+  }
+
+  @Override
+  public void setConf(Configuration conf) {
+    config = conf;
+    String serverURL = conf.get(SolrConstants.SERVER_URL);
+    if (serverURL == null) {
+      String message = "Missing SOLR URL. Should be set via -D "
+          + SolrConstants.SERVER_URL;
+      message += "\n" + describe();
+      LOG.error(message);
+      throw new RuntimeException(message);
+    }
+  }
+
+  public String describe() {
+    StringBuffer sb = new StringBuffer("SOLRIndexWriter\n");
+    sb.append("\t").append(SolrConstants.SERVER_URL)
+        .append(" : URL of the SOLR instance (mandatory)\n");
+    sb.append("\t").append(SolrConstants.COMMIT_SIZE)
+        .append(" : buffer size when sending to SOLR (default 1000)\n");
+    sb.append("\t")
+        .append(SolrConstants.MAPPING_FILE)
+        .append(
+            " : name of the mapping file for fields (default 
solrindex-mapping.xml)\n");
+    sb.append("\t").append(SolrConstants.USE_AUTH)
+        .append(" : use authentication (default false)\n");
+    sb.append("\t").append(SolrConstants.USERNAME)
+        .append(" : username for authentication\n");
+    sb.append("\t").append(SolrConstants.PASSWORD)
+        .append(" : password for authentication\n");
+    return sb.toString();
+  }
 
-    public void close() throws IOException {
-        try {
-            if (!inputDocs.isEmpty()) {
-                LOG.info("Indexing " + Integer.toString(inputDocs.size())
-                        + " documents");
-                if (numDeletes > 0) {
-                    LOG.info("Deleting " + Integer.toString(numDeletes)
-                            + " documents");
-                }
-                UpdateRequest req = new UpdateRequest();
-                req.add(inputDocs);
-                req.setParams(params);
-                req.process(solr);
-                inputDocs.clear();
-            }
-        } catch (final SolrServerException e) {
-            throw makeIOException(e);
-        }
-    }
-
-    @Override
-    public void commit() throws IOException {
-        try {
-            solr.commit();
-        } catch (SolrServerException e) {
-            throw makeIOException(e);
-        }
-    }
-
-    public static IOException makeIOException(SolrServerException e) {
-        final IOException ioe = new IOException();
-        ioe.initCause(e);
-        return ioe;
-    }
-
-    @Override
-    public Configuration getConf() {
-        return config;
-    }
-
-    @Override
-    public void setConf(Configuration conf) {
-        config = conf;
-        String serverURL = conf.get(SolrConstants.SERVER_URL);
-        if (serverURL == null) {
-            String message = "Missing SOLR URL. Should be set via -D "
-                    + SolrConstants.SERVER_URL;
-            message+="\n"+describe();
-            LOG.error(message);
-            throw new RuntimeException(message);
-        }
-    }
-
-    public String describe(){
-       StringBuffer sb = new StringBuffer("SOLRIndexWriter\n");
-       sb.append("\t").append(SolrConstants.SERVER_URL).append(" : URL of the 
SOLR instance (mandatory)\n");
-       sb.append("\t").append(SolrConstants.COMMIT_SIZE).append(" : buffer 
size when sending to SOLR (default 1000)\n");
-       sb.append("\t").append(SolrConstants.MAPPING_FILE).append(" : name of 
the mapping file for fields (default solrindex-mapping.xml)\n");
-       sb.append("\t").append(SolrConstants.USE_AUTH).append(" : use 
authentication (default false)\n");
-       sb.append("\t").append(SolrConstants.USERNAME).append(" : username for 
authentication\n");
-       sb.append("\t").append(SolrConstants.PASSWORD).append(" : password for 
authentication\n");
-       return sb.toString();
-    }
-    
 }

Modified: 
nutch/trunk/src/plugin/indexer-solr/src/java/org/apache/nutch/indexwriter/solr/SolrMappingReader.java
URL: 
http://svn.apache.org/viewvc/nutch/trunk/src/plugin/indexer-solr/src/java/org/apache/nutch/indexwriter/solr/SolrMappingReader.java?rev=1655526&r1=1655525&r2=1655526&view=diff
==============================================================================
--- 
nutch/trunk/src/plugin/indexer-solr/src/java/org/apache/nutch/indexwriter/solr/SolrMappingReader.java
 (original)
+++ 
nutch/trunk/src/plugin/indexer-solr/src/java/org/apache/nutch/indexwriter/solr/SolrMappingReader.java
 Thu Jan 29 05:38:59 2015
@@ -38,16 +38,17 @@ import org.xml.sax.SAXException;
 
 public class SolrMappingReader {
   public static Logger LOG = LoggerFactory.getLogger(SolrMappingReader.class);
-  
+
   private Configuration conf;
-  
+
   private Map<String, String> keyMap = new HashMap<String, String>();
   private Map<String, String> copyMap = new HashMap<String, String>();
   private String uniqueKey = "id";
-  
+
   public static synchronized SolrMappingReader getInstance(Configuration conf) 
{
     ObjectCache cache = ObjectCache.get(conf);
-    SolrMappingReader instance = 
(SolrMappingReader)cache.getObject(SolrMappingReader.class.getName());
+    SolrMappingReader instance = (SolrMappingReader) cache
+        .getObject(SolrMappingReader.class.getName());
     if (instance == null) {
       instance = new SolrMappingReader(conf);
       cache.setObject(SolrMappingReader.class.getName(), instance);
@@ -60,9 +61,10 @@ public class SolrMappingReader {
     parseMapping();
   }
 
-  private void parseMapping() {    
+  private void parseMapping() {
     InputStream ssInputStream = null;
-    ssInputStream = 
conf.getConfResourceAsInputStream(conf.get(SolrConstants.MAPPING_FILE, 
"solrindex-mapping.xml"));
+    ssInputStream = conf.getConfResourceAsInputStream(conf.get(
+        SolrConstants.MAPPING_FILE, "solrindex-mapping.xml"));
 
     InputSource inputSource = new InputSource(ssInputStream);
     try {
@@ -74,48 +76,50 @@ public class SolrMappingReader {
       if (fieldList.getLength() > 0) {
         for (int i = 0; i < fieldList.getLength(); i++) {
           Element element = (Element) fieldList.item(i);
-          LOG.info("source: " + element.getAttribute("source") + " dest: " + 
element.getAttribute("dest"));
-          keyMap.put(element.getAttribute("source"), 
element.getAttribute("dest"));
+          LOG.info("source: " + element.getAttribute("source") + " dest: "
+              + element.getAttribute("dest"));
+          keyMap.put(element.getAttribute("source"),
+              element.getAttribute("dest"));
         }
       }
       NodeList copyFieldList = rootElement.getElementsByTagName("copyField");
       if (copyFieldList.getLength() > 0) {
         for (int i = 0; i < copyFieldList.getLength(); i++) {
           Element element = (Element) copyFieldList.item(i);
-          LOG.info("source: " + element.getAttribute("source") + " dest: " + 
element.getAttribute("dest"));
-          copyMap.put(element.getAttribute("source"), 
element.getAttribute("dest"));
+          LOG.info("source: " + element.getAttribute("source") + " dest: "
+              + element.getAttribute("dest"));
+          copyMap.put(element.getAttribute("source"),
+              element.getAttribute("dest"));
         }
       }
       NodeList uniqueKeyItem = rootElement.getElementsByTagName("uniqueKey");
       if (uniqueKeyItem.getLength() > 1) {
         LOG.warn("More than one unique key definitions found in solr index 
mapping, using default 'id'");
         uniqueKey = "id";
-      }
-      else if (uniqueKeyItem.getLength() == 0) {
+      } else if (uniqueKeyItem.getLength() == 0) {
         LOG.warn("No unique key definition found in solr index mapping using, 
default 'id'");
-      }
-      else{
-         uniqueKey = uniqueKeyItem.item(0).getFirstChild().getNodeValue();
+      } else {
+        uniqueKey = uniqueKeyItem.item(0).getFirstChild().getNodeValue();
       }
     } catch (MalformedURLException e) {
-        LOG.warn(e.toString());
+      LOG.warn(e.toString());
     } catch (SAXException e) {
-        LOG.warn(e.toString());
+      LOG.warn(e.toString());
     } catch (IOException e) {
-       LOG.warn(e.toString());
+      LOG.warn(e.toString());
     } catch (ParserConfigurationException e) {
-       LOG.warn(e.toString());
-    } 
+      LOG.warn(e.toString());
+    }
   }
-         
+
   public Map<String, String> getKeyMap() {
     return keyMap;
   }
-         
+
   public Map<String, String> getCopyMap() {
     return copyMap;
   }
-         
+
   public String getUniqueKey() {
     return uniqueKey;
   }
@@ -128,14 +132,14 @@ public class SolrMappingReader {
   }
 
   public String mapKey(String key) throws IOException {
-    if(keyMap.containsKey(key)) {
+    if (keyMap.containsKey(key)) {
       key = (String) keyMap.get(key);
     }
     return key;
   }
 
   public String mapCopyKey(String key) throws IOException {
-    if(copyMap.containsKey(key)) {
+    if (copyMap.containsKey(key)) {
       key = (String) copyMap.get(key);
     }
     return key;

Modified: 
nutch/trunk/src/plugin/indexer-solr/src/java/org/apache/nutch/indexwriter/solr/SolrUtils.java
URL: 
http://svn.apache.org/viewvc/nutch/trunk/src/plugin/indexer-solr/src/java/org/apache/nutch/indexwriter/solr/SolrUtils.java?rev=1655526&r1=1655525&r2=1655526&view=diff
==============================================================================
--- 
nutch/trunk/src/plugin/indexer-solr/src/java/org/apache/nutch/indexwriter/solr/SolrUtils.java
 (original)
+++ 
nutch/trunk/src/plugin/indexer-solr/src/java/org/apache/nutch/indexwriter/solr/SolrUtils.java
 Thu Jan 29 05:38:59 2015
@@ -31,8 +31,9 @@ public class SolrUtils {
 
   public static Logger LOG = LoggerFactory.getLogger(SolrUtils.class);
 
-  public static CommonsHttpSolrServer getCommonsHttpSolrServer(JobConf job) 
throws MalformedURLException {
-    HttpClient client=new HttpClient();
+  public static CommonsHttpSolrServer getCommonsHttpSolrServer(JobConf job)
+      throws MalformedURLException {
+    HttpClient client = new HttpClient();
 
     // Check for username/password
     if (job.getBoolean(SolrConstants.USE_AUTH, false)) {
@@ -40,9 +41,13 @@ public class SolrUtils {
 
       LOG.info("Authenticating as: " + username);
 
-      AuthScope scope = new AuthScope(AuthScope.ANY_HOST, AuthScope.ANY_PORT, 
AuthScope.ANY_REALM, AuthScope.ANY_SCHEME);
+      AuthScope scope = new AuthScope(AuthScope.ANY_HOST, AuthScope.ANY_PORT,
+          AuthScope.ANY_REALM, AuthScope.ANY_SCHEME);
 
-      client.getState().setCredentials(scope, new 
UsernamePasswordCredentials(username, job.get(SolrConstants.PASSWORD)));
+      client.getState().setCredentials(
+          scope,
+          new UsernamePasswordCredentials(username, job
+              .get(SolrConstants.PASSWORD)));
 
       HttpClientParams params = client.getParams();
       params.setAuthenticationPreemptive(true);
@@ -51,7 +56,7 @@ public class SolrUtils {
     }
 
     String serverURL = job.get(SolrConstants.SERVER_URL);
-    
+
     return new CommonsHttpSolrServer(serverURL, client);
   }
 
@@ -62,8 +67,10 @@ public class SolrUtils {
     for (int i = 0; i < input.length(); i++) {
       ch = input.charAt(i);
 
-      // Strip all non-characters 
http://unicode.org/cldr/utility/list-unicodeset.jsp?a=[:Noncharacter_Code_Point=True:]
-      // and non-printable control characters except tabulator, new line and 
carriage return
+      // Strip all non-characters
+      // 
http://unicode.org/cldr/utility/list-unicodeset.jsp?a=[:Noncharacter_Code_Point=True:]
+      // and non-printable control characters except tabulator, new line and
+      // carriage return
       if (ch % 0x10000 != 0xffff && // 0xffff - 0x10ffff range step 0x10000
           ch % 0x10000 != 0xfffe && // 0xfffe - 0x10fffe range
           (ch <= 0xfdd0 || ch >= 0xfdef) && // 0xfdd0 - 0xfdef

Modified: 
nutch/trunk/src/plugin/indexer-solr/src/java/org/apache/nutch/indexwriter/solr/package-info.java
URL: 
http://svn.apache.org/viewvc/nutch/trunk/src/plugin/indexer-solr/src/java/org/apache/nutch/indexwriter/solr/package-info.java?rev=1655526&r1=1655525&r2=1655526&view=diff
==============================================================================
--- 
nutch/trunk/src/plugin/indexer-solr/src/java/org/apache/nutch/indexwriter/solr/package-info.java
 (original)
+++ 
nutch/trunk/src/plugin/indexer-solr/src/java/org/apache/nutch/indexwriter/solr/package-info.java
 Thu Jan 29 05:38:59 2015
@@ -19,3 +19,4 @@
  * Index writer plugin for <a href="http://lucene.apache.org/solr/";>Apache 
Solr</a>.
  */
 package org.apache.nutch.indexwriter.solr;
+

Modified: 
nutch/trunk/src/plugin/language-identifier/src/java/org/apache/nutch/analysis/lang/HTMLLanguageParser.java
URL: 
http://svn.apache.org/viewvc/nutch/trunk/src/plugin/language-identifier/src/java/org/apache/nutch/analysis/lang/HTMLLanguageParser.java?rev=1655526&r1=1655525&r2=1655526&view=diff
==============================================================================
--- 
nutch/trunk/src/plugin/language-identifier/src/java/org/apache/nutch/analysis/lang/HTMLLanguageParser.java
 (original)
+++ 
nutch/trunk/src/plugin/language-identifier/src/java/org/apache/nutch/analysis/lang/HTMLLanguageParser.java
 Thu Jan 29 05:38:59 2015
@@ -41,289 +41,280 @@ import org.w3c.dom.Node;
 
 public class HTMLLanguageParser implements HtmlParseFilter {
 
-    public static final Logger LOG = LoggerFactory
-            .getLogger(HTMLLanguageParser.class);
+  public static final Logger LOG = LoggerFactory
+      .getLogger(HTMLLanguageParser.class);
 
-    private int detect = -1, identify = -1;
+  private int detect = -1, identify = -1;
 
-    private int contentMaxlength = -1;
+  private int contentMaxlength = -1;
 
-    private boolean onlyCertain = false;
+  private boolean onlyCertain = false;
 
-    /* A static Map of ISO-639 language codes */
-    private static Map<String, String> LANGUAGES_MAP = new HashMap<String, 
String>();
-    static {
-        try {
-            Properties p = new Properties();
-            p.load(HTMLLanguageParser.class
-                    .getResourceAsStream("langmappings.properties"));
-            Enumeration<?> keys = p.keys();
-            while (keys.hasMoreElements()) {
-                String key = (String) keys.nextElement();
-                String[] values = p.getProperty(key).split(",", -1);
-                LANGUAGES_MAP.put(key, key);
-                for (int i = 0; i < values.length; i++) {
-                    LANGUAGES_MAP.put(values[i].trim().toLowerCase(), key);
-                }
-            }
-        } catch (Exception e) {
-            if (LOG.isErrorEnabled()) {
-                LOG.error(e.toString());
-            }
-        }
+  /* A static Map of ISO-639 language codes */
+  private static Map<String, String> LANGUAGES_MAP = new HashMap<String, 
String>();
+  static {
+    try {
+      Properties p = new Properties();
+      p.load(HTMLLanguageParser.class
+          .getResourceAsStream("langmappings.properties"));
+      Enumeration<?> keys = p.keys();
+      while (keys.hasMoreElements()) {
+        String key = (String) keys.nextElement();
+        String[] values = p.getProperty(key).split(",", -1);
+        LANGUAGES_MAP.put(key, key);
+        for (int i = 0; i < values.length; i++) {
+          LANGUAGES_MAP.put(values[i].trim().toLowerCase(), key);
+        }
+      }
+    } catch (Exception e) {
+      if (LOG.isErrorEnabled()) {
+        LOG.error(e.toString());
+      }
     }
+  }
 
-    private Configuration conf;
+  private Configuration conf;
 
-    /**
-     * Scan the HTML document looking at possible indications of content
-     * language<br>
-     * <li>1. html lang attribute
-     * (http://www.w3.org/TR/REC-html40/struct/dirlang.html#h-8.1) <li>2. meta
-     * dc.language
-     * (http://dublincore.org/documents/2000/07/16/usageguide/qualified
-     * -html.shtml#language) <li>3. meta http-equiv (content-language)
-     * (http://www.w3.org/TR/REC-html40/struct/global.html#h-7.4.4.2) <br>
-     */
-    public ParseResult filter(Content content, ParseResult parseResult,
-            HTMLMetaTags metaTags, DocumentFragment doc) {
-        String lang = null;
-
-        Parse parse = parseResult.get(content.getUrl());
-
-        if (detect >= 0 && identify < 0) {
-            lang = detectLanguage(parse, doc);
-        } else if (detect < 0 && identify >= 0) {
-            lang = identifyLanguage(parse);
-        } else if (detect < identify) {
-            lang = detectLanguage(parse, doc);
-            if (lang == null) {
-                lang = identifyLanguage(parse);
-            }
-        } else if (identify < detect) {
-            lang = identifyLanguage(parse);
-            if (lang == null) {
-                lang = detectLanguage(parse, doc);
-            }
-        } else {
-            LOG.warn("No configuration for language extraction policy is 
provided");
-            return parseResult;
-        }
+  /**
+   * Scan the HTML document looking at possible indications of content 
language<br>
+   * <li>1. html lang attribute
+   * (http://www.w3.org/TR/REC-html40/struct/dirlang.html#h-8.1) <li>2. meta
+   * dc.language
+   * (http://dublincore.org/documents/2000/07/16/usageguide/qualified
+   * -html.shtml#language) <li>3. meta http-equiv (content-language)
+   * (http://www.w3.org/TR/REC-html40/struct/global.html#h-7.4.4.2) <br>
+   */
+  public ParseResult filter(Content content, ParseResult parseResult,
+      HTMLMetaTags metaTags, DocumentFragment doc) {
+    String lang = null;
+
+    Parse parse = parseResult.get(content.getUrl());
+
+    if (detect >= 0 && identify < 0) {
+      lang = detectLanguage(parse, doc);
+    } else if (detect < 0 && identify >= 0) {
+      lang = identifyLanguage(parse);
+    } else if (detect < identify) {
+      lang = detectLanguage(parse, doc);
+      if (lang == null) {
+        lang = identifyLanguage(parse);
+      }
+    } else if (identify < detect) {
+      lang = identifyLanguage(parse);
+      if (lang == null) {
+        lang = detectLanguage(parse, doc);
+      }
+    } else {
+      LOG.warn("No configuration for language extraction policy is provided");
+      return parseResult;
+    }
 
-        if (lang != null) {
-            parse.getData().getParseMeta().set(Metadata.LANGUAGE, lang);
-            return parseResult;
-        }
+    if (lang != null) {
+      parse.getData().getParseMeta().set(Metadata.LANGUAGE, lang);
+      return parseResult;
+    }
 
-        return parseResult;
+    return parseResult;
+  }
+
+  /** Try to find the document's language from page headers and metadata */
+  private String detectLanguage(Parse page, DocumentFragment doc) {
+    String lang = getLanguageFromMetadata(page.getData().getParseMeta());
+    if (lang == null) {
+      LanguageParser parser = new LanguageParser(doc);
+      lang = parser.getLanguage();
     }
 
-    /** Try to find the document's language from page headers and metadata */
-    private String detectLanguage(Parse page, DocumentFragment doc) {
-        String lang = getLanguageFromMetadata(page.getData().getParseMeta());
-        if (lang == null) {
-            LanguageParser parser = new LanguageParser(doc);
-            lang = parser.getLanguage();
-        }
+    if (lang != null) {
+      return lang;
+    }
 
-        if (lang != null) {
-            return lang;
-        }
+    lang = page.getData().getContentMeta().get(Response.CONTENT_LANGUAGE);
 
-        lang = page.getData().getContentMeta().get(Response.CONTENT_LANGUAGE);
+    return lang;
+  }
 
-        return lang;
+  /** Use statistical language identification to extract page language */
+  private String identifyLanguage(Parse parse) {
+    StringBuilder text = new StringBuilder();
+    if (parse == null)
+      return null;
+
+    String title = parse.getData().getTitle();
+    if (title != null) {
+      text.append(title.toString());
     }
 
-    /** Use statistical language identification to extract page language */
-    private String identifyLanguage(Parse parse) {
-        StringBuilder text = new StringBuilder();
-        if (parse == null)
-            return null;
+    String content = parse.getText();
+    if (content != null) {
+      text.append(" ").append(content.toString());
+    }
 
-        String title = parse.getData().getTitle();
-        if (title != null) {
-            text.append(title.toString());
-        }
+    // trim content?
+    String titleandcontent = text.toString();
 
-        String content = parse.getText();
-        if (content != null) {
-            text.append(" ").append(content.toString());
-        }
+    if (this.contentMaxlength != -1
+        && titleandcontent.length() > this.contentMaxlength)
+      titleandcontent = titleandcontent.substring(0, contentMaxlength);
 
-        // trim content?
-        String titleandcontent = text.toString();
+    LanguageIdentifier identifier = new LanguageIdentifier(titleandcontent);
 
-        if (this.contentMaxlength != -1
-                && titleandcontent.length() > this.contentMaxlength)
-            titleandcontent = titleandcontent.substring(0, contentMaxlength);
-
-        LanguageIdentifier identifier = new 
LanguageIdentifier(titleandcontent);
-
-        if (onlyCertain) {
-            if (identifier.isReasonablyCertain())
-                return identifier.getLanguage();
-            else
-                return null;
-        }
+    if (onlyCertain) {
+      if (identifier.isReasonablyCertain())
         return identifier.getLanguage();
+      else
+        return null;
     }
+    return identifier.getLanguage();
+  }
 
-    // Check in the metadata whether the language has already been stored there
-    // by Tika
-    private static String getLanguageFromMetadata(Metadata meta) {
-        if (meta == null)
-            return null;
-        // dublin core
-        String lang = meta.get("dc.language");
-        if (lang != null)
-            return lang;
-        // meta content-language
-        lang = meta.get("content-language");
-        if (lang != null)
-            return lang;
-        // lang attribute
-        return meta.get("lang");
-    }
-
-    static class LanguageParser {
-
-        private String dublinCore = null;
-        private String htmlAttribute = null;
-        private String httpEquiv = null;
-        private String language = null;
-
-        LanguageParser(Node node) {
-            parse(node);
-            if (htmlAttribute != null) {
-                language = htmlAttribute;
-            } else if (dublinCore != null) {
-                language = dublinCore;
-            } else {
-                language = httpEquiv;
-            }
-        }
-
-        String getLanguage() {
-            return language;
-        }
-
-        void parse(Node node) {
-
-            NodeWalker walker = new NodeWalker(node);
-            while (walker.hasNext()) {
+  // Check in the metadata whether the language has already been stored there
+  // by Tika
+  private static String getLanguageFromMetadata(Metadata meta) {
+    if (meta == null)
+      return null;
+    // dublin core
+    String lang = meta.get("dc.language");
+    if (lang != null)
+      return lang;
+    // meta content-language
+    lang = meta.get("content-language");
+    if (lang != null)
+      return lang;
+    // lang attribute
+    return meta.get("lang");
+  }
+
+  static class LanguageParser {
+
+    private String dublinCore = null;
+    private String htmlAttribute = null;
+    private String httpEquiv = null;
+    private String language = null;
+
+    LanguageParser(Node node) {
+      parse(node);
+      if (htmlAttribute != null) {
+        language = htmlAttribute;
+      } else if (dublinCore != null) {
+        language = dublinCore;
+      } else {
+        language = httpEquiv;
+      }
+    }
 
-                Node currentNode = walker.nextNode();
-                String nodeName = currentNode.getNodeName();
-                short nodeType = currentNode.getNodeType();
+    String getLanguage() {
+      return language;
+    }
 
-                if (nodeType == Node.ELEMENT_NODE) {
+    void parse(Node node) {
 
-                    // Check for the lang HTML attribute
-                    if (htmlAttribute == null) {
-                        htmlAttribute = parseLanguage(((Element) currentNode)
-                                .getAttribute("lang"));
-                    }
+      NodeWalker walker = new NodeWalker(node);
+      while (walker.hasNext()) {
 
-                    // Check for Meta
-                    if ("meta".equalsIgnoreCase(nodeName)) {
-                        NamedNodeMap attrs = currentNode.getAttributes();
-
-                        // Check for the dc.language Meta
-                        if (dublinCore == null) {
-                            for (int i = 0; i < attrs.getLength(); i++) {
-                                Node attrnode = attrs.item(i);
-                                if ("name".equalsIgnoreCase(attrnode
-                                        .getNodeName())) {
-                                    if ("dc.language".equalsIgnoreCase(attrnode
-                                            .getNodeValue())) {
-                                        Node valueattr = attrs
-                                                .getNamedItem("content");
-                                        if (valueattr != null) {
-                                            dublinCore = 
parseLanguage(valueattr
-                                                    .getNodeValue());
-                                        }
-                                    }
-                                }
-                            }
-                        }
-
-                        // Check for the http-equiv content-language
-                        if (httpEquiv == null) {
-                            for (int i = 0; i < attrs.getLength(); i++) {
-                                Node attrnode = attrs.item(i);
-                                if ("http-equiv".equalsIgnoreCase(attrnode
-                                        .getNodeName())) {
-                                    if ("content-language".equals(attrnode
-                                            .getNodeValue().toLowerCase())) {
-                                        Node valueattr = attrs
-                                                .getNamedItem("content");
-                                        if (valueattr != null) {
-                                            httpEquiv = parseLanguage(valueattr
-                                                    .getNodeValue());
-                                        }
-                                    }
-                                }
-                            }
-                        }
+        Node currentNode = walker.nextNode();
+        String nodeName = currentNode.getNodeName();
+        short nodeType = currentNode.getNodeType();
+
+        if (nodeType == Node.ELEMENT_NODE) {
+
+          // Check for the lang HTML attribute
+          if (htmlAttribute == null) {
+            htmlAttribute = parseLanguage(((Element) currentNode)
+                .getAttribute("lang"));
+          }
+
+          // Check for Meta
+          if ("meta".equalsIgnoreCase(nodeName)) {
+            NamedNodeMap attrs = currentNode.getAttributes();
+
+            // Check for the dc.language Meta
+            if (dublinCore == null) {
+              for (int i = 0; i < attrs.getLength(); i++) {
+                Node attrnode = attrs.item(i);
+                if ("name".equalsIgnoreCase(attrnode.getNodeName())) {
+                  if ("dc.language".equalsIgnoreCase(attrnode.getNodeValue())) 
{
+                    Node valueattr = attrs.getNamedItem("content");
+                    if (valueattr != null) {
+                      dublinCore = parseLanguage(valueattr.getNodeValue());
                     }
+                  }
                 }
+              }
+            }
 
-                if ((dublinCore != null) && (htmlAttribute != null)
-                        && (httpEquiv != null)) {
-                    return;
+            // Check for the http-equiv content-language
+            if (httpEquiv == null) {
+              for (int i = 0; i < attrs.getLength(); i++) {
+                Node attrnode = attrs.item(i);
+                if ("http-equiv".equalsIgnoreCase(attrnode.getNodeName())) {
+                  if ("content-language".equals(attrnode.getNodeValue()
+                      .toLowerCase())) {
+                    Node valueattr = attrs.getNamedItem("content");
+                    if (valueattr != null) {
+                      httpEquiv = parseLanguage(valueattr.getNodeValue());
+                    }
+                  }
                 }
+              }
             }
+          }
         }
 
-        /**
-         * Parse a language string and return an ISO 639 primary code, or
-         * <code>null</code> if something wrong occurs, or if no language is
-         * found.
-         */
-        final static String parseLanguage(String lang) {
-
-            if (lang == null) {
-                return null;
-            }
-
-            String code = null;
-            String language = null;
+        if ((dublinCore != null) && (htmlAttribute != null)
+            && (httpEquiv != null)) {
+          return;
+        }
+      }
+    }
 
-            // First, split multi-valued values
-            String langs[] = lang.split(",| |;|\\.|\\(|\\)|=", -1);
+    /**
+     * Parse a language string and return an ISO 639 primary code, or
+     * <code>null</code> if something wrong occurs, or if no language is found.
+     */
+    final static String parseLanguage(String lang) {
 
-            int i = 0;
-            while ((language == null) && (i < langs.length)) {
-                // Then, get the primary code
-                code = langs[i].split("-")[0];
-                code = code.split("_")[0];
-                // Find the ISO 639 code
-                language = (String) LANGUAGES_MAP.get(code.toLowerCase());
-                i++;
-            }
-
-            return language;
-        }
+      if (lang == null) {
+        return null;
+      }
+
+      String code = null;
+      String language = null;
+
+      // First, split multi-valued values
+      String langs[] = lang.split(",| |;|\\.|\\(|\\)|=", -1);
+
+      int i = 0;
+      while ((language == null) && (i < langs.length)) {
+        // Then, get the primary code
+        code = langs[i].split("-")[0];
+        code = code.split("_")[0];
+        // Find the ISO 639 code
+        language = (String) LANGUAGES_MAP.get(code.toLowerCase());
+        i++;
+      }
 
+      return language;
     }
 
-    public void setConf(Configuration conf) {
-        this.conf = conf;
-        contentMaxlength = conf.getInt("lang.analyze.max.length", -1);
-        onlyCertain = conf.getBoolean("lang.identification.only.certain", 
false);
-        String[] policy = conf.getStrings("lang.extraction.policy");
-        for (int i = 0; i < policy.length; i++) {
-            if (policy[i].equals("detect")) {
-                detect = i;
-            } else if (policy[i].equals("identify")) {
-                identify = i;
-            }
-        }
-    }
+  }
 
-    public Configuration getConf() {
-        return this.conf;
+  public void setConf(Configuration conf) {
+    this.conf = conf;
+    contentMaxlength = conf.getInt("lang.analyze.max.length", -1);
+    onlyCertain = conf.getBoolean("lang.identification.only.certain", false);
+    String[] policy = conf.getStrings("lang.extraction.policy");
+    for (int i = 0; i < policy.length; i++) {
+      if (policy[i].equals("detect")) {
+        detect = i;
+      } else if (policy[i].equals("identify")) {
+        identify = i;
+      }
     }
+  }
+
+  public Configuration getConf() {
+    return this.conf;
+  }
 
 }
\ No newline at end of file

Modified: 
nutch/trunk/src/plugin/language-identifier/src/java/org/apache/nutch/analysis/lang/LanguageIndexingFilter.java
URL: 
http://svn.apache.org/viewvc/nutch/trunk/src/plugin/language-identifier/src/java/org/apache/nutch/analysis/lang/LanguageIndexingFilter.java?rev=1655526&r1=1655525&r2=1655526&view=diff
==============================================================================
--- 
nutch/trunk/src/plugin/language-identifier/src/java/org/apache/nutch/analysis/lang/LanguageIndexingFilter.java
 (original)
+++ 
nutch/trunk/src/plugin/language-identifier/src/java/org/apache/nutch/analysis/lang/LanguageIndexingFilter.java
 Thu Jan 29 05:38:59 2015
@@ -16,7 +16,6 @@
  */
 package org.apache.nutch.analysis.lang;
 
-
 // Nutch imports
 import org.apache.nutch.crawl.CrawlDatum;
 import org.apache.nutch.crawl.Inlinks;
@@ -31,29 +30,27 @@ import org.apache.nutch.net.protocols.Re
 // Hadoop imports
 import org.apache.hadoop.conf.Configuration;
 
-
 /**
- * An {@link org.apache.nutch.indexer.IndexingFilter} that 
- * add a <code>lang</code> (language) field to the document.
- *
+ * An {@link org.apache.nutch.indexer.IndexingFilter} that add a
+ * <code>lang</code> (language) field to the document.
+ * 
  * It tries to find the language of the document by:
  * <ul>
- *   <li>First, checking if {@link HTMLLanguageParser} add some language
- *       information</li>
- *   <li>Then, checking if a <code>Content-Language</code> HTTP header can be
- *       found</li>
- *   <li>Finaly by analyzing the document content</li>
+ * <li>First, checking if {@link HTMLLanguageParser} add some language
+ * information</li>
+ * <li>Then, checking if a <code>Content-Language</code> HTTP header can be
+ * found</li>
+ * <li>Finaly by analyzing the document content</li>
  * </ul>
- *   
+ * 
  * @author Sami Siren
  * @author Jerome Charron
  */
 public class LanguageIndexingFilter implements IndexingFilter {
-  
 
   private Configuration conf;
 
-/**
+  /**
    * Constructs a new Language Indexing Filter.
    */
   public LanguageIndexingFilter() {
@@ -61,15 +58,15 @@ public class LanguageIndexingFilter impl
   }
 
   // Inherited JavaDoc
-  public NutchDocument filter(NutchDocument doc, Parse parse, Text url, 
CrawlDatum datum, Inlinks inlinks)
-    throws IndexingException {
+  public NutchDocument filter(NutchDocument doc, Parse parse, Text url,
+      CrawlDatum datum, Inlinks inlinks) throws IndexingException {
 
     // check if LANGUAGE found, possibly put there by HTMLLanguageParser
     String lang = parse.getData().getParseMeta().get(Metadata.LANGUAGE);
 
     // check if HTTP-header tels us the language
     if (lang == null) {
-        lang = parse.getData().getContentMeta().get(Response.CONTENT_LANGUAGE);
+      lang = parse.getData().getContentMeta().get(Response.CONTENT_LANGUAGE);
     }
 
     if (lang == null || lang.length() == 0) {

Modified: 
nutch/trunk/src/plugin/language-identifier/src/test/org/apache/nutch/analysis/lang/TestHTMLLanguageParser.java
URL: 
http://svn.apache.org/viewvc/nutch/trunk/src/plugin/language-identifier/src/test/org/apache/nutch/analysis/lang/TestHTMLLanguageParser.java?rev=1655526&r1=1655525&r2=1655526&view=diff
==============================================================================
--- 
nutch/trunk/src/plugin/language-identifier/src/test/org/apache/nutch/analysis/lang/TestHTMLLanguageParser.java
 (original)
+++ 
nutch/trunk/src/plugin/language-identifier/src/test/org/apache/nutch/analysis/lang/TestHTMLLanguageParser.java
 Thu Jan 29 05:38:59 2015
@@ -19,8 +19,6 @@ package org.apache.nutch.analysis.lang;
 import java.io.BufferedReader;
 import java.io.InputStreamReader;
 
-
-
 // Nutch imports
 import org.apache.nutch.metadata.Metadata;
 import org.apache.nutch.parse.Parse;
@@ -31,7 +29,6 @@ import org.apache.tika.language.Language
 import org.junit.Assert;
 import org.junit.Test;
 
-
 public class TestHTMLLanguageParser {
 
   private static String URL = "http://foo.bar/";;
@@ -47,7 +44,7 @@ public class TestHTMLLanguageParser {
   String metalanguages[] = { "fi", "en", "en" };
 
   /**
-   * Test parsing of language identifiers from html 
+   * Test parsing of language identifiers from html
    **/
   @Test
   public void testMetaHTMLParsing() {
@@ -58,7 +55,8 @@ public class TestHTMLLanguageParser {
       for (int t = 0; t < docs.length; t++) {
         Content content = getContent(docs[t]);
         Parse parse = parser.parse(content).get(content.getUrl());
-        Assert.assertEquals(metalanguages[t], (String) 
parse.getData().getParseMeta().get(Metadata.LANGUAGE));
+        Assert.assertEquals(metalanguages[t], (String) parse.getData()
+            .getParseMeta().get(Metadata.LANGUAGE));
       }
     } catch (Exception e) {
       e.printStackTrace(System.out);
@@ -70,65 +68,38 @@ public class TestHTMLLanguageParser {
   /** Test of <code>LanguageParser.parseLanguage(String)</code> method. */
   @Test
   public void testParseLanguage() {
-    String tests[][] = {
-      { "(SCHEME=ISO.639-1) sv", "sv" },
-      { "(SCHEME=RFC1766) sv-FI", "sv" },
-      { "(SCHEME=Z39.53) SWE", "sv" },
-      { "EN_US, SV, EN, EN_UK", "en" },
-      { "English Swedish", "en" },
-      { "English, swedish", "en" },
-      { "English,Swedish", "en" },
-      { "Other (Svenska)", "sv" },
-      { "SE", "se" },
-      { "SV", "sv" },
-      { "SV charset=iso-8859-1", "sv" },
-      { "SV-FI", "sv" },
-      { "SV; charset=iso-8859-1", "sv" },
-      { "SVE", "sv" },
-      { "SW", "sw" },
-      { "SWE", "sv" },
-      { "SWEDISH", "sv" },
-      { "Sv", "sv" },
-      { "Sve", "sv" },
-      { "Svenska", "sv" },
-      { "Swedish", "sv" },
-      { "Swedish, svenska", "sv" },
-      { "en, sv", "en" },
-      { "sv", "sv" },
-      { "sv, be, dk, de, fr, no, pt, ch, fi, en", "sv" },
-      { "sv,en", "sv" },
-      { "sv-FI", "sv" },
-      { "sv-SE", "sv" },
-      { "sv-en", "sv" },
-      { "sv-fi", "sv" },
-      { "sv-se", "sv" },
-      { "sv; Content-Language: sv", "sv" },
-      { "sv_SE", "sv" },
-      { "sve", "sv" },
-      { "svenska, swedish, engelska, english", "sv" },
-      { "sw", "sw" },
-      { "swe", "sv" },
-      { "swe.SPR.", "sv" },
-      { "sweden", "sv" },
-      { "swedish", "sv" },
-      { "swedish,", "sv" },
-      { "text/html; charset=sv-SE", "sv" },
-      { "text/html; sv", "sv" },
-      { "torp, stuga, uthyres, bed & breakfast", null }
-    };
-    
-    for (int i=0; i<44; i++) {
-      Assert.assertEquals(tests[i][1], 
HTMLLanguageParser.LanguageParser.parseLanguage(tests[i][0]));
+    String tests[][] = { { "(SCHEME=ISO.639-1) sv", "sv" },
+        { "(SCHEME=RFC1766) sv-FI", "sv" }, { "(SCHEME=Z39.53) SWE", "sv" },
+        { "EN_US, SV, EN, EN_UK", "en" }, { "English Swedish", "en" },
+        { "English, swedish", "en" }, { "English,Swedish", "en" },
+        { "Other (Svenska)", "sv" }, { "SE", "se" }, { "SV", "sv" },
+        { "SV charset=iso-8859-1", "sv" }, { "SV-FI", "sv" },
+        { "SV; charset=iso-8859-1", "sv" }, { "SVE", "sv" }, { "SW", "sw" },
+        { "SWE", "sv" }, { "SWEDISH", "sv" }, { "Sv", "sv" }, { "Sve", "sv" },
+        { "Svenska", "sv" }, { "Swedish", "sv" }, { "Swedish, svenska", "sv" },
+        { "en, sv", "en" }, { "sv", "sv" },
+        { "sv, be, dk, de, fr, no, pt, ch, fi, en", "sv" }, { "sv,en", "sv" },
+        { "sv-FI", "sv" }, { "sv-SE", "sv" }, { "sv-en", "sv" },
+        { "sv-fi", "sv" }, { "sv-se", "sv" },
+        { "sv; Content-Language: sv", "sv" }, { "sv_SE", "sv" },
+        { "sve", "sv" }, { "svenska, swedish, engelska, english", "sv" },
+        { "sw", "sw" }, { "swe", "sv" }, { "swe.SPR.", "sv" },
+        { "sweden", "sv" }, { "swedish", "sv" }, { "swedish,", "sv" },
+        { "text/html; charset=sv-SE", "sv" }, { "text/html; sv", "sv" },
+        { "torp, stuga, uthyres, bed & breakfast", null } };
+
+    for (int i = 0; i < 44; i++) {
+      Assert.assertEquals(tests[i][1],
+          HTMLLanguageParser.LanguageParser.parseLanguage(tests[i][0]));
     }
   }
-  
-  
+
   private Content getContent(String text) {
     Metadata meta = new Metadata();
     meta.add("Content-Type", "text/html");
-    return new Content(URL, BASE, text.getBytes(), "text/html", meta, 
NutchConfiguration.create());
+    return new Content(URL, BASE, text.getBytes(), "text/html", meta,
+        NutchConfiguration.create());
   }
-  
 
   @Test
   public void testLanguageIndentifier() {

Modified: 
nutch/trunk/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/BlockedException.java
URL: 
http://svn.apache.org/viewvc/nutch/trunk/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/BlockedException.java?rev=1655526&r1=1655525&r2=1655526&view=diff
==============================================================================
--- 
nutch/trunk/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/BlockedException.java
 (original)
+++ 
nutch/trunk/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/BlockedException.java
 Thu Jan 29 05:38:59 2015
@@ -18,7 +18,7 @@
 package org.apache.nutch.protocol.http.api;
 
 public class BlockedException extends HttpException {
-  
+
   public BlockedException(String msg) {
     super(msg);
   }

svn commit: r1655526 [14/26] - in /nutch/trunk: ./ src/java/org/apache/nutch/crawl/ src/java/org/apache/nutch/fetcher/ src/java/org/apache/nutch/indexer/ src/java/org/apache/nutch/metadata/ src/java/org/apache/nutch/net/ src/java/org/apache/nutch/net/p...

Reply via email to