Author: jerome
Date: Wed Aug 31 08:17:11 2005
New Revision: 265503

URL: http://svn.apache.org/viewcvs?rev=265503&view=rev
Log:
Merged 0.7 branch changes 240321:240453 into trunk

Modified:
    lucene/nutch/trunk/src/java/org/apache/nutch/clustering/OnlineClusterer.java
    lucene/nutch/trunk/src/java/org/apache/nutch/fs/NutchFileSystem.java
    lucene/nutch/trunk/src/java/org/apache/nutch/mapReduce/FileSplit.java
    lucene/nutch/trunk/src/java/org/apache/nutch/mapReduce/MapOutputFile.java
    lucene/nutch/trunk/src/java/org/apache/nutch/mapReduce/RecordReader.java
    lucene/nutch/trunk/src/java/org/apache/nutch/mapReduce/package.html
    lucene/nutch/trunk/src/java/org/apache/nutch/parse/Parse.java
    lucene/nutch/trunk/src/java/org/apache/nutch/protocol/Content.java
    lucene/nutch/trunk/src/java/org/apache/nutch/protocol/ProtocolException.java
    lucene/nutch/trunk/src/java/org/apache/nutch/protocol/ResourceGone.java
    lucene/nutch/trunk/src/java/org/apache/nutch/protocol/ResourceMoved.java
    lucene/nutch/trunk/src/java/org/apache/nutch/protocol/RetryLater.java
    lucene/nutch/trunk/src/java/org/apache/nutch/searcher/Hits.java
    lucene/nutch/trunk/src/java/org/apache/nutch/segment/SegmentReader.java
    lucene/nutch/trunk/src/java/org/apache/nutch/util/Daemon.java
    
lucene/nutch/trunk/src/plugin/languageidentifier/src/java/org/apache/nutch/analysis/lang/LanguageIdentifier.java
    
lucene/nutch/trunk/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/DOMBuilder.java
    
lucene/nutch/trunk/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/XMLCharacterRecognizer.java
    
lucene/nutch/trunk/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/DummySSLProtocolSocketFactory.java

Modified: 
lucene/nutch/trunk/src/java/org/apache/nutch/clustering/OnlineClusterer.java
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/java/org/apache/nutch/clustering/OnlineClusterer.java?rev=265503&r1=265502&r2=265503&view=diff
==============================================================================
--- 
lucene/nutch/trunk/src/java/org/apache/nutch/clustering/OnlineClusterer.java 
(original)
+++ 
lucene/nutch/trunk/src/java/org/apache/nutch/clustering/OnlineClusterer.java 
Wed Aug 31 08:17:11 2005
@@ -23,8 +23,8 @@
  * algorithms.
  *
  * <p>By the term <b>online</b> search results clustering we will understand
- * a clusterer that works on a set of [EMAIL PROTECTED] Hit}s retrieved for a 
user's query
- * and produces a set of [EMAIL PROTECTED] Clusters} that can be displayed to 
help
+ * a clusterer that works on a set of [EMAIL PROTECTED] HitDetails} retrieved 
for a user's
+ * query and produces a set of [EMAIL PROTECTED] HitsCluster} that can be 
displayed to help
  * the user gain insight in the topics found in the result.</p>
  *
  * <p>Other clustering options include predefined categories and off-line

Modified: lucene/nutch/trunk/src/java/org/apache/nutch/fs/NutchFileSystem.java
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/java/org/apache/nutch/fs/NutchFileSystem.java?rev=265503&r1=265502&r2=265503&view=diff
==============================================================================
--- lucene/nutch/trunk/src/java/org/apache/nutch/fs/NutchFileSystem.java 
(original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/fs/NutchFileSystem.java Wed 
Aug 31 08:17:11 2005
@@ -80,8 +80,8 @@
       return getNamed(NutchConf.get().get("fs.default.name", "local"));
     }
 
-    /** Returns a name for this filesystem, suitable to pass to [EMAIL 
PROTECTED]
-     * NutchFileSystem#getNamed(String).*/
+    /** Returns a name for this filesystem, suitable to pass to
+     * [EMAIL PROTECTED] NutchFileSystem#getNamed(String)}.*/
     public abstract String getName();
   
     /** Returns a named filesystem.  Names are either the string "local" or a

Modified: lucene/nutch/trunk/src/java/org/apache/nutch/mapReduce/FileSplit.java
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/java/org/apache/nutch/mapReduce/FileSplit.java?rev=265503&r1=265502&r2=265503&view=diff
==============================================================================
--- lucene/nutch/trunk/src/java/org/apache/nutch/mapReduce/FileSplit.java 
(original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/mapReduce/FileSplit.java Wed 
Aug 31 08:17:11 2005
@@ -25,9 +25,12 @@
 import org.apache.nutch.io.UTF8;
 import org.apache.nutch.fs.NutchFileSystem;
 
-/** A section of an input file.  Returned by [EMAIL PROTECTED]
- * InputFormat#getSplits(File[], int)} and passed to
- * InputFormat#getRecordReader(FileSplit). */
+/**
+ * A section of an input file.
+ * Returned by [EMAIL PROTECTED] InputFormat#getSplits(NutchFileSystem, 
JobConf, int)}
+ * and passed to
+ * [EMAIL PROTECTED] InputFormat#getRecordReader(NutchFileSystem, FileSplit, 
JobConf)}.
+ */
 public class FileSplit implements Writable {
   private File file;
   private long start;

Modified: 
lucene/nutch/trunk/src/java/org/apache/nutch/mapReduce/MapOutputFile.java
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/java/org/apache/nutch/mapReduce/MapOutputFile.java?rev=265503&r1=265502&r2=265503&view=diff
==============================================================================
--- lucene/nutch/trunk/src/java/org/apache/nutch/mapReduce/MapOutputFile.java 
(original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/mapReduce/MapOutputFile.java 
Wed Aug 31 08:17:11 2005
@@ -42,7 +42,7 @@
 
   /** Create a local reduce input file name.
    * @param mapTaskId a map task id
-   * @param partition a reduce partition
+   * @param reduceTaskId a reduce task id
    */
   public static File getInputFile(String mapTaskId, String reduceTaskId) {
     File taskDir = new File(LOCAL_DIR, reduceTaskId);

Modified: 
lucene/nutch/trunk/src/java/org/apache/nutch/mapReduce/RecordReader.java
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/java/org/apache/nutch/mapReduce/RecordReader.java?rev=265503&r1=265502&r2=265503&view=diff
==============================================================================
--- lucene/nutch/trunk/src/java/org/apache/nutch/mapReduce/RecordReader.java 
(original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/mapReduce/RecordReader.java 
Wed Aug 31 08:17:11 2005
@@ -22,7 +22,7 @@
 import org.apache.nutch.io.WritableComparable;
 import org.apache.nutch.io.Writable;
 
-/** Reads key/value pairs from an input file [EMAIL PROTECTED] 
InputFormat.Split}.
+/** Reads key/value pairs from an input file [EMAIL PROTECTED] FileSplit}.
  * Implemented by [EMAIL PROTECTED] InputFormat} implementations. */
 public interface RecordReader {
   /** Reads the next key/value pair.

Modified: lucene/nutch/trunk/src/java/org/apache/nutch/mapReduce/package.html
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/java/org/apache/nutch/mapReduce/package.html?rev=265503&r1=265502&r2=265503&view=diff
==============================================================================
--- lucene/nutch/trunk/src/java/org/apache/nutch/mapReduce/package.html 
(original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/mapReduce/package.html Wed Aug 
31 08:17:11 2005
@@ -6,7 +6,7 @@
 
 <p>Applications implement [EMAIL PROTECTED] org.apache.nutch.mapReduce.Mapper} 
and
 [EMAIL PROTECTED] org.apache.nutch.mapReduce.Reducer} interfaces.  These are 
submitted
-as a [EMAIL PROTECTED] org.apache.nutch.mapReduce.MapReduceJob} and are 
applied to data
+as a MapReduceJob and are applied to data
 stored in a [EMAIL PROTECTED] org.apache.nutch.fs.NutchFileSystem}.</p>
 
 <p>See <a href="http://labs.google.com/papers/mapreduce.html";>Google's

Modified: lucene/nutch/trunk/src/java/org/apache/nutch/parse/Parse.java
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/java/org/apache/nutch/parse/Parse.java?rev=265503&r1=265502&r2=265503&view=diff
==============================================================================
--- lucene/nutch/trunk/src/java/org/apache/nutch/parse/Parse.java (original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/parse/Parse.java Wed Aug 31 
08:17:11 2005
@@ -17,7 +17,7 @@
 package org.apache.nutch.parse;
 
 /** The result of parsing a page's raw content.
- * @see Parser#getParse(FetcherOutput,Content)
+ * @see Parser#getParse(Content)
  */
 public interface Parse {
   

Modified: lucene/nutch/trunk/src/java/org/apache/nutch/protocol/Content.java
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/java/org/apache/nutch/protocol/Content.java?rev=265503&r1=265502&r2=265503&view=diff
==============================================================================
--- lucene/nutch/trunk/src/java/org/apache/nutch/protocol/Content.java 
(original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/protocol/Content.java Wed Aug 
31 08:17:11 2005
@@ -114,7 +114,8 @@
   public void setContent(byte[] content) { this.content = content; }
 
   /** The media type of the retrieved content.
-   * @see http://www.iana.org/assignments/media-types/
+   * @see <a href="http://www.iana.org/assignments/media-types/";>
+   *      http://www.iana.org/assignments/media-types/</a>
    */
   public String getContentType() { return contentType; }
   public void setContentType(String contentType) {

Modified: 
lucene/nutch/trunk/src/java/org/apache/nutch/protocol/ProtocolException.java
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/java/org/apache/nutch/protocol/ProtocolException.java?rev=265503&r1=265502&r2=265503&view=diff
==============================================================================
--- 
lucene/nutch/trunk/src/java/org/apache/nutch/protocol/ProtocolException.java 
(original)
+++ 
lucene/nutch/trunk/src/java/org/apache/nutch/protocol/ProtocolException.java 
Wed Aug 31 08:17:11 2005
@@ -18,7 +18,6 @@
 
 import java.net.URL;
 
-/** Thrown by [EMAIL PROTECTED] Protocol#getContent(String)}.*/
 public class ProtocolException extends Exception {
 
   public ProtocolException() {

Modified: 
lucene/nutch/trunk/src/java/org/apache/nutch/protocol/ResourceGone.java
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/java/org/apache/nutch/protocol/ResourceGone.java?rev=265503&r1=265502&r2=265503&view=diff
==============================================================================
--- lucene/nutch/trunk/src/java/org/apache/nutch/protocol/ResourceGone.java 
(original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/protocol/ResourceGone.java Wed 
Aug 31 08:17:11 2005
@@ -19,7 +19,7 @@
 import java.io.IOException;
 import java.net.URL;
 
-/** Thrown by [EMAIL PROTECTED] Protocol#getContent(String)} when a [EMAIL 
PROTECTED] URL} is invalid.*/
+/** Thrown when a resource is invalid. */
 public class ResourceGone extends ProtocolException {
   private URL url;
 

Modified: 
lucene/nutch/trunk/src/java/org/apache/nutch/protocol/ResourceMoved.java
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/java/org/apache/nutch/protocol/ResourceMoved.java?rev=265503&r1=265502&r2=265503&view=diff
==============================================================================
--- lucene/nutch/trunk/src/java/org/apache/nutch/protocol/ResourceMoved.java 
(original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/protocol/ResourceMoved.java 
Wed Aug 31 08:17:11 2005
@@ -19,8 +19,7 @@
 import java.io.IOException;
 import java.net.URL;
 
-/** Thrown by [EMAIL PROTECTED] Protocol#getContent(String)} when a [EMAIL 
PROTECTED] URL} no longer
- * exists.*/
+/** Thrown when a resource no longer exists.*/
 public class ResourceMoved extends IOException {
   private URL oldUrl;
   private URL newUrl;

Modified: lucene/nutch/trunk/src/java/org/apache/nutch/protocol/RetryLater.java
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/java/org/apache/nutch/protocol/RetryLater.java?rev=265503&r1=265502&r2=265503&view=diff
==============================================================================
--- lucene/nutch/trunk/src/java/org/apache/nutch/protocol/RetryLater.java 
(original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/protocol/RetryLater.java Wed 
Aug 31 08:17:11 2005
@@ -19,8 +19,7 @@
 import java.io.IOException;
 import java.net.URL;
 
-/** Thrown by [EMAIL PROTECTED] Protocol#getContent(String)} when a [EMAIL 
PROTECTED] URL} should be
- * retried later.*/
+/** Thrown when a resource should be retried later.*/
 public class RetryLater extends ProtocolException {
   private URL url;
 

Modified: lucene/nutch/trunk/src/java/org/apache/nutch/searcher/Hits.java
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/java/org/apache/nutch/searcher/Hits.java?rev=265503&r1=265502&r2=265503&view=diff
==============================================================================
--- lucene/nutch/trunk/src/java/org/apache/nutch/searcher/Hits.java (original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/searcher/Hits.java Wed Aug 31 
08:17:11 2005
@@ -44,14 +44,14 @@
   }
 
   /** Returns the total number of hits for this query.  This may be an estimate
-   * when (@link totalIsExact()} is false. */
+   * when (@link #totalIsExact()} is false. */
   public long getTotal() { return total; }
 
-  /** True if [EMAIL PROTECTED] getTotal()} gives the exact number of hits, or 
false if
+  /** True if [EMAIL PROTECTED] #getTotal()} gives the exact number of hits, 
or false if
    * it is only an estimate of the total number of hits. */
   public boolean totalIsExact() { return totalIsExact; }
 
-  /** Set [EMAIL PROTECTED] totalIsExact()}. */
+  /** Set [EMAIL PROTECTED] #totalIsExact()}. */
   public void setTotalIsExact(boolean isExact) { totalIsExact = isExact; }
 
   /** Returns the number of hits included in this current listing. */

Modified: 
lucene/nutch/trunk/src/java/org/apache/nutch/segment/SegmentReader.java
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/java/org/apache/nutch/segment/SegmentReader.java?rev=265503&r1=265502&r2=265503&view=diff
==============================================================================
--- lucene/nutch/trunk/src/java/org/apache/nutch/segment/SegmentReader.java 
(original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/segment/SegmentReader.java Wed 
Aug 31 08:17:11 2005
@@ -179,7 +179,8 @@
    * @param withParseText if true, fix parse_text, otherwise ignore it
    * @param withParseData if true, fix parse_data, otherwise ignore it
    * @param dryrun if true, only show what would be done without performing 
any actions
-   * @return
+   * @return <code>true</code> if segment was fixed successfully, otherwise
+   *         return <code>false</code>.
    */
   public static boolean fixSegment(NutchFileSystem nfs, File dir, 
           boolean withContent, boolean withParseText, boolean withParseData,

Modified: lucene/nutch/trunk/src/java/org/apache/nutch/util/Daemon.java
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/java/org/apache/nutch/util/Daemon.java?rev=265503&r1=265502&r2=265503&view=diff
==============================================================================
--- lucene/nutch/trunk/src/java/org/apache/nutch/util/Daemon.java (original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/util/Daemon.java Wed Aug 31 
08:17:11 2005
@@ -16,7 +16,7 @@
 
 package org.apache.nutch.util;
 
-/** A thread that has called [EMAIL PROTECTED] Thread#SetDaemon(boolean) } 
with true.*/
+/** A thread that has called [EMAIL PROTECTED] Thread#setDaemon(boolean) } 
with true.*/
 public class Daemon extends Thread {
 
   {

Modified: 
lucene/nutch/trunk/src/plugin/languageidentifier/src/java/org/apache/nutch/analysis/lang/LanguageIdentifier.java
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/languageidentifier/src/java/org/apache/nutch/analysis/lang/LanguageIdentifier.java?rev=265503&r1=265502&r2=265503&view=diff
==============================================================================
--- 
lucene/nutch/trunk/src/plugin/languageidentifier/src/java/org/apache/nutch/analysis/lang/LanguageIdentifier.java
 (original)
+++ 
lucene/nutch/trunk/src/plugin/languageidentifier/src/java/org/apache/nutch/analysis/lang/LanguageIdentifier.java
 Wed Aug 31 08:17:11 2005
@@ -20,6 +20,7 @@
 import java.io.InputStream;
 import java.io.IOException;
 import java.io.BufferedReader;
+import java.io.ByteArrayOutputStream;
 import java.io.FileInputStream;
 import java.io.InputStreamReader;
 import java.util.List;
@@ -48,9 +49,13 @@
 
 
 /**
+ * Identify the language of a content, based on statistical analysis.
+ *
+ * @see <a href="http://www.w3.org/WAI/ER/IG/ert/iso639.htm";>ISO 639
+ *      Language Codes</a>
  * 
  * @author Sami Siren
- * @author Jerome Charron
+ * @author J&eacute;r&ocirc;me Charron
  */
 public class LanguageIdentifier {
   
@@ -59,8 +64,8 @@
   
   private final static float SCORE_THRESOLD = 0.00F;
 
-  public final static Logger LOG = 
LogFormatter.getLogger(LanguageIdentifier.class.getName());
-
+  private final static Logger LOG =
+          LogFormatter.getLogger(LanguageIdentifier.class.getName());
   
   private ArrayList languages = new ArrayList();
 
@@ -168,7 +173,8 @@
   }
 
   /**
-   * return handle to singleton instance
+   * Get a LanguageIdentifier instance.
+   * @return the LanguageIdentifier singleton instance.
    */
   public static LanguageIdentifier getInstance() {
     if (identifier == null) {
@@ -182,13 +188,25 @@
   }
 
   /**
-   * main method used for testing
-   * 
-   * @param args
+   * Main method used for command line process.
+   * <br/>Usage is:
+   * <pre>
+   * LanguageIdentifier [-identifyrows filename maxlines]
+   *                    [-identifyfile charset filename]
+   *                    [-identifyfileset charset files]
+   *                    [-identifytext text]
+   *                    [-identifyurl url]
+   * </pre>
+   * @param args arguments.
    */
   public static void main(String args[]) {
 
-    String usage = "Usage: LanguageIdentifier [-identifyrows filename 
maxlines] [-identifyfile filename] [-identifyfileset files] [-identifytext 
text] [-identifyurl url]";
+    String usage = "Usage: LanguageIdentifier "            +
+                      "[-identifyrows filename maxlines] " +
+                      "[-identifyfile charset filename] "  +
+                      "[-identifyfileset charset files] "  +
+                      "[-identifytext text] "              +
+                      "[-identifyurl url]";
     int command = 0;
 
     final int IDFILE = 1;
@@ -199,6 +217,7 @@
 
     Vector fileset = new Vector();
     String filename = "";
+    String charset = "";
     String url = "";
     String text = "";
     int max = 0;
@@ -211,6 +230,7 @@
     for (int i = 0; i < args.length; i++) { // parse command line
       if (args[i].equals("-identifyfile")) {
         command = IDFILE;
+        charset = args[++i];
         filename = args[++i];
       }
 
@@ -233,6 +253,7 @@
 
       if (args[i].equals("-identifyfileset")) {
         command = IDFILESET;
+        charset = args[++i];
         for (i++; i < args.length; i++) {
           File[] files = null;
           File f = new File(args[i]);
@@ -264,7 +285,7 @@
         case IDFILE:
           f = new File(filename);
           fis = new FileInputStream(f);
-          lang = idfr.identify(fis);
+          lang = idfr.identify(fis, charset);
           fis.close();
           break;
 
@@ -302,7 +323,7 @@
               filename = (String) i.next();
               f = new File(filename);
               fis = new FileInputStream(f);
-              lang = idfr.identify(fis);
+              lang = idfr.identify(fis, charset);
               fis.close();
             } catch (Exception e) {
               System.out.println(e);
@@ -349,22 +370,26 @@
   }
 
   /**
-   * Identify language based on submitted content
+   * Identify language of a content.
    * 
-   * @param text to analyze
-   * @return 2 letter ISO639 code of language (en, fi, sv...) , or null if
-   *         unknown
+   * @param content is the content to analyze.
+   * @return The 2 letter
+   *         <a href="http://www.w3.org/WAI/ER/IG/ert/iso639.htm";>ISO 639
+   *         language code</a> (en, fi, sv, ...) of the language that best
+   *         matches the specified content.
    */
-  public String identify(String text) {
-    return identify(new StringBuffer(text));
+  public String identify(String content) {
+    return identify(new StringBuffer(content));
   }
 
   /**
-   * Identify language based on submitted content
+   * Identify language of a content.
    * 
-   * @param text to analyze
-   * @return 2 letter ISO639 code of language (en, fi, sv...) , or null if
-   *         unknown
+   * @param content is the content to analyze.
+   * @return The 2 letter
+   *         <a href="http://www.w3.org/WAI/ER/IG/ert/iso639.htm";>ISO 639
+   *         language code</a> (en, fi, sv, ...) of the language that best
+   *         matches the specified content.
    */
   public String identify(StringBuffer content) {
 
@@ -405,26 +430,48 @@
   }
 
   /**
-   * Identify language from inputstream
-   * 
-   * @param is
-   * @return language code
-   * @throws IOException
+   * Identify language from input stream.
+   * This method uses the platform default encoding to read the input stream.
+   * For using a specific encoding, use the
+   * [EMAIL PROTECTED] #identify(InputStream, String)} method.
+   *
+   * @param is is the input stream to analyze.
+   * @return The 2 letter
+   *         <a href="http://www.w3.org/WAI/ER/IG/ert/iso639.htm";>ISO 639
+   *         language code</a> (en, fi, sv, ...) of the language that best
+   *         matches the content of the specified input stream.
+   * @throws IOException if something wrong occurs on the input stream.
    */
   public String identify(InputStream is) throws IOException {
+    return identify(is, null);
+  }
+  
+  /**
+   * Identify language from input stream.
+   * 
+   * @param is is the input stream to analyze.
+   * @param charset is the charset to use to read the input stream.
+   * @return The 2 letter
+   *         <a href="http://www.w3.org/WAI/ER/IG/ert/iso639.htm";>ISO 639
+   *         language code</a> (en, fi, sv, ...) of the language that best
+   *         matches the content of the specified input stream.
+   * @throws IOException if something wrong occurs on the input stream.
+   */
+  public String identify(InputStream is, String charset) throws IOException {
 
-    StringBuffer text = new StringBuffer();
+    ByteArrayOutputStream out = new ByteArrayOutputStream();
     byte[] buffer = new byte[2048];
     int len = 0;
 
     while (((len = is.read(buffer)) != -1) &&
-           ((analyzeLength == 0) || (text.length() < analyzeLength))) {
+           ((analyzeLength == 0) || (out.size() < analyzeLength))) {
       if (analyzeLength != 0) {
-          len = Math.min(len, analyzeLength - text.length());
+          len = Math.min(len, analyzeLength - out.size());
       }
-      text.append(new String(buffer, 0, len, "UTF-8"));
+      out.write(buffer, 0, len);
     }
-    return identify(text);
+    return identify((charset == null) ? out.toString()
+                                      : out.toString(charset));
   }
 
 }

Modified: 
lucene/nutch/trunk/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/DOMBuilder.java
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/DOMBuilder.java?rev=265503&r1=265502&r2=265503&view=diff
==============================================================================
--- 
lucene/nutch/trunk/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/DOMBuilder.java
 (original)
+++ 
lucene/nutch/trunk/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/DOMBuilder.java
 Wed Aug 31 08:17:11 2005
@@ -42,7 +42,6 @@
  * This class takes SAX events (in addition to some extra events
  * that SAX doesn't handle yet) and adds the result to a document
  * or document fragment.
- * @xsl.usage general
  */
 public class DOMBuilder
         implements ContentHandler, LexicalHandler

Modified: 
lucene/nutch/trunk/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/XMLCharacterRecognizer.java
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/XMLCharacterRecognizer.java?rev=265503&r1=265502&r2=265503&view=diff
==============================================================================
--- 
lucene/nutch/trunk/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/XMLCharacterRecognizer.java
 (original)
+++ 
lucene/nutch/trunk/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/XMLCharacterRecognizer.java
 Wed Aug 31 08:17:11 2005
@@ -27,7 +27,6 @@
 /**
  * Class used to verify whether the specified <var>ch</var> 
  * conforms to the XML 1.0 definition of whitespace. 
- * @xsl.usage internal
  */
 public class XMLCharacterRecognizer
 {
@@ -90,7 +89,7 @@
   /**
    * Tell if the string is whitespace.
    *
-   * @param buf StringBuffer to check as XML whitespace.
+   * @param s String to check as XML whitespace.
    * @return True if characters in buffer are XML whitespace, false otherwise
    */
   public static boolean isWhiteSpace(String s)

Modified: 
lucene/nutch/trunk/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/DummySSLProtocolSocketFactory.java
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/DummySSLProtocolSocketFactory.java?rev=265503&r1=265502&r2=265503&view=diff
==============================================================================
--- 
lucene/nutch/trunk/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/DummySSLProtocolSocketFactory.java
 (original)
+++ 
lucene/nutch/trunk/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/DummySSLProtocolSocketFactory.java
 Wed Aug 31 08:17:11 2005
@@ -58,7 +58,7 @@
   }
 
   /**
-   * @see 
SecureProtocolSocketFactory#createSocket(java.lang.String,int,java.net.InetAddress,int)
+   * @see 
org.apache.commons.httpclient.protocol.SecureProtocolSocketFactory#createSocket(String,int,InetAddress,int)
    */
   public Socket createSocket(String host, int port, InetAddress clientHost, 
int clientPort) throws IOException,
           UnknownHostException {
@@ -79,8 +79,8 @@
    * 
    * @param host the host name/IP
    * @param port the port on the host
-   * @param clientHost the local host name/IP to bind the socket to
-   * @param clientPort the port on the local machine
+   * @param localAddress the local host name/IP to bind the socket to
+   * @param localPort the port on the local machine
    * @param params [EMAIL PROTECTED] HttpConnectionParams Http connection 
parameters}
    * 
    * @return Socket a new socket
@@ -104,14 +104,14 @@
   }
 
   /**
-   * @see SecureProtocolSocketFactory#createSocket(java.lang.String,int)
+   * @see 
org.apache.commons.httpclient.protocol.SecureProtocolSocketFactory#createSocket(String,int)
    */
   public Socket createSocket(String host, int port) throws IOException, 
UnknownHostException {
     return getSSLContext().getSocketFactory().createSocket(host, port);
   }
 
   /**
-   * @see 
SecureProtocolSocketFactory#createSocket(java.net.Socket,java.lang.String,int,boolean)
+   * @see 
org.apache.commons.httpclient.protocol.SecureProtocolSocketFactory#createSocket(Socket,String,int,boolean)
    */
   public Socket createSocket(Socket socket, String host, int port, boolean 
autoClose) throws IOException,
           UnknownHostException {


Reply via email to