[jira] [Commented] (NUTCH-2394) Possible bugs in the source code

ASF GitHub Bot (JIRA) Wed, 25 Oct 2017 08:01:41 -0700

    [ 
https://issues.apache.org/jira/browse/NUTCH-2394?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=16218839#comment-16218839
 ]


ASF GitHub Bot commented on NUTCH-2394:
---------------------------------------

sebastian-nagel closed pull request #234: NUTCH-2394 Fix of bugs detected by 
static code analysis
URL: https://github.com/apache/nutch/pull/234
 
 
   

This is a PR merged from a forked repository.
As GitHub hides the original diff on merge, it is displayed below for
the sake of provenance:

As this is a foreign pull request (from a fork), the diff is supplied
below (as it won't show otherwise due to GitHub magic):

diff --git a/src/java/org/apache/nutch/crawl/URLPartitioner.java 
b/src/java/org/apache/nutch/crawl/URLPartitioner.java
index fcf5f70aa..48ea79998 100644
--- a/src/java/org/apache/nutch/crawl/URLPartitioner.java
+++ b/src/java/org/apache/nutch/crawl/URLPartitioner.java
@@ -63,23 +63,27 @@ public void configure(JobConf job) {
   public void close() {
   }
 
-  /** Hash by domain name. */
+  /** Hash by host or domain name or IP address. */
   public int getPartition(Text key, Writable value, int numReduceTasks) {
     String urlString = key.toString();
     URL url = null;
-    int hashCode = urlString.hashCode();
+    int hashCode = 0;
     try {
       urlString = normalizers.normalize(urlString,
           URLNormalizers.SCOPE_PARTITION);
       url = new URL(urlString);
-      hashCode = url.getHost().hashCode();
     } catch (MalformedURLException e) {
       LOG.warn("Malformed URL: '" + urlString + "'");
     }
 
-    if (mode.equals(PARTITION_MODE_DOMAIN) && url != null)
+    if (url == null) {
+      // failed to parse URL, must take URL string as fall-back
+      hashCode = urlString.hashCode();
+    } else if (mode.equals(PARTITION_MODE_HOST)) {
+      hashCode = url.getHost().hashCode();
+    } else if (mode.equals(PARTITION_MODE_DOMAIN)) {
       hashCode = URLUtil.getDomainName(url).hashCode();
-    else if (mode.equals(PARTITION_MODE_IP)) {
+    } else if (mode.equals(PARTITION_MODE_IP)) {
       try {
         InetAddress address = InetAddress.getByName(url.getHost());
         hashCode = address.getHostAddress().hashCode();
diff --git a/src/java/org/apache/nutch/tools/CommonCrawlDataDumper.java 
b/src/java/org/apache/nutch/tools/CommonCrawlDataDumper.java
index 533715686..107ec1c52 100644
--- a/src/java/org/apache/nutch/tools/CommonCrawlDataDumper.java
+++ b/src/java/org/apache/nutch/tools/CommonCrawlDataDumper.java
@@ -38,6 +38,7 @@
 import java.util.List;
 import java.util.Map;
 import java.util.Set;
+import java.util.regex.Pattern;
 
 import org.apache.commons.cli.CommandLine;
 import org.apache.commons.cli.CommandLineParser;
@@ -342,7 +343,8 @@ public void dump(File outputDir, File segmentRootDir, File 
linkdb, boolean gzip,
                   .createFileName(md5Ofurl, baseName, extensionName);
               outputFullPath = String.format("%s/%s", fullDir, filename);
 
-              String[] fullPathLevels = fullDir.split(File.separator);
+              String[] fullPathLevels = fullDir
+                  .split(Pattern.quote(File.separator));
               String firstLevelDirName = fullPathLevels[fullPathLevels.length
                   - 2];
               String secondLevelDirName = fullPathLevels[fullPathLevels.length
diff --git 
a/src/plugin/index-more/src/java/org/apache/nutch/indexer/more/MoreIndexingFilter.java
 
b/src/plugin/index-more/src/java/org/apache/nutch/indexer/more/MoreIndexingFilter.java
index 53fe34475..26a7df2c6 100644
--- 
a/src/plugin/index-more/src/java/org/apache/nutch/indexer/more/MoreIndexingFilter.java
+++ 
b/src/plugin/index-more/src/java/org/apache/nutch/indexer/more/MoreIndexingFilter.java
@@ -324,7 +324,7 @@ private void readConfiguration() throws IOException {
 
     while ((line = reader.readLine()) != null) {
       if (StringUtils.isNotBlank(line) && !line.startsWith("#")) {
-        line.trim();
+        line = line.trim();
         parts = line.split("\t");
 
         // Must be at least two parts
diff --git 
a/src/plugin/indexer-elastic/src/java/org/apache/nutch/indexwriter/elastic/ElasticIndexWriter.java
 
b/src/plugin/indexer-elastic/src/java/org/apache/nutch/indexwriter/elastic/ElasticIndexWriter.java
index dfcc01cf4..6cb9cf8d1 100644
--- 
a/src/plugin/indexer-elastic/src/java/org/apache/nutch/indexwriter/elastic/ElasticIndexWriter.java
+++ 
b/src/plugin/indexer-elastic/src/java/org/apache/nutch/indexwriter/elastic/ElasticIndexWriter.java
@@ -114,7 +114,7 @@ protected Client makeClient(Configuration conf) throws 
IOException {
     String parts[];
     while ((line = reader.readLine()) != null) {
       if (StringUtils.isNotBlank(line) && !line.startsWith("#")) {
-        line.trim();
+        line = line.trim();
         parts = line.split("=");
 
         if (parts.length == 2) {
diff --git 
a/src/plugin/urlnormalizer-host/src/java/org/apache/nutch/net/urlnormalizer/host/HostURLNormalizer.java
 
b/src/plugin/urlnormalizer-host/src/java/org/apache/nutch/net/urlnormalizer/host/HostURLNormalizer.java
index 9b5e3ebd9..86f58e4ac 100644
--- 
a/src/plugin/urlnormalizer-host/src/java/org/apache/nutch/net/urlnormalizer/host/HostURLNormalizer.java
+++ 
b/src/plugin/urlnormalizer-host/src/java/org/apache/nutch/net/urlnormalizer/host/HostURLNormalizer.java
@@ -76,7 +76,7 @@ private synchronized void readConfiguration(Reader 
configReader)
 
     while ((line = reader.readLine()) != null) {
       if (StringUtils.isNotBlank(line) && !line.startsWith("#")) {
-        line.trim();
+        line = line.trim();
         delimiterIndex = line.indexOf(" ");
 
         host = line.substring(0, delimiterIndex);
diff --git 
a/src/plugin/urlnormalizer-protocol/src/java/org/apache/nutch/net/urlnormalizer/protocol/ProtocolURLNormalizer.java
 
b/src/plugin/urlnormalizer-protocol/src/java/org/apache/nutch/net/urlnormalizer/protocol/ProtocolURLNormalizer.java
index 1faa04113..73067462d 100644
--- 
a/src/plugin/urlnormalizer-protocol/src/java/org/apache/nutch/net/urlnormalizer/protocol/ProtocolURLNormalizer.java
+++ 
b/src/plugin/urlnormalizer-protocol/src/java/org/apache/nutch/net/urlnormalizer/protocol/ProtocolURLNormalizer.java
@@ -75,7 +75,7 @@ private synchronized void readConfiguration(Reader 
configReader) throws IOExcept
 
     while ((line = reader.readLine()) != null) {
       if (StringUtils.isNotBlank(line) && !line.startsWith("#")) {
-        line.trim();
+        line = line.trim();
         delimiterIndex = line.indexOf(" ");
         // try tabulator
         if (delimiterIndex == -1) {
diff --git 
a/src/plugin/urlnormalizer-slash/src/java/org/apache/nutch/net/urlnormalizer/slash/SlashURLNormalizer.java
 
b/src/plugin/urlnormalizer-slash/src/java/org/apache/nutch/net/urlnormalizer/slash/SlashURLNormalizer.java
index c96013f38..c4f68ef20 100644
--- 
a/src/plugin/urlnormalizer-slash/src/java/org/apache/nutch/net/urlnormalizer/slash/SlashURLNormalizer.java
+++ 
b/src/plugin/urlnormalizer-slash/src/java/org/apache/nutch/net/urlnormalizer/slash/SlashURLNormalizer.java
@@ -77,7 +77,7 @@ private synchronized void readConfiguration(Reader 
configReader) throws IOExcept
 
     while ((line = reader.readLine()) != null) {
       if (StringUtils.isNotBlank(line) && !line.startsWith("#")) {
-        line.trim();
+        line = line.trim();
         delimiterIndex = line.indexOf(" ");
         // try tabulator
         if (delimiterIndex == -1) {


 

----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
[email protected]


> Possible bugs in the source code
> --------------------------------
>
>                 Key: NUTCH-2394
>                 URL: https://issues.apache.org/jira/browse/NUTCH-2394
>             Project: Nutch
>          Issue Type: Bug
>    Affects Versions: 1.13
>            Reporter: AppChecker
>              Labels: appchecker, static-analysis
>             Fix For: 1.14
>
>
> Hi!
> I've checked your project with static analyzer 
> [AppChecker|https://npo-echelon.ru/en/solutions/appchecker.php] and if found 
> several suspicious code fragments:
> 1) 
> [src/plugin/headings/src/java/org/apache/nutch/parse/headings/HeadingsParseFilter.java|https://github.com/apache/nutch/blob/e53b34b2322f2d071981a72577644a225642ecbc/src/plugin/headings/src/java/org/apache/nutch/parse/headings/HeadingsParseFilter.java#L56]
> {code:java}
> heading.trim();
> {code}
> heading is not changed, because java.lang.String.trim returns new string.
> Probably, it should be:
> {code:java}
> heading = heading.trim();
> {code}
> see also:
> * 
> [src/plugin/urlnormalizer-host/src/java/org/apache/nutch/net/urlnormalizer/host/HostURLNormalizer.java#L78|https://github.com/apache/nutch/blob/e53b34b2322f2d071981a72577644a225642ecbc/src/plugin/urlnormalizer-host/src/java/org/apache/nutch/net/urlnormalizer/host/HostURLNormalizer.java#L78]
> * 
> [src/java/org/apache/nutch/indexwriter/elastic/ElasticIndexWriter.java#L115|https://github.com/apache/nutch/blob/e53b34b2322f2d071981a72577644a225642ecbc/src/plugin/indexer-elastic/src/java/org/apache/nutch/indexwriter/elastic/ElasticIndexWriter.java#L115]
> * 
> [src/java/org/apache/nutch/net/urlnormalizer/protocol/ProtocolURLNormalizer.java#L76|https://github.com/apache/nutch/blob/e53b34b2322f2d071981a72577644a225642ecbc/src/plugin/urlnormalizer-protocol/src/java/org/apache/nutch/net/urlnormalizer/protocol/ProtocolURLNormalizer.java#L76]
> * 
> [src/java/org/apache/nutch/net/urlnormalizer/slash/SlashURLNormalizer.java#L78|https://github.com/apache/nutch/blob/e53b34b2322f2d071981a72577644a225642ecbc/src/plugin/urlnormalizer-slash/src/java/org/apache/nutch/net/urlnormalizer/slash/SlashURLNormalizer.java#L78]
> * 
> [src/java/org/apache/nutch/indexer/more/MoreIndexingFilter.java#L326|https://github.com/apache/nutch/blob/e53b34b2322f2d071981a72577644a225642ecbc/src/plugin/index-more/src/java/org/apache/nutch/indexer/more/MoreIndexingFilter.java#L326]
> 2) 
> [src/java/org/apache/nutch/crawl/URLPartitioner.java#L84|https://github.com/apache/nutch/blob/2b93a66f0472e93223c69053d5482dcbef26de6d/src/java/org/apache/nutch/crawl/URLPartitioner.java#L84]
> {code:java}
> if (mode.equals(PARTITION_MODE_DOMAIN) && url != null)
>   ...
> else if ..
>   ...
>   InetAddress address = InetAddress.getByName(url.getHost());
>   ...
> {code}
> if url is null, method url.getHost() will be invoked, so NullPointerException 
> wiil be thrown
> 3) 
> [src/java/org/apache/nutch/tools/CommonCrawlDataDumper.java#L346|https://github.com/apache/nutch/blob/e53b34b2322f2d071981a72577644a225642ecbc/src/java/org/apache/nutch/tools/CommonCrawlDataDumper.java#L346]
> {code:java}
> String[] fullPathLevels = fullDir.split(File.separator);
> {code}
> Using File.separator in regular expressions may throws 
> java.util.regex.PatternSyntaxException exceptions, because it is "\" on 
> Windows-based systems.
> Possible      correction:
> {code:java}
> String[] fullPathLevels = fullDir.split(Pattern.quote(File.separator));
> {code}



--
This message was sent by Atlassian JIRA
(v6.4.14#64029)

[jira] [Commented] (NUTCH-2394) Possible bugs in the source code

Reply via email to