[solr] branch main updated: SOLR-15613: Address error-prone URLEqualsHashCode warning (#333)

mdrob Thu, 07 Oct 2021 19:21:33 -0700

This is an automated email from the ASF dual-hosted git repository.

mdrob pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/solr.git



The following commit(s) were added to refs/heads/main by this push:
     new 05c8f4f  SOLR-15613: Address error-prone URLEqualsHashCode warning 
(#333)
05c8f4f is described below

commit 05c8f4fcec4d65e5fb4e8e5b65454be01d2f40e2
Author: Collins Abanda <[email protected]>
AuthorDate: Thu Oct 7 22:21:21 2021 -0400

    SOLR-15613: Address error-prone URLEqualsHashCode warning (#333)
---
 gradle/validation/error-prone.gradle               |  1 -
 solr/CHANGES.txt                                   |  2 +
 .../java/org/apache/solr/util/SimplePostTool.java  | 73 ++++++++++++----------
 .../function/TestMinMaxOnMultiValuedField.java     |  4 +-
 .../org/apache/solr/util/SimplePostToolTest.java   | 24 +++----
 5 files changed, 57 insertions(+), 47 deletions(-)

diff --git a/gradle/validation/error-prone.gradle 
b/gradle/validation/error-prone.gradle
index 6302784..54817be 100644
--- a/gradle/validation/error-prone.gradle
+++ b/gradle/validation/error-prone.gradle
@@ -140,7 +140,6 @@ allprojects { prj ->
             '-Xep:ThreadPriorityCheck:OFF',
             '-Xep:TypeParameterShadowing:OFF',
             '-Xep:TypeParameterUnusedInFormals:OFF',
-            '-Xep:URLEqualsHashCode:OFF',
             '-Xep:UndefinedEquals:OFF',
             '-Xep:UnescapedEntity:OFF',
             '-Xep:UnnecessaryParentheses:OFF',
diff --git a/solr/CHANGES.txt b/solr/CHANGES.txt
index 87e0bbf..d30c343 100644
--- a/solr/CHANGES.txt
+++ b/solr/CHANGES.txt
@@ -44,6 +44,8 @@ New Features
 * SOLR-7642: opt-in support to create ZK chroot on startup (Timothy Potter, 
Shawn Heisey, Mark Miller, Tomas Eduardo Fernandez Lobbe,
   Jan Høydahl, Steve Molloy, Isabelle Giguere, David Eric Pugh, Gus Heck, 
Christine Poerschke, Houston Putman)
 
+* SOLR-15613: Enforce error-prone checks to catch all URLEqualsHashCode 
violations and use java.net.URI instead (Collins Abanda)
+
 Improvements
 ----------------------
 * LUCENE-8984: MoreLikeThis MLT is biased for uncommon fields (Andy Hind via 
Anshum Gupta)
diff --git a/solr/core/src/java/org/apache/solr/util/SimplePostTool.java 
b/solr/core/src/java/org/apache/solr/util/SimplePostTool.java
index 743fdf8..eedcb17 100644
--- a/solr/core/src/java/org/apache/solr/util/SimplePostTool.java
+++ b/solr/core/src/java/org/apache/solr/util/SimplePostTool.java
@@ -36,6 +36,8 @@ import java.io.OutputStream;
 import java.net.HttpURLConnection;
 import java.net.MalformedURLException;
 import java.net.ProtocolException;
+import java.net.URI;
+import java.net.URISyntaxException;
 import java.net.URL;
 import java.net.URLEncoder;
 import java.nio.BufferOverflowException;
@@ -118,8 +120,8 @@ public class SimplePostTool {
   static HashMap<String,String> mimeMap;
   FileFilter fileFilter;
   // Backlog for crawling
-  List<LinkedHashSet<URL>> backlog = new ArrayList<>();
-  Set<URL> visited = new HashSet<>();
+  List<LinkedHashSet<URI>> backlog = new ArrayList<>();
+  Set<URI> visited = new HashSet<>();
 
   static final Set<String> DATA_MODES = new HashSet<>();
   static final String USAGE_STRING_SHORT =
@@ -557,7 +559,8 @@ public class SimplePostTool {
 
   /**
    * This method takes as input a list of start URL strings for crawling,
-   * adds each one to a backlog and then starts crawling
+   * converts the URL strings to URI strings
+   * and adds each one to a backlog and then starts crawling
    * @param args the raw input args from main()
    * @param startIndexInArgs offset for where to start
    * @param out outputStream to write results to
@@ -565,16 +568,16 @@ public class SimplePostTool {
    */
   public int postWebPages(String[] args, int startIndexInArgs, OutputStream 
out) {
     reset();
-    LinkedHashSet<URL> s = new LinkedHashSet<>();
+    LinkedHashSet<URI> s = new LinkedHashSet<>();
     for (int j = startIndexInArgs; j < args.length; j++) {
       try {
-        URL u = new URL(normalizeUrlEnding(args[j]));
-        s.add(u);
-      } catch(MalformedURLException e) {
+        URI uri = new URI(normalizeUrlEnding(args[j]));
+        s.add(uri);
+      } catch(URISyntaxException e) {
         warn("Skipping malformed input URL: "+args[j]);
       }
     }
-    // Add URLs to level 0 of the backlog and start recursive crawling
+    // Add URIs to level 0 of the backlog and start recursive crawling
     backlog.add(s);
     return webCrawl(0, out);
   }
@@ -607,40 +610,41 @@ public class SimplePostTool {
    */
   protected int webCrawl(int level, OutputStream out) {
     int numPages = 0;
-    LinkedHashSet<URL> stack = backlog.get(level);
+    LinkedHashSet<URI> stack = backlog.get(level);
     int rawStackSize = stack.size();
     stack.removeAll(visited);
     int stackSize = stack.size();
-    LinkedHashSet<URL> subStack = new LinkedHashSet<>();
+    LinkedHashSet<URI> subStack = new LinkedHashSet<>();
     info("Entering crawl at level "+level+" ("+rawStackSize+" links total, 
"+stackSize+" new)");
-    for(URL u : stack) {
+    for(URI uri : stack) {
       try {
-        visited.add(u);
-        PageFetcherResult result = pageFetcher.readPageFromUrl(u);
+        visited.add(uri);
+        URL url = uri.toURL();
+        PageFetcherResult result = pageFetcher.readPageFromUrl(url);
         if(result.httpStatus == 200) {
-          u = (result.redirectUrl != null) ? result.redirectUrl : u;
+          url = (result.redirectUrl != null) ? result.redirectUrl : url;
           URL postUrl = new URL(appendParam(solrUrl.toString(),
-              "literal.id="+URLEncoder.encode(u.toString(),"UTF-8") +
-              "&literal.url="+URLEncoder.encode(u.toString(),"UTF-8")));
+              "literal.id="+URLEncoder.encode(url.toString(),"UTF-8") +
+              "&literal.url="+URLEncoder.encode(url.toString(),"UTF-8")));
           ByteBuffer content = result.content;
           boolean success = postData(new ByteArrayInputStream(content.array(), 
content.arrayOffset(), content.limit()), null, out, result.contentType, 
postUrl);
           if (success) {
-            info("POSTed web resource "+u+" (depth: "+level+")");
+            info("POSTed web resource "+url+" (depth: "+level+")");
             Thread.sleep(delay * 1000);
             numPages++;
             // Pull links from HTML pages only
             if(recursive > level && result.contentType.equals("text/html")) {
-              Set<URL> children = pageFetcher.getLinksFromWebPage(u, new 
ByteArrayInputStream(content.array(), content.arrayOffset(), content.limit()), 
result.contentType, postUrl);
+              Set<URI> children = pageFetcher.getLinksFromWebPage(url, new 
ByteArrayInputStream(content.array(), content.arrayOffset(), content.limit()), 
result.contentType, postUrl);
               subStack.addAll(children);
             }
           } else {
-            warn("An error occurred while posting "+u);
+            warn("An error occurred while posting "+uri);
           }
         } else {
-          warn("The URL "+u+" returned a HTTP result status of 
"+result.httpStatus);
+          warn("The URL "+uri+" returned a HTTP result status of 
"+result.httpStatus);
         }
-      } catch (IOException e) {
-        warn("Caught exception when trying to open connection to "+u+": 
"+e.getMessage());
+      } catch (IOException | URISyntaxException e) {
+        warn("Caught exception when trying to open connection to "+uri+": 
"+e.getMessage());
       } catch (InterruptedException e) {
         throw new RuntimeException(e);
       }
@@ -852,7 +856,7 @@ public class SimplePostTool {
   }
 
   /**
-   * Guesses the type of a file, based on file name suffix
+   * Guesses the type of file, based on file name suffix
    * Returns "application/octet-stream" if no corresponding mimeMap type.
    * @param file the file
    * @return the content-type guessed
@@ -1127,13 +1131,14 @@ public class SimplePostTool {
       robotsCache = new HashMap<>();
     }
 
-    public PageFetcherResult readPageFromUrl(URL u) {
+    public PageFetcherResult readPageFromUrl(URL u) throws URISyntaxException {
       PageFetcherResult res = new PageFetcherResult();
       try {
         if (isDisallowedByRobots(u)) {
           warn("The URL "+u+" is disallowed by robots.txt and will not be 
crawled.");
           res.httpStatus = 403;
-          visited.add(u);
+          URI uri = u.toURI();
+          visited.add(uri);
           return res;
         }
         res.httpStatus = 404;
@@ -1146,7 +1151,8 @@ public class SimplePostTool {
           info("The URL "+u+" caused a redirect to "+conn.getURL());
           u = conn.getURL();
           res.redirectUrl = u;
-          visited.add(u);
+          URI uri = u.toURI();
+          visited.add(uri);
         }
         if(res.httpStatus == 200) {
           // Raw content type of form "text/html; encoding=utf-8"
@@ -1235,10 +1241,10 @@ public class SimplePostTool {
      * @param is the input stream of the page
      * @param type the content-type
      * @param postUrl the URL (typically /solr/extract) in order to pull out 
links
-     * @return a set of URLs parsed from the page
+     * @return a set of URIs parsed from the page
      */
-    protected Set<URL> getLinksFromWebPage(URL u, InputStream is, String type, 
URL postUrl) {
-      Set<URL> l = new HashSet<>();
+    protected Set<URI> getLinksFromWebPage(URL u, InputStream is, String type, 
URL postUrl) {
+      Set<URI> l = new HashSet<>();
       URL url = null;
       try {
         ByteArrayOutputStream os = new ByteArrayOutputStream();
@@ -1254,10 +1260,10 @@ public class SimplePostTool {
             link = computeFullUrl(u, link);
             if(link == null)
               continue;
-            url = new URL(link);
-            if(url.getAuthority() == null || 
!url.getAuthority().equals(u.getAuthority()))
-              continue;
-            l.add(url);
+            URI newUri = new URI(link);
+            if(newUri.getAuthority() == null || 
!newUri.getAuthority().equals(u.getAuthority())){
+              l.add(newUri);
+            }
           }
         }
       } catch (MalformedURLException e) {
@@ -1267,6 +1273,7 @@ public class SimplePostTool {
       } catch (Exception e) {
         throw new RuntimeException(e);
       }
+
       return l;
     }
   }
diff --git 
a/solr/core/src/test/org/apache/solr/search/function/TestMinMaxOnMultiValuedField.java
 
b/solr/core/src/test/org/apache/solr/search/function/TestMinMaxOnMultiValuedField.java
index 8993b25..5a84393 100644
--- 
a/solr/core/src/test/org/apache/solr/search/function/TestMinMaxOnMultiValuedField.java
+++ 
b/solr/core/src/test/org/apache/solr/search/function/TestMinMaxOnMultiValuedField.java
@@ -895,8 +895,8 @@ public class TestMinMaxOnMultiValuedField extends 
SolrTestCaseJ4 {
   /**
    * Given a (multivalued) field name and an (ascending) sorted list of 
values, this method will generate a List of Solr Documents of the same size 
such that:
    * <ul>
-   *  <li>For each non-null value in the original list, the corrisponding 
document in the result will have that value in the specified field.</li>
-   *  <li>For each null value in the original list, the corrisponding document 
in teh result will have <em>NO</em> values in the specified field.</li>
+   *  <li>For each non-null value in the original list, the corresponding 
document in the result will have that value in the specified field.</li>
+   *  <li>For each null value in the original list, the corresponding document 
in teh result will have <em>NO</em> values in the specified field.</li>
    *  <li>If a document has a value in the field, then some random number of 
values that come <em>after</em> that value in the original list may also be 
included in the specified field.</li>
    *  <li>Every document in the result will have a randomly asssigned 'id', 
unique realitive to all other documents in the result.</li>
    * </ul>
diff --git a/solr/core/src/test/org/apache/solr/util/SimplePostToolTest.java 
b/solr/core/src/test/org/apache/solr/util/SimplePostToolTest.java
index 39036ef..bc07bfc 100644
--- a/solr/core/src/test/org/apache/solr/util/SimplePostToolTest.java
+++ b/solr/core/src/test/org/apache/solr/util/SimplePostToolTest.java
@@ -21,6 +21,8 @@ import java.io.File;
 import java.io.IOException;
 import java.io.InputStream;
 import java.net.MalformedURLException;
+import java.net.URI;
+import java.net.URISyntaxException;
 import java.net.URL;
 import java.nio.ByteBuffer;
 import java.nio.charset.StandardCharsets;
@@ -187,9 +189,9 @@ public class SimplePostToolTest extends SolrTestCaseJ4 {
 
   static class MockPageFetcher extends PageFetcher {
     HashMap<String,String> htmlMap = new HashMap<>();
-    HashMap<String,Set<URL>> linkMap = new HashMap<>();
+    HashMap<String,Set<URI>> linkMap = new HashMap<>();
     
-    public MockPageFetcher() throws IOException {
+    public MockPageFetcher() throws IOException, URISyntaxException {
       (new SimplePostTool()).super();
       htmlMap.put("http://[ff01::114]";, "<html><body><a 
href=\"http://[ff01::114]/page1\";>page1</a><a 
href=\"http://[ff01::114]/page2\";>page2</a></body></html>");
       htmlMap.put("http://[ff01::114]/index.html";, "<html><body><a 
href=\"http://[ff01::114]/page1\";>page1</a><a 
href=\"http://[ff01::114]/page2\";>page2</a></body></html>");
@@ -199,19 +201,19 @@ public class SimplePostToolTest extends SolrTestCaseJ4 {
       htmlMap.put("http://[ff01::114]/page2";, "<html><body><a 
href=\"http://[ff01::114]/\";><a 
href=\"http://[ff01::114]/disallowed\"/></body></html>");
       htmlMap.put("http://[ff01::114]/disallowed";, "<html><body><a 
href=\"http://[ff01::114]/\";></body></html>");
 
-      Set<URL> s = new HashSet<>();
-      s.add(new URL("http://[ff01::114]/page1";));
-      s.add(new URL("http://[ff01::114]/page2";));
+      Set<URI> s = new HashSet<>();
+      s.add(new URI("http://[ff01::114]/page1";));
+      s.add(new URI("http://[ff01::114]/page2";));
       linkMap.put("http://[ff01::114]";, s);
       linkMap.put("http://[ff01::114]/index.html";, s);
       s = new HashSet<>();
-      s.add(new URL("http://[ff01::114]/page1/foo";));
+      s.add(new URI("http://[ff01::114]/page1/foo";));
       linkMap.put("http://[ff01::114]/page1";, s);
       s = new HashSet<>();
-      s.add(new URL("http://[ff01::114]/page1/foo/bar";));
+      s.add(new URI("http://[ff01::114]/page1/foo/bar";));
       linkMap.put("http://[ff01::114]/page1/foo";, s);
       s = new HashSet<>();
-      s.add(new URL("http://[ff01::114]/disallowed";));
+      s.add(new URI("http://[ff01::114]/disallowed";));
       linkMap.put("http://[ff01::114]/page2";, s);
       
       // Simulate a robots.txt file with comments and a few disallows
@@ -239,11 +241,11 @@ public class SimplePostToolTest extends SolrTestCaseJ4 {
     }
     
     @Override
-    public Set<URL> getLinksFromWebPage(URL u, InputStream is, String type, 
URL postUrl) {
-      Set<URL> s = 
linkMap.get(SimplePostTool.normalizeUrlEnding(u.toString()));
+    public Set<URI> getLinksFromWebPage(URL u, InputStream is, String type, 
URL postUrl) {
+      Set<URI> s = 
linkMap.get(SimplePostTool.normalizeUrlEnding(u.toString()));
       if(s == null)
         s = new HashSet<>();
       return s;
     }
   }
-}
+}
\ No newline at end of file

[solr] branch main updated: SOLR-15613: Address error-prone URLEqualsHashCode warning (#333)

Reply via email to