(solr) 01/02: SOLR-17159: Untangle PostTool and SimplePostTool code (#2275)

epugh Mon, 19 Feb 2024 09:10:41 -0800

This is an automated email from the ASF dual-hosted git repository.

epugh pushed a commit to branch branch_9x
in repository https://gitbox.apache.org/repos/asf/solr.git


commit 8a427ebc606d7967bcaaef30a9449bc0bf61b25b
Author: Eric Pugh <[email protected]>
AuthorDate: Mon Feb 19 11:46:47 2024 -0500

    SOLR-17159: Untangle PostTool and SimplePostTool code (#2275)
    
    * Copied unit tests from SimplePostToolTest to PostToolTest
    * add a --dry-run mode that simulates sending documents to Solr
    * Missed a few more places with -commit not needed post SOLR-17147 being 
completed
    * clean up our code base to have fewer warnings about code quality.
    * Update SolrCloudExampleTest to use the PostTool instead of simulating 
it's usage.
    * Make the long form of -url expliciting a --solr-update-url to be clear 
what it's for.
---
 solr/CHANGES.txt                                   |    3 +
 .../src/java/org/apache/solr/cli/PostTool.java     | 1166 +++++++++++++++++++-
 .../java/org/apache/solr/cli/RunExampleTool.java   |   30 +-
 .../src/test/org/apache/solr/cli/PostToolTest.java |  220 +++-
 .../apache/solr/cloud/SolrCloudExampleTest.java    |   57 +-
 solr/packaging/test/test_post.bats                 |   22 +-
 6 files changed, 1408 insertions(+), 90 deletions(-)

diff --git a/solr/CHANGES.txt b/solr/CHANGES.txt
index ab43eb599a7..54c96bc87bb 100644
--- a/solr/CHANGES.txt
+++ b/solr/CHANGES.txt
@@ -26,6 +26,9 @@ Improvements
 
 * SOLR-17145: The INSTALLSHARDDATA API now includes a 'requestid' field when 
run asynchronously (Jason Gerlowski)
 
+* SOLR-17159: bin/solr post now has proper unit testing.  Users can specify a 
--dry-run option to 
+  simulate posting documents without sending them to Solr. (Eric Pugh)
+
 Optimizations
 ---------------------
 * SOLR-17144: Close searcherExecutor thread per core after 1 minute (Pierre 
Salagnac, Christine Poerschke)
diff --git a/solr/core/src/java/org/apache/solr/cli/PostTool.java 
b/solr/core/src/java/org/apache/solr/cli/PostTool.java
index de716c131cb..0e3bc6b77c1 100644
--- a/solr/core/src/java/org/apache/solr/cli/PostTool.java
+++ b/solr/core/src/java/org/apache/solr/cli/PostTool.java
@@ -16,15 +16,144 @@
  */
 package org.apache.solr.cli;
 
+import static java.nio.charset.StandardCharsets.US_ASCII;
+import static java.nio.charset.StandardCharsets.UTF_8;
+
+import java.io.BufferedReader;
+import java.io.ByteArrayInputStream;
+import java.io.ByteArrayOutputStream;
+import java.io.File;
+import java.io.FileFilter;
+import java.io.FileInputStream;
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.InputStreamReader;
 import java.io.OutputStream;
 import java.io.PrintStream;
+import java.net.HttpURLConnection;
+import java.net.MalformedURLException;
+import java.net.ProtocolException;
+import java.net.URI;
+import java.net.URISyntaxException;
 import java.net.URL;
+import java.net.URLEncoder;
+import java.nio.ByteBuffer;
+import java.nio.charset.Charset;
+import java.nio.charset.StandardCharsets;
+import java.nio.file.Files;
+import java.security.GeneralSecurityException;
+import java.text.SimpleDateFormat;
+import java.util.ArrayList;
+import java.util.Base64;
+import java.util.Date;
+import java.util.HashMap;
+import java.util.HashSet;
+import java.util.LinkedHashSet;
 import java.util.List;
+import java.util.Locale;
+import java.util.Map;
+import java.util.Set;
+import java.util.TimeZone;
+import java.util.regex.Pattern;
+import java.util.regex.PatternSyntaxException;
+import java.util.zip.GZIPInputStream;
+import java.util.zip.Inflater;
+import java.util.zip.InflaterInputStream;
+import javax.xml.parsers.DocumentBuilderFactory;
+import javax.xml.parsers.ParserConfigurationException;
+import javax.xml.xpath.XPath;
+import javax.xml.xpath.XPathConstants;
+import javax.xml.xpath.XPathExpression;
+import javax.xml.xpath.XPathExpressionException;
+import javax.xml.xpath.XPathFactory;
 import org.apache.commons.cli.CommandLine;
 import org.apache.commons.cli.Option;
+import org.apache.solr.client.api.util.SolrVersion;
+import org.apache.solr.client.solrj.SolrClient;
+import org.apache.solr.client.solrj.SolrServerException;
+import org.apache.solr.common.util.Utils;
+import org.apache.solr.util.RTimer;
+import org.w3c.dom.Document;
+import org.w3c.dom.Node;
+import org.w3c.dom.NodeList;
+import org.xml.sax.SAXException;
 
 public class PostTool extends ToolBase {
 
+  public static final String DEFAULT_FILE_TYPES =
+      
"xml,json,jsonl,csv,pdf,doc,docx,ppt,pptx,xls,xlsx,odt,odp,ods,ott,otp,ots,rtf,htm,html,txt,log";
+  static final String DATA_MODE_FILES = "files";
+  static final String DATA_MODE_ARGS = "args";
+  static final String DATA_MODE_STDIN = "stdin";
+  static final String DEFAULT_DATA_MODE = DATA_MODE_FILES;
+  static final String FORMAT_SOLR = "solr";
+  static final String DATA_MODE_WEB = "web";
+
+  private static final int DEFAULT_WEB_DELAY = 10;
+  private static final int MAX_WEB_DEPTH = 10;
+  public static final String DEFAULT_CONTENT_TYPE = "application/json";
+
+  // Input args
+  int recursive = 0;
+  int delay = 0;
+  String fileTypes = PostTool.DEFAULT_FILE_TYPES;
+  URL solrUpdateUrl;
+  String credentials;
+  OutputStream out = null;
+  String type;
+  String format;
+  String mode = DEFAULT_DATA_MODE;
+  boolean commit;
+  boolean optimize;
+  boolean dryRun; // Avoids actual network traffic to Solr
+
+  String[] args;
+
+  boolean auto = true;
+  private int currentDepth;
+
+  static HashMap<String, String> mimeMap;
+  FileFilter fileFilter;
+  // Backlog for crawling
+  List<LinkedHashSet<URI>> backlog = new ArrayList<>();
+  Set<URI> visited = new HashSet<>();
+
+  static final Set<String> DATA_MODES = new HashSet<>();
+
+  PostTool.PageFetcher pageFetcher = new PostTool.PageFetcher();
+
+  static {
+    DATA_MODES.add(DATA_MODE_FILES);
+    DATA_MODES.add(DATA_MODE_ARGS);
+    DATA_MODES.add(DATA_MODE_STDIN);
+    DATA_MODES.add(DATA_MODE_WEB);
+
+    mimeMap = new HashMap<>();
+    mimeMap.put("xml", "application/xml");
+    mimeMap.put("csv", "text/csv");
+    mimeMap.put("json", "application/json");
+    mimeMap.put("jsonl", "application/jsonl");
+    mimeMap.put("pdf", "application/pdf");
+    mimeMap.put("rtf", "text/rtf");
+    mimeMap.put("html", "text/html");
+    mimeMap.put("htm", "text/html");
+    mimeMap.put("doc", "application/msword");
+    mimeMap.put("docx", 
"application/vnd.openxmlformats-officedocument.wordprocessingml.document");
+    mimeMap.put("ppt", "application/vnd.ms-powerpoint");
+    mimeMap.put(
+        "pptx", 
"application/vnd.openxmlformats-officedocument.presentationml.presentation");
+    mimeMap.put("xls", "application/vnd.ms-excel");
+    mimeMap.put("xlsx", 
"application/vnd.openxmlformats-officedocument.spreadsheetml.sheet");
+    mimeMap.put("odt", "application/vnd.oasis.opendocument.text");
+    mimeMap.put("ott", "application/vnd.oasis.opendocument.text");
+    mimeMap.put("odp", "application/vnd.oasis.opendocument.presentation");
+    mimeMap.put("otp", "application/vnd.oasis.opendocument.presentation");
+    mimeMap.put("ods", "application/vnd.oasis.opendocument.spreadsheet");
+    mimeMap.put("ots", "application/vnd.oasis.opendocument.spreadsheet");
+    mimeMap.put("txt", "text/plain");
+    mimeMap.put("log", "text/plain");
+  }
+
   public PostTool() {
     this(CLIO.getOutStream());
   }
@@ -43,9 +172,10 @@ public class PostTool extends ToolBase {
     return List.of(
         Option.builder("url")
             .argName("url")
+            .longOpt("solr-update-url")
             .hasArg()
             .required(false)
-            .desc("<base Solr update URL>")
+            .desc("Solr Update URL, the full url to the update handler, 
including the /update.")
             .build(),
         Option.builder("c")
             .longOpt("name")
@@ -66,7 +196,8 @@ public class PostTool extends ToolBase {
             .argName("mode")
             .hasArg(true)
             .required(false)
-            .desc("Files crawls files, web crawls website. default: files.")
+            .desc(
+                "Files crawls files, web crawls website, args processes input 
args, and stdin reads a command from standard in. default: files.")
             .build(),
         Option.builder("recursive")
             .argName("recursive")
@@ -85,13 +216,13 @@ public class PostTool extends ToolBase {
             .argName("content-type")
             .hasArg(true)
             .required(false)
-            .desc("default: application/json")
+            .desc("Specify a specific mimetype to use, such as 
application/json.")
             .build(),
         Option.builder("filetypes")
             .argName("<type>[,<type>,...]")
             .hasArg(true)
             .required(false)
-            .desc("default: " + SimplePostTool.DEFAULT_FILE_TYPES)
+            .desc("default: " + DEFAULT_FILE_TYPES)
             .build(),
         Option.builder("params")
             .argName("<key>=<value>[&<key>=<value>...]")
@@ -107,6 +238,12 @@ public class PostTool extends ToolBase {
             .required(false)
             .desc(
                 "sends application/json content as Solr commands to /update 
instead of /update/json/docs.")
+            .build(),
+        Option.builder()
+            .longOpt("dry-run")
+            .required(false)
+            .desc(
+                "Performs a dry run of the posting process without actually 
sending documents to Solr.  Only works with files mode.")
             .build());
   }
 
@@ -114,52 +251,1027 @@ public class PostTool extends ToolBase {
   public void runImpl(CommandLine cli) throws Exception {
     SolrCLI.raiseLogLevelUnlessVerbose(cli);
 
-    URL solrUrl = null;
+    solrUpdateUrl = null;
     if (cli.hasOption("url")) {
       String url = cli.getOptionValue("url");
-      solrUrl = new URL(url);
+      solrUpdateUrl = new URL(url);
     } else if (cli.hasOption("c")) {
       String url = SolrCLI.getDefaultSolrUrl() + "/solr/" + 
cli.getOptionValue("c") + "/update";
-      solrUrl = new URL(url);
+      solrUpdateUrl = new URL(url);
     } else {
       throw new IllegalArgumentException(
           "Must specify either -url or -c parameter to post documents.");
     }
 
-    String mode = SimplePostTool.DEFAULT_DATA_MODE;
     if (cli.hasOption("mode")) {
       mode = cli.getOptionValue("mode");
     }
-    boolean auto = true;
-    String type = null;
+
+    if (cli.hasOption("dry-run")) {
+      dryRun = true;
+    }
+
     if (cli.hasOption("type")) {
       type = cli.getOptionValue("type");
+      // Turn off automatically looking up the mimetype in favour of what is 
passed in.
+      auto = false;
     }
-    String format =
-        cli.hasOption("format")
-            ? SimplePostTool.FORMAT_SOLR
-            : ""; // i.e not solr formatted json commands
+    format = cli.hasOption("format") ? FORMAT_SOLR : ""; // i.e not solr 
formatted json commands
 
-    String fileTypes = SimplePostTool.DEFAULT_FILE_TYPES;
     if (cli.hasOption("filetypes")) {
       fileTypes = cli.getOptionValue("filetypes");
     }
 
-    int defaultDelay = (mode.equals((SimplePostTool.DATA_MODE_WEB)) ? 10 : 0);
-    int delay = Integer.parseInt(cli.getOptionValue("delay", 
String.valueOf(defaultDelay)));
-    int recursive = Integer.parseInt(cli.getOptionValue("recursive", "1"));
+    int defaultDelay = (mode.equals((DATA_MODE_WEB)) ? 10 : 0);
+    delay = Integer.parseInt(cli.getOptionValue("delay", 
String.valueOf(defaultDelay)));
+    recursive = Integer.parseInt(cli.getOptionValue("recursive", "1"));
 
-    OutputStream out = cli.hasOption("out") ? CLIO.getOutStream() : null;
-    boolean commit = cli.hasOption("skipcommit") ? false : true;
-    boolean optimize = cli.hasOption("optimize");
+    out = cli.hasOption("out") ? CLIO.getOutStream() : null;
+    commit = cli.hasOption("skipcommit") ? false : true;
+    optimize = cli.hasOption("optimize");
 
-    String[] args = cli.getArgs();
+    args = cli.getArgs();
 
-    SimplePostTool spt =
-        new SimplePostTool(
-            mode, solrUrl, auto, type, format, recursive, delay, fileTypes, 
out, commit, optimize,
-            args);
+    execute();
+  }
+
+  /**
+   * After initialization, call execute to start the post job. This method 
delegates to the correct
+   * mode method.
+   */
+  public void execute() throws SolrServerException, IOException {
+    final RTimer timer = new RTimer();
+    if (PostTool.DATA_MODE_FILES.equals(mode)) {
+      doFilesMode();
+    } else if (DATA_MODE_ARGS.equals(mode)) {
+      doArgsMode(args);
+    } else if (PostTool.DATA_MODE_WEB.equals(mode)) {
+      doWebMode();
+    } else if (DATA_MODE_STDIN.equals(mode)) {
+      doStdinMode();
+    } else {
+      return;
+    }
+
+    if (commit) {
+      commit();
+    }
+    if (optimize) {
+      optimize();
+    }
+    displayTiming((long) timer.getTime());
+  }
+
+  private void doFilesMode() {
+    currentDepth = 0;
+
+    info(
+        "Posting files to [base] url "
+            + solrUpdateUrl
+            + (!auto ? " using content-type " + (type == null ? 
DEFAULT_CONTENT_TYPE : type) : "")
+            + "...");
+    if (auto) {
+      info("Entering auto mode. File endings considered are " + fileTypes);
+    }
+    if (recursive > 0) {
+      info("Entering recursive mode, max depth=" + recursive + ", delay=" + 
delay + "s");
+    }
+    fileFilter = getFileFilterFromFileTypes(fileTypes);
+    int numFilesPosted = postFiles(args, 0, out, type);
+    if (dryRun) {
+      info("Dry run complete. " + numFilesPosted + " would have been 
indexed.");
+    } else {
+      info(numFilesPosted + " files indexed.");
+    }
+  }
+
+  private void doArgsMode(String[] args) {
+    info("POSTing args to " + solrUpdateUrl + "...");
+    for (String a : args) {
+      postData(stringToStream(a), null, out, type, solrUpdateUrl);
+    }
+  }
+
+  private void doWebMode() {
+    reset();
+    int numPagesPosted = 0;
+    try {
+      if (type != null) {
+        throw new IllegalArgumentException(
+            "Specifying content-type with \"-Ddata=web\" is not supported");
+      }
+
+      // Set Extracting handler as default
+      solrUpdateUrl = appendUrlPath(solrUpdateUrl, "/extract");
+
+      info("Posting web pages to Solr url " + solrUpdateUrl);
+      auto = true;
+      info(
+          "Entering auto mode. Indexing pages with content-types corresponding 
to file endings "
+              + fileTypes);
+      if (recursive > 0) {
+        if (recursive > MAX_WEB_DEPTH) {
+          recursive = MAX_WEB_DEPTH;
+          warn("Too large recursion depth for web mode, limiting to " + 
MAX_WEB_DEPTH + "...");
+        }
+        if (delay < DEFAULT_WEB_DELAY) {
+          warn(
+              "Never crawl an external web site faster than every 10 seconds, 
your IP will probably be blocked");
+        }
+        info("Entering recursive mode, depth=" + recursive + ", delay=" + 
delay + "s");
+      }
+      numPagesPosted = postWebPages(args, 0, out);
+      info(numPagesPosted + " web pages indexed.");
+
+    } catch (MalformedURLException e) {
+      warn("Wrong URL trying to append /extract to " + solrUpdateUrl);
+    }
+  }
+
+  private void doStdinMode() {
+    info("POSTing stdin to " + solrUpdateUrl + "...");
+    postData(System.in, null, out, type, solrUpdateUrl);
+  }
+
+  private void reset() {
+    backlog = new ArrayList<>();
+    visited = new HashSet<>();
+  }
+
+  /**
+   * Pretty prints the number of milliseconds taken to post the content to Solr
+   *
+   * @param millis the time in milliseconds
+   */
+  private void displayTiming(long millis) {
+    SimpleDateFormat df = new SimpleDateFormat("H:mm:ss.SSS", 
Locale.getDefault());
+    df.setTimeZone(TimeZone.getTimeZone("UTC"));
+    CLIO.out("Time spent: " + df.format(new Date(millis)));
+  }
+
+  private boolean checkIsValidPath(File srcFile) {
+    return Files.exists(srcFile.toPath());
+  }
+
+  /**
+   * Post all filenames provided in args
+   *
+   * @param args array of file names
+   * @param startIndexInArgs offset to start
+   * @param out output stream to post data to
+   * @param type default content-type to use when posting (may be overridden 
in auto mode)
+   * @return number of files posted
+   */
+  public int postFiles(String[] args, int startIndexInArgs, OutputStream out, 
String type) {
+    reset();
+    int filesPosted = 0;
+    for (int j = startIndexInArgs; j < args.length; j++) {
+      File srcFile = new File(args[j]);
+      filesPosted = getFilesPosted(out, type, srcFile);
+    }
+    return filesPosted;
+  }
+
+  private int getFilesPosted(final OutputStream out, final String type, final 
File srcFile) {
+    int filesPosted = 0;
+    boolean isValidPath = checkIsValidPath(srcFile);
+    if (isValidPath && srcFile.isDirectory() && srcFile.canRead()) {
+      filesPosted += postDirectory(srcFile, out, type);
+    } else if (isValidPath && srcFile.isFile() && srcFile.canRead()) {
+      filesPosted += postFiles(new File[] {srcFile}, out, type);
+    } else {
+      filesPosted += handleGlob(srcFile, out, type);
+    }
+    return filesPosted;
+  }
+
+  /**
+   * Posts a whole directory
+   *
+   * @return number of files posted total
+   */
+  private int postDirectory(File dir, OutputStream out, String type) {
+    if (dir.isHidden() && !dir.getName().equals(".")) {
+      return (0);
+    }
+    info(
+        "Indexing directory "
+            + dir.getPath()
+            + " ("
+            + dir.listFiles(fileFilter).length
+            + " files, depth="
+            + currentDepth
+            + ")");
+    int posted = 0;
+    posted += postFiles(dir.listFiles(fileFilter), out, type);
+    if (recursive > currentDepth) {
+      for (File d : dir.listFiles()) {
+        if (d.isDirectory()) {
+          currentDepth++;
+          posted += postDirectory(d, out, type);
+          currentDepth--;
+        }
+      }
+    }
+    return posted;
+  }
+
+  /**
+   * Posts a list of file names
+   *
+   * @return number of files posted
+   */
+  int postFiles(File[] files, OutputStream out, String type) {
+    int filesPosted = 0;
+    for (File srcFile : files) {
+      try {
+        if (!srcFile.isFile() || srcFile.isHidden()) {
+          continue;
+        }
+        postFile(srcFile, out, type);
+        Thread.sleep(delay * 1000L);
+        filesPosted++;
+      } catch (InterruptedException | MalformedURLException e) {
+        throw new RuntimeException(e);
+      }
+    }
+    return filesPosted;
+  }
+
+  /**
+   * This only handles file globs not full path globbing.
+   *
+   * @param globFile file holding glob path
+   * @param out outputStream to write results to
+   * @param type default content-type to use when posting (may be overridden 
in auto mode)
+   * @return number of files posted
+   */
+  int handleGlob(File globFile, OutputStream out, String type) {
+    int filesPosted = 0;
+    File parent = globFile.getParentFile();
+    if (parent == null) {
+      parent = new File(".");
+    }
+    String fileGlob = globFile.getName();
+    PostTool.GlobFileFilter ff = new PostTool.GlobFileFilter(fileGlob, false);
+    File[] fileList = parent.listFiles(ff);
+    if (fileList == null || fileList.length == 0) {
+      warn("No files or directories matching " + globFile);
+    } else {
+      filesPosted = postFiles(fileList, out, type);
+    }
+    return filesPosted;
+  }
+
+  /**
+   * This method takes as input a list of start URL strings for crawling, 
converts the URL strings
+   * to URI strings and adds each one to a backlog and then starts crawling
+   *
+   * @param args the raw input args from main()
+   * @param startIndexInArgs offset for where to start
+   * @param out outputStream to write results to
+   * @return the number of web pages posted
+   */
+  public int postWebPages(String[] args, int startIndexInArgs, OutputStream 
out) {
+    reset();
+    LinkedHashSet<URI> s = new LinkedHashSet<>();
+    for (int j = startIndexInArgs; j < args.length; j++) {
+      try {
+        URI uri = new URI(normalizeUrlEnding(args[j]));
+        s.add(uri);
+      } catch (URISyntaxException e) {
+        warn("Skipping malformed input URL: " + args[j]);
+      }
+    }
+    // Add URIs to level 0 of the backlog and start recursive crawling
+    backlog.add(s);
+    return webCrawl(0, out);
+  }
+
+  /**
+   * Normalizes a URL string by removing anchor part and trailing slash
+   *
+   * @return the normalized URL string
+   */
+  protected static String normalizeUrlEnding(String link) {
+    if (link.contains("#")) {
+      link = link.substring(0, link.indexOf('#'));
+    }
+    if (link.endsWith("?")) {
+      link = link.substring(0, link.length() - 1);
+    }
+    if (link.endsWith("/")) {
+      link = link.substring(0, link.length() - 1);
+    }
+    return link;
+  }
+
+  /**
+   * A very simple crawler, pulling URLs to fetch from a backlog and then 
recurses N levels deep if
+   * recursive&gt;0. Links are parsed from HTML through first getting an XHTML 
version using
+   * SolrCell with extractOnly, and followed if they are local. The crawler 
pauses for a default
+   * delay of 10 seconds between each fetch, this can be configured in the 
delay variable. This is
+   * only meant for test purposes, as it does not respect robots or anything 
else fancy :)
+   *
+   * @param level which level to crawl
+   * @param out output stream to write to
+   * @return number of pages crawled on this level and below
+   */
+  protected int webCrawl(int level, OutputStream out) {
+    int numPages = 0;
+    LinkedHashSet<URI> stack = backlog.get(level);
+    int rawStackSize = stack.size();
+    stack.removeAll(visited);
+    int stackSize = stack.size();
+    LinkedHashSet<URI> subStack = new LinkedHashSet<>();
+    info(
+        "Entering crawl at level "
+            + level
+            + " ("
+            + rawStackSize
+            + " links total, "
+            + stackSize
+            + " new)");
+    for (URI uri : stack) {
+      try {
+        visited.add(uri);
+        URL url = uri.toURL();
+        PostTool.PageFetcherResult result = pageFetcher.readPageFromUrl(url);
+        if (result.httpStatus == 200) {
+          url = (result.redirectUrl != null) ? result.redirectUrl : url;
+          URL postUrl =
+              new URL(
+                  appendParam(
+                      solrUpdateUrl.toString(),
+                      "literal.id="
+                          + URLEncoder.encode(url.toString(), UTF_8)
+                          + "&literal.url="
+                          + URLEncoder.encode(url.toString(), UTF_8)));
+          ByteBuffer content = result.content;
+          boolean success =
+              postData(
+                  new ByteArrayInputStream(content.array(), 
content.arrayOffset(), content.limit()),
+                  null,
+                  out,
+                  result.contentType,
+                  postUrl);
+          if (success) {
+            info("POSTed web resource " + url + " (depth: " + level + ")");
+            Thread.sleep(delay * 1000L);
+            numPages++;
+            // Pull links from HTML pages only
+            if (recursive > level && result.contentType.equals("text/html")) {
+              Set<URI> children =
+                  pageFetcher.getLinksFromWebPage(
+                      url,
+                      new ByteArrayInputStream(
+                          content.array(), content.arrayOffset(), 
content.limit()),
+                      result.contentType,
+                      postUrl);
+              subStack.addAll(children);
+            }
+          } else {
+            warn("An error occurred while posting " + uri);
+          }
+        } else {
+          warn("The URL " + uri + " returned a HTTP result status of " + 
result.httpStatus);
+        }
+      } catch (IOException | URISyntaxException e) {
+        warn("Caught exception when trying to open connection to " + uri + ": 
" + e.getMessage());
+      } catch (InterruptedException e) {
+        throw new RuntimeException(e);
+      }
+    }
+    if (!subStack.isEmpty()) {
+      backlog.add(subStack);
+      numPages += webCrawl(level + 1, out);
+    }
+    return numPages;
+  }
+
+  /**
+   * Computes the full URL based on a base url and a possibly relative link 
found in the href param
+   * of an HTML anchor.
+   *
+   * @param baseUrl the base url from where the link was found
+   * @param link the absolute or relative link
+   * @return the string version of the full URL
+   */
+  protected String computeFullUrl(URL baseUrl, String link) {
+    if (link == null || link.length() == 0) {
+      return null;
+    }
+    if (!link.startsWith("http")) {
+      if (link.startsWith("/")) {
+        link = baseUrl.getProtocol() + "://" + baseUrl.getAuthority() + link;
+      } else {
+        if (link.contains(":")) {
+          return null; // Skip non-relative URLs
+        }
+        String path = baseUrl.getPath();
+        if (!path.endsWith("/")) {
+          int sep = path.lastIndexOf('/');
+          String file = path.substring(sep + 1);
+          if (file.contains(".") || file.contains("?")) {
+            path = path.substring(0, sep);
+          }
+        }
+        link = baseUrl.getProtocol() + "://" + baseUrl.getAuthority() + path + 
"/" + link;
+      }
+    }
+    link = normalizeUrlEnding(link);
+    String l = link.toLowerCase(Locale.ROOT);
+    // Simple brute force skip images
+    if (l.endsWith(".jpg") || l.endsWith(".jpeg") || l.endsWith(".png") || 
l.endsWith(".gif")) {
+      return null; // Skip images
+    }
+    return link;
+  }
+
+  /**
+   * Uses the mime-type map to reverse lookup whether the file ending for our 
type is supported by
+   * the fileTypes option
+   *
+   * @param type what content-type to lookup
+   * @return true if this is a supported content type
+   */
+  protected boolean typeSupported(String type) {
+    for (Map.Entry<String, String> entry : mimeMap.entrySet()) {
+      if (entry.getValue().equals(type)) {
+        if (fileTypes.contains(entry.getKey())) {
+          return true;
+        }
+      }
+    }
+    return false;
+  }
+
+  static void warn(String msg) {
+    CLIO.err("PostTool: WARNING: " + msg);
+  }
+
+  static void info(String msg) {
+    CLIO.out(msg);
+  }
+
+  /** Does a simple commit operation */
+  public void commit() throws IOException, SolrServerException {
+    info("COMMITting Solr index changes to " + solrUpdateUrl + "...");
+    String url = solrUpdateUrl.toString();
+    url = url.substring(0, url.lastIndexOf("/update"));
+    try (final SolrClient client = SolrCLI.getSolrClient(url, credentials)) {
+      client.commit();
+    }
+  }
+
+  /** Does a simple optimize operation */
+  public void optimize() throws IOException, SolrServerException {
+    info("Performing an OPTIMIZE to " + solrUpdateUrl + "...");
+    String url = solrUpdateUrl.toString();
+    url = url.substring(0, url.lastIndexOf("/update"));
+    try (final SolrClient client = SolrCLI.getSolrClient(url, credentials)) {
+      client.optimize();
+    }
+  }
+
+  /**
+   * Appends a URL query parameter to a URL
+   *
+   * @param url the original URL
+   * @param param the parameter(s) to append, separated by "&amp;"
+   * @return the string version of the resulting URL
+   */
+  public static String appendParam(String url, String param) {
+    String[] pa = param.split("&");
+    for (String p : pa) {
+      if (p.trim().length() == 0) {
+        continue;
+      }
+      String[] kv = p.split("=");
+      if (kv.length == 2) {
+        url = url + (url.contains("?") ? "&" : "?") + kv[0] + "=" + kv[1];
+      } else {
+        warn("Skipping param " + p + " which is not on form key=value");
+      }
+    }
+    return url;
+  }
+
+  /** Opens the file and posts its contents to the solrUrl, writes to response 
to output. */
+  public void postFile(File file, OutputStream output, String type) throws 
MalformedURLException {
+    InputStream is = null;
+
+    URL url = solrUpdateUrl;
+    String suffix = "";
+    if (auto) {
+      if (type == null) {
+        type = guessType(file);
+      }
+      // TODO: Add a flag that disables /update and sends all to 
/update/extract, to avoid CSV,
+      // JSON, and XML files
+      // TODO: from being interpreted as Solr documents internally
+      if (type.equals("application/json") && 
!PostTool.FORMAT_SOLR.equals(format)) {
+        suffix = "/json/docs";
+        String urlStr = appendUrlPath(solrUpdateUrl, suffix).toString();
+        url = new URL(urlStr);
+      } else if (type.equals("application/xml")
+          || type.equals("text/csv")
+          || type.equals("application/json")) {
+        // Default handler
+      } else {
+        // SolrCell
+        suffix = "/extract";
+        String urlStr = appendUrlPath(solrUpdateUrl, suffix).toString();
+        if (!urlStr.contains("resource.name")) {
+          urlStr =
+              appendParam(
+                  urlStr, "resource.name=" + 
URLEncoder.encode(file.getAbsolutePath(), UTF_8));
+        }
+        if (!urlStr.contains("literal.id")) {
+          urlStr =
+              appendParam(urlStr, "literal.id=" + 
URLEncoder.encode(file.getAbsolutePath(), UTF_8));
+        }
+        url = new URL(urlStr);
+      }
+    } else {
+      if (type == null) {
+        type = DEFAULT_CONTENT_TYPE;
+      }
+    }
+    if (dryRun) {
+      info(
+          "DRY RUN of POSTing file "
+              + file.getName()
+              + (auto ? " (" + type + ")" : "")
+              + " to [base]"
+              + suffix);
+    } else {
+      try {
+        info(
+            "POSTing file "
+                + file.getName()
+                + (auto ? " (" + type + ")" : "")
+                + " to [base]"
+                + suffix);
+        is = new FileInputStream(file);
+        postData(is, file.length(), output, type, url);
+      } catch (IOException e) {
+        warn("Can't open/read file: " + file);
+      } finally {
+        try {
+          if (is != null) {
+            is.close();
+          }
+        } catch (IOException e) {
+          warn("IOException while closing file: " + e);
+        }
+      }
+    }
+  }
+
+  /**
+   * Appends to the path of the URL
+   *
+   * @param url the URL
+   * @param append the path to append
+   * @return the final URL version
+   */
+  protected static URL appendUrlPath(URL url, String append) throws 
MalformedURLException {
+    return new URL(
+        url.getProtocol()
+            + "://"
+            + url.getAuthority()
+            + url.getPath()
+            + append
+            + (url.getQuery() != null ? "?" + url.getQuery() : ""));
+  }
+
+  /**
+   * Guesses the type of file, based on file name suffix Returns 
"application/octet-stream" if no
+   * corresponding mimeMap type.
+   *
+   * @param file the file
+   * @return the content-type guessed
+   */
+  protected static String guessType(File file) {
+    String name = file.getName();
+    String suffix = name.substring(name.lastIndexOf('.') + 1);
+    String type = mimeMap.get(suffix.toLowerCase(Locale.ROOT));
+    return (type != null) ? type : "application/octet-stream";
+  }
+
+  /**
+   * Reads data from the data stream and posts it to solr, writes to the 
response to output
+   *
+   * @return true if success
+   */
+  public boolean postData(
+      InputStream data, Long length, OutputStream output, String type, URL 
url) {
+    if (dryRun) {
+      return true;
+    }
+
+    boolean success = true;
+    if (type == null) {
+      type = DEFAULT_CONTENT_TYPE;
+    }
+    HttpURLConnection urlConnection = null;
+    try {
+      try {
+        urlConnection = (HttpURLConnection) url.openConnection();
+        try {
+          urlConnection.setRequestMethod("POST");
+        } catch (ProtocolException e) {
+          warn("Shouldn't happen: HttpURLConnection doesn't support POST??" + 
e);
+        }
+        urlConnection.setDoOutput(true);
+        urlConnection.setDoInput(true);
+        urlConnection.setUseCaches(false);
+        urlConnection.setAllowUserInteraction(false);
+        urlConnection.setRequestProperty("Content-type", type);
+        basicAuth(urlConnection);
+        if (null != length) {
+          urlConnection.setFixedLengthStreamingMode(length);
+        } else {
+          urlConnection.setChunkedStreamingMode(-1); // use JDK default 
chunkLen, 4k in Java 8.
+        }
+        urlConnection.connect();
+      } catch (IOException e) {
+        warn("Connection error (is Solr running at " + solrUpdateUrl + " ?): " 
+ e);
+        success = false;
+      } catch (Exception e) {
+        warn("POST failed with error " + e.getMessage());
+      }
+
+      try (final OutputStream out = urlConnection.getOutputStream()) {
+        pipe(data, out);
+      } catch (IOException e) {
+        warn("IOException while posting data: " + e);
+      }
+
+      try {
+        success &= checkResponseCode(urlConnection);
+        try (final InputStream in = urlConnection.getInputStream()) {
+          pipe(in, output);
+        }
+      } catch (IOException e) {
+        warn("IOException while reading response: " + e);
+        success = false;
+      } catch (GeneralSecurityException e) {
+        warn(
+            "Looks like Solr is secured and would not let us in. Try with 
another user in '-u' parameter");
+      }
+    } finally {
+      if (urlConnection != null) {
+        urlConnection.disconnect();
+      }
+    }
+    return success;
+  }
+
+  private void basicAuth(HttpURLConnection urlc) throws Exception {
+    if (urlc.getURL().getUserInfo() != null) {
+      String encoding =
+          
Base64.getEncoder().encodeToString(urlc.getURL().getUserInfo().getBytes(US_ASCII));
+      urlc.setRequestProperty("Authorization", "Basic " + encoding);
+    } else if (credentials != null) {
+      if (!credentials.contains(":")) {
+        throw new Exception("credentials '" + credentials + "' must be of 
format user:pass");
+      }
+      urlc.setRequestProperty(
+          "Authorization",
+          "Basic " + 
Base64.getEncoder().encodeToString(credentials.getBytes(UTF_8)));
+    }
+  }
+
+  private static boolean checkResponseCode(HttpURLConnection urlc)
+      throws IOException, GeneralSecurityException {
+    if (urlc.getResponseCode() >= 400) {
+      warn(
+          "Solr returned an error #"
+              + urlc.getResponseCode()
+              + " ("
+              + urlc.getResponseMessage()
+              + ") for url: "
+              + urlc.getURL());
+      Charset charset = StandardCharsets.ISO_8859_1;
+      final String contentType = urlc.getContentType();
+      // code cloned from ContentStreamBase, but post.jar should be standalone!
+      if (contentType != null) {
+        int idx = contentType.toLowerCase(Locale.ROOT).indexOf("charset=");
+        if (idx > 0) {
+          charset = Charset.forName(contentType.substring(idx + 
"charset=".length()).trim());
+        }
+      }
+      // Print the response returned by Solr
+      try (InputStream errStream = urlc.getErrorStream()) {
+        if (errStream != null) {
+          BufferedReader br = new BufferedReader(new 
InputStreamReader(errStream, charset));
+          final StringBuilder response = new StringBuilder("Response: ");
+          int ch;
+          while ((ch = br.read()) != -1) {
+            response.append((char) ch);
+          }
+          warn(response.toString().trim());
+        }
+      }
+      if (urlc.getResponseCode() == 401) {
+        throw new GeneralSecurityException(
+            "Solr requires authentication (response 401). Please try again 
with '-u' option");
+      }
+      if (urlc.getResponseCode() == 403) {
+        throw new GeneralSecurityException(
+            "You are not authorized to perform this action against Solr. 
(response 403)");
+      }
+      return false;
+    }
+    return true;
+  }
+
+  /**
+   * Converts a string to an input stream
+   *
+   * @param s the string
+   * @return the input stream
+   */
+  public static InputStream stringToStream(String s) {
+    return new ByteArrayInputStream(s.getBytes(StandardCharsets.UTF_8));
+  }
+
+  /**
+   * Pipes everything from the source to the dest. If dest is null, then 
everything is read from
+   * source and thrown away.
+   */
+  private static void pipe(InputStream source, OutputStream dest) throws 
IOException {
+    byte[] buf = new byte[1024];
+    int read = 0;
+    while ((read = source.read(buf)) >= 0) {
+      if (null != dest) {
+        dest.write(buf, 0, read);
+      }
+    }
+    if (null != dest) {
+      dest.flush();
+    }
+  }
+
+  public FileFilter getFileFilterFromFileTypes(String fileTypes) {
+    String glob;
+    if (fileTypes.equals("*")) {
+      glob = ".*";
+    } else {
+      glob = "^.*\\.(" + fileTypes.replace(",", "|") + ")$";
+    }
+    return new PostTool.GlobFileFilter(glob, true);
+  }
+
+  //
+  // Utility methods for XPath handing
+  //
+
+  /** Gets all nodes matching an XPath */
+  public static NodeList getNodesFromXP(Node n, String xpath) throws 
XPathExpressionException {
+    XPathFactory factory = XPathFactory.newInstance();
+    XPath xp = factory.newXPath();
+    XPathExpression expr = xp.compile(xpath);
+    return (NodeList) expr.evaluate(n, XPathConstants.NODESET);
+  }
+
+  /**
+   * Gets the string content of the matching an XPath
+   *
+   * @param n the node (or doc)
+   * @param xpath the xpath string
+   * @param concatAll if true, text from all matching nodes will be 
concatenated, else only the
+   *     first returned
+   */
+  public static String getXP(Node n, String xpath, boolean concatAll)
+      throws XPathExpressionException {
+    NodeList nodes = getNodesFromXP(n, xpath);
+    StringBuilder sb = new StringBuilder();
+    if (nodes.getLength() > 0) {
+      for (int i = 0; i < nodes.getLength(); i++) {
+        sb.append(nodes.item(i).getNodeValue()).append(' ');
+        if (!concatAll) {
+          break;
+        }
+      }
+      return sb.toString().trim();
+    } else return "";
+  }
+
+  /** Takes a string as input and returns a DOM */
+  public static Document makeDom(byte[] in)
+      throws SAXException, IOException, ParserConfigurationException {
+    InputStream is = new ByteArrayInputStream(in);
+    Document dom = 
DocumentBuilderFactory.newInstance().newDocumentBuilder().parse(is);
+    return dom;
+  }
+
+  /** Inner class to filter files based on glob wildcards */
+  static class GlobFileFilter implements FileFilter {
+    private final Pattern p;
+
+    public GlobFileFilter(String pattern, boolean isRegex) {
+      String _pattern = pattern;
+      if (!isRegex) {
+        _pattern =
+            _pattern
+                .replace("^", "\\^")
+                .replace("$", "\\$")
+                .replace(".", "\\.")
+                .replace("(", "\\(")
+                .replace(")", "\\)")
+                .replace("+", "\\+")
+                .replace("*", ".*")
+                .replace("?", ".");
+        _pattern = "^" + _pattern + "$";
+      }
+
+      try {
+        p = Pattern.compile(_pattern, Pattern.CASE_INSENSITIVE);
+      } catch (PatternSyntaxException e) {
+        throw new IllegalArgumentException(
+            "Invalid type list " + pattern + ". " + e.getDescription());
+      }
+    }
+
+    @Override
+    public boolean accept(File file) {
+      return p.matcher(file.getName()).find();
+    }
+  }
+
+  //
+  // Simple crawler class which can fetch a page and check for robots.txt
+  //
+  class PageFetcher {
+    Map<String, List<String>> robotsCache;
+    static final String DISALLOW = "Disallow:";
+
+    public PageFetcher() {
+      robotsCache = new HashMap<>();
+    }
+
+    public PageFetcherResult readPageFromUrl(URL u) throws URISyntaxException {
+      PostTool.PageFetcherResult res = new PostTool.PageFetcherResult();
+      try {
+        if (isDisallowedByRobots(u)) {
+          warn("The URL " + u + " is disallowed by robots.txt and will not be 
crawled.");
+          res.httpStatus = 403;
+          URI uri = u.toURI();
+          visited.add(uri);
+          return res;
+        }
+        res.httpStatus = 404;
+        HttpURLConnection conn = (HttpURLConnection) u.openConnection();
+        conn.setRequestProperty(
+            "User-Agent",
+            "PostTool-crawler/" + SolrVersion.LATEST_STRING + " 
(https://solr.apache.org/)");
+        conn.setRequestProperty("Accept-Encoding", "gzip, deflate");
+        conn.connect();
+        res.httpStatus = conn.getResponseCode();
+        if (!normalizeUrlEnding(conn.getURL().toString())
+            .equals(normalizeUrlEnding(u.toString()))) {
+          info("The URL " + u + " caused a redirect to " + conn.getURL());
+          u = conn.getURL();
+          res.redirectUrl = u;
+          URI uri = u.toURI();
+          visited.add(uri);
+        }
+        if (res.httpStatus == 200) {
+          // Raw content type of form "text/html; encoding=utf-8"
+          String rawContentType = conn.getContentType();
+          String type = rawContentType.split(";")[0];
+          if (typeSupported(type) || "*".equals(fileTypes)) {
+            String encoding = conn.getContentEncoding();
+            InputStream is;
+            if (encoding != null && encoding.equalsIgnoreCase("gzip")) {
+              is = new GZIPInputStream(conn.getInputStream());
+            } else if (encoding != null && 
encoding.equalsIgnoreCase("deflate")) {
+              is = new InflaterInputStream(conn.getInputStream(), new 
Inflater(true));
+            } else {
+              is = conn.getInputStream();
+            }
+
+            // Read into memory, so that we later can pull links from the page 
without re-fetching
+            res.content = Utils.toByteArray(is);
+            is.close();
+          } else {
+            warn("Skipping URL with unsupported type " + type);
+            res.httpStatus = 415;
+          }
+        }
+      } catch (IOException e) {
+        warn("IOException when reading page from url " + u + ": " + 
e.getMessage());
+      }
+      return res;
+    }
+
+    public boolean isDisallowedByRobots(URL url) {
+      String host = url.getHost();
+      String strRobot = url.getProtocol() + "://" + host + "/robots.txt";
+      List<String> disallows = robotsCache.get(host);
+      if (disallows == null) {
+        disallows = new ArrayList<>();
+        URL urlRobot;
+        try {
+          urlRobot = new URL(strRobot);
+          disallows = parseRobotsTxt(urlRobot.openStream());
+        } catch (MalformedURLException e) {
+          return true; // We cannot trust this robots URL, should not happen
+        } catch (IOException e) {
+          // There is no robots.txt, will cache an empty disallow list
+        }
+      }
+
+      robotsCache.put(host, disallows);
+
+      String strURL = url.getFile();
+      for (String path : disallows) {
+        if (path.equals("/") || strURL.indexOf(path) == 0) return true;
+      }
+      return false;
+    }
+
+    /**
+     * Very simple robots.txt parser which obeys all Disallow lines regardless 
of user agent or
+     * whether there are valid Allow: lines.
+     *
+     * @param is Input stream of the robots.txt file
+     * @return a list of disallow paths
+     * @throws IOException if problems reading the stream
+     */
+    protected List<String> parseRobotsTxt(InputStream is) throws IOException {
+      List<String> disallows = new ArrayList<>();
+      BufferedReader r = new BufferedReader(new InputStreamReader(is, 
StandardCharsets.UTF_8));
+      String l;
+      while ((l = r.readLine()) != null) {
+        String[] arr = l.split("#");
+        if (arr.length == 0) continue;
+        l = arr[0].trim();
+        if (l.startsWith(DISALLOW)) {
+          l = l.substring(DISALLOW.length()).trim();
+          if (l.length() == 0) continue;
+          disallows.add(l);
+        }
+      }
+      is.close();
+      return disallows;
+    }
+
+    /**
+     * Finds links on a web page, using /extract?extractOnly=true
+     *
+     * @param url the URL of the web page
+     * @param is the input stream of the page
+     * @param type the content-type
+     * @param postUrl the URL (typically /solr/extract) in order to pull out 
links
+     * @return a set of URIs parsed from the page
+     */
+    protected Set<URI> getLinksFromWebPage(URL url, InputStream is, String 
type, URL postUrl) {
+      Set<URI> linksFromPage = new HashSet<>();
+
+      try {
+        ByteArrayOutputStream os = new ByteArrayOutputStream();
+        URL extractUrl = new URL(appendParam(postUrl.toString(), 
"extractOnly=true"));
+        extractUrl = new URL(appendParam(extractUrl.toString(), "wt=xml"));
+        boolean success = postData(is, null, os, type, extractUrl);
+        if (success) {
+          Document d = makeDom(os.toByteArray());
+          String innerXml = getXP(d, "/response/str/text()[1]", false);
+          d = makeDom(innerXml.getBytes(StandardCharsets.UTF_8));
+          NodeList links = getNodesFromXP(d, "/html/body//a/@href");
+          for (int i = 0; i < links.getLength(); i++) {
+            String link = links.item(i).getTextContent();
+            link = computeFullUrl(url, link);
+            if (link == null) {
+              continue;
+            }
+            URI newUri = new URI(link);
+            if (newUri.getAuthority() == null
+                || !newUri.getAuthority().equals(url.getAuthority())) {
+              linksFromPage.add(newUri);
+            }
+          }
+        }
+      } catch (MalformedURLException e) {
+        warn("Malformed URL " + url);
+      } catch (IOException e) {
+        warn("IOException opening URL " + url + ": " + e.getMessage());
+      } catch (Exception e) {
+        throw new RuntimeException(e);
+      }
+
+      return linksFromPage;
+    }
+  }
 
-    spt.execute();
+  /** Utility class to hold the result form a page fetch */
+  public static class PageFetcherResult {
+    int httpStatus = 200;
+    String contentType = "text/html";
+    URL redirectUrl = null;
+    ByteBuffer content;
   }
 }
diff --git a/solr/core/src/java/org/apache/solr/cli/RunExampleTool.java 
b/solr/core/src/java/org/apache/solr/cli/RunExampleTool.java
index 908990db22d..680a879372e 100644
--- a/solr/core/src/java/org/apache/solr/cli/RunExampleTool.java
+++ b/solr/core/src/java/org/apache/solr/cli/RunExampleTool.java
@@ -315,22 +315,20 @@ public class RunExampleTool extends ToolBase {
         String updateUrl = String.format(Locale.ROOT, "%s/%s/update", solrUrl, 
collectionName);
         echo("Indexing tech product example docs from " + 
exampledocsDir.getAbsolutePath());
 
-        String currentPropVal = System.getProperty("url");
-        System.setProperty("url", updateUrl);
-        String currentTypeVal = System.getProperty("type");
-        // We assume that example docs are always in XML.
-        System.setProperty("type", "application/xml");
-        SimplePostTool.main(new String[] {exampledocsDir.getAbsolutePath() + 
"/*.xml"});
-        if (currentPropVal != null) {
-          System.setProperty("url", currentPropVal); // reset
-        } else {
-          System.clearProperty("url");
-        }
-        if (currentTypeVal != null) {
-          System.setProperty("type", currentTypeVal); // reset
-        } else {
-          System.clearProperty("type");
-        }
+        String[] args =
+            new String[] {
+              "post",
+              "-url",
+              updateUrl,
+              "-type",
+              "application/xml",
+              exampledocsDir.getAbsolutePath() + "/*.xml"
+            };
+        PostTool postTool = new PostTool();
+        CommandLine postToolCli =
+            SolrCLI.parseCmdLine(postTool.getName(), args, 
postTool.getOptions());
+        postTool.runTool(postToolCli);
+
       } else {
         echo(
             "exampledocs directory not found, skipping indexing step for the 
techproducts example");
diff --git a/solr/core/src/test/org/apache/solr/cli/PostToolTest.java 
b/solr/core/src/test/org/apache/solr/cli/PostToolTest.java
index 88639c7cefc..e11c11884f1 100644
--- a/solr/core/src/test/org/apache/solr/cli/PostToolTest.java
+++ b/solr/core/src/test/org/apache/solr/cli/PostToolTest.java
@@ -20,10 +20,22 @@ package org.apache.solr.cli;
 import static org.apache.solr.cli.SolrCLI.findTool;
 import static org.apache.solr.cli.SolrCLI.parseCmdLine;
 
+import java.io.ByteArrayInputStream;
 import java.io.File;
 import java.io.FileWriter;
+import java.io.IOException;
+import java.io.InputStream;
+import java.net.MalformedURLException;
+import java.net.URI;
+import java.net.URISyntaxException;
+import java.net.URL;
+import java.nio.ByteBuffer;
 import java.nio.charset.StandardCharsets;
+import java.util.Collections;
+import java.util.HashMap;
+import java.util.HashSet;
 import java.util.Map;
+import java.util.Set;
 import org.apache.commons.cli.CommandLine;
 import org.apache.solr.SolrTestCaseJ4;
 import org.apache.solr.client.solrj.request.CollectionAdminRequest;
@@ -33,6 +45,11 @@ import org.apache.solr.common.util.Utils;
 import org.junit.BeforeClass;
 import org.junit.Test;
 
+/**
+ * NOTE: do *not* use real hostnames, not even "example.com", in the 
webcrawler tests.
+ *
+ * <p>A MockPageFetcher is used to prevent real HTTP requests from being 
executed.
+ */
 @SolrTestCaseJ4.SuppressSSL
 public class PostToolTest extends SolrCloudTestCase {
 
@@ -58,7 +75,7 @@ public class PostToolTest extends SolrCloudTestCase {
 
     String[] args = {
       "post",
-      "-url",
+      "--solr-update-url",
       cluster.getJettySolrRunner(0).getBaseUrl() + "/" + collection + 
"/update",
       jsonDoc.getAbsolutePath()
     };
@@ -90,4 +107,205 @@ public class PostToolTest extends SolrCloudTestCase {
     CommandLine cli = parseCmdLine(tool.getName(), args, tool.getOptions());
     return tool.runTool(cli);
   }
+
+  @Test
+  public void testNormalizeUrlEnding() {
+    assertEquals("http://[ff01::114]";, 
PostTool.normalizeUrlEnding("http://[ff01::114]/";));
+    assertEquals(
+        "http://[ff01::114]";, 
PostTool.normalizeUrlEnding("http://[ff01::114]/#foo?bar=baz";));
+    assertEquals(
+        "http://[ff01::114]/index.html";,
+        PostTool.normalizeUrlEnding("http://[ff01::114]/index.html#hello";));
+  }
+
+  @Test
+  public void testComputeFullUrl() throws IOException {
+
+    PostTool webPostTool = new PostTool();
+
+    assertEquals(
+        "http://[ff01::114]/index.html";,
+        webPostTool.computeFullUrl(new URL("http://[ff01::114]/";), 
"/index.html"));
+    assertEquals(
+        "http://[ff01::114]/index.html";,
+        webPostTool.computeFullUrl(new URL("http://[ff01::114]/foo/bar/";), 
"/index.html"));
+    assertEquals(
+        "http://[ff01::114]/fil.html";,
+        webPostTool.computeFullUrl(new 
URL("http://[ff01::114]/foo.htm?baz#hello";), "fil.html"));
+    //    TODO: How to know what is the base if URL path ends with "foo"??
+    //    assertEquals("http://[ff01::114]/fil.html";, t_web.computeFullUrl(new
+    // URL("http://[ff01::114]/foo?baz#hello";), "fil.html"));
+    assertNull(webPostTool.computeFullUrl(new URL("http://[ff01::114]/";), 
"fil.jpg"));
+    assertNull(webPostTool.computeFullUrl(new URL("http://[ff01::114]/";), 
"mailto:[email protected]";));
+    assertNull(webPostTool.computeFullUrl(new URL("http://[ff01::114]/";), 
"ftp://server/file";));
+  }
+
+  @Test
+  public void testTypeSupported() {
+    PostTool postTool = new PostTool();
+
+    assertTrue(postTool.typeSupported("application/pdf"));
+    assertTrue(postTool.typeSupported("application/xml"));
+    assertFalse(postTool.typeSupported("text/foo"));
+
+    postTool.fileTypes = "doc,xls,ppt";
+    postTool.fileFilter = 
postTool.getFileFilterFromFileTypes(postTool.fileTypes);
+    assertFalse(postTool.typeSupported("application/pdf"));
+    assertTrue(postTool.typeSupported("application/msword"));
+  }
+
+  @Test
+  public void testAppendParam() {
+    assertEquals(
+        "http://[ff01::114]?foo=bar";, 
PostTool.appendParam("http://[ff01::114]";, "foo=bar"));
+    assertEquals(
+        "http://[ff01::114]/?a=b&foo=bar";,
+        PostTool.appendParam("http://[ff01::114]/?a=b";, "foo=bar"));
+  }
+
+  @Test
+  public void testAppendUrlPath() throws MalformedURLException {
+    assertEquals(
+        new URL("http://[ff01::114]/a?foo=bar";),
+        PostTool.appendUrlPath(new URL("http://[ff01::114]?foo=bar";), "/a"));
+  }
+
+  @Test
+  public void testGuessType() {
+    File f = new File("foo.doc");
+    assertEquals("application/msword", PostTool.guessType(f));
+    f = new File("foobar");
+    assertEquals("application/octet-stream", PostTool.guessType(f));
+    f = new File("foo.json");
+    assertEquals("application/json", PostTool.guessType(f));
+  }
+
+  @Test
+  public void testDoFilesMode() throws MalformedURLException {
+    PostTool postTool = new PostTool();
+    postTool.recursive = 0;
+    postTool.dryRun = true;
+    postTool.solrUpdateUrl = new URL("http://localhost:8983/solr/fake/update";);
+    File dir = getFile("exampledocs");
+    int num = postTool.postFiles(new String[] {dir.toString()}, 0, null, null);
+    assertEquals(2, num);
+  }
+
+  @Test
+  public void testDoWebMode() throws IOException, URISyntaxException {
+    PostTool postTool = new PostTool();
+    postTool.pageFetcher = new MockPageFetcher();
+    postTool.dryRun = true;
+    postTool.solrUpdateUrl = new 
URL("http://user:password@localhost:5150/solr/fake/update";);
+
+    // Uses mock pageFetcher
+    postTool.delay = 0;
+    postTool.recursive = 5;
+    int num = postTool.postWebPages(new String[] 
{"http://[ff01::114]/#removeme"}, 0, null);
+    assertEquals(5, num);
+
+    postTool.recursive = 1;
+    num = postTool.postWebPages(new String[] {"http://[ff01::114]/"}, 0, null);
+    assertEquals(3, num);
+
+    // Without respecting robots.txt
+    postTool.pageFetcher.robotsCache.put("[ff01::114]", 
Collections.emptyList());
+    postTool.recursive = 5;
+    num = postTool.postWebPages(new String[] {"http://[ff01::114]/#removeme"}, 
0, null);
+    assertEquals(6, num);
+  }
+
+  @Test
+  public void testRobotsExclusion() throws IOException, URISyntaxException {
+    PostTool postTool = new PostTool();
+    postTool.pageFetcher = new MockPageFetcher();
+    postTool.dryRun = true;
+
+    assertFalse(postTool.pageFetcher.isDisallowedByRobots(new 
URL("http://[ff01::114]/";)));
+    assertTrue(postTool.pageFetcher.isDisallowedByRobots(new 
URL("http://[ff01::114]/disallowed";)));
+    assertEquals(
+        "There should be two entries parsed from robots.txt",
+        2,
+        postTool.pageFetcher.robotsCache.get("[ff01::114]").size());
+  }
+
+  static class MockPageFetcher extends PostTool.PageFetcher {
+    HashMap<String, String> htmlMap = new HashMap<>();
+    HashMap<String, Set<URI>> linkMap = new HashMap<>();
+
+    public MockPageFetcher() throws IOException, URISyntaxException {
+      (new PostTool()).super();
+      htmlMap.put(
+          "http://[ff01::114]";,
+          "<html><body><a href=\"http://[ff01::114]/page1\";>page1</a><a 
href=\"http://[ff01::114]/page2\";>page2</a></body></html>");
+      htmlMap.put(
+          "http://[ff01::114]/index.html";,
+          "<html><body><a href=\"http://[ff01::114]/page1\";>page1</a><a 
href=\"http://[ff01::114]/page2\";>page2</a></body></html>");
+      htmlMap.put(
+          "http://[ff01::114]/page1";,
+          "<html><body><a 
href=\"http://[ff01::114]/page1/foo\";></body></html>");
+      htmlMap.put(
+          "http://[ff01::114]/page1/foo";,
+          "<html><body><a 
href=\"http://[ff01::114]/page1/foo/bar\";></body></html>");
+      htmlMap.put(
+          "http://[ff01::114]/page1/foo/bar";,
+          "<html><body><a href=\"http://[ff01::114]/page1\";></body></html>");
+      htmlMap.put(
+          "http://[ff01::114]/page2";,
+          "<html><body><a href=\"http://[ff01::114]/\";><a 
href=\"http://[ff01::114]/disallowed\"/></body></html>");
+      htmlMap.put(
+          "http://[ff01::114]/disallowed";,
+          "<html><body><a href=\"http://[ff01::114]/\";></body></html>");
+
+      Set<URI> s = new HashSet<>();
+      s.add(new URI("http://[ff01::114]/page1";));
+      s.add(new URI("http://[ff01::114]/page2";));
+      linkMap.put("http://[ff01::114]";, s);
+      linkMap.put("http://[ff01::114]/index.html";, s);
+      s = new HashSet<>();
+      s.add(new URI("http://[ff01::114]/page1/foo";));
+      linkMap.put("http://[ff01::114]/page1";, s);
+      s = new HashSet<>();
+      s.add(new URI("http://[ff01::114]/page1/foo/bar";));
+      linkMap.put("http://[ff01::114]/page1/foo";, s);
+      s = new HashSet<>();
+      s.add(new URI("http://[ff01::114]/disallowed";));
+      linkMap.put("http://[ff01::114]/page2";, s);
+
+      // Simulate a robots.txt file with comments and a few disallows
+      StringBuilder sb = new StringBuilder();
+      sb.append(
+          "# Comments appear after the \"#\" symbol at the start of a line, or 
after a directive\n");
+      sb.append("User-agent: * # match all bots\n");
+      sb.append("Disallow:  # This is void\n");
+      sb.append("Disallow: /disallow # Disallow this path\n");
+      sb.append("Disallow: /nonexistentpath # Disallow this path\n");
+      this.robotsCache.put(
+          "[ff01::114]",
+          super.parseRobotsTxt(
+              new 
ByteArrayInputStream(sb.toString().getBytes(StandardCharsets.UTF_8))));
+    }
+
+    @Override
+    public PostTool.PageFetcherResult readPageFromUrl(URL u) {
+      PostTool.PageFetcherResult res = new PostTool.PageFetcherResult();
+      if (isDisallowedByRobots(u)) {
+        res.httpStatus = 403;
+        return res;
+      }
+      res.httpStatus = 200;
+      res.contentType = "text/html";
+      res.content = 
ByteBuffer.wrap(htmlMap.get(u.toString()).getBytes(StandardCharsets.UTF_8));
+      return res;
+    }
+
+    @Override
+    public Set<URI> getLinksFromWebPage(URL url, InputStream is, String type, 
URL postUrl) {
+      Set<URI> s = linkMap.get(PostTool.normalizeUrlEnding(url.toString()));
+      if (s == null) {
+        s = new HashSet<>();
+      }
+      return s;
+    }
+  }
 }
diff --git a/solr/core/src/test/org/apache/solr/cloud/SolrCloudExampleTest.java 
b/solr/core/src/test/org/apache/solr/cloud/SolrCloudExampleTest.java
index b6c63148a4e..a40903c1f0a 100644
--- a/solr/core/src/test/org/apache/solr/cloud/SolrCloudExampleTest.java
+++ b/solr/core/src/test/org/apache/solr/cloud/SolrCloudExampleTest.java
@@ -20,22 +20,15 @@ import java.io.File;
 import java.lang.invoke.MethodHandles;
 import java.nio.file.Files;
 import java.nio.file.Path;
-import java.util.ArrayList;
-import java.util.Collections;
-import java.util.Comparator;
-import java.util.List;
-import java.util.Random;
 import java.util.Set;
 import java.util.concurrent.TimeUnit;
-import java.util.stream.Collectors;
-import java.util.stream.Stream;
 import org.apache.commons.cli.CommandLine;
 import org.apache.solr.cli.CreateCollectionTool;
 import org.apache.solr.cli.DeleteTool;
 import org.apache.solr.cli.HealthcheckTool;
+import org.apache.solr.cli.PostTool;
 import org.apache.solr.cli.SolrCLI;
 import org.apache.solr.client.solrj.SolrQuery;
-import org.apache.solr.client.solrj.request.StreamingUpdateRequest;
 import org.apache.solr.client.solrj.response.QueryResponse;
 import org.apache.solr.common.cloud.ZkStateReader;
 import org.apache.solr.util.ExternalPaths;
@@ -115,52 +108,36 @@ public class SolrCloudExampleTest extends 
AbstractFullDistribZkTestBase {
         invalidToolExitStatus,
         tool.runTool(cli));
 
-    // now index docs like bin/solr post would, but we can't use 
SimplePostTool because it uses
-    // System.exit when it encounters an error, which JUnit doesn't like ...
+    // now index docs ...
     log.info("Created collection, now posting example docs!");
     Path exampleDocsDir = Path.of(ExternalPaths.SOURCE_HOME, "example", 
"exampledocs");
     assertTrue(exampleDocsDir.toAbsolutePath() + " not found!", 
Files.isDirectory(exampleDocsDir));
 
-    List<Path> xmlFiles;
-    try (Stream<Path> stream = Files.walk(exampleDocsDir, 1)) {
-      xmlFiles =
-          stream
-              .filter(path -> path.getFileName().toString().endsWith(".xml"))
-              // don't rely on File.compareTo, it's behavior varies by OS
-              .sorted(Comparator.comparing(path -> 
path.getFileName().toString()))
-              // be explicit about the collection type because we will shuffle 
it later
-              .collect(Collectors.toCollection(ArrayList::new));
-    }
+    String[] argsForPost =
+        new String[] {
+          "--solr-update-url",
+          solrUrl + "/" + testCollectionName + "/update",
+          "-filetypes",
+          "xml",
+          exampleDocsDir.toAbsolutePath().toString()
+        };
 
-    // force a deterministic random ordering of the files so seeds reproduce 
regardless of
-    // platform/filesystem
-    Collections.shuffle(xmlFiles, new Random(random().nextLong()));
+    PostTool postTool = new PostTool();
+    CommandLine postCli =
+        SolrCLI.processCommandLineArgs(postTool.getName(), 
postTool.getOptions(), argsForPost);
+    postTool.runTool(postCli);
 
-    // if you add/remove example XML docs, you'll have to fix these expected 
values
-    int expectedXmlFileCount = 14;
     int expectedXmlDocCount = 32;
 
-    assertEquals(
-        "Unexpected # of example XML files in " + 
exampleDocsDir.toAbsolutePath(),
-        expectedXmlFileCount,
-        xmlFiles.size());
-
-    for (Path xml : xmlFiles) {
-      if (log.isInfoEnabled()) {
-        log.info("POSTing {}", xml.toAbsolutePath());
-      }
-      cloudClient.request(
-          new StreamingUpdateRequest("/update", xml, "application/xml"), 
testCollectionName);
-    }
-    cloudClient.commit(testCollectionName);
-
     int numFound = 0;
 
     // give the update a chance to take effect.
     for (int idx = 0; idx < 100; ++idx) {
       QueryResponse qr = cloudClient.query(testCollectionName, new 
SolrQuery("*:*"));
       numFound = (int) qr.getResults().getNumFound();
-      if (numFound == expectedXmlDocCount) break;
+      if (numFound == expectedXmlDocCount) {
+        break;
+      }
       Thread.sleep(100);
     }
     assertEquals("*:* found unexpected number of documents", 
expectedXmlDocCount, numFound);
diff --git a/solr/packaging/test/test_post.bats 
b/solr/packaging/test/test_post.bats
index 34f39cfad87..1dcb561afa8 100644
--- a/solr/packaging/test/test_post.bats
+++ b/solr/packaging/test/test_post.bats
@@ -78,7 +78,7 @@ teardown() {
   
   solr create_collection -c monitors_no_type -d _default
   
-  run solr post -url 
http://localhost:${SOLR_PORT}/solr/monitors_no_type/update -commit 
${SOLR_TIP}/example/exampledocs/monitor.xml
+  run solr post -url 
http://localhost:${SOLR_PORT}/solr/monitors_no_type/update 
${SOLR_TIP}/example/exampledocs/monitor.xml
 
   assert_output --partial '1 files indexed.'
   refute_output --partial 'ERROR'
@@ -87,7 +87,7 @@ teardown() {
   
   solr create_collection -c books_no_type -d _default
   
-  run solr post -url http://localhost:${SOLR_PORT}/solr/books_no_type/update 
-commit ${SOLR_TIP}/example/exampledocs/books.json
+  run solr post -url http://localhost:${SOLR_PORT}/solr/books_no_type/update 
${SOLR_TIP}/example/exampledocs/books.json
 
   assert_output --partial '1 files indexed.'
   refute_output --partial 'ERROR'
@@ -96,7 +96,7 @@ teardown() {
   
   solr create_collection -c books_csv_no_type -d _default
   
-  run solr post -url 
http://localhost:${SOLR_PORT}/solr/books_csv_no_type/update -commit 
${SOLR_TIP}/example/exampledocs/books.csv
+  run solr post -url 
http://localhost:${SOLR_PORT}/solr/books_csv_no_type/update 
${SOLR_TIP}/example/exampledocs/books.csv
 
   assert_output --partial '1 files indexed.'
   refute_output --partial 'ERROR'
@@ -104,12 +104,22 @@ teardown() {
   assert_output --partial '"numFound":10'  
 }
 
+@test "crawling a directory as a dry-run" {
+  
+  # We filter to xml,json,and csv as we don't want to invoke the Extract 
handler, and are running it as a dry run
+  run solr post --dry-run -filetypes xml,json,csv -url 
http://localhost:${SOLR_PORT}/solr/foobar/update -skipcommit 
${SOLR_TIP}/example/exampledocs
+
+  assert_output --partial 'Dry run complete. 16 would have been indexed.' 
+  refute_output --partial '16 files indexed.'
+  refute_output --partial 'ERROR'
+}
+
 @test "crawling a directory" {
   
   solr create_collection -c mixed_content -d _default
   
   # We filter to xml,json,and csv as we don't want to invoke the Extract 
handler.
-  run solr post -filetypes xml,json,csv -url 
http://localhost:${SOLR_PORT}/solr/mixed_content/update -commit 
${SOLR_TIP}/example/exampledocs
+  run solr post -filetypes xml,json,csv -url 
http://localhost:${SOLR_PORT}/solr/mixed_content/update 
${SOLR_TIP}/example/exampledocs
 
   assert_output --partial '16 files indexed.'
   refute_output --partial 'ERROR'
@@ -129,7 +139,7 @@ teardown() {
     }
   }' "http://localhost:${SOLR_PORT}/solr/webcrawl/config";
   
-  run solr post -mode web -url http://localhost:${SOLR_PORT}/webcrawl/update 
-recursive 1 -delay 1 https://solr.apache.org
+  run solr post -mode web --solr-update-url 
http://localhost:${SOLR_PORT}/webcrawl/update -recursive 1 -delay 1 
https://solr.apache.org
   assert_output --partial 'Entering crawl at level 0'
 }
 
@@ -152,7 +162,7 @@ teardown() {
   run solr create_collection -c test_args -d _default
   assert_output --partial "Created collection 'test_args'"
   
-  run solr post -url http://localhost:${SOLR_PORT}/solr/test_args/update -mode 
args -type application/xml -out -commit "<delete><query>*:*</query></delete>"
+  run solr post -url http://localhost:${SOLR_PORT}/solr/test_args/update -mode 
args -type application/xml -out "<delete><query>*:*</query></delete>"
   assert_output --partial '<int name="status">0</int>'
   
   # confirm default type

(solr) 01/02: SOLR-17159: Untangle PostTool and SimplePostTool code (#2275)

Reply via email to