This is an automated email from the ASF dual-hosted git repository. epugh pushed a commit to branch branch_9x in repository https://gitbox.apache.org/repos/asf/solr.git
commit 8a427ebc606d7967bcaaef30a9449bc0bf61b25b Author: Eric Pugh <[email protected]> AuthorDate: Mon Feb 19 11:46:47 2024 -0500 SOLR-17159: Untangle PostTool and SimplePostTool code (#2275) * Copied unit tests from SimplePostToolTest to PostToolTest * add a --dry-run mode that simulates sending documents to Solr * Missed a few more places with -commit not needed post SOLR-17147 being completed * clean up our code base to have fewer warnings about code quality. * Update SolrCloudExampleTest to use the PostTool instead of simulating it's usage. * Make the long form of -url expliciting a --solr-update-url to be clear what it's for. --- solr/CHANGES.txt | 3 + .../src/java/org/apache/solr/cli/PostTool.java | 1166 +++++++++++++++++++- .../java/org/apache/solr/cli/RunExampleTool.java | 30 +- .../src/test/org/apache/solr/cli/PostToolTest.java | 220 +++- .../apache/solr/cloud/SolrCloudExampleTest.java | 57 +- solr/packaging/test/test_post.bats | 22 +- 6 files changed, 1408 insertions(+), 90 deletions(-) diff --git a/solr/CHANGES.txt b/solr/CHANGES.txt index ab43eb599a7..54c96bc87bb 100644 --- a/solr/CHANGES.txt +++ b/solr/CHANGES.txt @@ -26,6 +26,9 @@ Improvements * SOLR-17145: The INSTALLSHARDDATA API now includes a 'requestid' field when run asynchronously (Jason Gerlowski) +* SOLR-17159: bin/solr post now has proper unit testing. Users can specify a --dry-run option to + simulate posting documents without sending them to Solr. (Eric Pugh) + Optimizations --------------------- * SOLR-17144: Close searcherExecutor thread per core after 1 minute (Pierre Salagnac, Christine Poerschke) diff --git a/solr/core/src/java/org/apache/solr/cli/PostTool.java b/solr/core/src/java/org/apache/solr/cli/PostTool.java index de716c131cb..0e3bc6b77c1 100644 --- a/solr/core/src/java/org/apache/solr/cli/PostTool.java +++ b/solr/core/src/java/org/apache/solr/cli/PostTool.java @@ -16,15 +16,144 @@ */ package org.apache.solr.cli; +import static java.nio.charset.StandardCharsets.US_ASCII; +import static java.nio.charset.StandardCharsets.UTF_8; + +import java.io.BufferedReader; +import java.io.ByteArrayInputStream; +import java.io.ByteArrayOutputStream; +import java.io.File; +import java.io.FileFilter; +import java.io.FileInputStream; +import java.io.IOException; +import java.io.InputStream; +import java.io.InputStreamReader; import java.io.OutputStream; import java.io.PrintStream; +import java.net.HttpURLConnection; +import java.net.MalformedURLException; +import java.net.ProtocolException; +import java.net.URI; +import java.net.URISyntaxException; import java.net.URL; +import java.net.URLEncoder; +import java.nio.ByteBuffer; +import java.nio.charset.Charset; +import java.nio.charset.StandardCharsets; +import java.nio.file.Files; +import java.security.GeneralSecurityException; +import java.text.SimpleDateFormat; +import java.util.ArrayList; +import java.util.Base64; +import java.util.Date; +import java.util.HashMap; +import java.util.HashSet; +import java.util.LinkedHashSet; import java.util.List; +import java.util.Locale; +import java.util.Map; +import java.util.Set; +import java.util.TimeZone; +import java.util.regex.Pattern; +import java.util.regex.PatternSyntaxException; +import java.util.zip.GZIPInputStream; +import java.util.zip.Inflater; +import java.util.zip.InflaterInputStream; +import javax.xml.parsers.DocumentBuilderFactory; +import javax.xml.parsers.ParserConfigurationException; +import javax.xml.xpath.XPath; +import javax.xml.xpath.XPathConstants; +import javax.xml.xpath.XPathExpression; +import javax.xml.xpath.XPathExpressionException; +import javax.xml.xpath.XPathFactory; import org.apache.commons.cli.CommandLine; import org.apache.commons.cli.Option; +import org.apache.solr.client.api.util.SolrVersion; +import org.apache.solr.client.solrj.SolrClient; +import org.apache.solr.client.solrj.SolrServerException; +import org.apache.solr.common.util.Utils; +import org.apache.solr.util.RTimer; +import org.w3c.dom.Document; +import org.w3c.dom.Node; +import org.w3c.dom.NodeList; +import org.xml.sax.SAXException; public class PostTool extends ToolBase { + public static final String DEFAULT_FILE_TYPES = + "xml,json,jsonl,csv,pdf,doc,docx,ppt,pptx,xls,xlsx,odt,odp,ods,ott,otp,ots,rtf,htm,html,txt,log"; + static final String DATA_MODE_FILES = "files"; + static final String DATA_MODE_ARGS = "args"; + static final String DATA_MODE_STDIN = "stdin"; + static final String DEFAULT_DATA_MODE = DATA_MODE_FILES; + static final String FORMAT_SOLR = "solr"; + static final String DATA_MODE_WEB = "web"; + + private static final int DEFAULT_WEB_DELAY = 10; + private static final int MAX_WEB_DEPTH = 10; + public static final String DEFAULT_CONTENT_TYPE = "application/json"; + + // Input args + int recursive = 0; + int delay = 0; + String fileTypes = PostTool.DEFAULT_FILE_TYPES; + URL solrUpdateUrl; + String credentials; + OutputStream out = null; + String type; + String format; + String mode = DEFAULT_DATA_MODE; + boolean commit; + boolean optimize; + boolean dryRun; // Avoids actual network traffic to Solr + + String[] args; + + boolean auto = true; + private int currentDepth; + + static HashMap<String, String> mimeMap; + FileFilter fileFilter; + // Backlog for crawling + List<LinkedHashSet<URI>> backlog = new ArrayList<>(); + Set<URI> visited = new HashSet<>(); + + static final Set<String> DATA_MODES = new HashSet<>(); + + PostTool.PageFetcher pageFetcher = new PostTool.PageFetcher(); + + static { + DATA_MODES.add(DATA_MODE_FILES); + DATA_MODES.add(DATA_MODE_ARGS); + DATA_MODES.add(DATA_MODE_STDIN); + DATA_MODES.add(DATA_MODE_WEB); + + mimeMap = new HashMap<>(); + mimeMap.put("xml", "application/xml"); + mimeMap.put("csv", "text/csv"); + mimeMap.put("json", "application/json"); + mimeMap.put("jsonl", "application/jsonl"); + mimeMap.put("pdf", "application/pdf"); + mimeMap.put("rtf", "text/rtf"); + mimeMap.put("html", "text/html"); + mimeMap.put("htm", "text/html"); + mimeMap.put("doc", "application/msword"); + mimeMap.put("docx", "application/vnd.openxmlformats-officedocument.wordprocessingml.document"); + mimeMap.put("ppt", "application/vnd.ms-powerpoint"); + mimeMap.put( + "pptx", "application/vnd.openxmlformats-officedocument.presentationml.presentation"); + mimeMap.put("xls", "application/vnd.ms-excel"); + mimeMap.put("xlsx", "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"); + mimeMap.put("odt", "application/vnd.oasis.opendocument.text"); + mimeMap.put("ott", "application/vnd.oasis.opendocument.text"); + mimeMap.put("odp", "application/vnd.oasis.opendocument.presentation"); + mimeMap.put("otp", "application/vnd.oasis.opendocument.presentation"); + mimeMap.put("ods", "application/vnd.oasis.opendocument.spreadsheet"); + mimeMap.put("ots", "application/vnd.oasis.opendocument.spreadsheet"); + mimeMap.put("txt", "text/plain"); + mimeMap.put("log", "text/plain"); + } + public PostTool() { this(CLIO.getOutStream()); } @@ -43,9 +172,10 @@ public class PostTool extends ToolBase { return List.of( Option.builder("url") .argName("url") + .longOpt("solr-update-url") .hasArg() .required(false) - .desc("<base Solr update URL>") + .desc("Solr Update URL, the full url to the update handler, including the /update.") .build(), Option.builder("c") .longOpt("name") @@ -66,7 +196,8 @@ public class PostTool extends ToolBase { .argName("mode") .hasArg(true) .required(false) - .desc("Files crawls files, web crawls website. default: files.") + .desc( + "Files crawls files, web crawls website, args processes input args, and stdin reads a command from standard in. default: files.") .build(), Option.builder("recursive") .argName("recursive") @@ -85,13 +216,13 @@ public class PostTool extends ToolBase { .argName("content-type") .hasArg(true) .required(false) - .desc("default: application/json") + .desc("Specify a specific mimetype to use, such as application/json.") .build(), Option.builder("filetypes") .argName("<type>[,<type>,...]") .hasArg(true) .required(false) - .desc("default: " + SimplePostTool.DEFAULT_FILE_TYPES) + .desc("default: " + DEFAULT_FILE_TYPES) .build(), Option.builder("params") .argName("<key>=<value>[&<key>=<value>...]") @@ -107,6 +238,12 @@ public class PostTool extends ToolBase { .required(false) .desc( "sends application/json content as Solr commands to /update instead of /update/json/docs.") + .build(), + Option.builder() + .longOpt("dry-run") + .required(false) + .desc( + "Performs a dry run of the posting process without actually sending documents to Solr. Only works with files mode.") .build()); } @@ -114,52 +251,1027 @@ public class PostTool extends ToolBase { public void runImpl(CommandLine cli) throws Exception { SolrCLI.raiseLogLevelUnlessVerbose(cli); - URL solrUrl = null; + solrUpdateUrl = null; if (cli.hasOption("url")) { String url = cli.getOptionValue("url"); - solrUrl = new URL(url); + solrUpdateUrl = new URL(url); } else if (cli.hasOption("c")) { String url = SolrCLI.getDefaultSolrUrl() + "/solr/" + cli.getOptionValue("c") + "/update"; - solrUrl = new URL(url); + solrUpdateUrl = new URL(url); } else { throw new IllegalArgumentException( "Must specify either -url or -c parameter to post documents."); } - String mode = SimplePostTool.DEFAULT_DATA_MODE; if (cli.hasOption("mode")) { mode = cli.getOptionValue("mode"); } - boolean auto = true; - String type = null; + + if (cli.hasOption("dry-run")) { + dryRun = true; + } + if (cli.hasOption("type")) { type = cli.getOptionValue("type"); + // Turn off automatically looking up the mimetype in favour of what is passed in. + auto = false; } - String format = - cli.hasOption("format") - ? SimplePostTool.FORMAT_SOLR - : ""; // i.e not solr formatted json commands + format = cli.hasOption("format") ? FORMAT_SOLR : ""; // i.e not solr formatted json commands - String fileTypes = SimplePostTool.DEFAULT_FILE_TYPES; if (cli.hasOption("filetypes")) { fileTypes = cli.getOptionValue("filetypes"); } - int defaultDelay = (mode.equals((SimplePostTool.DATA_MODE_WEB)) ? 10 : 0); - int delay = Integer.parseInt(cli.getOptionValue("delay", String.valueOf(defaultDelay))); - int recursive = Integer.parseInt(cli.getOptionValue("recursive", "1")); + int defaultDelay = (mode.equals((DATA_MODE_WEB)) ? 10 : 0); + delay = Integer.parseInt(cli.getOptionValue("delay", String.valueOf(defaultDelay))); + recursive = Integer.parseInt(cli.getOptionValue("recursive", "1")); - OutputStream out = cli.hasOption("out") ? CLIO.getOutStream() : null; - boolean commit = cli.hasOption("skipcommit") ? false : true; - boolean optimize = cli.hasOption("optimize"); + out = cli.hasOption("out") ? CLIO.getOutStream() : null; + commit = cli.hasOption("skipcommit") ? false : true; + optimize = cli.hasOption("optimize"); - String[] args = cli.getArgs(); + args = cli.getArgs(); - SimplePostTool spt = - new SimplePostTool( - mode, solrUrl, auto, type, format, recursive, delay, fileTypes, out, commit, optimize, - args); + execute(); + } + + /** + * After initialization, call execute to start the post job. This method delegates to the correct + * mode method. + */ + public void execute() throws SolrServerException, IOException { + final RTimer timer = new RTimer(); + if (PostTool.DATA_MODE_FILES.equals(mode)) { + doFilesMode(); + } else if (DATA_MODE_ARGS.equals(mode)) { + doArgsMode(args); + } else if (PostTool.DATA_MODE_WEB.equals(mode)) { + doWebMode(); + } else if (DATA_MODE_STDIN.equals(mode)) { + doStdinMode(); + } else { + return; + } + + if (commit) { + commit(); + } + if (optimize) { + optimize(); + } + displayTiming((long) timer.getTime()); + } + + private void doFilesMode() { + currentDepth = 0; + + info( + "Posting files to [base] url " + + solrUpdateUrl + + (!auto ? " using content-type " + (type == null ? DEFAULT_CONTENT_TYPE : type) : "") + + "..."); + if (auto) { + info("Entering auto mode. File endings considered are " + fileTypes); + } + if (recursive > 0) { + info("Entering recursive mode, max depth=" + recursive + ", delay=" + delay + "s"); + } + fileFilter = getFileFilterFromFileTypes(fileTypes); + int numFilesPosted = postFiles(args, 0, out, type); + if (dryRun) { + info("Dry run complete. " + numFilesPosted + " would have been indexed."); + } else { + info(numFilesPosted + " files indexed."); + } + } + + private void doArgsMode(String[] args) { + info("POSTing args to " + solrUpdateUrl + "..."); + for (String a : args) { + postData(stringToStream(a), null, out, type, solrUpdateUrl); + } + } + + private void doWebMode() { + reset(); + int numPagesPosted = 0; + try { + if (type != null) { + throw new IllegalArgumentException( + "Specifying content-type with \"-Ddata=web\" is not supported"); + } + + // Set Extracting handler as default + solrUpdateUrl = appendUrlPath(solrUpdateUrl, "/extract"); + + info("Posting web pages to Solr url " + solrUpdateUrl); + auto = true; + info( + "Entering auto mode. Indexing pages with content-types corresponding to file endings " + + fileTypes); + if (recursive > 0) { + if (recursive > MAX_WEB_DEPTH) { + recursive = MAX_WEB_DEPTH; + warn("Too large recursion depth for web mode, limiting to " + MAX_WEB_DEPTH + "..."); + } + if (delay < DEFAULT_WEB_DELAY) { + warn( + "Never crawl an external web site faster than every 10 seconds, your IP will probably be blocked"); + } + info("Entering recursive mode, depth=" + recursive + ", delay=" + delay + "s"); + } + numPagesPosted = postWebPages(args, 0, out); + info(numPagesPosted + " web pages indexed."); + + } catch (MalformedURLException e) { + warn("Wrong URL trying to append /extract to " + solrUpdateUrl); + } + } + + private void doStdinMode() { + info("POSTing stdin to " + solrUpdateUrl + "..."); + postData(System.in, null, out, type, solrUpdateUrl); + } + + private void reset() { + backlog = new ArrayList<>(); + visited = new HashSet<>(); + } + + /** + * Pretty prints the number of milliseconds taken to post the content to Solr + * + * @param millis the time in milliseconds + */ + private void displayTiming(long millis) { + SimpleDateFormat df = new SimpleDateFormat("H:mm:ss.SSS", Locale.getDefault()); + df.setTimeZone(TimeZone.getTimeZone("UTC")); + CLIO.out("Time spent: " + df.format(new Date(millis))); + } + + private boolean checkIsValidPath(File srcFile) { + return Files.exists(srcFile.toPath()); + } + + /** + * Post all filenames provided in args + * + * @param args array of file names + * @param startIndexInArgs offset to start + * @param out output stream to post data to + * @param type default content-type to use when posting (may be overridden in auto mode) + * @return number of files posted + */ + public int postFiles(String[] args, int startIndexInArgs, OutputStream out, String type) { + reset(); + int filesPosted = 0; + for (int j = startIndexInArgs; j < args.length; j++) { + File srcFile = new File(args[j]); + filesPosted = getFilesPosted(out, type, srcFile); + } + return filesPosted; + } + + private int getFilesPosted(final OutputStream out, final String type, final File srcFile) { + int filesPosted = 0; + boolean isValidPath = checkIsValidPath(srcFile); + if (isValidPath && srcFile.isDirectory() && srcFile.canRead()) { + filesPosted += postDirectory(srcFile, out, type); + } else if (isValidPath && srcFile.isFile() && srcFile.canRead()) { + filesPosted += postFiles(new File[] {srcFile}, out, type); + } else { + filesPosted += handleGlob(srcFile, out, type); + } + return filesPosted; + } + + /** + * Posts a whole directory + * + * @return number of files posted total + */ + private int postDirectory(File dir, OutputStream out, String type) { + if (dir.isHidden() && !dir.getName().equals(".")) { + return (0); + } + info( + "Indexing directory " + + dir.getPath() + + " (" + + dir.listFiles(fileFilter).length + + " files, depth=" + + currentDepth + + ")"); + int posted = 0; + posted += postFiles(dir.listFiles(fileFilter), out, type); + if (recursive > currentDepth) { + for (File d : dir.listFiles()) { + if (d.isDirectory()) { + currentDepth++; + posted += postDirectory(d, out, type); + currentDepth--; + } + } + } + return posted; + } + + /** + * Posts a list of file names + * + * @return number of files posted + */ + int postFiles(File[] files, OutputStream out, String type) { + int filesPosted = 0; + for (File srcFile : files) { + try { + if (!srcFile.isFile() || srcFile.isHidden()) { + continue; + } + postFile(srcFile, out, type); + Thread.sleep(delay * 1000L); + filesPosted++; + } catch (InterruptedException | MalformedURLException e) { + throw new RuntimeException(e); + } + } + return filesPosted; + } + + /** + * This only handles file globs not full path globbing. + * + * @param globFile file holding glob path + * @param out outputStream to write results to + * @param type default content-type to use when posting (may be overridden in auto mode) + * @return number of files posted + */ + int handleGlob(File globFile, OutputStream out, String type) { + int filesPosted = 0; + File parent = globFile.getParentFile(); + if (parent == null) { + parent = new File("."); + } + String fileGlob = globFile.getName(); + PostTool.GlobFileFilter ff = new PostTool.GlobFileFilter(fileGlob, false); + File[] fileList = parent.listFiles(ff); + if (fileList == null || fileList.length == 0) { + warn("No files or directories matching " + globFile); + } else { + filesPosted = postFiles(fileList, out, type); + } + return filesPosted; + } + + /** + * This method takes as input a list of start URL strings for crawling, converts the URL strings + * to URI strings and adds each one to a backlog and then starts crawling + * + * @param args the raw input args from main() + * @param startIndexInArgs offset for where to start + * @param out outputStream to write results to + * @return the number of web pages posted + */ + public int postWebPages(String[] args, int startIndexInArgs, OutputStream out) { + reset(); + LinkedHashSet<URI> s = new LinkedHashSet<>(); + for (int j = startIndexInArgs; j < args.length; j++) { + try { + URI uri = new URI(normalizeUrlEnding(args[j])); + s.add(uri); + } catch (URISyntaxException e) { + warn("Skipping malformed input URL: " + args[j]); + } + } + // Add URIs to level 0 of the backlog and start recursive crawling + backlog.add(s); + return webCrawl(0, out); + } + + /** + * Normalizes a URL string by removing anchor part and trailing slash + * + * @return the normalized URL string + */ + protected static String normalizeUrlEnding(String link) { + if (link.contains("#")) { + link = link.substring(0, link.indexOf('#')); + } + if (link.endsWith("?")) { + link = link.substring(0, link.length() - 1); + } + if (link.endsWith("/")) { + link = link.substring(0, link.length() - 1); + } + return link; + } + + /** + * A very simple crawler, pulling URLs to fetch from a backlog and then recurses N levels deep if + * recursive>0. Links are parsed from HTML through first getting an XHTML version using + * SolrCell with extractOnly, and followed if they are local. The crawler pauses for a default + * delay of 10 seconds between each fetch, this can be configured in the delay variable. This is + * only meant for test purposes, as it does not respect robots or anything else fancy :) + * + * @param level which level to crawl + * @param out output stream to write to + * @return number of pages crawled on this level and below + */ + protected int webCrawl(int level, OutputStream out) { + int numPages = 0; + LinkedHashSet<URI> stack = backlog.get(level); + int rawStackSize = stack.size(); + stack.removeAll(visited); + int stackSize = stack.size(); + LinkedHashSet<URI> subStack = new LinkedHashSet<>(); + info( + "Entering crawl at level " + + level + + " (" + + rawStackSize + + " links total, " + + stackSize + + " new)"); + for (URI uri : stack) { + try { + visited.add(uri); + URL url = uri.toURL(); + PostTool.PageFetcherResult result = pageFetcher.readPageFromUrl(url); + if (result.httpStatus == 200) { + url = (result.redirectUrl != null) ? result.redirectUrl : url; + URL postUrl = + new URL( + appendParam( + solrUpdateUrl.toString(), + "literal.id=" + + URLEncoder.encode(url.toString(), UTF_8) + + "&literal.url=" + + URLEncoder.encode(url.toString(), UTF_8))); + ByteBuffer content = result.content; + boolean success = + postData( + new ByteArrayInputStream(content.array(), content.arrayOffset(), content.limit()), + null, + out, + result.contentType, + postUrl); + if (success) { + info("POSTed web resource " + url + " (depth: " + level + ")"); + Thread.sleep(delay * 1000L); + numPages++; + // Pull links from HTML pages only + if (recursive > level && result.contentType.equals("text/html")) { + Set<URI> children = + pageFetcher.getLinksFromWebPage( + url, + new ByteArrayInputStream( + content.array(), content.arrayOffset(), content.limit()), + result.contentType, + postUrl); + subStack.addAll(children); + } + } else { + warn("An error occurred while posting " + uri); + } + } else { + warn("The URL " + uri + " returned a HTTP result status of " + result.httpStatus); + } + } catch (IOException | URISyntaxException e) { + warn("Caught exception when trying to open connection to " + uri + ": " + e.getMessage()); + } catch (InterruptedException e) { + throw new RuntimeException(e); + } + } + if (!subStack.isEmpty()) { + backlog.add(subStack); + numPages += webCrawl(level + 1, out); + } + return numPages; + } + + /** + * Computes the full URL based on a base url and a possibly relative link found in the href param + * of an HTML anchor. + * + * @param baseUrl the base url from where the link was found + * @param link the absolute or relative link + * @return the string version of the full URL + */ + protected String computeFullUrl(URL baseUrl, String link) { + if (link == null || link.length() == 0) { + return null; + } + if (!link.startsWith("http")) { + if (link.startsWith("/")) { + link = baseUrl.getProtocol() + "://" + baseUrl.getAuthority() + link; + } else { + if (link.contains(":")) { + return null; // Skip non-relative URLs + } + String path = baseUrl.getPath(); + if (!path.endsWith("/")) { + int sep = path.lastIndexOf('/'); + String file = path.substring(sep + 1); + if (file.contains(".") || file.contains("?")) { + path = path.substring(0, sep); + } + } + link = baseUrl.getProtocol() + "://" + baseUrl.getAuthority() + path + "/" + link; + } + } + link = normalizeUrlEnding(link); + String l = link.toLowerCase(Locale.ROOT); + // Simple brute force skip images + if (l.endsWith(".jpg") || l.endsWith(".jpeg") || l.endsWith(".png") || l.endsWith(".gif")) { + return null; // Skip images + } + return link; + } + + /** + * Uses the mime-type map to reverse lookup whether the file ending for our type is supported by + * the fileTypes option + * + * @param type what content-type to lookup + * @return true if this is a supported content type + */ + protected boolean typeSupported(String type) { + for (Map.Entry<String, String> entry : mimeMap.entrySet()) { + if (entry.getValue().equals(type)) { + if (fileTypes.contains(entry.getKey())) { + return true; + } + } + } + return false; + } + + static void warn(String msg) { + CLIO.err("PostTool: WARNING: " + msg); + } + + static void info(String msg) { + CLIO.out(msg); + } + + /** Does a simple commit operation */ + public void commit() throws IOException, SolrServerException { + info("COMMITting Solr index changes to " + solrUpdateUrl + "..."); + String url = solrUpdateUrl.toString(); + url = url.substring(0, url.lastIndexOf("/update")); + try (final SolrClient client = SolrCLI.getSolrClient(url, credentials)) { + client.commit(); + } + } + + /** Does a simple optimize operation */ + public void optimize() throws IOException, SolrServerException { + info("Performing an OPTIMIZE to " + solrUpdateUrl + "..."); + String url = solrUpdateUrl.toString(); + url = url.substring(0, url.lastIndexOf("/update")); + try (final SolrClient client = SolrCLI.getSolrClient(url, credentials)) { + client.optimize(); + } + } + + /** + * Appends a URL query parameter to a URL + * + * @param url the original URL + * @param param the parameter(s) to append, separated by "&" + * @return the string version of the resulting URL + */ + public static String appendParam(String url, String param) { + String[] pa = param.split("&"); + for (String p : pa) { + if (p.trim().length() == 0) { + continue; + } + String[] kv = p.split("="); + if (kv.length == 2) { + url = url + (url.contains("?") ? "&" : "?") + kv[0] + "=" + kv[1]; + } else { + warn("Skipping param " + p + " which is not on form key=value"); + } + } + return url; + } + + /** Opens the file and posts its contents to the solrUrl, writes to response to output. */ + public void postFile(File file, OutputStream output, String type) throws MalformedURLException { + InputStream is = null; + + URL url = solrUpdateUrl; + String suffix = ""; + if (auto) { + if (type == null) { + type = guessType(file); + } + // TODO: Add a flag that disables /update and sends all to /update/extract, to avoid CSV, + // JSON, and XML files + // TODO: from being interpreted as Solr documents internally + if (type.equals("application/json") && !PostTool.FORMAT_SOLR.equals(format)) { + suffix = "/json/docs"; + String urlStr = appendUrlPath(solrUpdateUrl, suffix).toString(); + url = new URL(urlStr); + } else if (type.equals("application/xml") + || type.equals("text/csv") + || type.equals("application/json")) { + // Default handler + } else { + // SolrCell + suffix = "/extract"; + String urlStr = appendUrlPath(solrUpdateUrl, suffix).toString(); + if (!urlStr.contains("resource.name")) { + urlStr = + appendParam( + urlStr, "resource.name=" + URLEncoder.encode(file.getAbsolutePath(), UTF_8)); + } + if (!urlStr.contains("literal.id")) { + urlStr = + appendParam(urlStr, "literal.id=" + URLEncoder.encode(file.getAbsolutePath(), UTF_8)); + } + url = new URL(urlStr); + } + } else { + if (type == null) { + type = DEFAULT_CONTENT_TYPE; + } + } + if (dryRun) { + info( + "DRY RUN of POSTing file " + + file.getName() + + (auto ? " (" + type + ")" : "") + + " to [base]" + + suffix); + } else { + try { + info( + "POSTing file " + + file.getName() + + (auto ? " (" + type + ")" : "") + + " to [base]" + + suffix); + is = new FileInputStream(file); + postData(is, file.length(), output, type, url); + } catch (IOException e) { + warn("Can't open/read file: " + file); + } finally { + try { + if (is != null) { + is.close(); + } + } catch (IOException e) { + warn("IOException while closing file: " + e); + } + } + } + } + + /** + * Appends to the path of the URL + * + * @param url the URL + * @param append the path to append + * @return the final URL version + */ + protected static URL appendUrlPath(URL url, String append) throws MalformedURLException { + return new URL( + url.getProtocol() + + "://" + + url.getAuthority() + + url.getPath() + + append + + (url.getQuery() != null ? "?" + url.getQuery() : "")); + } + + /** + * Guesses the type of file, based on file name suffix Returns "application/octet-stream" if no + * corresponding mimeMap type. + * + * @param file the file + * @return the content-type guessed + */ + protected static String guessType(File file) { + String name = file.getName(); + String suffix = name.substring(name.lastIndexOf('.') + 1); + String type = mimeMap.get(suffix.toLowerCase(Locale.ROOT)); + return (type != null) ? type : "application/octet-stream"; + } + + /** + * Reads data from the data stream and posts it to solr, writes to the response to output + * + * @return true if success + */ + public boolean postData( + InputStream data, Long length, OutputStream output, String type, URL url) { + if (dryRun) { + return true; + } + + boolean success = true; + if (type == null) { + type = DEFAULT_CONTENT_TYPE; + } + HttpURLConnection urlConnection = null; + try { + try { + urlConnection = (HttpURLConnection) url.openConnection(); + try { + urlConnection.setRequestMethod("POST"); + } catch (ProtocolException e) { + warn("Shouldn't happen: HttpURLConnection doesn't support POST??" + e); + } + urlConnection.setDoOutput(true); + urlConnection.setDoInput(true); + urlConnection.setUseCaches(false); + urlConnection.setAllowUserInteraction(false); + urlConnection.setRequestProperty("Content-type", type); + basicAuth(urlConnection); + if (null != length) { + urlConnection.setFixedLengthStreamingMode(length); + } else { + urlConnection.setChunkedStreamingMode(-1); // use JDK default chunkLen, 4k in Java 8. + } + urlConnection.connect(); + } catch (IOException e) { + warn("Connection error (is Solr running at " + solrUpdateUrl + " ?): " + e); + success = false; + } catch (Exception e) { + warn("POST failed with error " + e.getMessage()); + } + + try (final OutputStream out = urlConnection.getOutputStream()) { + pipe(data, out); + } catch (IOException e) { + warn("IOException while posting data: " + e); + } + + try { + success &= checkResponseCode(urlConnection); + try (final InputStream in = urlConnection.getInputStream()) { + pipe(in, output); + } + } catch (IOException e) { + warn("IOException while reading response: " + e); + success = false; + } catch (GeneralSecurityException e) { + warn( + "Looks like Solr is secured and would not let us in. Try with another user in '-u' parameter"); + } + } finally { + if (urlConnection != null) { + urlConnection.disconnect(); + } + } + return success; + } + + private void basicAuth(HttpURLConnection urlc) throws Exception { + if (urlc.getURL().getUserInfo() != null) { + String encoding = + Base64.getEncoder().encodeToString(urlc.getURL().getUserInfo().getBytes(US_ASCII)); + urlc.setRequestProperty("Authorization", "Basic " + encoding); + } else if (credentials != null) { + if (!credentials.contains(":")) { + throw new Exception("credentials '" + credentials + "' must be of format user:pass"); + } + urlc.setRequestProperty( + "Authorization", + "Basic " + Base64.getEncoder().encodeToString(credentials.getBytes(UTF_8))); + } + } + + private static boolean checkResponseCode(HttpURLConnection urlc) + throws IOException, GeneralSecurityException { + if (urlc.getResponseCode() >= 400) { + warn( + "Solr returned an error #" + + urlc.getResponseCode() + + " (" + + urlc.getResponseMessage() + + ") for url: " + + urlc.getURL()); + Charset charset = StandardCharsets.ISO_8859_1; + final String contentType = urlc.getContentType(); + // code cloned from ContentStreamBase, but post.jar should be standalone! + if (contentType != null) { + int idx = contentType.toLowerCase(Locale.ROOT).indexOf("charset="); + if (idx > 0) { + charset = Charset.forName(contentType.substring(idx + "charset=".length()).trim()); + } + } + // Print the response returned by Solr + try (InputStream errStream = urlc.getErrorStream()) { + if (errStream != null) { + BufferedReader br = new BufferedReader(new InputStreamReader(errStream, charset)); + final StringBuilder response = new StringBuilder("Response: "); + int ch; + while ((ch = br.read()) != -1) { + response.append((char) ch); + } + warn(response.toString().trim()); + } + } + if (urlc.getResponseCode() == 401) { + throw new GeneralSecurityException( + "Solr requires authentication (response 401). Please try again with '-u' option"); + } + if (urlc.getResponseCode() == 403) { + throw new GeneralSecurityException( + "You are not authorized to perform this action against Solr. (response 403)"); + } + return false; + } + return true; + } + + /** + * Converts a string to an input stream + * + * @param s the string + * @return the input stream + */ + public static InputStream stringToStream(String s) { + return new ByteArrayInputStream(s.getBytes(StandardCharsets.UTF_8)); + } + + /** + * Pipes everything from the source to the dest. If dest is null, then everything is read from + * source and thrown away. + */ + private static void pipe(InputStream source, OutputStream dest) throws IOException { + byte[] buf = new byte[1024]; + int read = 0; + while ((read = source.read(buf)) >= 0) { + if (null != dest) { + dest.write(buf, 0, read); + } + } + if (null != dest) { + dest.flush(); + } + } + + public FileFilter getFileFilterFromFileTypes(String fileTypes) { + String glob; + if (fileTypes.equals("*")) { + glob = ".*"; + } else { + glob = "^.*\\.(" + fileTypes.replace(",", "|") + ")$"; + } + return new PostTool.GlobFileFilter(glob, true); + } + + // + // Utility methods for XPath handing + // + + /** Gets all nodes matching an XPath */ + public static NodeList getNodesFromXP(Node n, String xpath) throws XPathExpressionException { + XPathFactory factory = XPathFactory.newInstance(); + XPath xp = factory.newXPath(); + XPathExpression expr = xp.compile(xpath); + return (NodeList) expr.evaluate(n, XPathConstants.NODESET); + } + + /** + * Gets the string content of the matching an XPath + * + * @param n the node (or doc) + * @param xpath the xpath string + * @param concatAll if true, text from all matching nodes will be concatenated, else only the + * first returned + */ + public static String getXP(Node n, String xpath, boolean concatAll) + throws XPathExpressionException { + NodeList nodes = getNodesFromXP(n, xpath); + StringBuilder sb = new StringBuilder(); + if (nodes.getLength() > 0) { + for (int i = 0; i < nodes.getLength(); i++) { + sb.append(nodes.item(i).getNodeValue()).append(' '); + if (!concatAll) { + break; + } + } + return sb.toString().trim(); + } else return ""; + } + + /** Takes a string as input and returns a DOM */ + public static Document makeDom(byte[] in) + throws SAXException, IOException, ParserConfigurationException { + InputStream is = new ByteArrayInputStream(in); + Document dom = DocumentBuilderFactory.newInstance().newDocumentBuilder().parse(is); + return dom; + } + + /** Inner class to filter files based on glob wildcards */ + static class GlobFileFilter implements FileFilter { + private final Pattern p; + + public GlobFileFilter(String pattern, boolean isRegex) { + String _pattern = pattern; + if (!isRegex) { + _pattern = + _pattern + .replace("^", "\\^") + .replace("$", "\\$") + .replace(".", "\\.") + .replace("(", "\\(") + .replace(")", "\\)") + .replace("+", "\\+") + .replace("*", ".*") + .replace("?", "."); + _pattern = "^" + _pattern + "$"; + } + + try { + p = Pattern.compile(_pattern, Pattern.CASE_INSENSITIVE); + } catch (PatternSyntaxException e) { + throw new IllegalArgumentException( + "Invalid type list " + pattern + ". " + e.getDescription()); + } + } + + @Override + public boolean accept(File file) { + return p.matcher(file.getName()).find(); + } + } + + // + // Simple crawler class which can fetch a page and check for robots.txt + // + class PageFetcher { + Map<String, List<String>> robotsCache; + static final String DISALLOW = "Disallow:"; + + public PageFetcher() { + robotsCache = new HashMap<>(); + } + + public PageFetcherResult readPageFromUrl(URL u) throws URISyntaxException { + PostTool.PageFetcherResult res = new PostTool.PageFetcherResult(); + try { + if (isDisallowedByRobots(u)) { + warn("The URL " + u + " is disallowed by robots.txt and will not be crawled."); + res.httpStatus = 403; + URI uri = u.toURI(); + visited.add(uri); + return res; + } + res.httpStatus = 404; + HttpURLConnection conn = (HttpURLConnection) u.openConnection(); + conn.setRequestProperty( + "User-Agent", + "PostTool-crawler/" + SolrVersion.LATEST_STRING + " (https://solr.apache.org/)"); + conn.setRequestProperty("Accept-Encoding", "gzip, deflate"); + conn.connect(); + res.httpStatus = conn.getResponseCode(); + if (!normalizeUrlEnding(conn.getURL().toString()) + .equals(normalizeUrlEnding(u.toString()))) { + info("The URL " + u + " caused a redirect to " + conn.getURL()); + u = conn.getURL(); + res.redirectUrl = u; + URI uri = u.toURI(); + visited.add(uri); + } + if (res.httpStatus == 200) { + // Raw content type of form "text/html; encoding=utf-8" + String rawContentType = conn.getContentType(); + String type = rawContentType.split(";")[0]; + if (typeSupported(type) || "*".equals(fileTypes)) { + String encoding = conn.getContentEncoding(); + InputStream is; + if (encoding != null && encoding.equalsIgnoreCase("gzip")) { + is = new GZIPInputStream(conn.getInputStream()); + } else if (encoding != null && encoding.equalsIgnoreCase("deflate")) { + is = new InflaterInputStream(conn.getInputStream(), new Inflater(true)); + } else { + is = conn.getInputStream(); + } + + // Read into memory, so that we later can pull links from the page without re-fetching + res.content = Utils.toByteArray(is); + is.close(); + } else { + warn("Skipping URL with unsupported type " + type); + res.httpStatus = 415; + } + } + } catch (IOException e) { + warn("IOException when reading page from url " + u + ": " + e.getMessage()); + } + return res; + } + + public boolean isDisallowedByRobots(URL url) { + String host = url.getHost(); + String strRobot = url.getProtocol() + "://" + host + "/robots.txt"; + List<String> disallows = robotsCache.get(host); + if (disallows == null) { + disallows = new ArrayList<>(); + URL urlRobot; + try { + urlRobot = new URL(strRobot); + disallows = parseRobotsTxt(urlRobot.openStream()); + } catch (MalformedURLException e) { + return true; // We cannot trust this robots URL, should not happen + } catch (IOException e) { + // There is no robots.txt, will cache an empty disallow list + } + } + + robotsCache.put(host, disallows); + + String strURL = url.getFile(); + for (String path : disallows) { + if (path.equals("/") || strURL.indexOf(path) == 0) return true; + } + return false; + } + + /** + * Very simple robots.txt parser which obeys all Disallow lines regardless of user agent or + * whether there are valid Allow: lines. + * + * @param is Input stream of the robots.txt file + * @return a list of disallow paths + * @throws IOException if problems reading the stream + */ + protected List<String> parseRobotsTxt(InputStream is) throws IOException { + List<String> disallows = new ArrayList<>(); + BufferedReader r = new BufferedReader(new InputStreamReader(is, StandardCharsets.UTF_8)); + String l; + while ((l = r.readLine()) != null) { + String[] arr = l.split("#"); + if (arr.length == 0) continue; + l = arr[0].trim(); + if (l.startsWith(DISALLOW)) { + l = l.substring(DISALLOW.length()).trim(); + if (l.length() == 0) continue; + disallows.add(l); + } + } + is.close(); + return disallows; + } + + /** + * Finds links on a web page, using /extract?extractOnly=true + * + * @param url the URL of the web page + * @param is the input stream of the page + * @param type the content-type + * @param postUrl the URL (typically /solr/extract) in order to pull out links + * @return a set of URIs parsed from the page + */ + protected Set<URI> getLinksFromWebPage(URL url, InputStream is, String type, URL postUrl) { + Set<URI> linksFromPage = new HashSet<>(); + + try { + ByteArrayOutputStream os = new ByteArrayOutputStream(); + URL extractUrl = new URL(appendParam(postUrl.toString(), "extractOnly=true")); + extractUrl = new URL(appendParam(extractUrl.toString(), "wt=xml")); + boolean success = postData(is, null, os, type, extractUrl); + if (success) { + Document d = makeDom(os.toByteArray()); + String innerXml = getXP(d, "/response/str/text()[1]", false); + d = makeDom(innerXml.getBytes(StandardCharsets.UTF_8)); + NodeList links = getNodesFromXP(d, "/html/body//a/@href"); + for (int i = 0; i < links.getLength(); i++) { + String link = links.item(i).getTextContent(); + link = computeFullUrl(url, link); + if (link == null) { + continue; + } + URI newUri = new URI(link); + if (newUri.getAuthority() == null + || !newUri.getAuthority().equals(url.getAuthority())) { + linksFromPage.add(newUri); + } + } + } + } catch (MalformedURLException e) { + warn("Malformed URL " + url); + } catch (IOException e) { + warn("IOException opening URL " + url + ": " + e.getMessage()); + } catch (Exception e) { + throw new RuntimeException(e); + } + + return linksFromPage; + } + } - spt.execute(); + /** Utility class to hold the result form a page fetch */ + public static class PageFetcherResult { + int httpStatus = 200; + String contentType = "text/html"; + URL redirectUrl = null; + ByteBuffer content; } } diff --git a/solr/core/src/java/org/apache/solr/cli/RunExampleTool.java b/solr/core/src/java/org/apache/solr/cli/RunExampleTool.java index 908990db22d..680a879372e 100644 --- a/solr/core/src/java/org/apache/solr/cli/RunExampleTool.java +++ b/solr/core/src/java/org/apache/solr/cli/RunExampleTool.java @@ -315,22 +315,20 @@ public class RunExampleTool extends ToolBase { String updateUrl = String.format(Locale.ROOT, "%s/%s/update", solrUrl, collectionName); echo("Indexing tech product example docs from " + exampledocsDir.getAbsolutePath()); - String currentPropVal = System.getProperty("url"); - System.setProperty("url", updateUrl); - String currentTypeVal = System.getProperty("type"); - // We assume that example docs are always in XML. - System.setProperty("type", "application/xml"); - SimplePostTool.main(new String[] {exampledocsDir.getAbsolutePath() + "/*.xml"}); - if (currentPropVal != null) { - System.setProperty("url", currentPropVal); // reset - } else { - System.clearProperty("url"); - } - if (currentTypeVal != null) { - System.setProperty("type", currentTypeVal); // reset - } else { - System.clearProperty("type"); - } + String[] args = + new String[] { + "post", + "-url", + updateUrl, + "-type", + "application/xml", + exampledocsDir.getAbsolutePath() + "/*.xml" + }; + PostTool postTool = new PostTool(); + CommandLine postToolCli = + SolrCLI.parseCmdLine(postTool.getName(), args, postTool.getOptions()); + postTool.runTool(postToolCli); + } else { echo( "exampledocs directory not found, skipping indexing step for the techproducts example"); diff --git a/solr/core/src/test/org/apache/solr/cli/PostToolTest.java b/solr/core/src/test/org/apache/solr/cli/PostToolTest.java index 88639c7cefc..e11c11884f1 100644 --- a/solr/core/src/test/org/apache/solr/cli/PostToolTest.java +++ b/solr/core/src/test/org/apache/solr/cli/PostToolTest.java @@ -20,10 +20,22 @@ package org.apache.solr.cli; import static org.apache.solr.cli.SolrCLI.findTool; import static org.apache.solr.cli.SolrCLI.parseCmdLine; +import java.io.ByteArrayInputStream; import java.io.File; import java.io.FileWriter; +import java.io.IOException; +import java.io.InputStream; +import java.net.MalformedURLException; +import java.net.URI; +import java.net.URISyntaxException; +import java.net.URL; +import java.nio.ByteBuffer; import java.nio.charset.StandardCharsets; +import java.util.Collections; +import java.util.HashMap; +import java.util.HashSet; import java.util.Map; +import java.util.Set; import org.apache.commons.cli.CommandLine; import org.apache.solr.SolrTestCaseJ4; import org.apache.solr.client.solrj.request.CollectionAdminRequest; @@ -33,6 +45,11 @@ import org.apache.solr.common.util.Utils; import org.junit.BeforeClass; import org.junit.Test; +/** + * NOTE: do *not* use real hostnames, not even "example.com", in the webcrawler tests. + * + * <p>A MockPageFetcher is used to prevent real HTTP requests from being executed. + */ @SolrTestCaseJ4.SuppressSSL public class PostToolTest extends SolrCloudTestCase { @@ -58,7 +75,7 @@ public class PostToolTest extends SolrCloudTestCase { String[] args = { "post", - "-url", + "--solr-update-url", cluster.getJettySolrRunner(0).getBaseUrl() + "/" + collection + "/update", jsonDoc.getAbsolutePath() }; @@ -90,4 +107,205 @@ public class PostToolTest extends SolrCloudTestCase { CommandLine cli = parseCmdLine(tool.getName(), args, tool.getOptions()); return tool.runTool(cli); } + + @Test + public void testNormalizeUrlEnding() { + assertEquals("http://[ff01::114]", PostTool.normalizeUrlEnding("http://[ff01::114]/")); + assertEquals( + "http://[ff01::114]", PostTool.normalizeUrlEnding("http://[ff01::114]/#foo?bar=baz")); + assertEquals( + "http://[ff01::114]/index.html", + PostTool.normalizeUrlEnding("http://[ff01::114]/index.html#hello")); + } + + @Test + public void testComputeFullUrl() throws IOException { + + PostTool webPostTool = new PostTool(); + + assertEquals( + "http://[ff01::114]/index.html", + webPostTool.computeFullUrl(new URL("http://[ff01::114]/"), "/index.html")); + assertEquals( + "http://[ff01::114]/index.html", + webPostTool.computeFullUrl(new URL("http://[ff01::114]/foo/bar/"), "/index.html")); + assertEquals( + "http://[ff01::114]/fil.html", + webPostTool.computeFullUrl(new URL("http://[ff01::114]/foo.htm?baz#hello"), "fil.html")); + // TODO: How to know what is the base if URL path ends with "foo"?? + // assertEquals("http://[ff01::114]/fil.html", t_web.computeFullUrl(new + // URL("http://[ff01::114]/foo?baz#hello"), "fil.html")); + assertNull(webPostTool.computeFullUrl(new URL("http://[ff01::114]/"), "fil.jpg")); + assertNull(webPostTool.computeFullUrl(new URL("http://[ff01::114]/"), "mailto:[email protected]")); + assertNull(webPostTool.computeFullUrl(new URL("http://[ff01::114]/"), "ftp://server/file")); + } + + @Test + public void testTypeSupported() { + PostTool postTool = new PostTool(); + + assertTrue(postTool.typeSupported("application/pdf")); + assertTrue(postTool.typeSupported("application/xml")); + assertFalse(postTool.typeSupported("text/foo")); + + postTool.fileTypes = "doc,xls,ppt"; + postTool.fileFilter = postTool.getFileFilterFromFileTypes(postTool.fileTypes); + assertFalse(postTool.typeSupported("application/pdf")); + assertTrue(postTool.typeSupported("application/msword")); + } + + @Test + public void testAppendParam() { + assertEquals( + "http://[ff01::114]?foo=bar", PostTool.appendParam("http://[ff01::114]", "foo=bar")); + assertEquals( + "http://[ff01::114]/?a=b&foo=bar", + PostTool.appendParam("http://[ff01::114]/?a=b", "foo=bar")); + } + + @Test + public void testAppendUrlPath() throws MalformedURLException { + assertEquals( + new URL("http://[ff01::114]/a?foo=bar"), + PostTool.appendUrlPath(new URL("http://[ff01::114]?foo=bar"), "/a")); + } + + @Test + public void testGuessType() { + File f = new File("foo.doc"); + assertEquals("application/msword", PostTool.guessType(f)); + f = new File("foobar"); + assertEquals("application/octet-stream", PostTool.guessType(f)); + f = new File("foo.json"); + assertEquals("application/json", PostTool.guessType(f)); + } + + @Test + public void testDoFilesMode() throws MalformedURLException { + PostTool postTool = new PostTool(); + postTool.recursive = 0; + postTool.dryRun = true; + postTool.solrUpdateUrl = new URL("http://localhost:8983/solr/fake/update"); + File dir = getFile("exampledocs"); + int num = postTool.postFiles(new String[] {dir.toString()}, 0, null, null); + assertEquals(2, num); + } + + @Test + public void testDoWebMode() throws IOException, URISyntaxException { + PostTool postTool = new PostTool(); + postTool.pageFetcher = new MockPageFetcher(); + postTool.dryRun = true; + postTool.solrUpdateUrl = new URL("http://user:password@localhost:5150/solr/fake/update"); + + // Uses mock pageFetcher + postTool.delay = 0; + postTool.recursive = 5; + int num = postTool.postWebPages(new String[] {"http://[ff01::114]/#removeme"}, 0, null); + assertEquals(5, num); + + postTool.recursive = 1; + num = postTool.postWebPages(new String[] {"http://[ff01::114]/"}, 0, null); + assertEquals(3, num); + + // Without respecting robots.txt + postTool.pageFetcher.robotsCache.put("[ff01::114]", Collections.emptyList()); + postTool.recursive = 5; + num = postTool.postWebPages(new String[] {"http://[ff01::114]/#removeme"}, 0, null); + assertEquals(6, num); + } + + @Test + public void testRobotsExclusion() throws IOException, URISyntaxException { + PostTool postTool = new PostTool(); + postTool.pageFetcher = new MockPageFetcher(); + postTool.dryRun = true; + + assertFalse(postTool.pageFetcher.isDisallowedByRobots(new URL("http://[ff01::114]/"))); + assertTrue(postTool.pageFetcher.isDisallowedByRobots(new URL("http://[ff01::114]/disallowed"))); + assertEquals( + "There should be two entries parsed from robots.txt", + 2, + postTool.pageFetcher.robotsCache.get("[ff01::114]").size()); + } + + static class MockPageFetcher extends PostTool.PageFetcher { + HashMap<String, String> htmlMap = new HashMap<>(); + HashMap<String, Set<URI>> linkMap = new HashMap<>(); + + public MockPageFetcher() throws IOException, URISyntaxException { + (new PostTool()).super(); + htmlMap.put( + "http://[ff01::114]", + "<html><body><a href=\"http://[ff01::114]/page1\">page1</a><a href=\"http://[ff01::114]/page2\">page2</a></body></html>"); + htmlMap.put( + "http://[ff01::114]/index.html", + "<html><body><a href=\"http://[ff01::114]/page1\">page1</a><a href=\"http://[ff01::114]/page2\">page2</a></body></html>"); + htmlMap.put( + "http://[ff01::114]/page1", + "<html><body><a href=\"http://[ff01::114]/page1/foo\"></body></html>"); + htmlMap.put( + "http://[ff01::114]/page1/foo", + "<html><body><a href=\"http://[ff01::114]/page1/foo/bar\"></body></html>"); + htmlMap.put( + "http://[ff01::114]/page1/foo/bar", + "<html><body><a href=\"http://[ff01::114]/page1\"></body></html>"); + htmlMap.put( + "http://[ff01::114]/page2", + "<html><body><a href=\"http://[ff01::114]/\"><a href=\"http://[ff01::114]/disallowed\"/></body></html>"); + htmlMap.put( + "http://[ff01::114]/disallowed", + "<html><body><a href=\"http://[ff01::114]/\"></body></html>"); + + Set<URI> s = new HashSet<>(); + s.add(new URI("http://[ff01::114]/page1")); + s.add(new URI("http://[ff01::114]/page2")); + linkMap.put("http://[ff01::114]", s); + linkMap.put("http://[ff01::114]/index.html", s); + s = new HashSet<>(); + s.add(new URI("http://[ff01::114]/page1/foo")); + linkMap.put("http://[ff01::114]/page1", s); + s = new HashSet<>(); + s.add(new URI("http://[ff01::114]/page1/foo/bar")); + linkMap.put("http://[ff01::114]/page1/foo", s); + s = new HashSet<>(); + s.add(new URI("http://[ff01::114]/disallowed")); + linkMap.put("http://[ff01::114]/page2", s); + + // Simulate a robots.txt file with comments and a few disallows + StringBuilder sb = new StringBuilder(); + sb.append( + "# Comments appear after the \"#\" symbol at the start of a line, or after a directive\n"); + sb.append("User-agent: * # match all bots\n"); + sb.append("Disallow: # This is void\n"); + sb.append("Disallow: /disallow # Disallow this path\n"); + sb.append("Disallow: /nonexistentpath # Disallow this path\n"); + this.robotsCache.put( + "[ff01::114]", + super.parseRobotsTxt( + new ByteArrayInputStream(sb.toString().getBytes(StandardCharsets.UTF_8)))); + } + + @Override + public PostTool.PageFetcherResult readPageFromUrl(URL u) { + PostTool.PageFetcherResult res = new PostTool.PageFetcherResult(); + if (isDisallowedByRobots(u)) { + res.httpStatus = 403; + return res; + } + res.httpStatus = 200; + res.contentType = "text/html"; + res.content = ByteBuffer.wrap(htmlMap.get(u.toString()).getBytes(StandardCharsets.UTF_8)); + return res; + } + + @Override + public Set<URI> getLinksFromWebPage(URL url, InputStream is, String type, URL postUrl) { + Set<URI> s = linkMap.get(PostTool.normalizeUrlEnding(url.toString())); + if (s == null) { + s = new HashSet<>(); + } + return s; + } + } } diff --git a/solr/core/src/test/org/apache/solr/cloud/SolrCloudExampleTest.java b/solr/core/src/test/org/apache/solr/cloud/SolrCloudExampleTest.java index b6c63148a4e..a40903c1f0a 100644 --- a/solr/core/src/test/org/apache/solr/cloud/SolrCloudExampleTest.java +++ b/solr/core/src/test/org/apache/solr/cloud/SolrCloudExampleTest.java @@ -20,22 +20,15 @@ import java.io.File; import java.lang.invoke.MethodHandles; import java.nio.file.Files; import java.nio.file.Path; -import java.util.ArrayList; -import java.util.Collections; -import java.util.Comparator; -import java.util.List; -import java.util.Random; import java.util.Set; import java.util.concurrent.TimeUnit; -import java.util.stream.Collectors; -import java.util.stream.Stream; import org.apache.commons.cli.CommandLine; import org.apache.solr.cli.CreateCollectionTool; import org.apache.solr.cli.DeleteTool; import org.apache.solr.cli.HealthcheckTool; +import org.apache.solr.cli.PostTool; import org.apache.solr.cli.SolrCLI; import org.apache.solr.client.solrj.SolrQuery; -import org.apache.solr.client.solrj.request.StreamingUpdateRequest; import org.apache.solr.client.solrj.response.QueryResponse; import org.apache.solr.common.cloud.ZkStateReader; import org.apache.solr.util.ExternalPaths; @@ -115,52 +108,36 @@ public class SolrCloudExampleTest extends AbstractFullDistribZkTestBase { invalidToolExitStatus, tool.runTool(cli)); - // now index docs like bin/solr post would, but we can't use SimplePostTool because it uses - // System.exit when it encounters an error, which JUnit doesn't like ... + // now index docs ... log.info("Created collection, now posting example docs!"); Path exampleDocsDir = Path.of(ExternalPaths.SOURCE_HOME, "example", "exampledocs"); assertTrue(exampleDocsDir.toAbsolutePath() + " not found!", Files.isDirectory(exampleDocsDir)); - List<Path> xmlFiles; - try (Stream<Path> stream = Files.walk(exampleDocsDir, 1)) { - xmlFiles = - stream - .filter(path -> path.getFileName().toString().endsWith(".xml")) - // don't rely on File.compareTo, it's behavior varies by OS - .sorted(Comparator.comparing(path -> path.getFileName().toString())) - // be explicit about the collection type because we will shuffle it later - .collect(Collectors.toCollection(ArrayList::new)); - } + String[] argsForPost = + new String[] { + "--solr-update-url", + solrUrl + "/" + testCollectionName + "/update", + "-filetypes", + "xml", + exampleDocsDir.toAbsolutePath().toString() + }; - // force a deterministic random ordering of the files so seeds reproduce regardless of - // platform/filesystem - Collections.shuffle(xmlFiles, new Random(random().nextLong())); + PostTool postTool = new PostTool(); + CommandLine postCli = + SolrCLI.processCommandLineArgs(postTool.getName(), postTool.getOptions(), argsForPost); + postTool.runTool(postCli); - // if you add/remove example XML docs, you'll have to fix these expected values - int expectedXmlFileCount = 14; int expectedXmlDocCount = 32; - assertEquals( - "Unexpected # of example XML files in " + exampleDocsDir.toAbsolutePath(), - expectedXmlFileCount, - xmlFiles.size()); - - for (Path xml : xmlFiles) { - if (log.isInfoEnabled()) { - log.info("POSTing {}", xml.toAbsolutePath()); - } - cloudClient.request( - new StreamingUpdateRequest("/update", xml, "application/xml"), testCollectionName); - } - cloudClient.commit(testCollectionName); - int numFound = 0; // give the update a chance to take effect. for (int idx = 0; idx < 100; ++idx) { QueryResponse qr = cloudClient.query(testCollectionName, new SolrQuery("*:*")); numFound = (int) qr.getResults().getNumFound(); - if (numFound == expectedXmlDocCount) break; + if (numFound == expectedXmlDocCount) { + break; + } Thread.sleep(100); } assertEquals("*:* found unexpected number of documents", expectedXmlDocCount, numFound); diff --git a/solr/packaging/test/test_post.bats b/solr/packaging/test/test_post.bats index 34f39cfad87..1dcb561afa8 100644 --- a/solr/packaging/test/test_post.bats +++ b/solr/packaging/test/test_post.bats @@ -78,7 +78,7 @@ teardown() { solr create_collection -c monitors_no_type -d _default - run solr post -url http://localhost:${SOLR_PORT}/solr/monitors_no_type/update -commit ${SOLR_TIP}/example/exampledocs/monitor.xml + run solr post -url http://localhost:${SOLR_PORT}/solr/monitors_no_type/update ${SOLR_TIP}/example/exampledocs/monitor.xml assert_output --partial '1 files indexed.' refute_output --partial 'ERROR' @@ -87,7 +87,7 @@ teardown() { solr create_collection -c books_no_type -d _default - run solr post -url http://localhost:${SOLR_PORT}/solr/books_no_type/update -commit ${SOLR_TIP}/example/exampledocs/books.json + run solr post -url http://localhost:${SOLR_PORT}/solr/books_no_type/update ${SOLR_TIP}/example/exampledocs/books.json assert_output --partial '1 files indexed.' refute_output --partial 'ERROR' @@ -96,7 +96,7 @@ teardown() { solr create_collection -c books_csv_no_type -d _default - run solr post -url http://localhost:${SOLR_PORT}/solr/books_csv_no_type/update -commit ${SOLR_TIP}/example/exampledocs/books.csv + run solr post -url http://localhost:${SOLR_PORT}/solr/books_csv_no_type/update ${SOLR_TIP}/example/exampledocs/books.csv assert_output --partial '1 files indexed.' refute_output --partial 'ERROR' @@ -104,12 +104,22 @@ teardown() { assert_output --partial '"numFound":10' } +@test "crawling a directory as a dry-run" { + + # We filter to xml,json,and csv as we don't want to invoke the Extract handler, and are running it as a dry run + run solr post --dry-run -filetypes xml,json,csv -url http://localhost:${SOLR_PORT}/solr/foobar/update -skipcommit ${SOLR_TIP}/example/exampledocs + + assert_output --partial 'Dry run complete. 16 would have been indexed.' + refute_output --partial '16 files indexed.' + refute_output --partial 'ERROR' +} + @test "crawling a directory" { solr create_collection -c mixed_content -d _default # We filter to xml,json,and csv as we don't want to invoke the Extract handler. - run solr post -filetypes xml,json,csv -url http://localhost:${SOLR_PORT}/solr/mixed_content/update -commit ${SOLR_TIP}/example/exampledocs + run solr post -filetypes xml,json,csv -url http://localhost:${SOLR_PORT}/solr/mixed_content/update ${SOLR_TIP}/example/exampledocs assert_output --partial '16 files indexed.' refute_output --partial 'ERROR' @@ -129,7 +139,7 @@ teardown() { } }' "http://localhost:${SOLR_PORT}/solr/webcrawl/config" - run solr post -mode web -url http://localhost:${SOLR_PORT}/webcrawl/update -recursive 1 -delay 1 https://solr.apache.org + run solr post -mode web --solr-update-url http://localhost:${SOLR_PORT}/webcrawl/update -recursive 1 -delay 1 https://solr.apache.org assert_output --partial 'Entering crawl at level 0' } @@ -152,7 +162,7 @@ teardown() { run solr create_collection -c test_args -d _default assert_output --partial "Created collection 'test_args'" - run solr post -url http://localhost:${SOLR_PORT}/solr/test_args/update -mode args -type application/xml -out -commit "<delete><query>*:*</query></delete>" + run solr post -url http://localhost:${SOLR_PORT}/solr/test_args/update -mode args -type application/xml -out "<delete><query>*:*</query></delete>" assert_output --partial '<int name="status">0</int>' # confirm default type
