Author: chetanm Date: Mon Jun 19 08:48:40 2017 New Revision: 1799157 URL: http://svn.apache.org/viewvc?rev=1799157&view=rev Log: OAK-6362 - Use NodeStoreFixtureProvider in tika command of oak-run
Added: jackrabbit/oak/trunk/oak-run/src/main/java/org/apache/jackrabbit/oak/plugins/tika/TikaCommandOptions.java (with props) Removed: jackrabbit/oak/trunk/oak-run/src/main/java/org/apache/jackrabbit/oak/plugins/tika/SegmentTarUtils.java Modified: jackrabbit/oak/trunk/oak-run/src/main/java/org/apache/jackrabbit/oak/plugins/tika/CSVFileGenerator.java jackrabbit/oak/trunk/oak-run/src/main/java/org/apache/jackrabbit/oak/plugins/tika/TextExtractorMain.java Modified: jackrabbit/oak/trunk/oak-run/src/main/java/org/apache/jackrabbit/oak/plugins/tika/CSVFileGenerator.java URL: http://svn.apache.org/viewvc/jackrabbit/oak/trunk/oak-run/src/main/java/org/apache/jackrabbit/oak/plugins/tika/CSVFileGenerator.java?rev=1799157&r1=1799156&r2=1799157&view=diff ============================================================================== --- jackrabbit/oak/trunk/oak-run/src/main/java/org/apache/jackrabbit/oak/plugins/tika/CSVFileGenerator.java (original) +++ jackrabbit/oak/trunk/oak-run/src/main/java/org/apache/jackrabbit/oak/plugins/tika/CSVFileGenerator.java Mon Jun 19 08:48:40 2017 @@ -55,6 +55,9 @@ public class CSVFileGenerator { br.getEncoding(), br.getPath() ); + if (count % 1000 == 0) { + log.info("Processed {} binaries so far", count); + } } printer.flush(); log.info("Generated csv output at {} with {} entries", outFile.getAbsolutePath(), count); Modified: jackrabbit/oak/trunk/oak-run/src/main/java/org/apache/jackrabbit/oak/plugins/tika/TextExtractorMain.java URL: http://svn.apache.org/viewvc/jackrabbit/oak/trunk/oak-run/src/main/java/org/apache/jackrabbit/oak/plugins/tika/TextExtractorMain.java?rev=1799157&r1=1799156&r2=1799157&view=diff ============================================================================== --- jackrabbit/oak/trunk/oak-run/src/main/java/org/apache/jackrabbit/oak/plugins/tika/TextExtractorMain.java (original) +++ jackrabbit/oak/trunk/oak-run/src/main/java/org/apache/jackrabbit/oak/plugins/tika/TextExtractorMain.java Mon Jun 19 08:48:40 2017 @@ -21,42 +21,24 @@ package org.apache.jackrabbit.oak.plugin import static com.google.common.base.Preconditions.checkArgument; import static com.google.common.base.Preconditions.checkNotNull; -import static java.util.Arrays.asList; -import java.io.Closeable; import java.io.File; -import java.io.IOException; -import java.io.InputStream; -import java.util.List; -import java.util.Map; -import java.util.Properties; -import java.util.UUID; - -import org.apache.commons.io.FileUtils; -import org.apache.commons.io.IOUtils; -import org.apache.jackrabbit.aws.ext.ds.S3DataStore; -import org.apache.jackrabbit.core.data.DataStore; -import org.apache.jackrabbit.core.data.DataStoreException; -import org.apache.jackrabbit.core.data.FileDataStore; -import org.apache.jackrabbit.oak.commons.PropertiesUtil; -import org.apache.jackrabbit.oak.plugins.blob.datastore.DataStoreBlobStore; + import org.apache.jackrabbit.oak.plugins.index.datastore.DataStoreTextWriter; -import org.apache.jackrabbit.oak.plugins.document.DocumentMK; -import org.apache.jackrabbit.oak.plugins.document.DocumentNodeStore; -import org.apache.jackrabbit.oak.plugins.document.util.MongoConnection; +import org.apache.jackrabbit.oak.run.cli.BlobStoreFixture; +import org.apache.jackrabbit.oak.run.cli.BlobStoreFixtureProvider; +import org.apache.jackrabbit.oak.run.cli.CommonOptions; +import org.apache.jackrabbit.oak.run.cli.NodeStoreFixture; +import org.apache.jackrabbit.oak.run.cli.NodeStoreFixtureProvider; +import org.apache.jackrabbit.oak.run.cli.Options; import org.apache.jackrabbit.oak.spi.blob.BlobStore; import org.apache.jackrabbit.oak.spi.state.NodeStore; import org.slf4j.Logger; import org.slf4j.LoggerFactory; -import com.google.common.collect.Maps; import com.google.common.io.Closer; -import com.mongodb.MongoClientURI; -import com.mongodb.MongoURI; import joptsimple.OptionParser; -import joptsimple.OptionSet; -import joptsimple.OptionSpec; public class TextExtractorMain { private static final Logger log = LoggerFactory.getLogger(TextExtractorMain.class); @@ -65,147 +47,65 @@ public class TextExtractorMain { } public static void main(String[] args) throws Exception { - Closer closer = Closer.create(); - String h = "tika [extract|report|generate]\n" + - "\n" + - "report : Generates a summary report related to binary data\n" + - "extract : Performs the text extraction\n" + - "generate : Generates the csv data file based on configured NodeStore/BlobStore"; - try { - OptionParser parser = new OptionParser(); - OptionSpec<?> help = parser.acceptsAll(asList("h", "?", "help"), - "show help").forHelp(); - - OptionSpec<String> nodeStoreSpec = parser - .accepts("nodestore", "NodeStore detail /path/to/oak/repository | mongodb://host:port/database") - .withRequiredArg() - .ofType(String.class); - - OptionSpec<String> pathSpec = parser - .accepts("path", "Path in repository under which the binaries would be searched") - .withRequiredArg() - .ofType(String.class); - - OptionSpec<File> dataFileSpec = parser - .accepts("data-file", "Data file in csv format containing the binary metadata") - .withRequiredArg() - .ofType(File.class); - - OptionSpec<File> tikaConfigSpec = parser - .accepts("tika-config", "Tika config file path") - .withRequiredArg() - .ofType(File.class); - - OptionSpec<File> fdsDirSpec = parser - .accepts("fds-path", "Path of directory used by FileDataStore") - .withRequiredArg() - .ofType(File.class); - - OptionSpec<File> s3ConfigSpec = parser - .accepts("s3-config-path", "Path of properties file containing config for S3DataStore") - .withRequiredArg() - .ofType(File.class); - - OptionSpec<File> storeDirSpec = parser - .accepts("store-path", "Path of directory used to store extracted text content") - .withRequiredArg() - .ofType(File.class); - - OptionSpec<Integer> poolSize = parser - .accepts("pool-size", "Size of the thread pool used to perform text extraction. Defaults " + - "to number of cores on the system") - .withRequiredArg() - .ofType(Integer.class); - - OptionSpec<String> nonOption = parser.nonOptions(h); - - OptionSet options = parser.parse(args); - List<String> nonOptions = nonOption.values(options); - - if (options.has(help)) { - parser.printHelpOn(System.out); - System.exit(0); - } + OptionParser parser = new OptionParser(); - if (nonOptions.isEmpty()) { - parser.printHelpOn(System.err); - System.exit(1); - } - - boolean report = nonOptions.contains("report"); - boolean extract = nonOptions.contains("extract"); - boolean generate = nonOptions.contains("generate"); - File dataFile = null; - File storeDir = null; - File tikaConfigFile = null; - BlobStore blobStore = null; + Options opts = new Options(); + opts.setCommandName(TikaCommandOptions.NAME); + opts.setSummary("Provides text extraction related operations"); + opts.setConnectionString(CommonOptions.DEFAULT_CONNECTION_STRING); + opts.registerOptionsFactory(TikaCommandOptions.FACTORY); + opts.parseAndConfigure(parser, args); + + TikaCommandOptions tikaOpts = opts.getOptionBean(TikaCommandOptions.class); + + try (Closer closer = Closer.create()) { + boolean report = tikaOpts.report(); + boolean extract = tikaOpts.extract(); + boolean generate = tikaOpts.generate(); + BlobStore blobStore; + NodeStore nodeStore = null; + File dataFile = tikaOpts.getDataFile(); + File storeDir = tikaOpts.getStoreDir(); + File tikaConfigFile = tikaOpts.getTikaConfig(); BinaryResourceProvider binaryResourceProvider = null; BinaryStats stats = null; - String path = "/"; + String path = tikaOpts.getPath(); - if (options.has(tikaConfigSpec)) { - tikaConfigFile = tikaConfigSpec.value(options); + if (tikaConfigFile != null) { checkArgument(tikaConfigFile.exists(), "Tika config file %s does not exist", tikaConfigFile.getAbsolutePath()); } - if (options.has(storeDirSpec)) { - storeDir = storeDirSpec.value(options); + if (storeDir != null) { if (storeDir.exists()) { checkArgument(storeDir.isDirectory(), "Path [%s] specified for storing extracted " + - "text content '%s' is not a directory", storeDir.getAbsolutePath(), storeDirSpec.options()); + "text content is not a directory", storeDir.getAbsolutePath()); } } - if (options.has(fdsDirSpec)) { - File fdsDir = fdsDirSpec.value(options); - checkArgument(fdsDir.exists(), "FileDataStore %s does not exist", fdsDir.getAbsolutePath()); - FileDataStore fds = new FileDataStore(); - fds.setPath(fdsDir.getAbsolutePath()); - fds.init(null); - blobStore = new DataStoreBlobStore(fds); - } - - if (options.has(s3ConfigSpec)){ - File s3Config = s3ConfigSpec.value(options); - checkArgument(s3Config.exists() && s3Config.canRead(), "S3DataStore config cannot be read from [%s]", - s3Config.getAbsolutePath()); - Properties props = loadProperties(s3Config); - log.info("Loaded properties for S3DataStore from {}", s3Config.getAbsolutePath()); - String pathProp = "path"; - String repoPath = props.getProperty(pathProp); - checkNotNull(repoPath, "Missing required property [%s] from S3DataStore config loaded from [%s]", pathProp, s3Config); - - //Check if 'secret' key is defined. It should be non null for references - //to be generated. As the ref are transient we can just use any random value - //if not specified - String secretConfig = "secret"; - if (props.getProperty(secretConfig) == null){ - props.setProperty(secretConfig, UUID.randomUUID().toString()); - } - - log.info("Using {} for S3DataStore ", repoPath); - DataStore ds = createS3DataStore(props); - PropertiesUtil.populate(ds, toMap(props), false); - ds.init(pathProp); - blobStore = new DataStoreBlobStore(ds); - closer.register(asCloseable(ds)); - } + checkNotNull(dataFile, "Data file not configured with %s", tikaOpts.getDataFileSpecOpt()); - if (options.has(dataFileSpec)) { - dataFile = dataFileSpec.value(options); + if (!generate) { + //For report and extract case we do not need NodeStore access so create BlobStore directly + BlobStoreFixture blobStoreFixture = BlobStoreFixtureProvider.create(opts); + closer.register(blobStoreFixture); + blobStore = checkNotNull(blobStoreFixture).getBlobStore(); + } else { + NodeStoreFixture nodeStoreFixture = NodeStoreFixtureProvider.create(opts); + closer.register(nodeStoreFixture); + blobStore = nodeStoreFixture.getBlobStore(); + nodeStore = nodeStoreFixture.getStore(); } - checkNotNull(dataFile, "Data file not configured with %s", dataFileSpec); + checkNotNull(blobStore, "This command requires an external BlobStore configured"); if (report || extract) { checkArgument(dataFile.exists(), "Data file %s does not exist", dataFile.getAbsolutePath()); - binaryResourceProvider = new CSVFileBinaryResourceProvider(dataFile, blobStore); - if (binaryResourceProvider instanceof Closeable) { - closer.register((Closeable) binaryResourceProvider); - } + CSVFileBinaryResourceProvider csvProvider = new CSVFileBinaryResourceProvider(dataFile, blobStore); + closer.register(csvProvider); + binaryResourceProvider = csvProvider; stats = new BinaryStats(tikaConfigFile, binaryResourceProvider); String summary = stats.getSummary(); @@ -213,11 +113,8 @@ public class TextExtractorMain { } if (generate){ - String src = nodeStoreSpec.value(options); - checkNotNull(blobStore, "BlobStore found to be null. FileDataStore directory " + - "must be specified via %s", fdsDirSpec.options()); checkNotNull(dataFile, "Data file path not provided"); - NodeStore nodeStore = bootStrapNodeStore(src, blobStore, closer); + log.info("Generated csv data to be stored in {}", dataFile.getAbsolutePath()); BinaryResourceProvider brp = new NodeStoreBinaryResourceProvider(nodeStore, blobStore); CSVFileGenerator generator = new CSVFileGenerator(dataFile); generator.generate(brp.getBinaries(path)); @@ -225,25 +122,20 @@ public class TextExtractorMain { if (extract) { checkNotNull(storeDir, "Directory to store extracted text content " + - "must be specified via %s", storeDirSpec.options()); - checkNotNull(blobStore, "BlobStore found to be null. FileDataStore directory " + - "must be specified via %s", fdsDirSpec.options()); + "must be specified via %s", tikaOpts.getStoreDirSpecOpt()); + checkNotNull(blobStore, "BlobStore found to be null."); DataStoreTextWriter writer = new DataStoreTextWriter(storeDir, false); TextExtractor extractor = new TextExtractor(writer); - if (options.has(poolSize)) { - extractor.setThreadPoolSize(poolSize.value(options)); + if (tikaOpts.isPoolSizeDefined()) { + extractor.setThreadPoolSize(tikaOpts.getPoolSize()); } if (tikaConfigFile != null) { extractor.setTikaConfig(tikaConfigFile); } - if (options.has(pathSpec)) { - path = pathSpec.value(options); - } - closer.register(writer); closer.register(extractor); @@ -254,89 +146,6 @@ public class TextExtractorMain { extractor.close(); writer.close(); } - - } catch (Throwable e) { - throw closer.rethrow(e); - } finally { - closer.close(); - } - } - - private static Map<String, ?> toMap(Properties properties) { - Map<String, String> map = Maps.newHashMap(); - for (final String name: properties.stringPropertyNames()) { - map.put(name, properties.getProperty(name)); - } - return map; - } - - private static DataStore createS3DataStore(Properties props) throws IOException { - S3DataStore s3ds = new S3DataStore(); - s3ds.setProperties(props); - return s3ds; - } - - private static Properties loadProperties(File s3Config) throws IOException { - Properties props = new Properties(); - InputStream is = FileUtils.openInputStream(s3Config); - try{ - props.load(is); - } finally { - IOUtils.closeQuietly(is); - } - return props; - } - - private static NodeStore bootStrapNodeStore(String src, BlobStore blobStore, Closer closer) throws IOException { - if (src.startsWith(MongoURI.MONGODB_PREFIX)) { - MongoClientURI uri = new MongoClientURI(src); - if (uri.getDatabase() == null) { - System.err.println("Database missing in MongoDB URI: " - + uri.getURI()); - System.exit(1); - } - MongoConnection mongo = new MongoConnection(uri.getURI()); - closer.register(asCloseable(mongo)); - DocumentNodeStore store = new DocumentMK.Builder() - .setBlobStore(blobStore) - .setMongoDB(mongo.getDB()) - .setReadOnlyMode() - .getNodeStore(); - closer.register(asCloseable(store)); - return store; } - - return SegmentTarUtils.bootstrap(src, blobStore, closer); - } - - private static Closeable asCloseable(final DataStore ds) { - return new Closeable() { - @Override - public void close() throws IOException { - try { - ds.close(); - } catch (DataStoreException e) { - throw new IOException(e); - } - } - }; - } - - private static Closeable asCloseable(final DocumentNodeStore dns) { - return new Closeable() { - @Override - public void close() throws IOException { - dns.dispose(); - } - }; - } - - private static Closeable asCloseable(final MongoConnection con) { - return new Closeable() { - @Override - public void close() throws IOException { - con.close(); - } - }; } } \ No newline at end of file Added: jackrabbit/oak/trunk/oak-run/src/main/java/org/apache/jackrabbit/oak/plugins/tika/TikaCommandOptions.java URL: http://svn.apache.org/viewvc/jackrabbit/oak/trunk/oak-run/src/main/java/org/apache/jackrabbit/oak/plugins/tika/TikaCommandOptions.java?rev=1799157&view=auto ============================================================================== --- jackrabbit/oak/trunk/oak-run/src/main/java/org/apache/jackrabbit/oak/plugins/tika/TikaCommandOptions.java (added) +++ jackrabbit/oak/trunk/oak-run/src/main/java/org/apache/jackrabbit/oak/plugins/tika/TikaCommandOptions.java Mon Jun 19 08:48:40 2017 @@ -0,0 +1,161 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.jackrabbit.oak.plugins.tika; + +import java.io.File; +import java.util.Set; + +import com.google.common.collect.ImmutableSet; +import joptsimple.OptionParser; +import joptsimple.OptionSet; +import joptsimple.OptionSpec; +import org.apache.jackrabbit.oak.run.cli.OptionsBean; +import org.apache.jackrabbit.oak.run.cli.OptionsBeanFactory; + +public class TikaCommandOptions implements OptionsBean { + public static final String NAME = "tika"; + + public static final OptionsBeanFactory FACTORY = TikaCommandOptions::new; + + private final OptionSpec<String> pathOpt; + private final OptionSpec<File> dataFileSpecOpt; + private final OptionSpec<File> tikaConfigSpecOpt; + private final OptionSpec<File> storeDirSpecOpt; + private final OptionSpec<Integer> poolSizeOpt; + + private final OptionSpec<Void> reportAction; + private final OptionSpec<Void> generateAction; + private final OptionSpec<Void> extractAction; + + private final Set<String> operationNames; + + private OptionSet options; + + public TikaCommandOptions(OptionParser parser) { + pathOpt = parser + .accepts("path", "Path in repository under which the binaries would be searched") + .withRequiredArg() + .ofType(String.class) + .defaultsTo("/"); + + dataFileSpecOpt = parser + .accepts("data-file", "Data file in csv format containing the binary metadata") + .withRequiredArg() + .ofType(File.class) + .defaultsTo(new File("oak-binary-stats.csv")); + + tikaConfigSpecOpt = parser + .accepts("tika-config", "Tika config file path") + .withRequiredArg() + .ofType(File.class); + + storeDirSpecOpt = parser + .accepts("store-path", "Path of directory used to store extracted text content") + .withRequiredArg() + .ofType(File.class); + + poolSizeOpt = parser + .accepts("pool-size", "Size of the thread pool used to perform text extraction. Defaults " + + "to number of cores on the system") + .withRequiredArg() + .ofType(Integer.class); + + reportAction = parser.accepts("report", "Generates a summary report based on the csv file"); + generateAction = parser.accepts("generate", "Generates the CSV file required for 'extract' and 'report' actions"); + extractAction = parser.accepts("extract", "Performs the text extraction based on the csv file"); + + operationNames = ImmutableSet.of("report", "generate", "extract"); + } + + @Override + public void configure(OptionSet options) { + this.options = options; + } + + @Override + public String title() { + return ""; + } + + @Override + public String description() { + return "The tika command supports following operations. All operations connect to repository in read only mode. \n" + + "Use of one of the supported actions like --report, --generate, --extract etc. "; + } + + @Override + public int order() { + return 50; + } + + @Override + public Set<String> operationNames() { + return operationNames; + } + + public String getPath() { + return pathOpt.value(options); + } + + public File getDataFile() { + return dataFileSpecOpt.value(options); + } + + public File getTikaConfig() { + return tikaConfigSpecOpt.value(options); + } + + public File getStoreDir() { + return storeDirSpecOpt.value(options); + } + + public boolean isPoolSizeDefined() { + return options.has(poolSizeOpt); + } + + public int getPoolSize() { + return poolSizeOpt.value(options); + } + + public boolean report() { + //The non option mode is for comparability support with previous versions + return options.has(reportAction) || hasNonOption("report"); + } + + public boolean generate() { + return options.has(generateAction) || hasNonOption("generate"); + } + + public boolean extract() { + return options.has(extractAction) || hasNonOption("extract"); + } + + public OptionSpec<File> getDataFileSpecOpt() { + return dataFileSpecOpt; + } + + public OptionSpec<File> getStoreDirSpecOpt() { + return storeDirSpecOpt; + } + + private boolean hasNonOption(String name) { + return options.nonOptionArguments().contains(name); + } +} Propchange: jackrabbit/oak/trunk/oak-run/src/main/java/org/apache/jackrabbit/oak/plugins/tika/TikaCommandOptions.java ------------------------------------------------------------------------------ svn:eol-style = native