Author: chetanm
Date: Mon Jun 19 08:48:40 2017
New Revision: 1799157

URL: http://svn.apache.org/viewvc?rev=1799157&view=rev
Log:
OAK-6362 - Use NodeStoreFixtureProvider in tika command of oak-run

Added:
    
jackrabbit/oak/trunk/oak-run/src/main/java/org/apache/jackrabbit/oak/plugins/tika/TikaCommandOptions.java
   (with props)
Removed:
    
jackrabbit/oak/trunk/oak-run/src/main/java/org/apache/jackrabbit/oak/plugins/tika/SegmentTarUtils.java
Modified:
    
jackrabbit/oak/trunk/oak-run/src/main/java/org/apache/jackrabbit/oak/plugins/tika/CSVFileGenerator.java
    
jackrabbit/oak/trunk/oak-run/src/main/java/org/apache/jackrabbit/oak/plugins/tika/TextExtractorMain.java

Modified: 
jackrabbit/oak/trunk/oak-run/src/main/java/org/apache/jackrabbit/oak/plugins/tika/CSVFileGenerator.java
URL: 
http://svn.apache.org/viewvc/jackrabbit/oak/trunk/oak-run/src/main/java/org/apache/jackrabbit/oak/plugins/tika/CSVFileGenerator.java?rev=1799157&r1=1799156&r2=1799157&view=diff
==============================================================================
--- 
jackrabbit/oak/trunk/oak-run/src/main/java/org/apache/jackrabbit/oak/plugins/tika/CSVFileGenerator.java
 (original)
+++ 
jackrabbit/oak/trunk/oak-run/src/main/java/org/apache/jackrabbit/oak/plugins/tika/CSVFileGenerator.java
 Mon Jun 19 08:48:40 2017
@@ -55,6 +55,9 @@ public class CSVFileGenerator {
                         br.getEncoding(),
                         br.getPath()
                 );
+                if (count % 1000 == 0) {
+                    log.info("Processed {} binaries so far", count);
+                }
             }
             printer.flush();
             log.info("Generated csv output at {} with {} entries", 
outFile.getAbsolutePath(), count);

Modified: 
jackrabbit/oak/trunk/oak-run/src/main/java/org/apache/jackrabbit/oak/plugins/tika/TextExtractorMain.java
URL: 
http://svn.apache.org/viewvc/jackrabbit/oak/trunk/oak-run/src/main/java/org/apache/jackrabbit/oak/plugins/tika/TextExtractorMain.java?rev=1799157&r1=1799156&r2=1799157&view=diff
==============================================================================
--- 
jackrabbit/oak/trunk/oak-run/src/main/java/org/apache/jackrabbit/oak/plugins/tika/TextExtractorMain.java
 (original)
+++ 
jackrabbit/oak/trunk/oak-run/src/main/java/org/apache/jackrabbit/oak/plugins/tika/TextExtractorMain.java
 Mon Jun 19 08:48:40 2017
@@ -21,42 +21,24 @@ package org.apache.jackrabbit.oak.plugin
 
 import static com.google.common.base.Preconditions.checkArgument;
 import static com.google.common.base.Preconditions.checkNotNull;
-import static java.util.Arrays.asList;
 
-import java.io.Closeable;
 import java.io.File;
-import java.io.IOException;
-import java.io.InputStream;
-import java.util.List;
-import java.util.Map;
-import java.util.Properties;
-import java.util.UUID;
-
-import org.apache.commons.io.FileUtils;
-import org.apache.commons.io.IOUtils;
-import org.apache.jackrabbit.aws.ext.ds.S3DataStore;
-import org.apache.jackrabbit.core.data.DataStore;
-import org.apache.jackrabbit.core.data.DataStoreException;
-import org.apache.jackrabbit.core.data.FileDataStore;
-import org.apache.jackrabbit.oak.commons.PropertiesUtil;
-import org.apache.jackrabbit.oak.plugins.blob.datastore.DataStoreBlobStore;
+
 import org.apache.jackrabbit.oak.plugins.index.datastore.DataStoreTextWriter;
-import org.apache.jackrabbit.oak.plugins.document.DocumentMK;
-import org.apache.jackrabbit.oak.plugins.document.DocumentNodeStore;
-import org.apache.jackrabbit.oak.plugins.document.util.MongoConnection;
+import org.apache.jackrabbit.oak.run.cli.BlobStoreFixture;
+import org.apache.jackrabbit.oak.run.cli.BlobStoreFixtureProvider;
+import org.apache.jackrabbit.oak.run.cli.CommonOptions;
+import org.apache.jackrabbit.oak.run.cli.NodeStoreFixture;
+import org.apache.jackrabbit.oak.run.cli.NodeStoreFixtureProvider;
+import org.apache.jackrabbit.oak.run.cli.Options;
 import org.apache.jackrabbit.oak.spi.blob.BlobStore;
 import org.apache.jackrabbit.oak.spi.state.NodeStore;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 
-import com.google.common.collect.Maps;
 import com.google.common.io.Closer;
-import com.mongodb.MongoClientURI;
-import com.mongodb.MongoURI;
 
 import joptsimple.OptionParser;
-import joptsimple.OptionSet;
-import joptsimple.OptionSpec;
 
 public class TextExtractorMain {
     private static final Logger log = 
LoggerFactory.getLogger(TextExtractorMain.class);
@@ -65,147 +47,65 @@ public class TextExtractorMain {
     }
 
     public static void main(String[] args) throws Exception {
-        Closer closer = Closer.create();
-        String h = "tika [extract|report|generate]\n" +
-                "\n" +
-                "report   : Generates a summary report related to binary 
data\n" +
-                "extract  : Performs the text extraction\n" +
-                "generate : Generates the csv data file based on configured 
NodeStore/BlobStore";
-        try {
-            OptionParser parser = new OptionParser();
-            OptionSpec<?> help = parser.acceptsAll(asList("h", "?", "help"),
-                    "show help").forHelp();
-
-            OptionSpec<String> nodeStoreSpec = parser
-                    .accepts("nodestore", "NodeStore detail 
/path/to/oak/repository | mongodb://host:port/database")
-                    .withRequiredArg()
-                    .ofType(String.class);
-
-            OptionSpec<String> pathSpec = parser
-                    .accepts("path", "Path in repository under which the 
binaries would be searched")
-                    .withRequiredArg()
-                    .ofType(String.class);
-
-            OptionSpec<File> dataFileSpec = parser
-                    .accepts("data-file", "Data file in csv format containing 
the binary metadata")
-                    .withRequiredArg()
-                    .ofType(File.class);
-
-            OptionSpec<File> tikaConfigSpec = parser
-                    .accepts("tika-config", "Tika config file path")
-                    .withRequiredArg()
-                    .ofType(File.class);
-
-            OptionSpec<File> fdsDirSpec = parser
-                    .accepts("fds-path", "Path of directory used by 
FileDataStore")
-                    .withRequiredArg()
-                    .ofType(File.class);
-
-            OptionSpec<File> s3ConfigSpec = parser
-                    .accepts("s3-config-path", "Path of properties file 
containing config for S3DataStore")
-                    .withRequiredArg()
-                    .ofType(File.class);
-
-            OptionSpec<File> storeDirSpec = parser
-                    .accepts("store-path", "Path of directory used to store 
extracted text content")
-                    .withRequiredArg()
-                    .ofType(File.class);
-
-            OptionSpec<Integer> poolSize = parser
-                    .accepts("pool-size", "Size of the thread pool used to 
perform text extraction. Defaults " +
-                            "to number of cores on the system")
-                    .withRequiredArg()
-                    .ofType(Integer.class);
-
-            OptionSpec<String> nonOption = parser.nonOptions(h);
-
-            OptionSet options = parser.parse(args);
-            List<String> nonOptions = nonOption.values(options);
-
-            if (options.has(help)) {
-                parser.printHelpOn(System.out);
-                System.exit(0);
-            }
+        OptionParser parser = new OptionParser();
 
-            if (nonOptions.isEmpty()) {
-                parser.printHelpOn(System.err);
-                System.exit(1);
-            }
-
-            boolean report = nonOptions.contains("report");
-            boolean extract = nonOptions.contains("extract");
-            boolean generate = nonOptions.contains("generate");
-            File dataFile = null;
-            File storeDir = null;
-            File tikaConfigFile = null;
-            BlobStore blobStore = null;
+        Options opts = new Options();
+        opts.setCommandName(TikaCommandOptions.NAME);
+        opts.setSummary("Provides text extraction related operations");
+        opts.setConnectionString(CommonOptions.DEFAULT_CONNECTION_STRING);
+        opts.registerOptionsFactory(TikaCommandOptions.FACTORY);
+        opts.parseAndConfigure(parser, args);
+
+        TikaCommandOptions tikaOpts = 
opts.getOptionBean(TikaCommandOptions.class);
+
+        try (Closer closer = Closer.create()) {
+            boolean report = tikaOpts.report();
+            boolean extract = tikaOpts.extract();
+            boolean generate = tikaOpts.generate();
+            BlobStore blobStore;
+            NodeStore nodeStore = null;
+            File dataFile = tikaOpts.getDataFile();
+            File storeDir = tikaOpts.getStoreDir();
+            File tikaConfigFile = tikaOpts.getTikaConfig();
             BinaryResourceProvider binaryResourceProvider = null;
             BinaryStats stats = null;
-            String path = "/";
+            String path = tikaOpts.getPath();
 
-            if (options.has(tikaConfigSpec)) {
-                tikaConfigFile = tikaConfigSpec.value(options);
+            if (tikaConfigFile != null) {
                 checkArgument(tikaConfigFile.exists(), "Tika config file %s 
does not exist",
                         tikaConfigFile.getAbsolutePath());
             }
 
-            if (options.has(storeDirSpec)) {
-                storeDir = storeDirSpec.value(options);
+            if (storeDir != null) {
                 if (storeDir.exists()) {
                     checkArgument(storeDir.isDirectory(), "Path [%s] specified 
for storing extracted " +
-                            "text content '%s' is not a directory", 
storeDir.getAbsolutePath(), storeDirSpec.options());
+                            "text content is not a directory", 
storeDir.getAbsolutePath());
                 }
             }
 
-            if (options.has(fdsDirSpec)) {
-                File fdsDir = fdsDirSpec.value(options);
-                checkArgument(fdsDir.exists(), "FileDataStore %s does not 
exist", fdsDir.getAbsolutePath());
-                FileDataStore fds = new FileDataStore();
-                fds.setPath(fdsDir.getAbsolutePath());
-                fds.init(null);
-                blobStore = new DataStoreBlobStore(fds);
-            }
-
-            if (options.has(s3ConfigSpec)){
-                File s3Config = s3ConfigSpec.value(options);
-                checkArgument(s3Config.exists() && s3Config.canRead(), 
"S3DataStore config cannot be read from [%s]",
-                        s3Config.getAbsolutePath());
-                Properties props = loadProperties(s3Config);
-                log.info("Loaded properties for S3DataStore from {}", 
s3Config.getAbsolutePath());
-                String pathProp = "path";
-                String repoPath = props.getProperty(pathProp);
-                checkNotNull(repoPath, "Missing required property [%s] from 
S3DataStore config loaded from [%s]", pathProp, s3Config);
-
-                //Check if 'secret' key is defined. It should be non null for 
references
-                //to be generated. As the ref are transient we can just use 
any random value
-                //if not specified
-                String secretConfig = "secret";
-                if (props.getProperty(secretConfig) == null){
-                    props.setProperty(secretConfig, 
UUID.randomUUID().toString());
-                }
-
-                log.info("Using {} for S3DataStore ", repoPath);
-                DataStore ds = createS3DataStore(props);
-                PropertiesUtil.populate(ds, toMap(props), false);
-                ds.init(pathProp);
-                blobStore = new DataStoreBlobStore(ds);
-                closer.register(asCloseable(ds));
-            }
+            checkNotNull(dataFile, "Data file not configured with %s", 
tikaOpts.getDataFileSpecOpt());
 
-            if (options.has(dataFileSpec)) {
-                dataFile = dataFileSpec.value(options);
+            if (!generate) {
+                //For report and extract case we do not need NodeStore access 
so create BlobStore directly
+                BlobStoreFixture blobStoreFixture = 
BlobStoreFixtureProvider.create(opts);
+                closer.register(blobStoreFixture);
+                blobStore = checkNotNull(blobStoreFixture).getBlobStore();
+            } else {
+                NodeStoreFixture nodeStoreFixture = 
NodeStoreFixtureProvider.create(opts);
+                closer.register(nodeStoreFixture);
+                blobStore = nodeStoreFixture.getBlobStore();
+                nodeStore = nodeStoreFixture.getStore();
             }
 
-            checkNotNull(dataFile, "Data file not configured with %s", 
dataFileSpec);
+            checkNotNull(blobStore, "This command requires an external 
BlobStore configured");
 
             if (report || extract) {
                 checkArgument(dataFile.exists(),
                         "Data file %s does not exist", 
dataFile.getAbsolutePath());
 
-                binaryResourceProvider = new 
CSVFileBinaryResourceProvider(dataFile, blobStore);
-                if (binaryResourceProvider instanceof Closeable) {
-                    closer.register((Closeable) binaryResourceProvider);
-                }
+                CSVFileBinaryResourceProvider csvProvider = new 
CSVFileBinaryResourceProvider(dataFile, blobStore);
+                closer.register(csvProvider);
+                binaryResourceProvider = csvProvider;
 
                 stats = new BinaryStats(tikaConfigFile, 
binaryResourceProvider);
                 String summary = stats.getSummary();
@@ -213,11 +113,8 @@ public class TextExtractorMain {
             }
 
             if (generate){
-                String src = nodeStoreSpec.value(options);
-                checkNotNull(blobStore, "BlobStore found to be null. 
FileDataStore directory " +
-                        "must be specified via %s", fdsDirSpec.options());
                 checkNotNull(dataFile, "Data file path not provided");
-                NodeStore nodeStore = bootStrapNodeStore(src, blobStore, 
closer);
+                log.info("Generated csv data to be stored in {}", 
dataFile.getAbsolutePath());
                 BinaryResourceProvider brp = new 
NodeStoreBinaryResourceProvider(nodeStore, blobStore);
                 CSVFileGenerator generator = new CSVFileGenerator(dataFile);
                 generator.generate(brp.getBinaries(path));
@@ -225,25 +122,20 @@ public class TextExtractorMain {
 
             if (extract) {
                 checkNotNull(storeDir, "Directory to store extracted text 
content " +
-                        "must be specified via %s", storeDirSpec.options());
-                checkNotNull(blobStore, "BlobStore found to be null. 
FileDataStore directory " +
-                        "must be specified via %s", fdsDirSpec.options());
+                        "must be specified via %s", 
tikaOpts.getStoreDirSpecOpt());
+                checkNotNull(blobStore, "BlobStore found to be null.");
 
                 DataStoreTextWriter writer = new DataStoreTextWriter(storeDir, 
false);
                 TextExtractor extractor = new TextExtractor(writer);
 
-                if (options.has(poolSize)) {
-                    extractor.setThreadPoolSize(poolSize.value(options));
+                if (tikaOpts.isPoolSizeDefined()) {
+                    extractor.setThreadPoolSize(tikaOpts.getPoolSize());
                 }
 
                 if (tikaConfigFile != null) {
                     extractor.setTikaConfig(tikaConfigFile);
                 }
 
-                if (options.has(pathSpec)) {
-                    path = pathSpec.value(options);
-                }
-
                 closer.register(writer);
                 closer.register(extractor);
 
@@ -254,89 +146,6 @@ public class TextExtractorMain {
                 extractor.close();
                 writer.close();
             }
-
-        } catch (Throwable e) {
-            throw closer.rethrow(e);
-        } finally {
-            closer.close();
-        }
-    }
-
-    private static Map<String, ?> toMap(Properties properties) {
-        Map<String, String> map = Maps.newHashMap();
-        for (final String name: properties.stringPropertyNames()) {
-            map.put(name, properties.getProperty(name));
-        }
-        return map;
-    }
-
-    private static DataStore createS3DataStore(Properties props) throws 
IOException {
-        S3DataStore s3ds = new S3DataStore();
-        s3ds.setProperties(props);
-        return s3ds;
-    }
-
-    private static Properties loadProperties(File s3Config) throws IOException 
{
-        Properties props = new Properties();
-        InputStream is = FileUtils.openInputStream(s3Config);
-        try{
-            props.load(is);
-        } finally {
-            IOUtils.closeQuietly(is);
-        }
-        return props;
-    }
-
-    private static NodeStore bootStrapNodeStore(String src, BlobStore 
blobStore, Closer closer) throws IOException {
-        if (src.startsWith(MongoURI.MONGODB_PREFIX)) {
-            MongoClientURI uri = new MongoClientURI(src);
-            if (uri.getDatabase() == null) {
-                System.err.println("Database missing in MongoDB URI: "
-                        + uri.getURI());
-                System.exit(1);
-            }
-            MongoConnection mongo = new MongoConnection(uri.getURI());
-            closer.register(asCloseable(mongo));
-            DocumentNodeStore store = new DocumentMK.Builder()
-                    .setBlobStore(blobStore)
-                    .setMongoDB(mongo.getDB())
-                    .setReadOnlyMode()
-                    .getNodeStore();
-            closer.register(asCloseable(store));
-            return store;
         }
-
-        return SegmentTarUtils.bootstrap(src, blobStore, closer);
-    }
-
-    private static Closeable asCloseable(final DataStore ds) {
-        return new Closeable() {
-            @Override
-            public void close() throws IOException {
-                try {
-                    ds.close();
-                } catch (DataStoreException e) {
-                    throw new IOException(e);
-                }
-            }
-        };
-    }
-
-    private static Closeable asCloseable(final DocumentNodeStore dns) {
-        return new Closeable() {
-            @Override
-            public void close() throws IOException {
-                dns.dispose();
-            }
-        };
-    }
-
-    private static Closeable asCloseable(final MongoConnection con) {
-        return new Closeable() {
-            @Override
-            public void close() throws IOException {
-                con.close();
-            }
-        };
     }
 }
\ No newline at end of file

Added: 
jackrabbit/oak/trunk/oak-run/src/main/java/org/apache/jackrabbit/oak/plugins/tika/TikaCommandOptions.java
URL: 
http://svn.apache.org/viewvc/jackrabbit/oak/trunk/oak-run/src/main/java/org/apache/jackrabbit/oak/plugins/tika/TikaCommandOptions.java?rev=1799157&view=auto
==============================================================================
--- 
jackrabbit/oak/trunk/oak-run/src/main/java/org/apache/jackrabbit/oak/plugins/tika/TikaCommandOptions.java
 (added)
+++ 
jackrabbit/oak/trunk/oak-run/src/main/java/org/apache/jackrabbit/oak/plugins/tika/TikaCommandOptions.java
 Mon Jun 19 08:48:40 2017
@@ -0,0 +1,161 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.jackrabbit.oak.plugins.tika;
+
+import java.io.File;
+import java.util.Set;
+
+import com.google.common.collect.ImmutableSet;
+import joptsimple.OptionParser;
+import joptsimple.OptionSet;
+import joptsimple.OptionSpec;
+import org.apache.jackrabbit.oak.run.cli.OptionsBean;
+import org.apache.jackrabbit.oak.run.cli.OptionsBeanFactory;
+
+public class TikaCommandOptions implements OptionsBean {
+    public static final String NAME = "tika";
+
+    public static final OptionsBeanFactory FACTORY = TikaCommandOptions::new;
+
+    private final OptionSpec<String> pathOpt;
+    private final OptionSpec<File> dataFileSpecOpt;
+    private final OptionSpec<File> tikaConfigSpecOpt;
+    private final OptionSpec<File> storeDirSpecOpt;
+    private final OptionSpec<Integer> poolSizeOpt;
+
+    private final OptionSpec<Void> reportAction;
+    private final OptionSpec<Void> generateAction;
+    private final OptionSpec<Void> extractAction;
+
+    private final Set<String> operationNames;
+
+    private OptionSet options;
+
+    public TikaCommandOptions(OptionParser parser) {
+        pathOpt = parser
+                .accepts("path", "Path in repository under which the binaries 
would be searched")
+                .withRequiredArg()
+                .ofType(String.class)
+                .defaultsTo("/");
+
+        dataFileSpecOpt = parser
+                .accepts("data-file", "Data file in csv format containing the 
binary metadata")
+                .withRequiredArg()
+                .ofType(File.class)
+                .defaultsTo(new File("oak-binary-stats.csv"));
+
+        tikaConfigSpecOpt = parser
+                .accepts("tika-config", "Tika config file path")
+                .withRequiredArg()
+                .ofType(File.class);
+
+        storeDirSpecOpt = parser
+                .accepts("store-path", "Path of directory used to store 
extracted text content")
+                .withRequiredArg()
+                .ofType(File.class);
+
+        poolSizeOpt = parser
+                .accepts("pool-size", "Size of the thread pool used to perform 
text extraction. Defaults " +
+                        "to number of cores on the system")
+                .withRequiredArg()
+                .ofType(Integer.class);
+
+        reportAction = parser.accepts("report", "Generates a summary report 
based on the csv file");
+        generateAction = parser.accepts("generate", "Generates the CSV file 
required for 'extract' and 'report' actions");
+        extractAction = parser.accepts("extract", "Performs the text 
extraction based on the csv file");
+
+        operationNames = ImmutableSet.of("report", "generate", "extract");
+    }
+
+    @Override
+    public void configure(OptionSet options) {
+        this.options = options;
+    }
+
+    @Override
+    public String title() {
+        return "";
+    }
+
+    @Override
+    public String description() {
+        return "The tika command supports following operations. All operations 
connect to repository in read only mode. \n" +
+                "Use of one of the supported actions like --report, 
--generate, --extract etc. ";
+    }
+
+    @Override
+    public int order() {
+        return 50;
+    }
+
+    @Override
+    public Set<String> operationNames() {
+        return operationNames;
+    }
+
+    public String getPath() {
+        return pathOpt.value(options);
+    }
+
+    public File getDataFile() {
+        return dataFileSpecOpt.value(options);
+    }
+
+    public File getTikaConfig() {
+        return tikaConfigSpecOpt.value(options);
+    }
+
+    public File getStoreDir() {
+        return storeDirSpecOpt.value(options);
+    }
+
+    public boolean isPoolSizeDefined() {
+        return options.has(poolSizeOpt);
+    }
+
+    public int getPoolSize() {
+        return poolSizeOpt.value(options);
+    }
+
+    public boolean report() {
+        //The non option mode is for comparability support with previous 
versions
+        return options.has(reportAction) || hasNonOption("report");
+    }
+
+    public boolean generate() {
+        return options.has(generateAction) || hasNonOption("generate");
+    }
+
+    public boolean extract() {
+        return options.has(extractAction) || hasNonOption("extract");
+    }
+
+    public OptionSpec<File> getDataFileSpecOpt() {
+        return dataFileSpecOpt;
+    }
+
+    public OptionSpec<File> getStoreDirSpecOpt() {
+        return storeDirSpecOpt;
+    }
+
+    private boolean hasNonOption(String name) {
+        return options.nonOptionArguments().contains(name);
+    }
+}

Propchange: 
jackrabbit/oak/trunk/oak-run/src/main/java/org/apache/jackrabbit/oak/plugins/tika/TikaCommandOptions.java
------------------------------------------------------------------------------
    svn:eol-style = native


Reply via email to