[ https://issues.apache.org/jira/browse/NUTCH-3032?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=17830983#comment-17830983 ]
ASF GitHub Bot commented on NUTCH-3032: --------------------------------------- lewismc commented on code in PR #810: URL: https://github.com/apache/nutch/pull/810#discussion_r1539452666 ########## src/plugin/index-arbitrary/src/java/org/apache/nutch/indexer/arbitrary/ArbitraryIndexingFilter.java: ########## @@ -0,0 +1,266 @@ +package org.apache.nutch.indexer.arbitrary; + +import org.apache.nutch.parse.Parse; +import org.apache.nutch.indexer.IndexingFilter; +import org.apache.nutch.indexer.IndexingException; +import org.apache.nutch.indexer.NutchDocument; +import org.apache.nutch.crawl.CrawlDatum; +import org.apache.nutch.crawl.Inlinks; + +import org.apache.hadoop.io.Text; + +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.lang.invoke.MethodHandles; +import java.lang.Class; +import java.lang.reflect.Constructor; +import java.lang.reflect.Method; + +import org.apache.hadoop.conf.Configuration; + +/** + * Adds arbitrary searchable fields to a document from the class and method + * the user identifies in the config. The user supplies the name of the field + * to add with the class and method names that supply the value. + * + * Example:<br><br> + * <property><br> + * <name>index.arbitrary.function.count</name><br> + * <value>1</value><br> + * </property><br> + * <br> + * <property><br> + * <name>index.arbitrary.fieldName.0</name><br> + * <value>advisors</value><br> + * </property><br> + * <br> + * <property><br> + * <name>index.arbitrary.className.0</name><br> + * <value>com.example.arbitrary.AdvisorCalculator</value><br> + * </property><br> + * <br> + * <property><br> + * <name>index.arbitrary.constructorArgs.0</name><br> + * <value>Kirk</value><br> + * </property><br> + * <br> + * <property><br> + * <name>index.arbitrary.methodName.0</name><br> + * <value>countAdvisors</value><br> + * </property><br> + * <br> + * <property><br> + * <name>index.arbitrary.methodArgs.0</name><br> + * <value>Spock,McCoy</value><br> + * </property><br> + * <br> + * To set more than one arbitrary field value, + * increment {@code index.arbitrary.function.count} and + * repeat the rest of these blocks with successive int values + * appended to the property names, e.g. fieldName.1, methodName.1, etc. + */ +public class ArbitraryIndexingFilter implements IndexingFilter { + + private static final Logger LOG = LoggerFactory + .getLogger(MethodHandles.lookup().lookupClass()); + + /** How many arbitrary field definitions to set. */ + private int arbitraryAddsCount = 0; + + /** The name of the field to insert/overwrite in the NutchDocument */ + private String fieldName; + + /** The fully-qualified class name of the custom class to use for the + * new field. This class must be in the Nutch runtime classpath, + * e.g., nutch/lib/ dierctory. */ + private String className; + + /** The String values to pass to the custom class constructor. The plugin + * will add the document url as the first argument in className's + * String[] args. */ + private String[] userConstrArgs; + + /** The array where the plugin copies the url & the userConstrArgs + * to create the instance of className. */ + private String[] constrArgs; + + /** The name of the method in the custom class to call. Its return value + * will become the value of fieldName in the NutchDocument. */ + private String methodName; + + /** The String values of the arguments to methodName. It's up to the + * developer of className to do any casts/conversions from String to + * another class in the code of className. */ + private String[] methodArgs; + + /** The result that returns from methodName. The plugin will set the value + * of fieldName to this. */ + private Object result; + + /** Optional flag to determine whether to overwrite the existing value in the + * NutchDocument fieldName if this is set to true. Default behavior is to + * add the value from calling methodName to existing values for fieldName. */ + private boolean overwrite = false; + + /** Hadoop Configuration object to pass these values into the plugin. */ + private Configuration conf; + + /** + * The {@link ArbitraryIndexingFilter} filter object uses reflection + * to instantiate the configured class and invoke the configured method. + * It requires a few configuration settings for adding arbitrary fields + * and values to the NutchDocument as searchable fields. + * See {@code index.arbitrary.function.count}, and (possibly multiple + * instances when {@code index.arbitrary.function.count} > 1) of the following + * {@code index.arbitrary.fieldName}.<em>index</em>, + * {@code index.arbitrary.className}.<em>index</em>, + * {@code index.arbitrary.constructorArgs}.<em>index</em>, + * {@code index.arbitrary.methodName}.<em>index</em>, and + * {@code index.arbitrary.methodArgs}.<em>index</em> + * in nutch-default.xml or nutch-site.xml where <em>index</em> ranges from 0 + * to {@code index.arbitrary.function.count} - 1. + * + * @param doc + * The {@link NutchDocument} object + * @param parse + * The relevant {@link Parse} object passing through the filter + * @param url + * URL to be filtered by the user-specified class + * @param datum + * The {@link CrawlDatum} entry + * @param inlinks + * The {@link Inlinks} containing anchor text + * @return filtered NutchDocument + */ + @Override + public NutchDocument filter(NutchDocument doc, Parse parse, Text url, + CrawlDatum datum, Inlinks inlinks) throws IndexingException { + + Class theClass = null; + Method theMethod = null; + Constructor<?> theConstructor = null; + Object instance = null; + + // This'll be quick + if (doc == null) { + LOG.info("In filter() where doc is null for url == " + + String.valueOf(url)); + return doc; + } else if (url == null) { + LOG.info("In filter() where url is null. Nothing to do."); + return doc; + } + + int cfgCounter = 0; + while (cfgCounter < arbitraryAddsCount) { + setIndexedConf(conf,cfgCounter); + cfgCounter++; + try { + theClass = Class.forName(className); + if (methodArgs.length > 0) { + theMethod = theClass.getDeclaredMethod(methodName,String[].class); + } else { + theMethod = theClass.getMethod(methodName); + } + theConstructor = theClass.getDeclaredConstructor(String[].class); + } catch (Exception e) { + LOG.error("Exception preparing reflection tasks. className was " + String.valueOf(className)); + e.printStackTrace(); + } + try { + constrArgs = new String[userConstrArgs.length + 1]; + constrArgs[0] = url.toString(); + System.arraycopy(userConstrArgs,0,constrArgs,1,userConstrArgs.length); + instance = theConstructor.newInstance(new Object[]{constrArgs}); + if (methodArgs.length > 0) { + result = theMethod.invoke(instance, new Object[]{methodArgs}); + } else { + result = theMethod.invoke(instance); + } + } catch (Exception e) { + LOG.error("Exception using reflection to instantiate and invoke."); + LOG.error("\nurl was " + String.valueOf(url)); + LOG.error("\nclassName was " + String.valueOf(className)); + if (constrArgs.length > 0) { + LOG.error("\nconstrArgs[1] was " + String.valueOf(constrArgs[1])); Review Comment: Same here ########## src/plugin/index-arbitrary/src/java/org/apache/nutch/indexer/arbitrary/ArbitraryIndexingFilter.java: ########## @@ -0,0 +1,266 @@ +package org.apache.nutch.indexer.arbitrary; + +import org.apache.nutch.parse.Parse; +import org.apache.nutch.indexer.IndexingFilter; +import org.apache.nutch.indexer.IndexingException; +import org.apache.nutch.indexer.NutchDocument; +import org.apache.nutch.crawl.CrawlDatum; +import org.apache.nutch.crawl.Inlinks; + +import org.apache.hadoop.io.Text; + +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.lang.invoke.MethodHandles; +import java.lang.Class; +import java.lang.reflect.Constructor; +import java.lang.reflect.Method; + +import org.apache.hadoop.conf.Configuration; + +/** + * Adds arbitrary searchable fields to a document from the class and method + * the user identifies in the config. The user supplies the name of the field + * to add with the class and method names that supply the value. + * + * Example:<br><br> + * <property><br> + * <name>index.arbitrary.function.count</name><br> + * <value>1</value><br> + * </property><br> + * <br> + * <property><br> + * <name>index.arbitrary.fieldName.0</name><br> + * <value>advisors</value><br> + * </property><br> + * <br> + * <property><br> + * <name>index.arbitrary.className.0</name><br> + * <value>com.example.arbitrary.AdvisorCalculator</value><br> + * </property><br> + * <br> + * <property><br> + * <name>index.arbitrary.constructorArgs.0</name><br> + * <value>Kirk</value><br> + * </property><br> + * <br> + * <property><br> + * <name>index.arbitrary.methodName.0</name><br> + * <value>countAdvisors</value><br> + * </property><br> + * <br> + * <property><br> + * <name>index.arbitrary.methodArgs.0</name><br> + * <value>Spock,McCoy</value><br> + * </property><br> + * <br> + * To set more than one arbitrary field value, + * increment {@code index.arbitrary.function.count} and + * repeat the rest of these blocks with successive int values + * appended to the property names, e.g. fieldName.1, methodName.1, etc. + */ +public class ArbitraryIndexingFilter implements IndexingFilter { + + private static final Logger LOG = LoggerFactory + .getLogger(MethodHandles.lookup().lookupClass()); + + /** How many arbitrary field definitions to set. */ + private int arbitraryAddsCount = 0; + + /** The name of the field to insert/overwrite in the NutchDocument */ + private String fieldName; + + /** The fully-qualified class name of the custom class to use for the + * new field. This class must be in the Nutch runtime classpath, + * e.g., nutch/lib/ dierctory. */ + private String className; + + /** The String values to pass to the custom class constructor. The plugin + * will add the document url as the first argument in className's + * String[] args. */ + private String[] userConstrArgs; + + /** The array where the plugin copies the url & the userConstrArgs + * to create the instance of className. */ + private String[] constrArgs; + + /** The name of the method in the custom class to call. Its return value + * will become the value of fieldName in the NutchDocument. */ + private String methodName; + + /** The String values of the arguments to methodName. It's up to the + * developer of className to do any casts/conversions from String to + * another class in the code of className. */ + private String[] methodArgs; + + /** The result that returns from methodName. The plugin will set the value + * of fieldName to this. */ + private Object result; + + /** Optional flag to determine whether to overwrite the existing value in the + * NutchDocument fieldName if this is set to true. Default behavior is to + * add the value from calling methodName to existing values for fieldName. */ + private boolean overwrite = false; + + /** Hadoop Configuration object to pass these values into the plugin. */ + private Configuration conf; + + /** + * The {@link ArbitraryIndexingFilter} filter object uses reflection + * to instantiate the configured class and invoke the configured method. + * It requires a few configuration settings for adding arbitrary fields + * and values to the NutchDocument as searchable fields. + * See {@code index.arbitrary.function.count}, and (possibly multiple + * instances when {@code index.arbitrary.function.count} > 1) of the following + * {@code index.arbitrary.fieldName}.<em>index</em>, + * {@code index.arbitrary.className}.<em>index</em>, + * {@code index.arbitrary.constructorArgs}.<em>index</em>, + * {@code index.arbitrary.methodName}.<em>index</em>, and + * {@code index.arbitrary.methodArgs}.<em>index</em> + * in nutch-default.xml or nutch-site.xml where <em>index</em> ranges from 0 + * to {@code index.arbitrary.function.count} - 1. + * + * @param doc + * The {@link NutchDocument} object + * @param parse + * The relevant {@link Parse} object passing through the filter + * @param url + * URL to be filtered by the user-specified class + * @param datum + * The {@link CrawlDatum} entry + * @param inlinks + * The {@link Inlinks} containing anchor text + * @return filtered NutchDocument + */ + @Override + public NutchDocument filter(NutchDocument doc, Parse parse, Text url, + CrawlDatum datum, Inlinks inlinks) throws IndexingException { + + Class theClass = null; + Method theMethod = null; + Constructor<?> theConstructor = null; + Object instance = null; + + // This'll be quick + if (doc == null) { + LOG.info("In filter() where doc is null for url == " Review Comment: Please use parameterized logging. `LOG.info("In filter() where doc is null for url == {}”, String.valueOf(url));` ########## src/plugin/index-arbitrary/src/java/org/apache/nutch/indexer/arbitrary/ArbitraryIndexingFilter.java: ########## @@ -0,0 +1,266 @@ +package org.apache.nutch.indexer.arbitrary; + +import org.apache.nutch.parse.Parse; +import org.apache.nutch.indexer.IndexingFilter; +import org.apache.nutch.indexer.IndexingException; +import org.apache.nutch.indexer.NutchDocument; +import org.apache.nutch.crawl.CrawlDatum; +import org.apache.nutch.crawl.Inlinks; + +import org.apache.hadoop.io.Text; + +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.lang.invoke.MethodHandles; +import java.lang.Class; +import java.lang.reflect.Constructor; +import java.lang.reflect.Method; + +import org.apache.hadoop.conf.Configuration; + +/** + * Adds arbitrary searchable fields to a document from the class and method + * the user identifies in the config. The user supplies the name of the field + * to add with the class and method names that supply the value. + * + * Example:<br><br> + * <property><br> + * <name>index.arbitrary.function.count</name><br> + * <value>1</value><br> + * </property><br> + * <br> + * <property><br> + * <name>index.arbitrary.fieldName.0</name><br> + * <value>advisors</value><br> + * </property><br> + * <br> + * <property><br> + * <name>index.arbitrary.className.0</name><br> + * <value>com.example.arbitrary.AdvisorCalculator</value><br> + * </property><br> + * <br> + * <property><br> + * <name>index.arbitrary.constructorArgs.0</name><br> + * <value>Kirk</value><br> + * </property><br> + * <br> + * <property><br> + * <name>index.arbitrary.methodName.0</name><br> + * <value>countAdvisors</value><br> + * </property><br> + * <br> + * <property><br> + * <name>index.arbitrary.methodArgs.0</name><br> + * <value>Spock,McCoy</value><br> + * </property><br> + * <br> + * To set more than one arbitrary field value, + * increment {@code index.arbitrary.function.count} and + * repeat the rest of these blocks with successive int values + * appended to the property names, e.g. fieldName.1, methodName.1, etc. + */ +public class ArbitraryIndexingFilter implements IndexingFilter { + + private static final Logger LOG = LoggerFactory + .getLogger(MethodHandles.lookup().lookupClass()); + + /** How many arbitrary field definitions to set. */ + private int arbitraryAddsCount = 0; + + /** The name of the field to insert/overwrite in the NutchDocument */ + private String fieldName; + + /** The fully-qualified class name of the custom class to use for the + * new field. This class must be in the Nutch runtime classpath, + * e.g., nutch/lib/ dierctory. */ + private String className; + + /** The String values to pass to the custom class constructor. The plugin + * will add the document url as the first argument in className's + * String[] args. */ + private String[] userConstrArgs; + + /** The array where the plugin copies the url & the userConstrArgs + * to create the instance of className. */ + private String[] constrArgs; + + /** The name of the method in the custom class to call. Its return value + * will become the value of fieldName in the NutchDocument. */ + private String methodName; + + /** The String values of the arguments to methodName. It's up to the + * developer of className to do any casts/conversions from String to + * another class in the code of className. */ + private String[] methodArgs; + + /** The result that returns from methodName. The plugin will set the value + * of fieldName to this. */ + private Object result; + + /** Optional flag to determine whether to overwrite the existing value in the + * NutchDocument fieldName if this is set to true. Default behavior is to + * add the value from calling methodName to existing values for fieldName. */ + private boolean overwrite = false; + + /** Hadoop Configuration object to pass these values into the plugin. */ + private Configuration conf; + + /** + * The {@link ArbitraryIndexingFilter} filter object uses reflection + * to instantiate the configured class and invoke the configured method. + * It requires a few configuration settings for adding arbitrary fields + * and values to the NutchDocument as searchable fields. + * See {@code index.arbitrary.function.count}, and (possibly multiple + * instances when {@code index.arbitrary.function.count} > 1) of the following + * {@code index.arbitrary.fieldName}.<em>index</em>, + * {@code index.arbitrary.className}.<em>index</em>, + * {@code index.arbitrary.constructorArgs}.<em>index</em>, + * {@code index.arbitrary.methodName}.<em>index</em>, and + * {@code index.arbitrary.methodArgs}.<em>index</em> + * in nutch-default.xml or nutch-site.xml where <em>index</em> ranges from 0 + * to {@code index.arbitrary.function.count} - 1. + * + * @param doc + * The {@link NutchDocument} object + * @param parse + * The relevant {@link Parse} object passing through the filter + * @param url + * URL to be filtered by the user-specified class + * @param datum + * The {@link CrawlDatum} entry + * @param inlinks + * The {@link Inlinks} containing anchor text + * @return filtered NutchDocument + */ + @Override + public NutchDocument filter(NutchDocument doc, Parse parse, Text url, + CrawlDatum datum, Inlinks inlinks) throws IndexingException { + + Class theClass = null; + Method theMethod = null; + Constructor<?> theConstructor = null; + Object instance = null; + + // This'll be quick + if (doc == null) { + LOG.info("In filter() where doc is null for url == " + + String.valueOf(url)); + return doc; + } else if (url == null) { + LOG.info("In filter() where url is null. Nothing to do."); + return doc; + } + + int cfgCounter = 0; + while (cfgCounter < arbitraryAddsCount) { + setIndexedConf(conf,cfgCounter); + cfgCounter++; + try { + theClass = Class.forName(className); + if (methodArgs.length > 0) { + theMethod = theClass.getDeclaredMethod(methodName,String[].class); + } else { + theMethod = theClass.getMethod(methodName); + } + theConstructor = theClass.getDeclaredConstructor(String[].class); + } catch (Exception e) { + LOG.error("Exception preparing reflection tasks. className was " + String.valueOf(className)); + e.printStackTrace(); + } + try { + constrArgs = new String[userConstrArgs.length + 1]; + constrArgs[0] = url.toString(); + System.arraycopy(userConstrArgs,0,constrArgs,1,userConstrArgs.length); + instance = theConstructor.newInstance(new Object[]{constrArgs}); + if (methodArgs.length > 0) { + result = theMethod.invoke(instance, new Object[]{methodArgs}); + } else { + result = theMethod.invoke(instance); + } + } catch (Exception e) { + LOG.error("Exception using reflection to instantiate and invoke."); + LOG.error("\nurl was " + String.valueOf(url)); + LOG.error("\nclassName was " + String.valueOf(className)); + if (constrArgs.length > 0) { + LOG.error("\nconstrArgs[1] was " + String.valueOf(constrArgs[1])); + } + LOG.error("\nmethodName was " + String.valueOf(className)); + if (methodArgs.length > 0) { + LOG.error("\nmethodArgs[0] was " + String.valueOf(methodArgs[0])); + } + e.printStackTrace(); + } + + LOG.debug(className + "." + methodName + "() returned " + String.valueOf(result) Review Comment: Same here ########## src/plugin/index-arbitrary/src/java/org/apache/nutch/indexer/arbitrary/ArbitraryIndexingFilter.java: ########## @@ -0,0 +1,266 @@ +package org.apache.nutch.indexer.arbitrary; + +import org.apache.nutch.parse.Parse; +import org.apache.nutch.indexer.IndexingFilter; +import org.apache.nutch.indexer.IndexingException; +import org.apache.nutch.indexer.NutchDocument; +import org.apache.nutch.crawl.CrawlDatum; +import org.apache.nutch.crawl.Inlinks; + +import org.apache.hadoop.io.Text; + +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.lang.invoke.MethodHandles; +import java.lang.Class; +import java.lang.reflect.Constructor; +import java.lang.reflect.Method; + +import org.apache.hadoop.conf.Configuration; + +/** + * Adds arbitrary searchable fields to a document from the class and method + * the user identifies in the config. The user supplies the name of the field + * to add with the class and method names that supply the value. + * + * Example:<br><br> + * <property><br> + * <name>index.arbitrary.function.count</name><br> + * <value>1</value><br> + * </property><br> + * <br> + * <property><br> + * <name>index.arbitrary.fieldName.0</name><br> + * <value>advisors</value><br> + * </property><br> + * <br> + * <property><br> + * <name>index.arbitrary.className.0</name><br> + * <value>com.example.arbitrary.AdvisorCalculator</value><br> + * </property><br> + * <br> + * <property><br> + * <name>index.arbitrary.constructorArgs.0</name><br> + * <value>Kirk</value><br> + * </property><br> + * <br> + * <property><br> + * <name>index.arbitrary.methodName.0</name><br> + * <value>countAdvisors</value><br> + * </property><br> + * <br> + * <property><br> + * <name>index.arbitrary.methodArgs.0</name><br> + * <value>Spock,McCoy</value><br> + * </property><br> + * <br> + * To set more than one arbitrary field value, + * increment {@code index.arbitrary.function.count} and + * repeat the rest of these blocks with successive int values + * appended to the property names, e.g. fieldName.1, methodName.1, etc. + */ +public class ArbitraryIndexingFilter implements IndexingFilter { + + private static final Logger LOG = LoggerFactory + .getLogger(MethodHandles.lookup().lookupClass()); + + /** How many arbitrary field definitions to set. */ + private int arbitraryAddsCount = 0; + + /** The name of the field to insert/overwrite in the NutchDocument */ + private String fieldName; + + /** The fully-qualified class name of the custom class to use for the + * new field. This class must be in the Nutch runtime classpath, + * e.g., nutch/lib/ dierctory. */ + private String className; + + /** The String values to pass to the custom class constructor. The plugin + * will add the document url as the first argument in className's + * String[] args. */ + private String[] userConstrArgs; + + /** The array where the plugin copies the url & the userConstrArgs + * to create the instance of className. */ + private String[] constrArgs; + + /** The name of the method in the custom class to call. Its return value + * will become the value of fieldName in the NutchDocument. */ + private String methodName; + + /** The String values of the arguments to methodName. It's up to the + * developer of className to do any casts/conversions from String to + * another class in the code of className. */ + private String[] methodArgs; + + /** The result that returns from methodName. The plugin will set the value + * of fieldName to this. */ + private Object result; + + /** Optional flag to determine whether to overwrite the existing value in the + * NutchDocument fieldName if this is set to true. Default behavior is to + * add the value from calling methodName to existing values for fieldName. */ + private boolean overwrite = false; + + /** Hadoop Configuration object to pass these values into the plugin. */ + private Configuration conf; + + /** + * The {@link ArbitraryIndexingFilter} filter object uses reflection + * to instantiate the configured class and invoke the configured method. + * It requires a few configuration settings for adding arbitrary fields + * and values to the NutchDocument as searchable fields. + * See {@code index.arbitrary.function.count}, and (possibly multiple + * instances when {@code index.arbitrary.function.count} > 1) of the following + * {@code index.arbitrary.fieldName}.<em>index</em>, + * {@code index.arbitrary.className}.<em>index</em>, + * {@code index.arbitrary.constructorArgs}.<em>index</em>, + * {@code index.arbitrary.methodName}.<em>index</em>, and + * {@code index.arbitrary.methodArgs}.<em>index</em> + * in nutch-default.xml or nutch-site.xml where <em>index</em> ranges from 0 + * to {@code index.arbitrary.function.count} - 1. + * + * @param doc + * The {@link NutchDocument} object + * @param parse + * The relevant {@link Parse} object passing through the filter + * @param url + * URL to be filtered by the user-specified class + * @param datum + * The {@link CrawlDatum} entry + * @param inlinks + * The {@link Inlinks} containing anchor text + * @return filtered NutchDocument + */ + @Override + public NutchDocument filter(NutchDocument doc, Parse parse, Text url, + CrawlDatum datum, Inlinks inlinks) throws IndexingException { + + Class theClass = null; + Method theMethod = null; + Constructor<?> theConstructor = null; + Object instance = null; + + // This'll be quick + if (doc == null) { + LOG.info("In filter() where doc is null for url == " + + String.valueOf(url)); + return doc; + } else if (url == null) { + LOG.info("In filter() where url is null. Nothing to do."); + return doc; + } + + int cfgCounter = 0; + while (cfgCounter < arbitraryAddsCount) { + setIndexedConf(conf,cfgCounter); + cfgCounter++; + try { + theClass = Class.forName(className); + if (methodArgs.length > 0) { + theMethod = theClass.getDeclaredMethod(methodName,String[].class); + } else { + theMethod = theClass.getMethod(methodName); + } + theConstructor = theClass.getDeclaredConstructor(String[].class); + } catch (Exception e) { + LOG.error("Exception preparing reflection tasks. className was " + String.valueOf(className)); Review Comment: Please use parameterized logging. ########## src/plugin/index-arbitrary/src/java/org/apache/nutch/indexer/arbitrary/ArbitraryIndexingFilter.java: ########## @@ -0,0 +1,266 @@ +package org.apache.nutch.indexer.arbitrary; + +import org.apache.nutch.parse.Parse; +import org.apache.nutch.indexer.IndexingFilter; +import org.apache.nutch.indexer.IndexingException; +import org.apache.nutch.indexer.NutchDocument; +import org.apache.nutch.crawl.CrawlDatum; +import org.apache.nutch.crawl.Inlinks; + +import org.apache.hadoop.io.Text; + +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.lang.invoke.MethodHandles; +import java.lang.Class; +import java.lang.reflect.Constructor; +import java.lang.reflect.Method; + +import org.apache.hadoop.conf.Configuration; + +/** + * Adds arbitrary searchable fields to a document from the class and method + * the user identifies in the config. The user supplies the name of the field + * to add with the class and method names that supply the value. + * + * Example:<br><br> + * <property><br> + * <name>index.arbitrary.function.count</name><br> + * <value>1</value><br> + * </property><br> + * <br> + * <property><br> + * <name>index.arbitrary.fieldName.0</name><br> + * <value>advisors</value><br> + * </property><br> + * <br> + * <property><br> + * <name>index.arbitrary.className.0</name><br> + * <value>com.example.arbitrary.AdvisorCalculator</value><br> + * </property><br> + * <br> + * <property><br> + * <name>index.arbitrary.constructorArgs.0</name><br> + * <value>Kirk</value><br> + * </property><br> + * <br> + * <property><br> + * <name>index.arbitrary.methodName.0</name><br> + * <value>countAdvisors</value><br> + * </property><br> + * <br> + * <property><br> + * <name>index.arbitrary.methodArgs.0</name><br> + * <value>Spock,McCoy</value><br> + * </property><br> + * <br> + * To set more than one arbitrary field value, + * increment {@code index.arbitrary.function.count} and + * repeat the rest of these blocks with successive int values + * appended to the property names, e.g. fieldName.1, methodName.1, etc. + */ +public class ArbitraryIndexingFilter implements IndexingFilter { + + private static final Logger LOG = LoggerFactory + .getLogger(MethodHandles.lookup().lookupClass()); + + /** How many arbitrary field definitions to set. */ + private int arbitraryAddsCount = 0; + + /** The name of the field to insert/overwrite in the NutchDocument */ + private String fieldName; + + /** The fully-qualified class name of the custom class to use for the + * new field. This class must be in the Nutch runtime classpath, + * e.g., nutch/lib/ dierctory. */ + private String className; + + /** The String values to pass to the custom class constructor. The plugin + * will add the document url as the first argument in className's + * String[] args. */ + private String[] userConstrArgs; + + /** The array where the plugin copies the url & the userConstrArgs + * to create the instance of className. */ + private String[] constrArgs; + + /** The name of the method in the custom class to call. Its return value + * will become the value of fieldName in the NutchDocument. */ + private String methodName; + + /** The String values of the arguments to methodName. It's up to the + * developer of className to do any casts/conversions from String to + * another class in the code of className. */ + private String[] methodArgs; + + /** The result that returns from methodName. The plugin will set the value + * of fieldName to this. */ + private Object result; + + /** Optional flag to determine whether to overwrite the existing value in the + * NutchDocument fieldName if this is set to true. Default behavior is to + * add the value from calling methodName to existing values for fieldName. */ + private boolean overwrite = false; + + /** Hadoop Configuration object to pass these values into the plugin. */ + private Configuration conf; + + /** + * The {@link ArbitraryIndexingFilter} filter object uses reflection + * to instantiate the configured class and invoke the configured method. + * It requires a few configuration settings for adding arbitrary fields + * and values to the NutchDocument as searchable fields. + * See {@code index.arbitrary.function.count}, and (possibly multiple + * instances when {@code index.arbitrary.function.count} > 1) of the following + * {@code index.arbitrary.fieldName}.<em>index</em>, + * {@code index.arbitrary.className}.<em>index</em>, + * {@code index.arbitrary.constructorArgs}.<em>index</em>, + * {@code index.arbitrary.methodName}.<em>index</em>, and + * {@code index.arbitrary.methodArgs}.<em>index</em> + * in nutch-default.xml or nutch-site.xml where <em>index</em> ranges from 0 + * to {@code index.arbitrary.function.count} - 1. + * + * @param doc + * The {@link NutchDocument} object + * @param parse + * The relevant {@link Parse} object passing through the filter + * @param url + * URL to be filtered by the user-specified class + * @param datum + * The {@link CrawlDatum} entry + * @param inlinks + * The {@link Inlinks} containing anchor text + * @return filtered NutchDocument + */ + @Override + public NutchDocument filter(NutchDocument doc, Parse parse, Text url, + CrawlDatum datum, Inlinks inlinks) throws IndexingException { + + Class theClass = null; + Method theMethod = null; + Constructor<?> theConstructor = null; + Object instance = null; + + // This'll be quick + if (doc == null) { + LOG.info("In filter() where doc is null for url == " + + String.valueOf(url)); + return doc; + } else if (url == null) { + LOG.info("In filter() where url is null. Nothing to do."); + return doc; + } + + int cfgCounter = 0; + while (cfgCounter < arbitraryAddsCount) { + setIndexedConf(conf,cfgCounter); + cfgCounter++; + try { + theClass = Class.forName(className); + if (methodArgs.length > 0) { + theMethod = theClass.getDeclaredMethod(methodName,String[].class); + } else { + theMethod = theClass.getMethod(methodName); + } + theConstructor = theClass.getDeclaredConstructor(String[].class); + } catch (Exception e) { + LOG.error("Exception preparing reflection tasks. className was " + String.valueOf(className)); + e.printStackTrace(); + } + try { + constrArgs = new String[userConstrArgs.length + 1]; + constrArgs[0] = url.toString(); + System.arraycopy(userConstrArgs,0,constrArgs,1,userConstrArgs.length); + instance = theConstructor.newInstance(new Object[]{constrArgs}); + if (methodArgs.length > 0) { + result = theMethod.invoke(instance, new Object[]{methodArgs}); + } else { + result = theMethod.invoke(instance); + } + } catch (Exception e) { + LOG.error("Exception using reflection to instantiate and invoke."); Review Comment: Please combine log statements and use parameterized logging. Thanks ########## src/plugin/index-arbitrary/src/java/org/apache/nutch/indexer/arbitrary/ArbitraryIndexingFilter.java: ########## @@ -0,0 +1,266 @@ +package org.apache.nutch.indexer.arbitrary; + +import org.apache.nutch.parse.Parse; +import org.apache.nutch.indexer.IndexingFilter; +import org.apache.nutch.indexer.IndexingException; +import org.apache.nutch.indexer.NutchDocument; +import org.apache.nutch.crawl.CrawlDatum; +import org.apache.nutch.crawl.Inlinks; + +import org.apache.hadoop.io.Text; + +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.lang.invoke.MethodHandles; +import java.lang.Class; +import java.lang.reflect.Constructor; +import java.lang.reflect.Method; + +import org.apache.hadoop.conf.Configuration; + +/** + * Adds arbitrary searchable fields to a document from the class and method + * the user identifies in the config. The user supplies the name of the field + * to add with the class and method names that supply the value. + * + * Example:<br><br> + * <property><br> + * <name>index.arbitrary.function.count</name><br> + * <value>1</value><br> + * </property><br> + * <br> + * <property><br> + * <name>index.arbitrary.fieldName.0</name><br> + * <value>advisors</value><br> + * </property><br> + * <br> + * <property><br> + * <name>index.arbitrary.className.0</name><br> + * <value>com.example.arbitrary.AdvisorCalculator</value><br> + * </property><br> + * <br> + * <property><br> + * <name>index.arbitrary.constructorArgs.0</name><br> + * <value>Kirk</value><br> + * </property><br> + * <br> + * <property><br> + * <name>index.arbitrary.methodName.0</name><br> + * <value>countAdvisors</value><br> + * </property><br> + * <br> + * <property><br> + * <name>index.arbitrary.methodArgs.0</name><br> + * <value>Spock,McCoy</value><br> + * </property><br> + * <br> + * To set more than one arbitrary field value, + * increment {@code index.arbitrary.function.count} and + * repeat the rest of these blocks with successive int values + * appended to the property names, e.g. fieldName.1, methodName.1, etc. + */ +public class ArbitraryIndexingFilter implements IndexingFilter { + + private static final Logger LOG = LoggerFactory + .getLogger(MethodHandles.lookup().lookupClass()); + + /** How many arbitrary field definitions to set. */ + private int arbitraryAddsCount = 0; + + /** The name of the field to insert/overwrite in the NutchDocument */ + private String fieldName; + + /** The fully-qualified class name of the custom class to use for the + * new field. This class must be in the Nutch runtime classpath, + * e.g., nutch/lib/ dierctory. */ + private String className; + + /** The String values to pass to the custom class constructor. The plugin + * will add the document url as the first argument in className's + * String[] args. */ + private String[] userConstrArgs; + + /** The array where the plugin copies the url & the userConstrArgs + * to create the instance of className. */ + private String[] constrArgs; + + /** The name of the method in the custom class to call. Its return value + * will become the value of fieldName in the NutchDocument. */ + private String methodName; + + /** The String values of the arguments to methodName. It's up to the + * developer of className to do any casts/conversions from String to + * another class in the code of className. */ + private String[] methodArgs; + + /** The result that returns from methodName. The plugin will set the value + * of fieldName to this. */ + private Object result; + + /** Optional flag to determine whether to overwrite the existing value in the + * NutchDocument fieldName if this is set to true. Default behavior is to + * add the value from calling methodName to existing values for fieldName. */ + private boolean overwrite = false; + + /** Hadoop Configuration object to pass these values into the plugin. */ + private Configuration conf; + + /** + * The {@link ArbitraryIndexingFilter} filter object uses reflection + * to instantiate the configured class and invoke the configured method. + * It requires a few configuration settings for adding arbitrary fields + * and values to the NutchDocument as searchable fields. + * See {@code index.arbitrary.function.count}, and (possibly multiple + * instances when {@code index.arbitrary.function.count} > 1) of the following + * {@code index.arbitrary.fieldName}.<em>index</em>, + * {@code index.arbitrary.className}.<em>index</em>, + * {@code index.arbitrary.constructorArgs}.<em>index</em>, + * {@code index.arbitrary.methodName}.<em>index</em>, and + * {@code index.arbitrary.methodArgs}.<em>index</em> + * in nutch-default.xml or nutch-site.xml where <em>index</em> ranges from 0 + * to {@code index.arbitrary.function.count} - 1. + * + * @param doc + * The {@link NutchDocument} object + * @param parse + * The relevant {@link Parse} object passing through the filter + * @param url + * URL to be filtered by the user-specified class + * @param datum + * The {@link CrawlDatum} entry + * @param inlinks + * The {@link Inlinks} containing anchor text + * @return filtered NutchDocument + */ + @Override + public NutchDocument filter(NutchDocument doc, Parse parse, Text url, + CrawlDatum datum, Inlinks inlinks) throws IndexingException { + + Class theClass = null; + Method theMethod = null; + Constructor<?> theConstructor = null; + Object instance = null; + + // This'll be quick + if (doc == null) { + LOG.info("In filter() where doc is null for url == " + + String.valueOf(url)); + return doc; + } else if (url == null) { + LOG.info("In filter() where url is null. Nothing to do."); + return doc; + } + + int cfgCounter = 0; + while (cfgCounter < arbitraryAddsCount) { + setIndexedConf(conf,cfgCounter); + cfgCounter++; + try { + theClass = Class.forName(className); + if (methodArgs.length > 0) { + theMethod = theClass.getDeclaredMethod(methodName,String[].class); + } else { + theMethod = theClass.getMethod(methodName); + } + theConstructor = theClass.getDeclaredConstructor(String[].class); + } catch (Exception e) { + LOG.error("Exception preparing reflection tasks. className was " + String.valueOf(className)); + e.printStackTrace(); + } + try { + constrArgs = new String[userConstrArgs.length + 1]; + constrArgs[0] = url.toString(); + System.arraycopy(userConstrArgs,0,constrArgs,1,userConstrArgs.length); + instance = theConstructor.newInstance(new Object[]{constrArgs}); + if (methodArgs.length > 0) { + result = theMethod.invoke(instance, new Object[]{methodArgs}); + } else { + result = theMethod.invoke(instance); + } + } catch (Exception e) { + LOG.error("Exception using reflection to instantiate and invoke."); + LOG.error("\nurl was " + String.valueOf(url)); + LOG.error("\nclassName was " + String.valueOf(className)); + if (constrArgs.length > 0) { + LOG.error("\nconstrArgs[1] was " + String.valueOf(constrArgs[1])); + } + LOG.error("\nmethodName was " + String.valueOf(className)); + if (methodArgs.length > 0) { + LOG.error("\nmethodArgs[0] was " + String.valueOf(methodArgs[0])); Review Comment: Same here ########## src/plugin/index-arbitrary/src/java/org/apache/nutch/indexer/arbitrary/ArbitraryIndexingFilter.java: ########## @@ -0,0 +1,266 @@ +package org.apache.nutch.indexer.arbitrary; + +import org.apache.nutch.parse.Parse; +import org.apache.nutch.indexer.IndexingFilter; +import org.apache.nutch.indexer.IndexingException; +import org.apache.nutch.indexer.NutchDocument; +import org.apache.nutch.crawl.CrawlDatum; +import org.apache.nutch.crawl.Inlinks; + +import org.apache.hadoop.io.Text; + +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.lang.invoke.MethodHandles; +import java.lang.Class; +import java.lang.reflect.Constructor; +import java.lang.reflect.Method; + +import org.apache.hadoop.conf.Configuration; + +/** + * Adds arbitrary searchable fields to a document from the class and method + * the user identifies in the config. The user supplies the name of the field + * to add with the class and method names that supply the value. + * + * Example:<br><br> + * <property><br> + * <name>index.arbitrary.function.count</name><br> + * <value>1</value><br> + * </property><br> + * <br> + * <property><br> + * <name>index.arbitrary.fieldName.0</name><br> + * <value>advisors</value><br> + * </property><br> + * <br> + * <property><br> + * <name>index.arbitrary.className.0</name><br> + * <value>com.example.arbitrary.AdvisorCalculator</value><br> + * </property><br> + * <br> + * <property><br> + * <name>index.arbitrary.constructorArgs.0</name><br> + * <value>Kirk</value><br> + * </property><br> + * <br> + * <property><br> + * <name>index.arbitrary.methodName.0</name><br> + * <value>countAdvisors</value><br> + * </property><br> + * <br> + * <property><br> + * <name>index.arbitrary.methodArgs.0</name><br> + * <value>Spock,McCoy</value><br> + * </property><br> + * <br> + * To set more than one arbitrary field value, + * increment {@code index.arbitrary.function.count} and + * repeat the rest of these blocks with successive int values + * appended to the property names, e.g. fieldName.1, methodName.1, etc. + */ +public class ArbitraryIndexingFilter implements IndexingFilter { + + private static final Logger LOG = LoggerFactory + .getLogger(MethodHandles.lookup().lookupClass()); + + /** How many arbitrary field definitions to set. */ + private int arbitraryAddsCount = 0; + + /** The name of the field to insert/overwrite in the NutchDocument */ + private String fieldName; + + /** The fully-qualified class name of the custom class to use for the + * new field. This class must be in the Nutch runtime classpath, + * e.g., nutch/lib/ dierctory. */ + private String className; + + /** The String values to pass to the custom class constructor. The plugin + * will add the document url as the first argument in className's + * String[] args. */ + private String[] userConstrArgs; + + /** The array where the plugin copies the url & the userConstrArgs + * to create the instance of className. */ + private String[] constrArgs; + + /** The name of the method in the custom class to call. Its return value + * will become the value of fieldName in the NutchDocument. */ + private String methodName; + + /** The String values of the arguments to methodName. It's up to the + * developer of className to do any casts/conversions from String to + * another class in the code of className. */ + private String[] methodArgs; + + /** The result that returns from methodName. The plugin will set the value + * of fieldName to this. */ + private Object result; + + /** Optional flag to determine whether to overwrite the existing value in the + * NutchDocument fieldName if this is set to true. Default behavior is to + * add the value from calling methodName to existing values for fieldName. */ + private boolean overwrite = false; + + /** Hadoop Configuration object to pass these values into the plugin. */ + private Configuration conf; + + /** + * The {@link ArbitraryIndexingFilter} filter object uses reflection + * to instantiate the configured class and invoke the configured method. + * It requires a few configuration settings for adding arbitrary fields + * and values to the NutchDocument as searchable fields. + * See {@code index.arbitrary.function.count}, and (possibly multiple + * instances when {@code index.arbitrary.function.count} > 1) of the following + * {@code index.arbitrary.fieldName}.<em>index</em>, + * {@code index.arbitrary.className}.<em>index</em>, + * {@code index.arbitrary.constructorArgs}.<em>index</em>, + * {@code index.arbitrary.methodName}.<em>index</em>, and + * {@code index.arbitrary.methodArgs}.<em>index</em> + * in nutch-default.xml or nutch-site.xml where <em>index</em> ranges from 0 + * to {@code index.arbitrary.function.count} - 1. + * + * @param doc + * The {@link NutchDocument} object + * @param parse + * The relevant {@link Parse} object passing through the filter + * @param url + * URL to be filtered by the user-specified class + * @param datum + * The {@link CrawlDatum} entry + * @param inlinks + * The {@link Inlinks} containing anchor text + * @return filtered NutchDocument + */ + @Override + public NutchDocument filter(NutchDocument doc, Parse parse, Text url, + CrawlDatum datum, Inlinks inlinks) throws IndexingException { + + Class theClass = null; + Method theMethod = null; + Constructor<?> theConstructor = null; + Object instance = null; + + // This'll be quick + if (doc == null) { + LOG.info("In filter() where doc is null for url == " + + String.valueOf(url)); + return doc; + } else if (url == null) { + LOG.info("In filter() where url is null. Nothing to do."); + return doc; + } + + int cfgCounter = 0; + while (cfgCounter < arbitraryAddsCount) { + setIndexedConf(conf,cfgCounter); + cfgCounter++; + try { + theClass = Class.forName(className); + if (methodArgs.length > 0) { + theMethod = theClass.getDeclaredMethod(methodName,String[].class); + } else { + theMethod = theClass.getMethod(methodName); + } + theConstructor = theClass.getDeclaredConstructor(String[].class); + } catch (Exception e) { + LOG.error("Exception preparing reflection tasks. className was " + String.valueOf(className)); + e.printStackTrace(); + } + try { + constrArgs = new String[userConstrArgs.length + 1]; + constrArgs[0] = url.toString(); + System.arraycopy(userConstrArgs,0,constrArgs,1,userConstrArgs.length); + instance = theConstructor.newInstance(new Object[]{constrArgs}); + if (methodArgs.length > 0) { + result = theMethod.invoke(instance, new Object[]{methodArgs}); + } else { + result = theMethod.invoke(instance); + } + } catch (Exception e) { + LOG.error("Exception using reflection to instantiate and invoke."); + LOG.error("\nurl was " + String.valueOf(url)); + LOG.error("\nclassName was " + String.valueOf(className)); + if (constrArgs.length > 0) { + LOG.error("\nconstrArgs[1] was " + String.valueOf(constrArgs[1])); + } + LOG.error("\nmethodName was " + String.valueOf(className)); Review Comment: Same here ########## src/plugin/index-arbitrary/src/java/org/apache/nutch/indexer/arbitrary/ArbitraryIndexingFilter.java: ########## @@ -0,0 +1,266 @@ +package org.apache.nutch.indexer.arbitrary; + +import org.apache.nutch.parse.Parse; +import org.apache.nutch.indexer.IndexingFilter; +import org.apache.nutch.indexer.IndexingException; +import org.apache.nutch.indexer.NutchDocument; +import org.apache.nutch.crawl.CrawlDatum; +import org.apache.nutch.crawl.Inlinks; + +import org.apache.hadoop.io.Text; + +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.lang.invoke.MethodHandles; +import java.lang.Class; +import java.lang.reflect.Constructor; +import java.lang.reflect.Method; + +import org.apache.hadoop.conf.Configuration; + +/** + * Adds arbitrary searchable fields to a document from the class and method + * the user identifies in the config. The user supplies the name of the field + * to add with the class and method names that supply the value. + * + * Example:<br><br> + * <property><br> + * <name>index.arbitrary.function.count</name><br> + * <value>1</value><br> + * </property><br> + * <br> + * <property><br> + * <name>index.arbitrary.fieldName.0</name><br> + * <value>advisors</value><br> + * </property><br> + * <br> + * <property><br> + * <name>index.arbitrary.className.0</name><br> + * <value>com.example.arbitrary.AdvisorCalculator</value><br> + * </property><br> + * <br> + * <property><br> + * <name>index.arbitrary.constructorArgs.0</name><br> + * <value>Kirk</value><br> + * </property><br> + * <br> + * <property><br> + * <name>index.arbitrary.methodName.0</name><br> + * <value>countAdvisors</value><br> + * </property><br> + * <br> + * <property><br> + * <name>index.arbitrary.methodArgs.0</name><br> + * <value>Spock,McCoy</value><br> + * </property><br> + * <br> + * To set more than one arbitrary field value, + * increment {@code index.arbitrary.function.count} and + * repeat the rest of these blocks with successive int values + * appended to the property names, e.g. fieldName.1, methodName.1, etc. + */ +public class ArbitraryIndexingFilter implements IndexingFilter { + + private static final Logger LOG = LoggerFactory + .getLogger(MethodHandles.lookup().lookupClass()); + + /** How many arbitrary field definitions to set. */ + private int arbitraryAddsCount = 0; + + /** The name of the field to insert/overwrite in the NutchDocument */ + private String fieldName; + + /** The fully-qualified class name of the custom class to use for the + * new field. This class must be in the Nutch runtime classpath, + * e.g., nutch/lib/ dierctory. */ + private String className; + + /** The String values to pass to the custom class constructor. The plugin + * will add the document url as the first argument in className's + * String[] args. */ + private String[] userConstrArgs; + + /** The array where the plugin copies the url & the userConstrArgs + * to create the instance of className. */ + private String[] constrArgs; + + /** The name of the method in the custom class to call. Its return value + * will become the value of fieldName in the NutchDocument. */ + private String methodName; + + /** The String values of the arguments to methodName. It's up to the + * developer of className to do any casts/conversions from String to + * another class in the code of className. */ + private String[] methodArgs; + + /** The result that returns from methodName. The plugin will set the value + * of fieldName to this. */ + private Object result; + + /** Optional flag to determine whether to overwrite the existing value in the + * NutchDocument fieldName if this is set to true. Default behavior is to + * add the value from calling methodName to existing values for fieldName. */ + private boolean overwrite = false; + + /** Hadoop Configuration object to pass these values into the plugin. */ + private Configuration conf; + + /** + * The {@link ArbitraryIndexingFilter} filter object uses reflection + * to instantiate the configured class and invoke the configured method. + * It requires a few configuration settings for adding arbitrary fields + * and values to the NutchDocument as searchable fields. + * See {@code index.arbitrary.function.count}, and (possibly multiple + * instances when {@code index.arbitrary.function.count} > 1) of the following + * {@code index.arbitrary.fieldName}.<em>index</em>, + * {@code index.arbitrary.className}.<em>index</em>, + * {@code index.arbitrary.constructorArgs}.<em>index</em>, + * {@code index.arbitrary.methodName}.<em>index</em>, and + * {@code index.arbitrary.methodArgs}.<em>index</em> + * in nutch-default.xml or nutch-site.xml where <em>index</em> ranges from 0 + * to {@code index.arbitrary.function.count} - 1. + * + * @param doc + * The {@link NutchDocument} object + * @param parse + * The relevant {@link Parse} object passing through the filter + * @param url + * URL to be filtered by the user-specified class + * @param datum + * The {@link CrawlDatum} entry + * @param inlinks + * The {@link Inlinks} containing anchor text + * @return filtered NutchDocument + */ + @Override + public NutchDocument filter(NutchDocument doc, Parse parse, Text url, + CrawlDatum datum, Inlinks inlinks) throws IndexingException { + + Class theClass = null; + Method theMethod = null; + Constructor<?> theConstructor = null; + Object instance = null; + + // This'll be quick + if (doc == null) { + LOG.info("In filter() where doc is null for url == " + + String.valueOf(url)); + return doc; + } else if (url == null) { + LOG.info("In filter() where url is null. Nothing to do."); + return doc; + } + + int cfgCounter = 0; + while (cfgCounter < arbitraryAddsCount) { + setIndexedConf(conf,cfgCounter); + cfgCounter++; + try { + theClass = Class.forName(className); + if (methodArgs.length > 0) { + theMethod = theClass.getDeclaredMethod(methodName,String[].class); + } else { + theMethod = theClass.getMethod(methodName); + } + theConstructor = theClass.getDeclaredConstructor(String[].class); + } catch (Exception e) { + LOG.error("Exception preparing reflection tasks. className was " + String.valueOf(className)); + e.printStackTrace(); + } + try { + constrArgs = new String[userConstrArgs.length + 1]; + constrArgs[0] = url.toString(); + System.arraycopy(userConstrArgs,0,constrArgs,1,userConstrArgs.length); + instance = theConstructor.newInstance(new Object[]{constrArgs}); + if (methodArgs.length > 0) { + result = theMethod.invoke(instance, new Object[]{methodArgs}); + } else { + result = theMethod.invoke(instance); + } + } catch (Exception e) { + LOG.error("Exception using reflection to instantiate and invoke."); + LOG.error("\nurl was " + String.valueOf(url)); + LOG.error("\nclassName was " + String.valueOf(className)); + if (constrArgs.length > 0) { + LOG.error("\nconstrArgs[1] was " + String.valueOf(constrArgs[1])); + } + LOG.error("\nmethodName was " + String.valueOf(className)); + if (methodArgs.length > 0) { + LOG.error("\nmethodArgs[0] was " + String.valueOf(methodArgs[0])); + } + e.printStackTrace(); + } + + LOG.debug(className + "." + methodName + "() returned " + String.valueOf(result) + + " for field " + String.valueOf(fieldName)); + + // If user chose to overwrite, remove existing value + if (overwrite) { + LOG.debug("overwrite == true for fieldName == " + fieldName); Review Comment: Same here > Indexing plugin as an adapter for end user's own POJO instances > --------------------------------------------------------------- > > Key: NUTCH-3032 > URL: https://issues.apache.org/jira/browse/NUTCH-3032 > Project: Nutch > Issue Type: Improvement > Components: indexer > Reporter: Joe Gilvary > Priority: Major > Labels: indexing > Attachments: NUTCH-3032.patch > > > It could be helpful to let end users manipulate information at indexing time > with their own code without the need for writing their own indexing plugin. I > mentioned this on the dev mailing list > (https://www.mail-archive.com/dev@nutch.apache.org/msg31190.html) with some > description of my work in progress. > One potential use is to address some of the same concerns that NUTCH-585 > discusses regarding an alternative approach to picking and choosing which > content to index, but this approach would allow making index time decisions, > rather than setting the configuration for all content at the start of the > indexing run. > -- This message was sent by Atlassian Jira (v8.20.10#820010)