http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/main/java/org/apache/nutch/util/domain/DomainStatistics.java ---------------------------------------------------------------------- diff --git a/nutch-core/src/main/java/org/apache/nutch/util/domain/DomainStatistics.java b/nutch-core/src/main/java/org/apache/nutch/util/domain/DomainStatistics.java new file mode 100644 index 0000000..6c1bd9e --- /dev/null +++ b/nutch-core/src/main/java/org/apache/nutch/util/domain/DomainStatistics.java @@ -0,0 +1,234 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.nutch.util.domain; + +import java.io.File; +import java.io.IOException; +import java.net.URL; +import java.text.SimpleDateFormat; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.conf.Configured; +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.io.LongWritable; +import org.apache.hadoop.io.Text; +import org.apache.hadoop.mapreduce.Job; +import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; +import org.apache.hadoop.mapreduce.lib.input.SequenceFileInputFormat; +import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; +import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat; +import org.apache.hadoop.mapreduce.Mapper; +import org.apache.hadoop.mapreduce.Reducer; +import org.apache.hadoop.util.Tool; +import org.apache.hadoop.util.ToolRunner; +import org.apache.nutch.crawl.CrawlDatum; +import org.apache.nutch.util.NutchConfiguration; +import org.apache.nutch.util.TimingUtil; +import org.apache.nutch.util.URLUtil; + +/** + * Extracts some very basic statistics about domains from the crawldb + */ +public class DomainStatistics extends Configured implements Tool { + + private static final Logger LOG = LoggerFactory + .getLogger(DomainStatistics.class); + + private static final Text FETCHED_TEXT = new Text("FETCHED"); + private static final Text NOT_FETCHED_TEXT = new Text("NOT_FETCHED"); + + public static enum MyCounter { + FETCHED, NOT_FETCHED, EMPTY_RESULT + }; + + private static final int MODE_HOST = 1; + private static final int MODE_DOMAIN = 2; + private static final int MODE_SUFFIX = 3; + private static final int MODE_TLD = 4; + + private int mode = 0; + + public int run(String[] args) throws Exception { + if (args.length < 3) { + System.err.println("Usage: DomainStatistics inputDirs outDir mode [numOfReducer]"); + + System.err.println("\tinputDirs\tComma separated list of crawldb input directories"); + System.err.println("\t\t\tE.g.: crawl/crawldb/"); + + System.err.println("\toutDir\t\tOutput directory where results should be dumped"); + + System.err.println("\tmode\t\tSet statistics gathering mode"); + System.err.println("\t\t\t\thost\tGather statistics by host"); + System.err.println("\t\t\t\tdomain\tGather statistics by domain"); + System.err.println("\t\t\t\tsuffix\tGather statistics by suffix"); + System.err.println("\t\t\t\ttld\tGather statistics by top level directory"); + + System.err.println("\t[numOfReducers]\tOptional number of reduce jobs to use. Defaults to 1."); + + return 1; + } + String inputDir = args[0]; + String outputDir = args[1]; + int numOfReducers = 1; + + if (args.length > 3) { + numOfReducers = Integer.parseInt(args[3]); + } + + SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss"); + long start = System.currentTimeMillis(); + LOG.info("DomainStatistics: starting at " + sdf.format(start)); + + int mode = 0; + String jobName = "DomainStatistics"; + if (args[2].equals("host")) { + jobName = "Host statistics"; + mode = MODE_HOST; + } else if (args[2].equals("domain")) { + jobName = "Domain statistics"; + mode = MODE_DOMAIN; + } else if (args[2].equals("suffix")) { + jobName = "Suffix statistics"; + mode = MODE_SUFFIX; + } else if (args[2].equals("tld")) { + jobName = "TLD statistics"; + mode = MODE_TLD; + } + + Configuration conf = getConf(); + conf.setInt("domain.statistics.mode", mode); + conf.setBoolean("mapreduce.fileoutputcommitter.marksuccessfuljobs", false); + + Job job = Job.getInstance(conf, jobName); + job.setJarByClass(DomainStatistics.class); + + String[] inputDirsSpecs = inputDir.split(","); + for (int i = 0; i < inputDirsSpecs.length; i++) { + File completeInputPath = new File(new File(inputDirsSpecs[i]), "current"); + FileInputFormat.addInputPath(job, new Path(completeInputPath.toString())); + } + + job.setInputFormatClass(SequenceFileInputFormat.class); + FileOutputFormat.setOutputPath(job, new Path(outputDir)); + job.setOutputFormatClass(TextOutputFormat.class); + + job.setMapOutputKeyClass(Text.class); + job.setMapOutputValueClass(LongWritable.class); + job.setOutputKeyClass(Text.class); + job.setOutputValueClass(LongWritable.class); + + job.setMapperClass(DomainStatisticsMapper.class); + job.setReducerClass(DomainStatisticsReducer.class); + job.setCombinerClass(DomainStatisticsCombiner.class); + job.setNumReduceTasks(numOfReducers); + + try { + job.waitForCompletion(true); + } catch (Exception e) { + throw e; + } + + long end = System.currentTimeMillis(); + LOG.info("DomainStatistics: finished at " + sdf.format(end) + ", elapsed: " + + TimingUtil.elapsedTime(start, end)); + return 0; + } + + static class DomainStatisticsMapper extends + Mapper<Text, CrawlDatum, Text, LongWritable> { + int mode = 0; + + public void setup(Context context) { + mode = context.getConfiguration().getInt("domain.statistics.mode", + MODE_DOMAIN); + } + + public void map(Text urlText, CrawlDatum datum, Context context) + throws IOException, InterruptedException { + + if (datum.getStatus() == CrawlDatum.STATUS_DB_FETCHED + || datum.getStatus() == CrawlDatum.STATUS_DB_NOTMODIFIED) { + + try { + URL url = new URL(urlText.toString()); + String out = null; + switch (mode) { + case MODE_HOST: + out = url.getHost(); + break; + case MODE_DOMAIN: + out = URLUtil.getDomainName(url); + break; + case MODE_SUFFIX: + out = URLUtil.getDomainSuffix(url).getDomain(); + break; + case MODE_TLD: + out = URLUtil.getTopLevelDomainName(url); + break; + } + if (out.trim().equals("")) { + LOG.info("url : " + url); + context.getCounter(MyCounter.EMPTY_RESULT).increment(1); + } + + context.write(new Text(out), new LongWritable(1)); + } catch (Exception ex) { + } + + context.getCounter(MyCounter.FETCHED).increment(1); + context.write(FETCHED_TEXT, new LongWritable(1)); + } else { + context.getCounter(MyCounter.NOT_FETCHED).increment(1); + context.write(NOT_FETCHED_TEXT, new LongWritable(1)); + } + } + } + + static class DomainStatisticsReducer extends + Reducer<Text, LongWritable, LongWritable, Text> { + public void reduce(Text key, Iterable<LongWritable> values, Context context) + throws IOException, InterruptedException { + long total = 0; + + for (LongWritable val : values) { + total += val.get(); + } + + context.write(new LongWritable(total), key); + } + } + + public static class DomainStatisticsCombiner extends + Reducer<Text, LongWritable, Text, LongWritable> { + public void reduce(Text key, Iterable<LongWritable> values, Context context) + throws IOException, InterruptedException { + long total = 0; + + for (LongWritable val : values) { + total += val.get(); + } + context.write(key, new LongWritable(total)); + } + } + + public static void main(String[] args) throws Exception { + ToolRunner.run(NutchConfiguration.create(), new DomainStatistics(), args); + } + +}
http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/main/java/org/apache/nutch/util/domain/DomainSuffix.java ---------------------------------------------------------------------- diff --git a/nutch-core/src/main/java/org/apache/nutch/util/domain/DomainSuffix.java b/nutch-core/src/main/java/org/apache/nutch/util/domain/DomainSuffix.java new file mode 100644 index 0000000..d40ebe9 --- /dev/null +++ b/nutch-core/src/main/java/org/apache/nutch/util/domain/DomainSuffix.java @@ -0,0 +1,79 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.nutch.util.domain; + +/** + * This class represents the last part of the host name, which is operated by + * authoritives, not individuals. This information is needed to find the domain + * name of a host. The domain name of a host is defined to be the last part + * before the domain suffix, w/o subdomain names. As an example the domain name + * of <br> + * <code> http://lucene.apache.org/ + * </code><br> + * is <code> apache.org</code> <br> + * This class holds three fields, <strong>domain</strong> field represents the + * suffix (such as "co.uk") <strong>boost</strong> is a float for boosting score + * of url's with this suffix <strong>status</strong> field represents domain's + * status + * + * @author Enis Soztutar <[email protected]> + * @see TopLevelDomain for info please see conf/domain-suffixes.xml + */ +public class DomainSuffix { + + /** + * Enumeration of the status of the tld. Please see domain-suffixes.xml. + */ + public enum Status { + INFRASTRUCTURE, SPONSORED, UNSPONSORED, STARTUP, PROPOSED, DELETED, PSEUDO_DOMAIN, DEPRECATED, IN_USE, NOT_IN_USE, REJECTED + }; + + private String domain; + private Status status; + private float boost; + + public static final float DEFAULT_BOOST = 1.0f; + public static final Status DEFAULT_STATUS = Status.IN_USE; + + public DomainSuffix(String domain, Status status, float boost) { + this.domain = domain; + this.status = status; + this.boost = boost; + } + + public DomainSuffix(String domain) { + this(domain, DEFAULT_STATUS, DEFAULT_BOOST); + } + + public String getDomain() { + return domain; + } + + public Status getStatus() { + return status; + } + + public float getBoost() { + return boost; + } + + @Override + public String toString() { + return domain; + } +} http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/main/java/org/apache/nutch/util/domain/DomainSuffixes.java ---------------------------------------------------------------------- diff --git a/nutch-core/src/main/java/org/apache/nutch/util/domain/DomainSuffixes.java b/nutch-core/src/main/java/org/apache/nutch/util/domain/DomainSuffixes.java new file mode 100644 index 0000000..765457e --- /dev/null +++ b/nutch-core/src/main/java/org/apache/nutch/util/domain/DomainSuffixes.java @@ -0,0 +1,86 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.nutch.util.domain; + +import java.io.InputStream; +import java.util.HashMap; + +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; +import org.apache.hadoop.util.StringUtils; + +/** + * Storage class for <code>DomainSuffix</code> objects Note: this class is + * singleton + * + * @author Enis Soztutar <[email protected]> + */ +public class DomainSuffixes { + private static final Logger LOG = LoggerFactory + .getLogger(DomainSuffixes.class); + + private HashMap<String, DomainSuffix> domains = new HashMap<String, DomainSuffix>(); + + private static DomainSuffixes instance; + + /** private ctor */ + private DomainSuffixes() { + String file = "domain-suffixes.xml"; + InputStream input = this.getClass().getClassLoader() + .getResourceAsStream(file); + try { + new DomainSuffixesReader().read(this, input); + } catch (Exception ex) { + LOG.warn(StringUtils.stringifyException(ex)); + } + } + + /** + * Singleton instance, lazy instantination + * + * @return returns the domain suffix instance + */ + public static DomainSuffixes getInstance() { + if (instance == null) { + instance = new DomainSuffixes(); + } + return instance; + } + + void addDomainSuffix(DomainSuffix tld) { + domains.put(tld.getDomain(), tld); + } + + /** return whether the extension is a registered domain entry */ + public boolean isDomainSuffix(String extension) { + return domains.containsKey(extension); + } + + /** + * Return the {@link DomainSuffix} object for the extension, if extension is a + * top level domain returned object will be an instance of + * {@link TopLevelDomain} + * + * @param extension + * of the domain + */ + public DomainSuffix get(String extension) { + return domains.get(extension); + } + +} http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/main/java/org/apache/nutch/util/domain/DomainSuffixesReader.java ---------------------------------------------------------------------- diff --git a/nutch-core/src/main/java/org/apache/nutch/util/domain/DomainSuffixesReader.java b/nutch-core/src/main/java/org/apache/nutch/util/domain/DomainSuffixesReader.java new file mode 100644 index 0000000..a2a60e2 --- /dev/null +++ b/nutch-core/src/main/java/org/apache/nutch/util/domain/DomainSuffixesReader.java @@ -0,0 +1,164 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.nutch.util.domain; + +import java.io.IOException; +import java.io.InputStream; + +import javax.xml.parsers.DocumentBuilder; +import javax.xml.parsers.DocumentBuilderFactory; +import javax.xml.parsers.ParserConfigurationException; + +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; +import org.apache.hadoop.util.StringUtils; +import org.apache.nutch.util.domain.DomainSuffix.Status; +import org.apache.nutch.util.domain.TopLevelDomain.Type; +import org.w3c.dom.Document; +import org.w3c.dom.Element; +import org.w3c.dom.NodeList; +import org.xml.sax.InputSource; +import org.xml.sax.SAXException; + +/** + * For parsing xml files containing domain suffix definitions. Parsed xml files + * should validate against <code>domain-suffixes.xsd</code> + * + * @author Enis Soztutar <[email protected]> + */ +class DomainSuffixesReader { + + private static final Logger LOG = LoggerFactory + .getLogger(DomainSuffixesReader.class); + + void read(DomainSuffixes tldEntries, InputStream input) throws IOException { + try { + + DocumentBuilderFactory factory = DocumentBuilderFactory.newInstance(); + factory.setIgnoringComments(true); + DocumentBuilder builder = factory.newDocumentBuilder(); + Document document = builder.parse(new InputSource(input)); + + Element root = document.getDocumentElement(); + + if (root != null && root.getTagName().equals("domains")) { + + Element tlds = (Element) root.getElementsByTagName("tlds").item(0); + Element suffixes = (Element) root.getElementsByTagName("suffixes") + .item(0); + + // read tlds + readITLDs(tldEntries, (Element) tlds.getElementsByTagName("itlds") + .item(0)); + readGTLDs(tldEntries, (Element) tlds.getElementsByTagName("gtlds") + .item(0)); + readCCTLDs(tldEntries, (Element) tlds.getElementsByTagName("cctlds") + .item(0)); + + readSuffixes(tldEntries, suffixes); + } else { + throw new IOException("xml file is not valid"); + } + } catch (ParserConfigurationException ex) { + LOG.warn(StringUtils.stringifyException(ex)); + throw new IOException(ex.getMessage()); + } catch (SAXException ex) { + LOG.warn(StringUtils.stringifyException(ex)); + throw new IOException(ex.getMessage()); + } + } + + void readITLDs(DomainSuffixes tldEntries, Element el) { + NodeList children = el.getElementsByTagName("tld"); + for (int i = 0; i < children.getLength(); i++) { + tldEntries.addDomainSuffix(readGTLD((Element) children.item(i), + Type.INFRASTRUCTURE)); + } + } + + void readGTLDs(DomainSuffixes tldEntries, Element el) { + NodeList children = el.getElementsByTagName("tld"); + for (int i = 0; i < children.getLength(); i++) { + tldEntries.addDomainSuffix(readGTLD((Element) children.item(i), + Type.GENERIC)); + } + } + + void readCCTLDs(DomainSuffixes tldEntries, Element el) throws IOException { + NodeList children = el.getElementsByTagName("tld"); + for (int i = 0; i < children.getLength(); i++) { + tldEntries.addDomainSuffix(readCCTLD((Element) children.item(i))); + } + } + + TopLevelDomain readGTLD(Element el, Type type) { + String domain = el.getAttribute("domain"); + Status status = readStatus(el); + float boost = readBoost(el); + return new TopLevelDomain(domain, type, status, boost); + } + + TopLevelDomain readCCTLD(Element el) throws IOException { + String domain = el.getAttribute("domain"); + Status status = readStatus(el); + float boost = readBoost(el); + String countryName = readCountryName(el); + return new TopLevelDomain(domain, status, boost, countryName); + } + + /** read optional field status */ + Status readStatus(Element el) { + NodeList list = el.getElementsByTagName("status"); + if (list == null || list.getLength() == 0) + return DomainSuffix.DEFAULT_STATUS; + return Status.valueOf(list.item(0).getFirstChild().getNodeValue()); + } + + /** read optional field boost */ + float readBoost(Element el) { + NodeList list = el.getElementsByTagName("boost"); + if (list == null || list.getLength() == 0) + return DomainSuffix.DEFAULT_BOOST; + return Float.parseFloat(list.item(0).getFirstChild().getNodeValue()); + } + + /** + * read field countryname + */ + String readCountryName(Element el) throws IOException { + NodeList list = el.getElementsByTagName("country"); + if (list == null || list.getLength() == 0) + throw new IOException("Country name should be given"); + return list.item(0).getNodeValue(); + } + + void readSuffixes(DomainSuffixes tldEntries, Element el) { + NodeList children = el.getElementsByTagName("suffix"); + for (int i = 0; i < children.getLength(); i++) { + tldEntries.addDomainSuffix(readSuffix((Element) children.item(i))); + } + } + + DomainSuffix readSuffix(Element el) { + String domain = el.getAttribute("domain"); + Status status = readStatus(el); + float boost = readBoost(el); + return new DomainSuffix(domain, status, boost); + } + +} http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/main/java/org/apache/nutch/util/domain/TopLevelDomain.java ---------------------------------------------------------------------- diff --git a/nutch-core/src/main/java/org/apache/nutch/util/domain/TopLevelDomain.java b/nutch-core/src/main/java/org/apache/nutch/util/domain/TopLevelDomain.java new file mode 100644 index 0000000..f442d1f --- /dev/null +++ b/nutch-core/src/main/java/org/apache/nutch/util/domain/TopLevelDomain.java @@ -0,0 +1,67 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.nutch.util.domain; + +/** + * (From wikipedia) A top-level domain (TLD) is the last part of an Internet + * domain name; that is, the letters which follow the final dot of any domain + * name. For example, in the domain name <code>www.website.com</code>, the + * top-level domain is <code>com</code>. + * + * @author Enis Soztutar <[email protected]> + * + * @see <a href="http://www.iana.org/"> iana.org</a> + * + * @see <a href="http://en.wikipedia.org/wiki/Top-level_domain"> + * Top-level_domain</a> + */ +public class TopLevelDomain extends DomainSuffix { + + public enum Type { + INFRASTRUCTURE, GENERIC, COUNTRY + }; + + private Type type; + private String countryName = null; + + public TopLevelDomain(String domain, Type type, Status status, float boost) { + super(domain, status, boost); + this.type = type; + } + + public TopLevelDomain(String domain, Status status, float boost, + String countryName) { + super(domain, status, boost); + this.type = Type.COUNTRY; + this.countryName = countryName; + } + + public Type getType() { + return type; + } + + /** + * Returns the country name if TLD is Country Code TLD + * + * @return country name or null + */ + public String getCountryName() { + return countryName; + } + +} http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/main/java/org/apache/nutch/util/domain/package.html ---------------------------------------------------------------------- diff --git a/nutch-core/src/main/java/org/apache/nutch/util/domain/package.html b/nutch-core/src/main/java/org/apache/nutch/util/domain/package.html new file mode 100644 index 0000000..49e0e6a --- /dev/null +++ b/nutch-core/src/main/java/org/apache/nutch/util/domain/package.html @@ -0,0 +1,14 @@ +<html> +<body> +<h2>Classes for domain name analysis.</h2> + +for information please refer to following urls : +<ul> +<li><a href="http://en.wikipedia.org/wiki/DNS">http://en.wikipedia.org/wiki/DNS</a></li> +<li><a href="http://en.wikipedia.org/wiki/Top-level_domain">http://en.wikipedia.org/wiki/Top-level_domain</a></li> +<li><a href="http://wiki.mozilla.org/TLD_List">http://wiki.mozilla.org/TLD_List</a></li> +<li><a href="http://publicsuffix.org/">http://publicsuffix.org/</a></li> +</ul> + +</body> +</html> http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/main/java/org/apache/nutch/util/package-info.java ---------------------------------------------------------------------- diff --git a/nutch-core/src/main/java/org/apache/nutch/util/package-info.java b/nutch-core/src/main/java/org/apache/nutch/util/package-info.java new file mode 100644 index 0000000..053dbc1 --- /dev/null +++ b/nutch-core/src/main/java/org/apache/nutch/util/package-info.java @@ -0,0 +1,22 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * Miscellaneous utility classes. + */ +package org.apache.nutch.util; + http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/main/java/org/apache/nutch/webui/NutchUiApplication.java ---------------------------------------------------------------------- diff --git a/nutch-core/src/main/java/org/apache/nutch/webui/NutchUiApplication.java b/nutch-core/src/main/java/org/apache/nutch/webui/NutchUiApplication.java new file mode 100644 index 0000000..6fd2396 --- /dev/null +++ b/nutch-core/src/main/java/org/apache/nutch/webui/NutchUiApplication.java @@ -0,0 +1,75 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.nutch.webui; + +import org.apache.nutch.webui.pages.DashboardPage; +import org.apache.nutch.webui.pages.assets.NutchUiCssReference; +import org.apache.wicket.markup.html.WebPage; +import org.apache.wicket.protocol.http.WebApplication; +import org.apache.wicket.spring.injection.annot.SpringComponentInjector; +import org.springframework.beans.BeansException; +import org.springframework.context.ApplicationContext; +import org.springframework.context.ApplicationContextAware; +import org.springframework.stereotype.Component; + +import de.agilecoders.wicket.core.Bootstrap; +import de.agilecoders.wicket.core.markup.html.themes.bootstrap.BootstrapCssReference; +import de.agilecoders.wicket.core.settings.BootstrapSettings; +import de.agilecoders.wicket.core.settings.SingleThemeProvider; +import de.agilecoders.wicket.core.settings.Theme; +import de.agilecoders.wicket.extensions.markup.html.bootstrap.icon.FontAwesomeCssReference; + +@Component +public class NutchUiApplication extends WebApplication implements + ApplicationContextAware { + private static final String THEME_NAME = "bootstrap"; + private ApplicationContext context; + + /** + * @see org.apache.wicket.Application#getHomePage() + */ + @Override + public Class<? extends WebPage> getHomePage() { + return DashboardPage.class; + } + + /** + * @see org.apache.wicket.Application#init() + */ + @Override + public void init() { + super.init(); + BootstrapSettings settings = new BootstrapSettings(); + Bootstrap.install(this, settings); + configureTheme(settings); + + getComponentInstantiationListeners().add( + new SpringComponentInjector(this, context)); + } + + private void configureTheme(BootstrapSettings settings) { + Theme theme = new Theme(THEME_NAME, BootstrapCssReference.instance(), + FontAwesomeCssReference.instance(), NutchUiCssReference.instance()); + settings.setThemeProvider(new SingleThemeProvider(theme)); + } + + @Override + public void setApplicationContext(ApplicationContext applicationContext) + throws BeansException { + this.context = applicationContext; + } +} http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/main/java/org/apache/nutch/webui/NutchUiApplication.properties ---------------------------------------------------------------------- diff --git a/nutch-core/src/main/java/org/apache/nutch/webui/NutchUiApplication.properties b/nutch-core/src/main/java/org/apache/nutch/webui/NutchUiApplication.properties new file mode 100644 index 0000000..4c62939 --- /dev/null +++ b/nutch-core/src/main/java/org/apache/nutch/webui/NutchUiApplication.properties @@ -0,0 +1,63 @@ +############################################################################# +#Licensed to the Apache Software Foundation (ASF) under one or more +#contributor license agreements. See the NOTICE file distributed with +#this work for additional information regarding copyright ownership. +#The ASF licenses this file to You under the Apache License, Version 2.0 +#(the "License"); you may not use this file except in compliance with +#the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +#Unless required by applicable law or agreed to in writing, software +#distributed under the License is distributed on an "AS IS" BASIS, +#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +#See the License for the specific language governing permissions and +#limitations under the License. +############################################################################# + +navbar.menu.dashboard = Dashboard +navbar.menu.statistics = Statistics +navbar.menu.instances = Instances +navbar.menu.settings = Settings +navbar.menu.crawls = Crawls +navbar.menu.scheduling = Scheduling +navbar.menu.search = Search +navbar.menu.url = URLs upload +navbar.menu.seedLists = Seed lists + +page.header.seedList = Seed list + +navbar.userMenu.settings = Settings +navbar.userMenu.logout = Log out + +menu.settings=Settings +menu.instances=Instances + +connected=Connected +disconnected=Disconnected + +##ENUMS +ConnectionStatus.CONNECTING=Connecting +ConnectionStatus.CONNECTED=Connected +ConnectionStatus.DISCONNECTED=Disconnected + +CrawlStatus.NEW=New +CrawlStatus.ERROR=Error +CrawlStatus.CRAWLING=Crawling +CrawlStatus.FINISHED=Finished + +instances=Instances +instances.header.name=Instance name +instances.header.hostname=Hostname +instances.header.status=Status +instances.header.username=Username +instances.label.name=Instance name +instances.label.hostname=Hostname +instances.label.port=Port +instances.label.username=Username +instances.label.password=Password +instances.buttons.addInstance=Add instance + +settings=Settings +settings.header.name = Name +settings.header.value = Value \ No newline at end of file http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/main/java/org/apache/nutch/webui/NutchUiServer.java ---------------------------------------------------------------------- diff --git a/nutch-core/src/main/java/org/apache/nutch/webui/NutchUiServer.java b/nutch-core/src/main/java/org/apache/nutch/webui/NutchUiServer.java new file mode 100644 index 0000000..d534b8f --- /dev/null +++ b/nutch-core/src/main/java/org/apache/nutch/webui/NutchUiServer.java @@ -0,0 +1,104 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.nutch.webui; + +import org.apache.commons.cli.CommandLine; +import org.apache.commons.cli.CommandLineParser; +import org.apache.commons.cli.GnuParser; +import org.apache.commons.cli.HelpFormatter; +import org.apache.commons.cli.Option; +import org.apache.commons.cli.OptionBuilder; +import org.apache.commons.cli.Options; +import org.apache.hadoop.util.StringUtils; +import org.apache.wicket.protocol.http.WicketFilter; +import org.apache.wicket.spring.SpringWebApplicationFactory; +import org.mortbay.jetty.Handler; +import org.mortbay.jetty.Server; +import org.mortbay.jetty.servlet.Context; +import org.mortbay.jetty.servlet.DefaultServlet; +import org.mortbay.jetty.servlet.FilterHolder; +import org.springframework.web.context.ContextLoaderListener; +import org.springframework.web.context.WebApplicationContext; +import org.springframework.web.context.request.RequestContextListener; +import org.springframework.web.context.support.AnnotationConfigWebApplicationContext; + +public class NutchUiServer { + private static final String APP_FACTORY_NAME = SpringWebApplicationFactory.class + .getName(); + private static final String CONFIG_LOCATION = "org.apache.nutch.webui"; + private static final String CMD_PORT = "port"; + private static Integer port = 8080; + + public static void main(String[] args) throws Exception { + CommandLineParser parser = new GnuParser(); + Options options = createWebAppOptions(); + CommandLine commandLine = null; + HelpFormatter formatter = new HelpFormatter(); + try { + commandLine = parser.parse(options, args); + } catch (Exception e) { + formatter.printHelp("NutchUiServer", options, true); + StringUtils.stringifyException(e); + } + + if (commandLine.hasOption("help")) { + formatter.printHelp("NutchUiServer", options, true); + return; + } + if (commandLine.hasOption(CMD_PORT)) { + port = Integer.parseInt(commandLine.getOptionValue(CMD_PORT)); + } + startServer(); + } + + private static void startServer() throws Exception, InterruptedException { + Server server = new Server(port); + Context context = new Context(server, "/", Context.SESSIONS); + context.addServlet(DefaultServlet.class, "/*"); + + context.addEventListener(new ContextLoaderListener(getContext())); + context.addEventListener(new RequestContextListener()); + + WicketFilter filter = new WicketFilter(); + filter.setFilterPath("/"); + FilterHolder holder = new FilterHolder(filter); + holder.setInitParameter("applicationFactoryClassName", APP_FACTORY_NAME); + context.addFilter(holder, "/*", Handler.DEFAULT); + + server.setHandler(context); + server.start(); + server.join(); + } + + private static WebApplicationContext getContext() { + AnnotationConfigWebApplicationContext context = new AnnotationConfigWebApplicationContext(); + context.setConfigLocation(CONFIG_LOCATION); + return context; + } + + private static Options createWebAppOptions() { + Options options = new Options(); + Option helpOpt = new Option("h", "help", false, "show this help message"); + OptionBuilder.withDescription("Port to run the WebApplication on."); + OptionBuilder.hasOptionalArg(); + OptionBuilder.withArgName("port number"); + options.addOption(OptionBuilder.create(CMD_PORT)); + options.addOption(helpOpt); + return options; + } + +} http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/main/java/org/apache/nutch/webui/client/NutchClient.java ---------------------------------------------------------------------- diff --git a/nutch-core/src/main/java/org/apache/nutch/webui/client/NutchClient.java b/nutch-core/src/main/java/org/apache/nutch/webui/client/NutchClient.java new file mode 100644 index 0000000..3f8887d --- /dev/null +++ b/nutch-core/src/main/java/org/apache/nutch/webui/client/NutchClient.java @@ -0,0 +1,49 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.nutch.webui.client; + +import java.util.Map; + +import org.apache.nutch.webui.client.model.ConnectionStatus; +import org.apache.nutch.webui.client.model.JobConfig; +import org.apache.nutch.webui.client.model.JobInfo; +import org.apache.nutch.webui.client.model.NutchStatus; +import org.apache.nutch.webui.model.NutchInstance; +import org.apache.nutch.webui.model.SeedList; + +public interface NutchClient { + + public NutchInstance getNutchInstance(); + + public NutchStatus getNutchStatus(); + + public ConnectionStatus getConnectionStatus(); + + public String executeJob(JobConfig jobConfig); + + public JobInfo getJobInfo(String jobId); + + public Map<String, String> getNutchConfig(String config); + + /** + * Create seed list and return seed directory location + * + * @param seedList + * @return + */ + public String createSeed(SeedList seedList); +} http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/main/java/org/apache/nutch/webui/client/NutchClientFactory.java ---------------------------------------------------------------------- diff --git a/nutch-core/src/main/java/org/apache/nutch/webui/client/NutchClientFactory.java b/nutch-core/src/main/java/org/apache/nutch/webui/client/NutchClientFactory.java new file mode 100644 index 0000000..32da00e --- /dev/null +++ b/nutch-core/src/main/java/org/apache/nutch/webui/client/NutchClientFactory.java @@ -0,0 +1,52 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.nutch.webui.client; + +import java.util.concurrent.ExecutionException; + +import org.apache.nutch.webui.client.impl.NutchClientImpl; +import org.apache.nutch.webui.model.NutchInstance; +import org.springframework.stereotype.Component; + +import com.google.common.cache.CacheBuilder; +import com.google.common.cache.CacheLoader; +import com.google.common.cache.LoadingCache; + +@Component +public class NutchClientFactory { + private LoadingCache<NutchInstance, NutchClient> cache; + + public NutchClientFactory() { + cache = CacheBuilder.newBuilder().build(new NutchClientCacheLoader()); + } + + public NutchClient getClient(NutchInstance instance) { + try { + return cache.get(instance); + } catch (ExecutionException e) { + throw new IllegalStateException(e); + } + } + + private static class NutchClientCacheLoader extends + CacheLoader<NutchInstance, NutchClient> { + @Override + public NutchClient load(NutchInstance key) throws Exception { + return new NutchClientImpl(key); + } + } +} http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/main/java/org/apache/nutch/webui/client/impl/CrawlingCycle.java ---------------------------------------------------------------------- diff --git a/nutch-core/src/main/java/org/apache/nutch/webui/client/impl/CrawlingCycle.java b/nutch-core/src/main/java/org/apache/nutch/webui/client/impl/CrawlingCycle.java new file mode 100644 index 0000000..2482c06 --- /dev/null +++ b/nutch-core/src/main/java/org/apache/nutch/webui/client/impl/CrawlingCycle.java @@ -0,0 +1,82 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.nutch.webui.client.impl; + +import java.util.List; + +import org.apache.commons.collections4.CollectionUtils; +import org.apache.nutch.webui.client.model.Crawl; +import org.apache.nutch.webui.client.model.JobInfo; +import org.apache.nutch.webui.client.model.JobInfo.State; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import com.google.common.collect.Lists; + +/** + * This class implements crawl cycle as in crawl script + * + * @author feodor + * + */ +public class CrawlingCycle { + private Logger log = LoggerFactory.getLogger(CrawlingCycle.class); + + private CrawlingCycleListener listener; + private RemoteCommandExecutor executor; + private Crawl crawl; + + private List<RemoteCommand> remoteCommands; + private List<RemoteCommand> executedCommands = Lists.newArrayList(); + + public CrawlingCycle(CrawlingCycleListener listener, + RemoteCommandExecutor executor, Crawl crawl, List<RemoteCommand> commands) { + this.listener = listener; + this.executor = executor; + this.crawl = crawl; + this.remoteCommands = commands; + } + + public synchronized void executeCrawlCycle() { + listener.crawlingStarted(crawl); + + for (RemoteCommand command : remoteCommands) { + JobInfo jobInfo = executor.executeRemoteJob(command); + command.setJobInfo(jobInfo); + + log.info("Executed remote command data: {}", command); + + if (jobInfo.getState() == State.FAILED) { + listener.onCrawlError(crawl, jobInfo.getMsg()); + return; + } + + executedCommands.add(command); + listener.commandExecuted(crawl, command, calculateProgress()); + } + listener.crawlingFinished(crawl); + } + + private int calculateProgress() { + if (CollectionUtils.isEmpty(remoteCommands)) { + return 0; + } + return (int) ((float) executedCommands.size() + / (float) remoteCommands.size() * 100); + } + +} http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/main/java/org/apache/nutch/webui/client/impl/CrawlingCycleListener.java ---------------------------------------------------------------------- diff --git a/nutch-core/src/main/java/org/apache/nutch/webui/client/impl/CrawlingCycleListener.java b/nutch-core/src/main/java/org/apache/nutch/webui/client/impl/CrawlingCycleListener.java new file mode 100644 index 0000000..c2abde5 --- /dev/null +++ b/nutch-core/src/main/java/org/apache/nutch/webui/client/impl/CrawlingCycleListener.java @@ -0,0 +1,31 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.nutch.webui.client.impl; + +import org.apache.nutch.webui.client.model.Crawl; + +public interface CrawlingCycleListener { + + void crawlingStarted(Crawl crawl); + + void onCrawlError(Crawl crawl, String msg); + + void commandExecuted(Crawl crawl, RemoteCommand command, int progress); + + void crawlingFinished(Crawl crawl); + +} http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/main/java/org/apache/nutch/webui/client/impl/NutchClientImpl.java ---------------------------------------------------------------------- diff --git a/nutch-core/src/main/java/org/apache/nutch/webui/client/impl/NutchClientImpl.java b/nutch-core/src/main/java/org/apache/nutch/webui/client/impl/NutchClientImpl.java new file mode 100644 index 0000000..1a577f9 --- /dev/null +++ b/nutch-core/src/main/java/org/apache/nutch/webui/client/impl/NutchClientImpl.java @@ -0,0 +1,99 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.nutch.webui.client.impl; + +import static javax.ws.rs.core.MediaType.APPLICATION_JSON; + +import java.util.Map; + +import org.apache.nutch.webui.client.NutchClient; +import org.apache.nutch.webui.client.model.ConnectionStatus; +import org.apache.nutch.webui.client.model.JobConfig; +import org.apache.nutch.webui.client.model.JobInfo; +import org.apache.nutch.webui.client.model.NutchStatus; +import org.apache.nutch.webui.model.NutchInstance; +import org.apache.nutch.webui.model.SeedList; + +import com.sun.jersey.api.client.Client; +import com.sun.jersey.api.client.WebResource; +import com.sun.jersey.api.client.config.ClientConfig; +import com.sun.jersey.api.client.config.DefaultClientConfig; +import com.sun.jersey.api.json.JSONConfiguration; + +public class NutchClientImpl implements NutchClient { + private Client client; + private WebResource nutchResource; + private NutchInstance instance; + + public NutchClientImpl(NutchInstance instance) { + this.instance = instance; + createClient(); + } + + public void createClient() { + ClientConfig clientConfig = new DefaultClientConfig(); + clientConfig.getFeatures() + .put(JSONConfiguration.FEATURE_POJO_MAPPING, true); + this.client = Client.create(clientConfig); + this.nutchResource = client.resource(instance.getUrl()); + } + + @Override + public NutchStatus getNutchStatus() { + return nutchResource.path("/admin").type(APPLICATION_JSON) + .get(NutchStatus.class); + } + + @Override + public ConnectionStatus getConnectionStatus() { + + getNutchStatus(); + return ConnectionStatus.CONNECTED; + // TODO implement disconnected status + } + + @Override + public String executeJob(JobConfig jobConfig) { + JobInfo jobInfo = nutchResource.path("/job/create").type(APPLICATION_JSON) + .post(JobInfo.class, jobConfig); + return jobInfo.getId(); + } + + @Override + public JobInfo getJobInfo(String jobId) { + return nutchResource.path("/job/" + jobId).type(APPLICATION_JSON) + .get(JobInfo.class); + } + + @Override + public NutchInstance getNutchInstance() { + return instance; + } + + @SuppressWarnings("unchecked") + @Override + public Map<String, String> getNutchConfig(String config) { + return nutchResource.path("/config/" + config).type(APPLICATION_JSON) + .get(Map.class); + } + + @Override + public String createSeed(SeedList seedList) { + return nutchResource.path("/seed/create").type(APPLICATION_JSON) + .post(String.class, seedList); + } +} http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/main/java/org/apache/nutch/webui/client/impl/RemoteCommand.java ---------------------------------------------------------------------- diff --git a/nutch-core/src/main/java/org/apache/nutch/webui/client/impl/RemoteCommand.java b/nutch-core/src/main/java/org/apache/nutch/webui/client/impl/RemoteCommand.java new file mode 100644 index 0000000..ea19a8a --- /dev/null +++ b/nutch-core/src/main/java/org/apache/nutch/webui/client/impl/RemoteCommand.java @@ -0,0 +1,76 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.nutch.webui.client.impl; + +import java.io.Serializable; +import java.text.MessageFormat; + +import org.apache.commons.lang3.StringUtils; +import org.apache.nutch.webui.client.model.JobConfig; +import org.apache.nutch.webui.client.model.JobInfo; +import org.joda.time.Duration; + +public class RemoteCommand implements Serializable { + private JobConfig jobConfig; + private JobInfo jobInfo = new JobInfo(); + private Duration timeout; + + /** + * Use {@link RemoteCommandBuilder} instead + */ + @SuppressWarnings("unused") + private RemoteCommand() { + } + + public RemoteCommand(JobConfig jobConfig) { + this.jobConfig = jobConfig; + } + + public JobConfig getJobConfig() { + return jobConfig; + } + + public void setJobConfig(JobConfig jobConfig) { + this.jobConfig = jobConfig; + } + + public JobInfo getJobInfo() { + return jobInfo; + } + + public void setJobInfo(JobInfo jobInfo) { + this.jobInfo = jobInfo; + } + + public Duration getTimeout() { + return timeout; + } + + public void setTimeout(Duration timeout) { + this.timeout = timeout; + } + + @Override + public String toString() { + String statusInfo = StringUtils.EMPTY; + if (jobInfo != null) { + statusInfo = MessageFormat.format("{0}", jobInfo.getState()); + } + return MessageFormat.format("{0} status: {1}", jobConfig.getType(), + statusInfo); + } +} http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/main/java/org/apache/nutch/webui/client/impl/RemoteCommandBuilder.java ---------------------------------------------------------------------- diff --git a/nutch-core/src/main/java/org/apache/nutch/webui/client/impl/RemoteCommandBuilder.java b/nutch-core/src/main/java/org/apache/nutch/webui/client/impl/RemoteCommandBuilder.java new file mode 100644 index 0000000..d6b1767 --- /dev/null +++ b/nutch-core/src/main/java/org/apache/nutch/webui/client/impl/RemoteCommandBuilder.java @@ -0,0 +1,64 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.nutch.webui.client.impl; + +import org.apache.nutch.webui.client.model.JobConfig; +import org.apache.nutch.webui.client.model.JobInfo.JobType; +import org.joda.time.Duration; + +public class RemoteCommandBuilder { + private JobConfig jobConfig = new JobConfig(); + private Duration timeout = Duration.standardSeconds(10); + + private RemoteCommandBuilder() { + } + + public static RemoteCommandBuilder instance(JobType jobType) { + return new RemoteCommandBuilder().withJobType(jobType); + } + + public RemoteCommandBuilder withJobType(JobType jobType) { + jobConfig.setType(jobType); + return this; + } + + public RemoteCommandBuilder withConfigId(String configId) { + jobConfig.setConfId(configId); + return this; + } + + public RemoteCommandBuilder withCrawlId(String crawlId) { + jobConfig.setCrawlId(crawlId); + return this; + } + + public RemoteCommandBuilder withArgument(String key, String value) { + jobConfig.setArgument(key, value); + return this; + } + + public RemoteCommandBuilder withTimeout(Duration timeout) { + this.timeout = timeout; + return this; + } + + public RemoteCommand build() { + RemoteCommand remoteCommand = new RemoteCommand(jobConfig); + remoteCommand.setTimeout(timeout); + return remoteCommand; + } +} http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/main/java/org/apache/nutch/webui/client/impl/RemoteCommandExecutor.java ---------------------------------------------------------------------- diff --git a/nutch-core/src/main/java/org/apache/nutch/webui/client/impl/RemoteCommandExecutor.java b/nutch-core/src/main/java/org/apache/nutch/webui/client/impl/RemoteCommandExecutor.java new file mode 100644 index 0000000..e1eefc2 --- /dev/null +++ b/nutch-core/src/main/java/org/apache/nutch/webui/client/impl/RemoteCommandExecutor.java @@ -0,0 +1,110 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.nutch.webui.client.impl; + +import static com.google.common.base.Preconditions.checkState; + +import java.util.concurrent.Callable; +import java.util.concurrent.ExecutorService; +import java.util.concurrent.Executors; +import java.util.concurrent.Future; +import java.util.concurrent.TimeUnit; + +import org.apache.commons.lang3.exception.ExceptionUtils; +import org.apache.nutch.webui.client.NutchClient; +import org.apache.nutch.webui.client.model.JobInfo; +import org.apache.nutch.webui.client.model.JobInfo.State; +import org.joda.time.DateTimeConstants; +import org.joda.time.Duration; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +/** + * This class executes remote job and waits for success/failure result + * + * @author feodor + * + */ +public class RemoteCommandExecutor { + private Logger log = LoggerFactory.getLogger(RemoteCommandExecutor.class); + + private static final int DEFAULT_TIMEOUT_SEC = 60; + private Duration requestDelay = new Duration(500); + + private NutchClient client; + private ExecutorService executor; + + public RemoteCommandExecutor(NutchClient client) { + this.client = client; + this.executor = Executors.newSingleThreadExecutor(); + } + + public JobInfo executeRemoteJob(RemoteCommand command) { + try { + String jobId = client.executeJob(command.getJobConfig()); + Future<JobInfo> chekerFuture = executor + .submit(new JobStateChecker(jobId)); + return chekerFuture.get(getTimeout(command), TimeUnit.MILLISECONDS); + } catch (Exception e) { + log.error("Remote command failed", e); + JobInfo jobInfo = new JobInfo(); + jobInfo.setState(State.FAILED); + jobInfo.setMsg(ExceptionUtils.getStackTrace(e)); + return jobInfo; + } + } + + private long getTimeout(RemoteCommand command) { + if (command.getTimeout() == null) { + return DEFAULT_TIMEOUT_SEC * DateTimeConstants.MILLIS_PER_SECOND; + } + return command.getTimeout().getMillis(); + } + + public void setRequestDelay(Duration requestDelay) { + this.requestDelay = requestDelay; + } + + public class JobStateChecker implements Callable<JobInfo> { + + private String jobId; + + public JobStateChecker(String jobId) { + this.jobId = jobId; + } + + @Override + public JobInfo call() throws Exception { + while (!Thread.interrupted()) { + JobInfo jobInfo = client.getJobInfo(jobId); + checkState(jobInfo != null, "Cannot get job info!"); + + State state = jobInfo.getState(); + checkState(state != null, "Unknown job state!"); + + if (state == State.RUNNING || state == State.ANY || state == State.IDLE) { + Thread.sleep(requestDelay.getMillis()); + continue; + } + + return jobInfo; + } + return null; + } + + } +} http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/main/java/org/apache/nutch/webui/client/impl/RemoteCommandsBatchFactory.java ---------------------------------------------------------------------- diff --git a/nutch-core/src/main/java/org/apache/nutch/webui/client/impl/RemoteCommandsBatchFactory.java b/nutch-core/src/main/java/org/apache/nutch/webui/client/impl/RemoteCommandsBatchFactory.java new file mode 100644 index 0000000..cef56a5 --- /dev/null +++ b/nutch-core/src/main/java/org/apache/nutch/webui/client/impl/RemoteCommandsBatchFactory.java @@ -0,0 +1,97 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.nutch.webui.client.impl; + +import java.util.List; +import java.util.UUID; + +import org.apache.nutch.webui.client.model.Crawl; +import org.apache.nutch.webui.client.model.JobInfo.JobType; +import org.joda.time.Duration; +import org.springframework.beans.factory.config.BeanDefinition; +import org.springframework.context.annotation.Scope; +import org.springframework.stereotype.Component; + +import com.google.common.collect.Lists; + +@Component +@Scope(BeanDefinition.SCOPE_PROTOTYPE) +public class RemoteCommandsBatchFactory { + + private List<RemoteCommand> remoteCommands; + private Crawl crawl; + + private String batchId; + + public List<RemoteCommand> createCommands(Crawl crawl) { + this.crawl = crawl; + this.remoteCommands = Lists.newArrayList(); + + remoteCommands.add(inject()); + for (int i = 0; i < crawl.getNumberOfRounds(); i++) { + remoteCommands.addAll(createBatchCommands()); + } + return remoteCommands; + } + + private List<RemoteCommand> createBatchCommands() { + this.batchId = UUID.randomUUID().toString(); + List<RemoteCommand> batchCommands = Lists.newArrayList(); + + batchCommands.add(createGenerateCommand()); + batchCommands.add(createFetchCommand()); + batchCommands.add(createParseCommand()); + batchCommands.add(createUpdateDbCommand()); + batchCommands.add(createIndexCommand()); + + return batchCommands; + } + + private RemoteCommand inject() { + RemoteCommandBuilder builder = RemoteCommandBuilder + .instance(JobType.INJECT).withCrawlId(crawl.getCrawlId()) + .withArgument("url_dir", crawl.getSeedDirectory()); + return builder.build(); + } + + private RemoteCommand createGenerateCommand() { + return createBuilder(JobType.GENERATE).build(); + } + + private RemoteCommand createFetchCommand() { + return createBuilder(JobType.FETCH).withTimeout( + Duration.standardSeconds(50)).build(); + } + + private RemoteCommand createParseCommand() { + return createBuilder(JobType.PARSE).build(); + } + + private RemoteCommand createIndexCommand() { + return createBuilder(JobType.INDEX).build(); + } + + private RemoteCommand createUpdateDbCommand() { + return createBuilder(JobType.UPDATEDB).build(); + } + + private RemoteCommandBuilder createBuilder(JobType jobType) { + return RemoteCommandBuilder.instance(jobType) + .withCrawlId(crawl.getCrawlId()).withArgument("batch", batchId); + } + +} http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/main/java/org/apache/nutch/webui/client/model/ConnectionStatus.java ---------------------------------------------------------------------- diff --git a/nutch-core/src/main/java/org/apache/nutch/webui/client/model/ConnectionStatus.java b/nutch-core/src/main/java/org/apache/nutch/webui/client/model/ConnectionStatus.java new file mode 100644 index 0000000..d834612 --- /dev/null +++ b/nutch-core/src/main/java/org/apache/nutch/webui/client/model/ConnectionStatus.java @@ -0,0 +1,21 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.nutch.webui.client.model; + +public enum ConnectionStatus { + CONNECTING, CONNECTED, DISCONNECTED; +} \ No newline at end of file http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/main/java/org/apache/nutch/webui/client/model/Crawl.java ---------------------------------------------------------------------- diff --git a/nutch-core/src/main/java/org/apache/nutch/webui/client/model/Crawl.java b/nutch-core/src/main/java/org/apache/nutch/webui/client/model/Crawl.java new file mode 100644 index 0000000..6057f7f --- /dev/null +++ b/nutch-core/src/main/java/org/apache/nutch/webui/client/model/Crawl.java @@ -0,0 +1,126 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.nutch.webui.client.model; + +import java.io.Serializable; + +import javax.persistence.Column; +import javax.persistence.Entity; +import javax.persistence.GeneratedValue; +import javax.persistence.Id; + +import org.apache.nutch.webui.model.SeedList; + +import com.j256.ormlite.field.DatabaseField; + +@Entity +public class Crawl implements Serializable { + public enum CrawlStatus { + NEW, CRAWLING, FINISHED, ERROR + } + + @Id + @GeneratedValue + private Long id; + + @Column + private String crawlId; + + @Column + private String crawlName; + + @Column + private CrawlStatus status = CrawlStatus.NEW; + + @Column + private Integer numberOfRounds = 1; + + @Column + @DatabaseField(foreign = true, foreignAutoRefresh = true) + private SeedList seedList; + + @Column + private String seedDirectory; + + @Column + private int progress; + + public Integer getNumberOfRounds() { + return numberOfRounds; + } + + public void setNumberOfRounds(Integer numberOfRounds) { + this.numberOfRounds = numberOfRounds; + } + + public String getCrawlId() { + return crawlId; + } + + public void setCrawlId(String crawlId) { + this.crawlId = crawlId; + } + + public CrawlStatus getStatus() { + return status; + } + + public void setStatus(CrawlStatus status) { + this.status = status; + } + + public String getCrawlName() { + return crawlName; + } + + public void setCrawlName(String crawlName) { + this.crawlName = crawlName; + } + + public SeedList getSeedList() { + return seedList; + } + + public void setSeedList(SeedList seedList) { + this.seedList = seedList; + } + + public Long getId() { + return id; + } + + public void setId(Long id) { + this.id = id; + } + + public String getSeedDirectory() { + return seedDirectory; + } + + public void setSeedDirectory(String seedDirectory) { + this.seedDirectory = seedDirectory; + } + + public int getProgress() { + return progress; + } + + public void setProgress(int progress) { + this.progress = progress; + } + +} http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/main/java/org/apache/nutch/webui/client/model/JobConfig.java ---------------------------------------------------------------------- diff --git a/nutch-core/src/main/java/org/apache/nutch/webui/client/model/JobConfig.java b/nutch-core/src/main/java/org/apache/nutch/webui/client/model/JobConfig.java new file mode 100644 index 0000000..80df279 --- /dev/null +++ b/nutch-core/src/main/java/org/apache/nutch/webui/client/model/JobConfig.java @@ -0,0 +1,77 @@ +/******************************************************************************* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + ******************************************************************************/ +package org.apache.nutch.webui.client.model; + +import java.io.Serializable; +import java.util.Collections; +import java.util.Map; + +import org.apache.nutch.webui.client.model.JobInfo.JobType; + +import com.google.common.collect.Maps; + +public class JobConfig implements Serializable { + private String crawlId; + private JobType type; + private String confId = "default"; + private String jobClassName; + private Map<String, Object> args = Maps.newHashMap(); + + public void setArgument(String key, String value) { + args.put(key, value); + } + + public String getCrawlId() { + return crawlId; + } + + public void setCrawlId(String crawlId) { + this.crawlId = crawlId; + } + + public JobType getType() { + return type; + } + + public void setType(JobType type) { + this.type = type; + } + + public String getConfId() { + return confId; + } + + public void setConfId(String confId) { + this.confId = confId; + } + + public Map<String, Object> getArgs() { + return Collections.unmodifiableMap(args); + } + + public void setArgs(Map<String, Object> args) { + this.args = args; + } + + public String getJobClassName() { + return jobClassName; + } + + public void setJobClassName(String jobClass) { + this.jobClassName = jobClass; + } +} http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/main/java/org/apache/nutch/webui/client/model/JobInfo.java ---------------------------------------------------------------------- diff --git a/nutch-core/src/main/java/org/apache/nutch/webui/client/model/JobInfo.java b/nutch-core/src/main/java/org/apache/nutch/webui/client/model/JobInfo.java new file mode 100644 index 0000000..312118a --- /dev/null +++ b/nutch-core/src/main/java/org/apache/nutch/webui/client/model/JobInfo.java @@ -0,0 +1,104 @@ +/******************************************************************************* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + ******************************************************************************/ +package org.apache.nutch.webui.client.model; + +import java.io.Serializable; +import java.util.Map; + +public class JobInfo implements Serializable { + public static enum JobType { + INJECT, GENERATE, FETCH, PARSE, UPDATEDB, INDEX, READDB, CLASS + }; + + public static enum State { + IDLE, RUNNING, FINISHED, FAILED, KILLED, STOPPING, KILLING, ANY + }; + + private String id; + private String type; + private String confId; + private Map<String, Object> args; + private Map<String, Object> result; + private State state; + private String msg; + private String crawlId; + + public String getMsg() { + return msg; + } + + public void setMsg(String msg) { + this.msg = msg; + } + + public State getState() { + return state; + } + + public void setState(State state) { + this.state = state; + } + + public Map<String, Object> getResult() { + return result; + } + + public void setResult(Map<String, Object> result) { + this.result = result; + } + + public Map<String, Object> getArgs() { + return args; + } + + public void setArgs(Map<String, Object> args) { + this.args = args; + } + + public String getConfId() { + return confId; + } + + public void setConfId(String confId) { + this.confId = confId; + } + + public String getId() { + return id; + } + + public void setId(String id) { + this.id = id; + } + + public String getCrawlId() { + return crawlId; + } + + public void setCrawlId(String crawlId) { + this.crawlId = crawlId; + } + + public String getType() { + return type; + } + + public void setType(String type) { + this.type = type; + } + +} http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/main/java/org/apache/nutch/webui/client/model/NutchStatus.java ---------------------------------------------------------------------- diff --git a/nutch-core/src/main/java/org/apache/nutch/webui/client/model/NutchStatus.java b/nutch-core/src/main/java/org/apache/nutch/webui/client/model/NutchStatus.java new file mode 100644 index 0000000..0c5c425 --- /dev/null +++ b/nutch-core/src/main/java/org/apache/nutch/webui/client/model/NutchStatus.java @@ -0,0 +1,62 @@ +/******************************************************************************* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + ******************************************************************************/ +package org.apache.nutch.webui.client.model; + +import java.io.Serializable; +import java.util.Collection; +import java.util.Date; +import java.util.Set; + +public class NutchStatus implements Serializable { + + private Date startDate; + private Set<String> configuration; + private Collection<JobInfo> jobs; + private Collection<JobInfo> runningJobs; + + public Date getStartDate() { + return startDate; + } + + public void setStartDate(Date startDate) { + this.startDate = startDate; + } + + public Set<String> getConfiguration() { + return configuration; + } + + public void setConfiguration(Set<String> configuration) { + this.configuration = configuration; + } + + public Collection<JobInfo> getJobs() { + return jobs; + } + + public void setJobs(Collection<JobInfo> jobs) { + this.jobs = jobs; + } + + public Collection<JobInfo> getRunningJobs() { + return runningJobs; + } + + public void setRunningJobs(Collection<JobInfo> runningJobs) { + this.runningJobs = runningJobs; + } +} http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/main/java/org/apache/nutch/webui/config/CustomDaoFactory.java ---------------------------------------------------------------------- diff --git a/nutch-core/src/main/java/org/apache/nutch/webui/config/CustomDaoFactory.java b/nutch-core/src/main/java/org/apache/nutch/webui/config/CustomDaoFactory.java new file mode 100644 index 0000000..09c2d6a --- /dev/null +++ b/nutch-core/src/main/java/org/apache/nutch/webui/config/CustomDaoFactory.java @@ -0,0 +1,58 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.nutch.webui.config; + +import java.sql.SQLException; +import java.util.ArrayList; +import java.util.Collections; +import java.util.List; + +import com.j256.ormlite.dao.Dao; +import com.j256.ormlite.spring.DaoFactory; +import com.j256.ormlite.support.ConnectionSource; + +public class CustomDaoFactory { + private ConnectionSource connectionSource; + private List<Dao<?, ?>> registredDaos = Collections + .synchronizedList(new ArrayList<Dao<?, ?>>()); + + public CustomDaoFactory(ConnectionSource connectionSource) { + this.connectionSource = connectionSource; + } + + public <T, ID> Dao<T, ID> createDao(Class<T> clazz) { + try { + Dao<T, ID> dao = DaoFactory.createDao(connectionSource, clazz); + register(dao); + return dao; + } catch (SQLException e) { + throw new RuntimeException(e); + } + } + + private <T, ID> void register(Dao<T, ID> dao) { + synchronized (registredDaos) { + registredDaos.add(dao); + } + } + + public List<Dao<?, ?>> getCreatedDaos() { + synchronized (registredDaos) { + return Collections.unmodifiableList(registredDaos); + } + } +} http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/main/java/org/apache/nutch/webui/config/CustomTableCreator.java ---------------------------------------------------------------------- diff --git a/nutch-core/src/main/java/org/apache/nutch/webui/config/CustomTableCreator.java b/nutch-core/src/main/java/org/apache/nutch/webui/config/CustomTableCreator.java new file mode 100644 index 0000000..9b31d73 --- /dev/null +++ b/nutch-core/src/main/java/org/apache/nutch/webui/config/CustomTableCreator.java @@ -0,0 +1,83 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.nutch.webui.config; + +import java.sql.SQLException; +import java.util.List; + +import com.j256.ormlite.dao.BaseDaoImpl; +import com.j256.ormlite.dao.Dao; +import com.j256.ormlite.support.ConnectionSource; +import com.j256.ormlite.table.DatabaseTableConfig; +import com.j256.ormlite.table.TableUtils; + +public class CustomTableCreator { + + private ConnectionSource connectionSource; + private List<Dao<?, ?>> configuredDaos; + + public CustomTableCreator(ConnectionSource connectionSource, + List<Dao<?, ?>> configuredDaos) { + this.connectionSource = connectionSource; + this.configuredDaos = configuredDaos; + initialize(); + } + + private void initialize() { + if (configuredDaos == null) { + throw new IllegalStateException("configuredDaos was not set in " + + getClass().getSimpleName()); + } + + for (Dao<?, ?> dao : configuredDaos) { + createTableForDao(dao); + } + } + + private void createTableForDao(Dao<?, ?> dao) { + DatabaseTableConfig<?> tableConfig = getTableConfig(dao); + createTableIfNotExists(tableConfig); + } + + private DatabaseTableConfig<?> getTableConfig(Dao<?, ?> dao) { + Class<?> clazz = dao.getDataClass(); + DatabaseTableConfig<?> tableConfig = null; + if (dao instanceof BaseDaoImpl) { + tableConfig = ((BaseDaoImpl<?, ?>) dao).getTableConfig(); + } + if (tableConfig == null) { + return getConfigFromClass(clazz); + } + return tableConfig; + } + + private DatabaseTableConfig<?> getConfigFromClass(Class<?> clazz) { + try { + return DatabaseTableConfig.fromClass(connectionSource, clazz); + } catch (SQLException e) { + throw new RuntimeException(e); + } + } + + private void createTableIfNotExists(DatabaseTableConfig<?> tableConfig) { + try { + TableUtils.createTableIfNotExists(connectionSource, tableConfig); + } catch (SQLException e) { + throw new RuntimeException(e); + } + } +} http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/main/java/org/apache/nutch/webui/config/NutchGuiConfiguration.java ---------------------------------------------------------------------- diff --git a/nutch-core/src/main/java/org/apache/nutch/webui/config/NutchGuiConfiguration.java b/nutch-core/src/main/java/org/apache/nutch/webui/config/NutchGuiConfiguration.java new file mode 100644 index 0000000..8b76440 --- /dev/null +++ b/nutch-core/src/main/java/org/apache/nutch/webui/config/NutchGuiConfiguration.java @@ -0,0 +1,33 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.nutch.webui.config; + +import java.util.List; + +import org.apache.nutch.webui.model.NutchInstance; + +public class NutchGuiConfiguration { + private List<NutchInstance> instances; + + public List<NutchInstance> getInstances() { + return instances; + } + + public void setInstances(List<NutchInstance> instances) { + this.instances = instances; + } +} http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/main/java/org/apache/nutch/webui/config/SpringConfiguration.java ---------------------------------------------------------------------- diff --git a/nutch-core/src/main/java/org/apache/nutch/webui/config/SpringConfiguration.java b/nutch-core/src/main/java/org/apache/nutch/webui/config/SpringConfiguration.java new file mode 100644 index 0000000..1687cee --- /dev/null +++ b/nutch-core/src/main/java/org/apache/nutch/webui/config/SpringConfiguration.java @@ -0,0 +1,91 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.nutch.webui.config; + +import java.sql.SQLException; +import java.util.concurrent.Executor; + +import org.apache.nutch.webui.client.model.Crawl; +import org.apache.nutch.webui.model.NutchInstance; +import org.apache.nutch.webui.model.SeedList; +import org.apache.nutch.webui.model.SeedUrl; +import org.springframework.context.annotation.Bean; +import org.springframework.context.annotation.Configuration; +import org.springframework.scheduling.annotation.AsyncConfigurer; +import org.springframework.scheduling.annotation.EnableAsync; +import org.springframework.scheduling.concurrent.ThreadPoolTaskExecutor; + +import com.j256.ormlite.dao.Dao; +import com.j256.ormlite.db.H2DatabaseType; +import com.j256.ormlite.jdbc.JdbcConnectionSource; + +@Configuration +@EnableAsync +public class SpringConfiguration implements AsyncConfigurer { + + @Override + public Executor getAsyncExecutor() { + // TODO move magic numbers to properties file + ThreadPoolTaskExecutor executor = new ThreadPoolTaskExecutor(); + executor.setCorePoolSize(7); + executor.setMaxPoolSize(42); + executor.setQueueCapacity(11); + executor.setThreadNamePrefix("SpringExecutor-"); + executor.initialize(); + return executor; + } + + @Bean + public JdbcConnectionSource getConnectionSource() throws SQLException { + JdbcConnectionSource source = new JdbcConnectionSource( + "jdbc:h2:~/.nutch/config", new H2DatabaseType()); + source.initialize(); + return source; + } + + @Bean + public CustomDaoFactory getDaoFactory() throws SQLException { + return new CustomDaoFactory(getConnectionSource()); + } + + @Bean + public Dao<NutchInstance, Long> createNutchDao() throws SQLException { + return getDaoFactory().createDao(NutchInstance.class); + } + + @Bean + public Dao<SeedList, Long> createSeedListDao() throws SQLException { + return getDaoFactory().createDao(SeedList.class); + } + + @Bean + public Dao<SeedUrl, Long> createSeedUrlDao() throws SQLException { + return getDaoFactory().createDao(SeedUrl.class); + } + + @Bean + public Dao<Crawl, Long> createCrawlDao() throws SQLException { + return getDaoFactory().createDao(Crawl.class); + } + + @Bean + public CustomTableCreator createTableCreator() throws SQLException { + return new CustomTableCreator(getConnectionSource(), getDaoFactory() + .getCreatedDaos()); + } + +}
