http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/main/java/org/apache/nutch/indexer/IndexingFiltersChecker.java ---------------------------------------------------------------------- diff --git a/nutch-core/src/main/java/org/apache/nutch/indexer/IndexingFiltersChecker.java b/nutch-core/src/main/java/org/apache/nutch/indexer/IndexingFiltersChecker.java new file mode 100644 index 0000000..2e1b9c2 --- /dev/null +++ b/nutch-core/src/main/java/org/apache/nutch/indexer/IndexingFiltersChecker.java @@ -0,0 +1,371 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.nutch.indexer; + +import java.io.BufferedReader; +import java.io.InputStreamReader; +import java.io.PrintWriter; +import java.net.ServerSocket; +import java.net.Socket; +import java.net.InetSocketAddress; +import java.nio.charset.Charset; +import java.util.HashMap; +import java.util.Iterator; +import java.util.List; +import java.util.Map; + +import org.apache.hadoop.conf.Configured; +import org.apache.hadoop.io.Text; +import org.apache.hadoop.mapred.JobConf; +import org.apache.hadoop.util.Tool; +import org.apache.hadoop.util.ToolRunner; +import org.apache.nutch.crawl.CrawlDatum; +import org.apache.nutch.crawl.Inlinks; +import org.apache.nutch.crawl.SignatureFactory; +import org.apache.nutch.metadata.Metadata; +import org.apache.nutch.metadata.Nutch; +import org.apache.nutch.net.URLNormalizers; +import org.apache.nutch.parse.Parse; +import org.apache.nutch.parse.ParseResult; +import org.apache.nutch.parse.ParseSegment; +import org.apache.nutch.parse.ParseUtil; +import org.apache.nutch.protocol.Content; +import org.apache.nutch.protocol.Protocol; +import org.apache.nutch.protocol.ProtocolFactory; +import org.apache.nutch.protocol.ProtocolOutput; +import org.apache.nutch.scoring.ScoringFilters; +import org.apache.nutch.util.NutchConfiguration; +import org.apache.nutch.util.StringUtil; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +/** + * Reads and parses a URL and run the indexers on it. Displays the fields + * obtained and the first 100 characters of their value + * + * Tested with e.g. ./nutch org.apache.nutch.indexer.IndexingFiltersChecker + * http://www.lemonde.fr + * + * @author Julien Nioche + **/ + +public class IndexingFiltersChecker extends Configured implements Tool { + + protected URLNormalizers normalizers = null; + protected boolean dumpText = false; + protected boolean followRedirects = false; + protected boolean keepClientCnxOpen = false; + // used to simulate the metadata propagated from injection + protected HashMap<String, String> metadata = new HashMap<String, String>(); + protected int tcpPort = -1; + + public static final Logger LOG = LoggerFactory + .getLogger(IndexingFiltersChecker.class); + + public IndexingFiltersChecker() { + + } + + public int run(String[] args) throws Exception { + String url = null; + String usage = "Usage: IndexingFiltersChecker [-normalize] [-followRedirects] [-dumpText] [-md key=value] [-listen <port>] [-keepClientCnxOpen]"; + + if (args.length == 0) { + System.err.println(usage); + return -1; + } + + for (int i = 0; i < args.length; i++) { + if (args[i].equals("-normalize")) { + normalizers = new URLNormalizers(getConf(), URLNormalizers.SCOPE_DEFAULT); + } else if (args[i].equals("-listen")) { + tcpPort = Integer.parseInt(args[++i]); + } else if (args[i].equals("-followRedirects")) { + followRedirects = true; + } else if (args[i].equals("-keepClientCnxOpen")) { + keepClientCnxOpen = true; + } else if (args[i].equals("-dumpText")) { + dumpText = true; + } else if (args[i].equals("-md")) { + String k = null, v = null; + String nextOne = args[++i]; + int firstEquals = nextOne.indexOf("="); + if (firstEquals != -1) { + k = nextOne.substring(0, firstEquals); + v = nextOne.substring(firstEquals + 1); + } else + k = nextOne; + metadata.put(k, v); + } else if (i != args.length - 1) { + System.err.println(usage); + System.exit(-1); + } else { + url =args[i]; + } + } + + // In listening mode? + if (tcpPort == -1) { + // No, just fetch and display + StringBuilder output = new StringBuilder(); + int ret = fetch(url, output); + System.out.println(output); + return ret; + } else { + // Listen on socket and start workers on incoming requests + listen(); + } + + return 0; + } + + protected void listen() throws Exception { + ServerSocket server = null; + + try{ + server = new ServerSocket(); + server.bind(new InetSocketAddress(tcpPort)); + LOG.info(server.toString()); + } catch (Exception e) { + LOG.error("Could not listen on port " + tcpPort); + System.exit(-1); + } + + while(true){ + Worker worker; + try{ + worker = new Worker(server.accept()); + Thread thread = new Thread(worker); + thread.start(); + } catch (Exception e) { + LOG.error("Accept failed: " + tcpPort); + System.exit(-1); + } + } + } + + private class Worker implements Runnable { + private Socket client; + + Worker(Socket client) { + this.client = client; + LOG.info(client.toString()); + } + + public void run() { + if (keepClientCnxOpen) { + while (true) { // keep connection open until closes + readWrite(); + } + } else { + readWrite(); + + try { // close ourselves + client.close(); + } catch (Exception e){ + LOG.error(e.toString()); + } + } + } + + protected void readWrite() { + String line; + BufferedReader in = null; + PrintWriter out = null; + + try{ + in = new BufferedReader(new InputStreamReader(client.getInputStream())); + } catch (Exception e) { + LOG.error("in or out failed"); + System.exit(-1); + } + + try{ + line = in.readLine(); + StringBuilder output = new StringBuilder(); + fetch(line, output); + + client.getOutputStream().write(output.toString().getBytes(Charset.forName("UTF-8"))); + }catch (Exception e) { + LOG.error("Read/Write failed: " + e); + } + } + } + + + protected int fetch(String url, StringBuilder output) throws Exception { + if (normalizers != null) { + url = normalizers.normalize(url, URLNormalizers.SCOPE_DEFAULT); + } + + LOG.info("fetching: " + url); + + CrawlDatum datum = new CrawlDatum(); + + Iterator<String> iter = metadata.keySet().iterator(); + while (iter.hasNext()) { + String key = iter.next(); + String value = metadata.get(key); + if (value == null) + value = ""; + datum.getMetaData().put(new Text(key), new Text(value)); + } + + IndexingFilters indexers = new IndexingFilters(getConf()); + + int maxRedirects = 3; + + ProtocolOutput protocolOutput = getProtocolOutput(url, datum); + Text turl = new Text(url); + + // Following redirects and not reached maxRedirects? + while (!protocolOutput.getStatus().isSuccess() && followRedirects && protocolOutput.getStatus().isRedirect() && maxRedirects != 0) { + String[] stuff = protocolOutput.getStatus().getArgs(); + url = stuff[0]; + + if (normalizers != null) { + url = normalizers.normalize(url, URLNormalizers.SCOPE_DEFAULT); + } + + turl.set(url); + + // try again + protocolOutput = getProtocolOutput(url, datum); + maxRedirects--; + } + + if (!protocolOutput.getStatus().isSuccess()) { + output.append("Fetch failed with protocol status: " + + protocolOutput.getStatus() + "\n"); + return 0; + } + + Content content = protocolOutput.getContent(); + + if (content == null) { + output.append("No content for " + url + "\n"); + return 0; + } + + String contentType = content.getContentType(); + + if (contentType == null) { + return -1; + } + + // store the guessed content type in the crawldatum + datum.getMetaData().put(new Text(Metadata.CONTENT_TYPE), + new Text(contentType)); + + if (ParseSegment.isTruncated(content)) { + LOG.warn("Content is truncated, parse may fail!"); + } + + ScoringFilters scfilters = new ScoringFilters(getConf()); + // call the scoring filters + try { + scfilters.passScoreBeforeParsing(turl, datum, content); + } catch (Exception e) { + LOG.warn("Couldn't pass score, url {} ({})", url, e); + } + + LOG.info("parsing: {}", url); + LOG.info("contentType: {}", contentType); + + ParseResult parseResult = new ParseUtil(getConf()).parse(content); + + NutchDocument doc = new NutchDocument(); + doc.add("id", url); + Text urlText = new Text(url); + + Inlinks inlinks = null; + Parse parse = parseResult.get(urlText); + if (parse == null) { + LOG.error("Failed to get parse from parse result"); + LOG.error("Available parses in parse result (by URL key):"); + for (Map.Entry<Text, Parse> entry : parseResult) { + LOG.error(" " + entry.getKey()); + } + LOG.error("Parse result does not contain a parse for URL to be checked:"); + LOG.error(" " + urlText); + return -1; + } + + byte[] signature = SignatureFactory.getSignature(getConf()).calculate(content, + parse); + parse.getData().getContentMeta() + .set(Nutch.SIGNATURE_KEY, StringUtil.toHexString(signature)); + String digest = parse.getData().getContentMeta().get(Nutch.SIGNATURE_KEY); + doc.add("digest", digest); + datum.setSignature(signature); + + // call the scoring filters + try { + scfilters.passScoreAfterParsing(turl, content, parseResult.get(turl)); + } catch (Exception e) { + LOG.warn("Couldn't pass score, url {} ({})", turl, e); + } + + try { + doc = indexers.filter(doc, parse, urlText, datum, inlinks); + } catch (IndexingException e) { + e.printStackTrace(); + } + + if (doc == null) { + output.append("Document discarded by indexing filter\n"); + return 0; + } + + for (String fname : doc.getFieldNames()) { + List<Object> values = doc.getField(fname).getValues(); + if (values != null) { + for (Object value : values) { + String str = value.toString(); + int minText = dumpText ? str.length() : Math.min(100, str.length()); + output.append(fname + " :\t" + str.substring(0, minText) + "\n"); + } + } + } + + output.append("\n"); // For readability if keepClientCnxOpen + + if (getConf().getBoolean("doIndex", false) && doc != null) { + IndexWriters writers = new IndexWriters(getConf()); + writers.open(new JobConf(getConf()), "IndexingFilterChecker"); + writers.write(doc); + writers.close(); + } + + return 0; + } + + protected ProtocolOutput getProtocolOutput(String url, CrawlDatum datum) throws Exception { + ProtocolFactory factory = new ProtocolFactory(getConf()); + Protocol protocol = factory.getProtocol(url); + Text turl = new Text(url); + ProtocolOutput protocolOutput = protocol.getProtocolOutput(turl, datum); + return protocolOutput; + } + + public static void main(String[] args) throws Exception { + final int res = ToolRunner.run(NutchConfiguration.create(), + new IndexingFiltersChecker(), args); + System.exit(res); + } +} \ No newline at end of file
http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/main/java/org/apache/nutch/indexer/IndexingJob.java ---------------------------------------------------------------------- diff --git a/nutch-core/src/main/java/org/apache/nutch/indexer/IndexingJob.java b/nutch-core/src/main/java/org/apache/nutch/indexer/IndexingJob.java new file mode 100644 index 0000000..342ea4a --- /dev/null +++ b/nutch-core/src/main/java/org/apache/nutch/indexer/IndexingJob.java @@ -0,0 +1,358 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.nutch.indexer; + +import java.io.File; +import java.io.IOException; +import java.text.SimpleDateFormat; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.Comparator; +import java.util.HashMap; +import java.util.List; +import java.util.Locale; +import java.util.Map; +import java.util.Random; + +import org.apache.nutch.metadata.Nutch; +import org.apache.nutch.segment.SegmentChecker; +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.FileStatus; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.mapred.FileOutputFormat; +import org.apache.hadoop.mapred.JobClient; +import org.apache.hadoop.mapred.JobConf; +import org.apache.hadoop.mapred.Counters.Counter; +import org.apache.hadoop.mapred.RunningJob; +import org.apache.hadoop.util.StringUtils; +import org.apache.hadoop.util.Tool; +import org.apache.hadoop.util.ToolRunner; +import org.apache.nutch.util.HadoopFSUtil; +import org.apache.nutch.util.NutchConfiguration; +import org.apache.nutch.util.NutchJob; +import org.apache.nutch.util.NutchTool; +import org.apache.nutch.util.TimingUtil; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +/** + * Generic indexer which relies on the plugins implementing IndexWriter + **/ + +public class IndexingJob extends NutchTool implements Tool { + + public static Logger LOG = LoggerFactory.getLogger(IndexingJob.class); + + public IndexingJob() { + super(null); + } + + public IndexingJob(Configuration conf) { + super(conf); + } + + public void index(Path crawlDb, Path linkDb, List<Path> segments, + boolean noCommit) throws IOException { + index(crawlDb, linkDb, segments, noCommit, false, null); + } + + public void index(Path crawlDb, Path linkDb, List<Path> segments, + boolean noCommit, boolean deleteGone) throws IOException { + index(crawlDb, linkDb, segments, noCommit, deleteGone, null); + } + + public void index(Path crawlDb, Path linkDb, List<Path> segments, + boolean noCommit, boolean deleteGone, String params) throws IOException { + index(crawlDb, linkDb, segments, noCommit, deleteGone, params, false, false); + } + + public void index(Path crawlDb, Path linkDb, List<Path> segments, + boolean noCommit, boolean deleteGone, String params, boolean filter, + boolean normalize) throws IOException { + index(crawlDb, linkDb, segments, noCommit, deleteGone, params, false, + false, false); + } + + public void index(Path crawlDb, Path linkDb, List<Path> segments, + boolean noCommit, boolean deleteGone, String params, + boolean filter, boolean normalize, boolean addBinaryContent) throws IOException { + index(crawlDb, linkDb, segments, noCommit, deleteGone, params, false, + false, false, false); + } + + public void index(Path crawlDb, Path linkDb, List<Path> segments, + boolean noCommit, boolean deleteGone, String params, + boolean filter, boolean normalize, boolean addBinaryContent, + boolean base64) throws IOException { + + + SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss"); + long start = System.currentTimeMillis(); + LOG.info("Indexer: starting at {}", sdf.format(start)); + + final JobConf job = new NutchJob(getConf()); + job.setJobName("Indexer"); + + LOG.info("Indexer: deleting gone documents: {}", deleteGone); + LOG.info("Indexer: URL filtering: {}", filter); + LOG.info("Indexer: URL normalizing: {}", normalize); + if (addBinaryContent) { + if (base64) { + LOG.info("Indexer: adding binary content as Base64"); + } else { + LOG.info("Indexer: adding binary content"); + } + } + IndexWriters writers = new IndexWriters(getConf()); + LOG.info(writers.describe()); + + IndexerMapReduce.initMRJob(crawlDb, linkDb, segments, job, addBinaryContent); + + // NOW PASSED ON THE COMMAND LINE AS A HADOOP PARAM + // job.set(SolrConstants.SERVER_URL, solrUrl); + + job.setBoolean(IndexerMapReduce.INDEXER_DELETE, deleteGone); + job.setBoolean(IndexerMapReduce.URL_FILTERING, filter); + job.setBoolean(IndexerMapReduce.URL_NORMALIZING, normalize); + job.setBoolean(IndexerMapReduce.INDEXER_BINARY_AS_BASE64, base64); + + if (params != null) { + job.set(IndexerMapReduce.INDEXER_PARAMS, params); + } + + job.setReduceSpeculativeExecution(false); + + final Path tmp = new Path("tmp_" + System.currentTimeMillis() + "-" + + new Random().nextInt()); + + FileOutputFormat.setOutputPath(job, tmp); + try { + RunningJob indexJob = JobClient.runJob(job); + // do the commits once and for all the reducers in one go + if (!noCommit) { + writers.open(job, "commit"); + writers.commit(); + } + LOG.info("Indexer: number of documents indexed, deleted, or skipped:"); + for (Counter counter : indexJob.getCounters().getGroup("IndexerStatus")) { + LOG.info("Indexer: {} {}", + String.format(Locale.ROOT, "%6d", counter.getValue()), + counter.getName()); + } + long end = System.currentTimeMillis(); + LOG.info("Indexer: finished at " + sdf.format(end) + ", elapsed: " + + TimingUtil.elapsedTime(start, end)); + } finally { + FileSystem.get(job).delete(tmp, true); + } + } + + public int run(String[] args) throws Exception { + if (args.length < 2) { + System.err + //.println("Usage: Indexer <crawldb> [-linkdb <linkdb>] [-params k1=v1&k2=v2...] (<segment> ... | -dir <segments>) [-noCommit] [-deleteGone] [-filter] [-normalize]"); + .println("Usage: Indexer <crawldb> [-linkdb <linkdb>] [-params k1=v1&k2=v2...] (<segment> ... | -dir <segments>) [-noCommit] [-deleteGone] [-filter] [-normalize] [-addBinaryContent] [-base64]"); + IndexWriters writers = new IndexWriters(getConf()); + System.err.println(writers.describe()); + return -1; + } + + final Path crawlDb = new Path(args[0]); + Path linkDb = null; + + final List<Path> segments = new ArrayList<Path>(); + String params = null; + + boolean noCommit = false; + boolean deleteGone = false; + boolean filter = false; + boolean normalize = false; + boolean addBinaryContent = false; + boolean base64 = false; + + for (int i = 1; i < args.length; i++) { + FileSystem fs = null; + Path dir = null; + if (args[i].equals("-linkdb")) { + linkDb = new Path(args[++i]); + } else if (args[i].equals("-dir")) { + dir = new Path(args[++i]); + fs = dir.getFileSystem(getConf()); + FileStatus[] fstats = fs.listStatus(dir, + HadoopFSUtil.getPassDirectoriesFilter(fs)); + Path[] files = HadoopFSUtil.getPaths(fstats); + for (Path p : files) { + if (SegmentChecker.isIndexable(p,fs)) { + segments.add(p); + } + } + } else if (args[i].equals("-noCommit")) { + noCommit = true; + } else if (args[i].equals("-deleteGone")) { + deleteGone = true; + } else if (args[i].equals("-filter")) { + filter = true; + } else if (args[i].equals("-normalize")) { + normalize = true; + } else if (args[i].equals("-addBinaryContent")) { + addBinaryContent = true; + } else if (args[i].equals("-base64")) { + base64 = true; + } else if (args[i].equals("-params")) { + params = args[++i]; + } else { + dir = new Path(args[i]); + fs = dir.getFileSystem(getConf()); + if (SegmentChecker.isIndexable(dir,fs)) { + segments.add(dir); + } + } + } + + try { + index(crawlDb, linkDb, segments, noCommit, deleteGone, params, filter, normalize, addBinaryContent, base64); + return 0; + } catch (final Exception e) { + LOG.error("Indexer: {}", StringUtils.stringifyException(e)); + return -1; + } + } + + public static void main(String[] args) throws Exception { + final int res = ToolRunner.run(NutchConfiguration.create(), + new IndexingJob(), args); + System.exit(res); + } + + + //Used for REST API + @Override + public Map<String, Object> run(Map<String, Object> args, String crawlId) throws Exception { + boolean noCommit = false; + boolean deleteGone = false; + boolean filter = false; + boolean normalize = false; + boolean isSegment = false; + String params= null; + Configuration conf = getConf(); + + Path crawlDb; + if(args.containsKey(Nutch.ARG_CRAWLDB)) { + Object crawldbPath = args.get(Nutch.ARG_CRAWLDB); + if(crawldbPath instanceof Path) { + crawlDb = (Path) crawldbPath; + } + else { + crawlDb = new Path(crawldbPath.toString()); + } + } + else { + crawlDb = new Path(crawlId+"/crawldb"); + } + + Path linkdb = null; + List<Path> segments = new ArrayList<Path>(); + + if(args.containsKey(Nutch.ARG_LINKDB)){ + if(args.containsKey(Nutch.ARG_LINKDB)) { + Object path = args.get(Nutch.ARG_LINKDB); + if(path instanceof Path) { + linkdb = (Path) path; + } + else { + linkdb = new Path(path.toString()); + } + } + else { + linkdb = new Path(crawlId+"/linkdb"); + } + } + + if(args.containsKey(Nutch.ARG_SEGMENTDIR)){ + isSegment = true; + Path segmentsDir; + Object segDir = args.get(Nutch.ARG_SEGMENTDIR); + if(segDir instanceof Path) { + segmentsDir = (Path) segDir; + } + else { + segmentsDir = new Path(segDir.toString()); + } + FileSystem fs = segmentsDir.getFileSystem(getConf()); + FileStatus[] fstats = fs.listStatus(segmentsDir, + HadoopFSUtil.getPassDirectoriesFilter(fs)); + Path[] files = HadoopFSUtil.getPaths(fstats); + for (Path p : files) { + if (SegmentChecker.isIndexable(p,fs)) { + segments.add(p); + } + } + } + + if(args.containsKey(Nutch.ARG_SEGMENT)){ + isSegment = true; + Object seg = args.get(Nutch.ARG_SEGMENT); + ArrayList<String> segmentList = new ArrayList<String>(); + if(seg instanceof ArrayList) { + segmentList = (ArrayList<String>)seg; + } + for(String segment: segmentList) { + segments.add(new Path(segment)); + } + } + + if(!isSegment){ + String segment_dir = crawlId+"/segments"; + File segmentsDir = new File(segment_dir); + File[] segmentsList = segmentsDir.listFiles(); + Arrays.sort(segmentsList, new Comparator<File>(){ + @Override + public int compare(File f1, File f2) { + if(f1.lastModified()>f2.lastModified()) + return -1; + else + return 0; + } + }); + Path segment = new Path(segmentsList[0].getPath()); + segments.add(segment); + } + + if(args.containsKey("noCommit")){ + noCommit = true; + } + if(args.containsKey("deleteGone")){ + deleteGone = true; + } + if(args.containsKey("normalize")){ + normalize = true; + } + if(args.containsKey("filter")){ + filter = true; + } + if(args.containsKey("params")){ + params = (String)args.get("params"); + } + setConf(conf); + index(crawlDb, linkdb, segments, noCommit, deleteGone, params, filter, + normalize); + Map<String, Object> results = new HashMap<String, Object>(); + results.put(Nutch.VAL_RESULT, 0); + return results; + } +} http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/main/java/org/apache/nutch/indexer/NutchDocument.java ---------------------------------------------------------------------- diff --git a/nutch-core/src/main/java/org/apache/nutch/indexer/NutchDocument.java b/nutch-core/src/main/java/org/apache/nutch/indexer/NutchDocument.java new file mode 100644 index 0000000..efdde02 --- /dev/null +++ b/nutch-core/src/main/java/org/apache/nutch/indexer/NutchDocument.java @@ -0,0 +1,144 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.nutch.indexer; + +import java.io.DataInput; +import java.io.DataOutput; +import java.io.IOException; +import java.util.Collection; +import java.util.HashMap; +import java.util.Iterator; +import java.util.Map; +import java.util.Map.Entry; + +import org.apache.hadoop.io.Text; +import org.apache.hadoop.io.VersionMismatchException; +import org.apache.hadoop.io.Writable; +import org.apache.hadoop.io.WritableUtils; +import org.apache.nutch.metadata.Metadata; + +/** A {@link NutchDocument} is the unit of indexing. */ +public class NutchDocument implements Writable, + Iterable<Entry<String, NutchField>> { + + public static final byte VERSION = 2; + + private Map<String, NutchField> fields; + + private Metadata documentMeta; + + private float weight; + + public NutchDocument() { + fields = new HashMap<String, NutchField>(); + documentMeta = new Metadata(); + weight = 1.0f; + } + + public void add(String name, Object value) { + NutchField field = fields.get(name); + if (field == null) { + field = new NutchField(value); + fields.put(name, field); + } else { + field.add(value); + } + } + + public Object getFieldValue(String name) { + NutchField field = fields.get(name); + if (field == null) { + return null; + } + if (field.getValues().size() == 0) { + return null; + } + return field.getValues().get(0); + } + + public NutchField getField(String name) { + return fields.get(name); + } + + public NutchField removeField(String name) { + return fields.remove(name); + } + + public Collection<String> getFieldNames() { + return fields.keySet(); + } + + /** Iterate over all fields. */ + public Iterator<Entry<String, NutchField>> iterator() { + return fields.entrySet().iterator(); + } + + public float getWeight() { + return weight; + } + + public void setWeight(float weight) { + this.weight = weight; + } + + public Metadata getDocumentMeta() { + return documentMeta; + } + + public void readFields(DataInput in) throws IOException { + fields.clear(); + byte version = in.readByte(); + if (version != VERSION) { + throw new VersionMismatchException(VERSION, version); + } + int size = WritableUtils.readVInt(in); + for (int i = 0; i < size; i++) { + String name = Text.readString(in); + NutchField field = new NutchField(); + field.readFields(in); + fields.put(name, field); + } + weight = in.readFloat(); + documentMeta.readFields(in); + } + + public void write(DataOutput out) throws IOException { + out.writeByte(VERSION); + WritableUtils.writeVInt(out, fields.size()); + for (Map.Entry<String, NutchField> entry : fields.entrySet()) { + Text.writeString(out, entry.getKey()); + NutchField field = entry.getValue(); + field.write(out); + } + out.writeFloat(weight); + documentMeta.write(out); + } + + public String toString() { + StringBuilder sb = new StringBuilder(); + sb.append("doc {\n"); + for (Map.Entry<String, NutchField> entry : fields.entrySet()) { + sb.append("\t"); + sb.append(entry.getKey()); + sb.append(":\t"); + sb.append(entry.getValue()); + sb.append("\n"); + } + sb.append("}\n"); + return sb.toString(); + } +} http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/main/java/org/apache/nutch/indexer/NutchField.java ---------------------------------------------------------------------- diff --git a/nutch-core/src/main/java/org/apache/nutch/indexer/NutchField.java b/nutch-core/src/main/java/org/apache/nutch/indexer/NutchField.java new file mode 100644 index 0000000..33911e1 --- /dev/null +++ b/nutch-core/src/main/java/org/apache/nutch/indexer/NutchField.java @@ -0,0 +1,137 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.nutch.indexer; + +import java.io.DataInput; +import java.io.DataOutput; +import java.io.IOException; +import java.util.ArrayList; +import java.util.Collection; +import java.util.Date; +import java.util.List; + +import org.apache.hadoop.io.*; + +/** + * This class represents a multi-valued field with a weight. Values are + * arbitrary objects. + */ +public class NutchField implements Writable { + private float weight; + private List<Object> values = new ArrayList<Object>(); + + public NutchField() { + } + + public NutchField(Object value) { + this(value, 1.0f); + } + + public NutchField(Object value, float weight) { + this.weight = weight; + if (value instanceof Collection) { + values.addAll((Collection<?>) value); + } else { + values.add(value); + } + } + + public void add(Object value) { + values.add(value); + } + + public float getWeight() { + return weight; + } + + public void setWeight(float weight) { + this.weight = weight; + } + + public List<Object> getValues() { + return values; + } + + public void reset() { + weight = 1.0f; + values.clear(); + } + + @Override + public Object clone() throws CloneNotSupportedException { + NutchField result = (NutchField) super.clone(); + result.weight = weight; + result.values = values; + + return result; + } + + @Override + public void readFields(DataInput in) throws IOException { + weight = in.readFloat(); + int count = in.readInt(); + values = new ArrayList<Object>(); + for (int i = 0; i < count; i++) { + String type = Text.readString(in); + + if (type.equals("java.lang.String")) { + values.add(Text.readString(in)); + } else if (type.equals("java.lang.Boolean")) { + values.add(in.readBoolean()); + } else if (type.equals("java.lang.Integer")) { + values.add(in.readInt()); + } else if (type.equals("java.lang.Float")) { + values.add(in.readFloat()); + } else if (type.equals("java.lang.Long")) { + values.add(in.readLong()); + } else if (type.equals("java.util.Date")) { + values.add(new Date(in.readLong())); + } + } + } + + @Override + public void write(DataOutput out) throws IOException { + out.writeFloat(weight); + out.writeInt(values.size()); + for (Object value : values) { + + Text.writeString(out, value.getClass().getName()); + + if (value instanceof Boolean) { + out.writeBoolean((Boolean) value); + } else if (value instanceof Integer) { + out.writeInt((Integer) value); + } else if (value instanceof Long) { + out.writeLong((Long) value); + } else if (value instanceof Float) { + out.writeFloat((Float) value); + } else if (value instanceof String) { + Text.writeString(out, (String) value); + } else if (value instanceof Date) { + Date date = (Date) value; + out.writeLong(date.getTime()); + } + } + } + + public String toString() { + return values.toString(); + } + +} http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/main/java/org/apache/nutch/indexer/NutchIndexAction.java ---------------------------------------------------------------------- diff --git a/nutch-core/src/main/java/org/apache/nutch/indexer/NutchIndexAction.java b/nutch-core/src/main/java/org/apache/nutch/indexer/NutchIndexAction.java new file mode 100644 index 0000000..b2517c3 --- /dev/null +++ b/nutch-core/src/main/java/org/apache/nutch/indexer/NutchIndexAction.java @@ -0,0 +1,58 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.nutch.indexer; + +import java.io.DataInput; +import java.io.DataOutput; +import java.io.IOException; + +import org.apache.hadoop.io.Writable; + +import org.apache.nutch.indexer.NutchDocument; + +/** + * A {@link NutchIndexAction} is the new unit of indexing holding the document + * and action information. + */ +public class NutchIndexAction implements Writable { + + public static final byte ADD = 0; + public static final byte DELETE = 1; + public static final byte UPDATE = 2; + + public NutchDocument doc = null; + public byte action = ADD; + + protected NutchIndexAction() { + } + + public NutchIndexAction(NutchDocument doc, byte action) { + this.doc = doc; + this.action = action; + } + + public void readFields(DataInput in) throws IOException { + action = in.readByte(); + doc = new NutchDocument(); + doc.readFields(in); + } + + public void write(DataOutput out) throws IOException { + out.write(action); + doc.write(out); + } +} http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/main/java/org/apache/nutch/indexer/package.html ---------------------------------------------------------------------- diff --git a/nutch-core/src/main/java/org/apache/nutch/indexer/package.html b/nutch-core/src/main/java/org/apache/nutch/indexer/package.html new file mode 100644 index 0000000..825eaae --- /dev/null +++ b/nutch-core/src/main/java/org/apache/nutch/indexer/package.html @@ -0,0 +1,10 @@ +<html> +<body> +Index content, configure and run indexing and cleaning jobs to +add, update, and delete documents from an index. Two tasks are +delegated to plugins: +<ul> +<li>indexing filters fill index fields of each documents</li> +<li>index writer plugins send documents to index back-ends (Solr, etc.). +</body> +</html> http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/main/java/org/apache/nutch/metadata/CreativeCommons.java ---------------------------------------------------------------------- diff --git a/nutch-core/src/main/java/org/apache/nutch/metadata/CreativeCommons.java b/nutch-core/src/main/java/org/apache/nutch/metadata/CreativeCommons.java new file mode 100644 index 0000000..f9c425b --- /dev/null +++ b/nutch-core/src/main/java/org/apache/nutch/metadata/CreativeCommons.java @@ -0,0 +1,35 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.nutch.metadata; + +/** + * A collection of Creative Commons properties names. + * + * @see <a href="http://www.creativecommons.org/">creativecommons.org</a> + * + * @author Chris Mattmann + * @author Jérôme Charron + */ +public interface CreativeCommons { + + public final static String LICENSE_URL = "License-Url"; + + public final static String LICENSE_LOCATION = "License-Location"; + + public final static String WORK_TYPE = "Work-Type"; + +} http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/main/java/org/apache/nutch/metadata/DublinCore.java ---------------------------------------------------------------------- diff --git a/nutch-core/src/main/java/org/apache/nutch/metadata/DublinCore.java b/nutch-core/src/main/java/org/apache/nutch/metadata/DublinCore.java new file mode 100644 index 0000000..9724d80 --- /dev/null +++ b/nutch-core/src/main/java/org/apache/nutch/metadata/DublinCore.java @@ -0,0 +1,161 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.nutch.metadata; + +/** + * A collection of Dublin Core metadata names. + * + * @see <a href="http://dublincore.org">dublincore.org</a> + * + * @author Chris Mattmann + * @author Jérôme Charron + */ +public interface DublinCore { + + /** + * Typically, Format may include the media-type or dimensions of the resource. + * Format may be used to determine the software, hardware or other equipment + * needed to display or operate the resource. Examples of dimensions include + * size and duration. Recommended best practice is to select a value from a + * controlled vocabulary (for example, the list of Internet Media Types [MIME] + * defining computer media formats). + */ + public static final String FORMAT = "format"; + + /** + * Recommended best practice is to identify the resource by means of a string + * or number conforming to a formal identification system. Example formal + * identification systems include the Uniform Resource Identifier (URI) + * (including the Uniform Resource Locator (URL)), the Digital Object + * Identifier (DOI) and the International Standard Book Number (ISBN). + */ + public static final String IDENTIFIER = "identifier"; + + /** + * Date on which the resource was changed. + */ + public static final String MODIFIED = "modified"; + + /** + * An entity responsible for making contributions to the content of the + * resource. Examples of a Contributor include a person, an organisation, or a + * service. Typically, the name of a Contributor should be used to indicate + * the entity. + */ + public static final String CONTRIBUTOR = "contributor"; + + /** + * The extent or scope of the content of the resource. Coverage will typically + * include spatial location (a place name or geographic coordinates), temporal + * period (a period label, date, or date range) or jurisdiction (such as a + * named administrative entity). Recommended best practice is to select a + * value from a controlled vocabulary (for example, the Thesaurus of + * Geographic Names [TGN]) and that, where appropriate, named places or time + * periods be used in preference to numeric identifiers such as sets of + * coordinates or date ranges. + */ + public static final String COVERAGE = "coverage"; + + /** + * An entity primarily responsible for making the content of the resource. + * Examples of a Creator include a person, an organisation, or a service. + * Typically, the name of a Creator should be used to indicate the entity. + */ + public static final String CREATOR = "creator"; + + /** + * A date associated with an event in the life cycle of the resource. + * Typically, Date will be associated with the creation or availability of the + * resource. Recommended best practice for encoding the date value is defined + * in a profile of ISO 8601 [W3CDTF] and follows the YYYY-MM-DD format. + */ + public static final String DATE = "date"; + + /** + * An account of the content of the resource. Description may include but is + * not limited to: an abstract, table of contents, reference to a graphical + * representation of content or a free-text account of the content. + */ + public static final String DESCRIPTION = "description"; + + /** + * A language of the intellectual content of the resource. Recommended best + * practice is to use RFC 3066 [RFC3066], which, in conjunction with ISO 639 + * [ISO639], defines two- and three-letter primary language tags with optional + * subtags. Examples include "en" or "eng" for English, "akk" for Akkadian, + * and "en-GB" for English used in the United Kingdom. + */ + public static final String LANGUAGE = "language"; + + /** + * An entity responsible for making the resource available. Examples of a + * Publisher include a person, an organisation, or a service. Typically, the + * name of a Publisher should be used to indicate the entity. + */ + public static final String PUBLISHER = "publisher"; + + /** + * A reference to a related resource. Recommended best practice is to + * reference the resource by means of a string or number conforming to a + * formal identification system. + */ + public static final String RELATION = "relation"; + + /** + * Information about rights held in and over the resource. Typically, a Rights + * element will contain a rights management statement for the resource, or + * reference a service providing such information. Rights information often + * encompasses Intellectual Property Rights (IPR), Copyright, and various + * Property Rights. If the Rights element is absent, no assumptions can be + * made about the status of these and other rights with respect to the + * resource. + */ + public static final String RIGHTS = "rights"; + + /** + * A reference to a resource from which the present resource is derived. The + * present resource may be derived from the Source resource in whole or in + * part. Recommended best practice is to reference the resource by means of a + * string or number conforming to a formal identification system. + */ + public static final String SOURCE = "source"; + + /** + * The topic of the content of the resource. Typically, a Subject will be + * expressed as keywords, key phrases or classification codes that describe a + * topic of the resource. Recommended best practice is to select a value from + * a controlled vocabulary or formal classification scheme. + */ + public static final String SUBJECT = "subject"; + + /** + * A name given to the resource. Typically, a Title will be a name by which + * the resource is formally known. + */ + public static final String TITLE = "title"; + + /** + * The nature or genre of the content of the resource. Type includes terms + * describing general categories, functions, genres, or aggregation levels for + * content. Recommended best practice is to select a value from a controlled + * vocabulary (for example, the DCMI Type Vocabulary [DCMITYPE]). To describe + * the physical or digital manifestation of the resource, use the Format + * element. + */ + public static final String TYPE = "type"; + +} http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/main/java/org/apache/nutch/metadata/Feed.java ---------------------------------------------------------------------- diff --git a/nutch-core/src/main/java/org/apache/nutch/metadata/Feed.java b/nutch-core/src/main/java/org/apache/nutch/metadata/Feed.java new file mode 100644 index 0000000..2697da6 --- /dev/null +++ b/nutch-core/src/main/java/org/apache/nutch/metadata/Feed.java @@ -0,0 +1,38 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.nutch.metadata; + +/** + * A collection of Feed property names extracted by the ROME library. + * + * + * @author mattmann + * @author dogacan + */ +public interface Feed { + + public static final String FEED_AUTHOR = "author"; + + public static final String FEED_TAGS = "tag"; + + public static final String FEED_PUBLISHED = "published"; + + public static final String FEED_UPDATED = "updated"; + + public static final String FEED = "feed"; +} http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/main/java/org/apache/nutch/metadata/HttpHeaders.java ---------------------------------------------------------------------- diff --git a/nutch-core/src/main/java/org/apache/nutch/metadata/HttpHeaders.java b/nutch-core/src/main/java/org/apache/nutch/metadata/HttpHeaders.java new file mode 100644 index 0000000..78b8797 --- /dev/null +++ b/nutch-core/src/main/java/org/apache/nutch/metadata/HttpHeaders.java @@ -0,0 +1,51 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.nutch.metadata; + +import org.apache.hadoop.io.Text; + +/** + * A collection of HTTP header names. + * + * @see <a href="http://rfc-ref.org/RFC-TEXTS/2616/">Hypertext Transfer Protocol + * -- HTTP/1.1 (RFC 2616)</a> + */ +public interface HttpHeaders { + + public final static String TRANSFER_ENCODING = "Transfer-Encoding"; + + public final static String CONTENT_ENCODING = "Content-Encoding"; + + public final static String CONTENT_LANGUAGE = "Content-Language"; + + public final static String CONTENT_LENGTH = "Content-Length"; + + public final static String CONTENT_LOCATION = "Content-Location"; + + public static final String CONTENT_DISPOSITION = "Content-Disposition"; + + public final static String CONTENT_MD5 = "Content-MD5"; + + public final static String CONTENT_TYPE = "Content-Type"; + + public static final Text WRITABLE_CONTENT_TYPE = new Text(CONTENT_TYPE); + + public final static String LAST_MODIFIED = "Last-Modified"; + + public final static String LOCATION = "Location"; + +} http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/main/java/org/apache/nutch/metadata/MetaWrapper.java ---------------------------------------------------------------------- diff --git a/nutch-core/src/main/java/org/apache/nutch/metadata/MetaWrapper.java b/nutch-core/src/main/java/org/apache/nutch/metadata/MetaWrapper.java new file mode 100644 index 0000000..a43fa9d --- /dev/null +++ b/nutch-core/src/main/java/org/apache/nutch/metadata/MetaWrapper.java @@ -0,0 +1,120 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.nutch.metadata; + +import java.io.DataInput; +import java.io.DataOutput; +import java.io.IOException; + +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.io.Writable; +import org.apache.nutch.crawl.NutchWritable; + +/** + * This is a simple decorator that adds metadata to any Writable-s that can be + * serialized by <tt>NutchWritable</tt>. This is useful when data needs to be + * temporarily enriched during processing, but this temporary metadata doesn't + * need to be permanently stored after the job is done. + * + * @author Andrzej Bialecki + */ +public class MetaWrapper extends NutchWritable { + private Metadata metadata; + + public MetaWrapper() { + super(); + metadata = new Metadata(); + } + + public MetaWrapper(Writable instance, Configuration conf) { + super(instance); + metadata = new Metadata(); + setConf(conf); + } + + public MetaWrapper(Metadata metadata, Writable instance, Configuration conf) { + super(instance); + if (metadata == null) + metadata = new Metadata(); + this.metadata = metadata; + setConf(conf); + } + + /** + * Get all metadata. + */ + public Metadata getMetadata() { + return metadata; + } + + /** + * Add metadata. See {@link Metadata#add(String, String)} for more + * information. + * + * @param name + * metadata name + * @param value + * metadata value + */ + public void addMeta(String name, String value) { + metadata.add(name, value); + } + + /** + * Set metadata. See {@link Metadata#set(String, String)} for more + * information. + * + * @param name + * @param value + */ + public void setMeta(String name, String value) { + metadata.set(name, value); + } + + /** + * Get metadata. See {@link Metadata#get(String)} for more information. + * + * @param name + * @return metadata value + */ + public String getMeta(String name) { + return metadata.get(name); + } + + /** + * Get multiple metadata. See {@link Metadata#getValues(String)} for more + * information. + * + * @param name + * @return multiple values + */ + public String[] getMetaValues(String name) { + return metadata.getValues(name); + } + + public void readFields(DataInput in) throws IOException { + super.readFields(in); + metadata = new Metadata(); + metadata.readFields(in); + } + + public void write(DataOutput out) throws IOException { + super.write(out); + metadata.write(out); + } +} http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/main/java/org/apache/nutch/metadata/Metadata.java ---------------------------------------------------------------------- diff --git a/nutch-core/src/main/java/org/apache/nutch/metadata/Metadata.java b/nutch-core/src/main/java/org/apache/nutch/metadata/Metadata.java new file mode 100644 index 0000000..8a57ee3 --- /dev/null +++ b/nutch-core/src/main/java/org/apache/nutch/metadata/Metadata.java @@ -0,0 +1,280 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.nutch.metadata; + +import java.io.DataInput; +import java.io.DataOutput; +import java.io.IOException; +import java.util.Enumeration; +import java.util.HashMap; +import java.util.Map; +import java.util.Properties; + +import org.apache.hadoop.io.Text; +import org.apache.hadoop.io.Writable; + +/** + * A multi-valued metadata container. + */ +public class Metadata implements Writable, CreativeCommons, DublinCore, + HttpHeaders, Nutch, Feed { + + /** + * A map of all metadata attributes. + */ + private Map<String, String[]> metadata = null; + + /** + * Constructs a new, empty metadata. + */ + public Metadata() { + metadata = new HashMap<String, String[]>(); + } + + /** + * Returns true if named value is multivalued. + * + * @param name + * name of metadata + * @return true is named value is multivalued, false if single value or null + */ + public boolean isMultiValued(final String name) { + return metadata.get(name) != null && metadata.get(name).length > 1; + } + + /** + * Returns an array of the names contained in the metadata. + * + * @return Metadata names + */ + public String[] names() { + return metadata.keySet().toArray(new String[metadata.keySet().size()]); + } + + /** + * Get the value associated to a metadata name. If many values are assiociated + * to the specified name, then the first one is returned. + * + * @param name + * of the metadata. + * @return the value associated to the specified metadata name. + */ + public String get(final String name) { + String[] values = metadata.get(name); + if (values == null) { + return null; + } else { + return values[0]; + } + } + + /** + * Get the values associated to a metadata name. + * + * @param name + * of the metadata. + * @return the values associated to a metadata name. + */ + public String[] getValues(final String name) { + return _getValues(name); + } + + private String[] _getValues(final String name) { + String[] values = metadata.get(name); + if (values == null) { + values = new String[0]; + } + return values; + } + + /** + * Add a metadata name/value mapping. Add the specified value to the list of + * values associated to the specified metadata name. + * + * @param name + * the metadata name. + * @param value + * the metadata value. + */ + public void add(final String name, final String value) { + String[] values = metadata.get(name); + if (values == null) { + set(name, value); + } else { + String[] newValues = new String[values.length + 1]; + System.arraycopy(values, 0, newValues, 0, values.length); + newValues[newValues.length - 1] = value; + metadata.put(name, newValues); + } + } + + /** + * Add all name/value mappings (merge two metadata mappings). If a name + * already exists in current metadata the values are added to existing values. + * + * @param metadata + * other Metadata to be merged + */ + public void addAll(Metadata metadata) { + for (String name : metadata.names()) { + String[] addValues = metadata.getValues(name); + if (addValues == null) + continue; + String[] oldValues = this.metadata.get(name); + if (oldValues == null) { + this.metadata.put(name, addValues); + } else { + String[] newValues = new String[oldValues.length + addValues.length]; + System.arraycopy(oldValues, 0, newValues, 0, oldValues.length); + System.arraycopy(addValues, 0, newValues, oldValues.length, + addValues.length); + this.metadata.put(name, newValues); + } + } + } + + /** + * Copy All key-value pairs from properties. + * + * @param properties + * properties to copy from + */ + public void setAll(Properties properties) { + Enumeration<?> names = properties.propertyNames(); + while (names.hasMoreElements()) { + String name = (String) names.nextElement(); + metadata.put(name, new String[] { properties.getProperty(name) }); + } + } + + /** + * Set metadata name/value. Associate the specified value to the specified + * metadata name. If some previous values were associated to this name, they + * are removed. + * + * @param name + * the metadata name. + * @param value + * the metadata value. + */ + public void set(String name, String value) { + metadata.put(name, new String[] { value }); + } + + /** + * Remove a metadata and all its associated values. + * + * @param name + * metadata name to remove + */ + public void remove(String name) { + metadata.remove(name); + } + + /** + * Returns the number of metadata names in this metadata. + * + * @return number of metadata names + */ + public int size() { + return metadata.size(); + } + + /** Remove all mappings from metadata. */ + public void clear() { + metadata.clear(); + } + + public boolean equals(Object o) { + + if (o == null) { + return false; + } + + Metadata other = null; + try { + other = (Metadata) o; + } catch (ClassCastException cce) { + return false; + } + + if (other.size() != size()) { + return false; + } + + String[] names = names(); + for (int i = 0; i < names.length; i++) { + String[] otherValues = other._getValues(names[i]); + String[] thisValues = _getValues(names[i]); + if (otherValues.length != thisValues.length) { + return false; + } + for (int j = 0; j < otherValues.length; j++) { + if (!otherValues[j].equals(thisValues[j])) { + return false; + } + } + } + return true; + } + + public String toString() { + StringBuffer buf = new StringBuffer(); + String[] names = names(); + for (int i = 0; i < names.length; i++) { + String[] values = _getValues(names[i]); + for (int j = 0; j < values.length; j++) { + buf.append(names[i]).append("=").append(values[j]).append(" "); + } + } + return buf.toString(); + } + + public final void write(DataOutput out) throws IOException { + out.writeInt(size()); + String[] values = null; + String[] names = names(); + for (int i = 0; i < names.length; i++) { + Text.writeString(out, names[i]); + values = _getValues(names[i]); + int cnt = 0; + for (int j = 0; j < values.length; j++) { + if (values[j] != null) + cnt++; + } + out.writeInt(cnt); + for (int j = 0; j < values.length; j++) { + if (values[j] != null) { + Text.writeString(out, values[j]); + } + } + } + } + + public final void readFields(DataInput in) throws IOException { + int keySize = in.readInt(); + String key; + for (int i = 0; i < keySize; i++) { + key = Text.readString(in); + int valueSize = in.readInt(); + for (int j = 0; j < valueSize; j++) { + add(key, Text.readString(in)); + } + } + } + +} http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/main/java/org/apache/nutch/metadata/Nutch.java ---------------------------------------------------------------------- diff --git a/nutch-core/src/main/java/org/apache/nutch/metadata/Nutch.java b/nutch-core/src/main/java/org/apache/nutch/metadata/Nutch.java new file mode 100644 index 0000000..de80399 --- /dev/null +++ b/nutch-core/src/main/java/org/apache/nutch/metadata/Nutch.java @@ -0,0 +1,98 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.nutch.metadata; + +import org.apache.hadoop.io.Text; + +/** + * A collection of Nutch internal metadata constants. + * + * @author Chris Mattmann + * @author Jérôme Charron + */ +public interface Nutch { + + public static final String ORIGINAL_CHAR_ENCODING = "OriginalCharEncoding"; + + public static final String CHAR_ENCODING_FOR_CONVERSION = "CharEncodingForConversion"; + + public static final String SIGNATURE_KEY = "nutch.content.digest"; + + public static final String SEGMENT_NAME_KEY = "nutch.segment.name"; + + public static final String SCORE_KEY = "nutch.crawl.score"; + + public static final String GENERATE_TIME_KEY = "_ngt_"; + + public static final Text WRITABLE_GENERATE_TIME_KEY = new Text( + GENERATE_TIME_KEY); + + public static final Text PROTOCOL_STATUS_CODE_KEY = new Text("nutch.protocol.code"); + + public static final String PROTO_STATUS_KEY = "_pst_"; + + public static final Text WRITABLE_PROTO_STATUS_KEY = new Text( + PROTO_STATUS_KEY); + + public static final String FETCH_TIME_KEY = "_ftk_"; + + public static final String FETCH_STATUS_KEY = "_fst_"; + + /** + * Sites may request that search engines don't provide access to cached + * documents. + */ + public static final String CACHING_FORBIDDEN_KEY = "caching.forbidden"; + + /** Show both original forbidden content and summaries (default). */ + public static final String CACHING_FORBIDDEN_NONE = "none"; + + /** Don't show either original forbidden content or summaries. */ + public static final String CACHING_FORBIDDEN_ALL = "all"; + + /** Don't show original forbidden content, but show summaries. */ + public static final String CACHING_FORBIDDEN_CONTENT = "content"; + + public static final String REPR_URL_KEY = "_repr_"; + + public static final Text WRITABLE_REPR_URL_KEY = new Text(REPR_URL_KEY); + + /** Used by AdaptiveFetchSchedule to maintain custom fetch interval */ + public static final String FIXED_INTERVAL_KEY = "fixedInterval"; + + public static final Text WRITABLE_FIXED_INTERVAL_KEY = new Text( + FIXED_INTERVAL_KEY); + + + /** For progress of job. Used by the Nutch REST service */ + public static final String STAT_PROGRESS = "progress"; + /**Used by Nutch REST service */ + public static final String CRAWL_ID_KEY = "storage.crawl.id"; + /** Argument key to specify location of the seed url dir for the REST endpoints **/ + public static final String ARG_SEEDDIR = "url_dir"; + /** Argument key to specify the location of crawldb for the REST endpoints **/ + public static final String ARG_CRAWLDB = "crawldb"; + /** Argument key to specify the location of linkdb for the REST endpoints **/ + public static final String ARG_LINKDB = "linkdb"; + /** Name of the key used in the Result Map sent back by the REST endpoint **/ + public static final String VAL_RESULT = "result"; + /** Argument key to specify the location of a directory of segments for the REST endpoints. + * Similar to the -dir command in the bin/nutch script **/ + public static final String ARG_SEGMENTDIR = "segment_dir"; + /** Argument key to specify the location of individual segment for the REST endpoints **/ + public static final String ARG_SEGMENT = "segment"; +} http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/main/java/org/apache/nutch/metadata/SpellCheckedMetadata.java ---------------------------------------------------------------------- diff --git a/nutch-core/src/main/java/org/apache/nutch/metadata/SpellCheckedMetadata.java b/nutch-core/src/main/java/org/apache/nutch/metadata/SpellCheckedMetadata.java new file mode 100644 index 0000000..164ca1d --- /dev/null +++ b/nutch-core/src/main/java/org/apache/nutch/metadata/SpellCheckedMetadata.java @@ -0,0 +1,150 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.nutch.metadata; + +import java.lang.reflect.Field; +import java.lang.reflect.Modifier; +import java.util.HashMap; +import java.util.Map; + +import org.apache.commons.lang.StringUtils; + +/** + * A decorator to Metadata that adds spellchecking capabilities to property + * names. Currently used spelling vocabulary contains just the httpheaders from + * {@link HttpHeaders} class. + * + */ +public class SpellCheckedMetadata extends Metadata { + + /** + * Treshold divider. + * + * <code>threshold = searched.length() / TRESHOLD_DIVIDER;</code> + */ + private static final int TRESHOLD_DIVIDER = 3; + + /** + * Normalized name to name mapping. + */ + private final static Map<String, String> NAMES_IDX = new HashMap<String, String>(); + + /** + * Array holding map keys. + */ + private static String[] normalized = null; + + static { + + // Uses following array to fill the metanames index and the + // metanames list. + Class<?>[] spellthese = { HttpHeaders.class }; + + for (Class<?> spellCheckedNames : spellthese) { + for (Field field : spellCheckedNames.getFields()) { + int mods = field.getModifiers(); + if (Modifier.isFinal(mods) && Modifier.isPublic(mods) + && Modifier.isStatic(mods) && field.getType().equals(String.class)) { + try { + String val = (String) field.get(null); + NAMES_IDX.put(normalize(val), val); + } catch (Exception e) { + // Simply ignore... + } + } + } + } + normalized = NAMES_IDX.keySet().toArray(new String[NAMES_IDX.size()]); + } + + /** + * Normalizes String. + * + * @param str + * the string to normalize + * @return normalized String + */ + private static String normalize(final String str) { + char c; + StringBuffer buf = new StringBuffer(); + for (int i = 0; i < str.length(); i++) { + c = str.charAt(i); + if (Character.isLetter(c)) { + buf.append(Character.toLowerCase(c)); + } + } + return buf.toString(); + } + + /** + * Get the normalized name of metadata attribute name. This method tries to + * find a well-known metadata name (one of the metadata names defined in this + * class) that matches the specified name. The matching is error tolerent. For + * instance, + * <ul> + * <li>content-type gives Content-Type</li> + * <li>CoNtEntType gives Content-Type</li> + * <li>ConTnTtYpe gives Content-Type</li> + * </ul> + * If no matching with a well-known metadata name is found, then the original + * name is returned. + * + * @param name + * Name to normalize + * @return normalized name + */ + public static String getNormalizedName(final String name) { + String searched = normalize(name); + String value = NAMES_IDX.get(searched); + + if ((value == null) && (normalized != null)) { + int threshold = searched.length() / TRESHOLD_DIVIDER; + for (int i = 0; i < normalized.length && value == null; i++) { + if (StringUtils.getLevenshteinDistance(searched, normalized[i]) < threshold) { + value = NAMES_IDX.get(normalized[i]); + } + } + } + return (value != null) ? value : name; + } + + @Override + public void remove(final String name) { + super.remove(getNormalizedName(name)); + } + + @Override + public void add(final String name, final String value) { + super.add(getNormalizedName(name), value); + } + + @Override + public String[] getValues(final String name) { + return super.getValues(getNormalizedName(name)); + } + + @Override + public String get(final String name) { + return super.get(getNormalizedName(name)); + } + + @Override + public void set(final String name, final String value) { + super.set(getNormalizedName(name), value); + } + +} http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/main/java/org/apache/nutch/metadata/package.html ---------------------------------------------------------------------- diff --git a/nutch-core/src/main/java/org/apache/nutch/metadata/package.html b/nutch-core/src/main/java/org/apache/nutch/metadata/package.html new file mode 100644 index 0000000..53281bb --- /dev/null +++ b/nutch-core/src/main/java/org/apache/nutch/metadata/package.html @@ -0,0 +1,6 @@ +<html> +<body> +A Multi-valued Metadata container, and set +of constant fields for Nutch Metadata. +</body> +</html> http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/main/java/org/apache/nutch/net/URLExemptionFilter.java ---------------------------------------------------------------------- diff --git a/nutch-core/src/main/java/org/apache/nutch/net/URLExemptionFilter.java b/nutch-core/src/main/java/org/apache/nutch/net/URLExemptionFilter.java new file mode 100644 index 0000000..8de5800 --- /dev/null +++ b/nutch-core/src/main/java/org/apache/nutch/net/URLExemptionFilter.java @@ -0,0 +1,43 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.nutch.net; + +//Hadoop +import org.apache.hadoop.conf.Configurable; +// Nutch +import org.apache.nutch.plugin.Pluggable; + +/** + * Interface used to allow exemptions to external domain resources by overriding <code>db.ignore.external.links</code>. + * This is useful when the crawl is focused to a domain but resources like images are hosted on CDN. + */ + +public interface URLExemptionFilter extends Pluggable, Configurable{ + + /** The name of the extension point. */ + public final static String X_POINT_ID = URLExemptionFilter.class.getName(); + + /** + * Checks if toUrl is exempted when the ignore external is enabled + * @param fromUrl : the source url which generated the outlink + * @param toUrl : the destination url which needs to be checked for exemption + * @return true when toUrl is exempted from dbIgnore + */ + public boolean filter(String fromUrl, String toUrl); + +} http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/main/java/org/apache/nutch/net/URLExemptionFilters.java ---------------------------------------------------------------------- diff --git a/nutch-core/src/main/java/org/apache/nutch/net/URLExemptionFilters.java b/nutch-core/src/main/java/org/apache/nutch/net/URLExemptionFilters.java new file mode 100644 index 0000000..d362f2e --- /dev/null +++ b/nutch-core/src/main/java/org/apache/nutch/net/URLExemptionFilters.java @@ -0,0 +1,64 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.nutch.net; + +import org.apache.hadoop.conf.Configuration; +import org.apache.nutch.plugin.Extension; +import org.apache.nutch.plugin.PluginRepository; +import org.apache.nutch.plugin.PluginRuntimeException; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +/** Creates and caches {@link URLExemptionFilter} implementing plugins. */ +public class URLExemptionFilters { + + private static final Logger LOG = LoggerFactory.getLogger(URLExemptionFilters.class); + + private URLExemptionFilter[] filters; + + public URLExemptionFilters(Configuration conf) { + Extension[] extensions = PluginRepository.get(conf) + .getExtensionPoint(URLExemptionFilter.X_POINT_ID).getExtensions(); + filters = new URLExemptionFilter[extensions.length]; + for (int i = 0; i < extensions.length; i++) { + try { + filters[i] = (URLExemptionFilter) extensions[i].getExtensionInstance(); + } catch (PluginRuntimeException e) { + throw new IllegalStateException(e); + } + } + LOG.info("Found {} extensions at point:'{}'", filters.length, + URLExemptionFilter.X_POINT_ID); + } + + + /** Run all defined filters. Assume logical AND. */ + public boolean isExempted(String fromUrl, String toUrl) { + if (filters.length < 1) { + //at least one filter should be on + return false; + } + //validate from, to and filters + boolean exempted = fromUrl != null && toUrl != null; + //An URL is exempted when all the filters accept it to pass through + for (int i = 0; i < this.filters.length && exempted; i++) { + exempted = this.filters[i].filter(fromUrl, toUrl); + } + return exempted; + } +} \ No newline at end of file http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/main/java/org/apache/nutch/net/URLFilter.java ---------------------------------------------------------------------- diff --git a/nutch-core/src/main/java/org/apache/nutch/net/URLFilter.java b/nutch-core/src/main/java/org/apache/nutch/net/URLFilter.java new file mode 100644 index 0000000..01efbcd --- /dev/null +++ b/nutch-core/src/main/java/org/apache/nutch/net/URLFilter.java @@ -0,0 +1,40 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.nutch.net; + +// Hadoop imports +import org.apache.hadoop.conf.Configurable; + +// Nutch imports +import org.apache.nutch.plugin.Pluggable; + +/** + * Interface used to limit which URLs enter Nutch. Used by the injector and the + * db updater. + */ + +public interface URLFilter extends Pluggable, Configurable { + /** The name of the extension point. */ + public final static String X_POINT_ID = URLFilter.class.getName(); + + /* + * Interface for a filter that transforms a URL: it can pass the original URL + * through or "delete" the URL by returning null + */ + public String filter(String urlString); +} http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/main/java/org/apache/nutch/net/URLFilterChecker.java ---------------------------------------------------------------------- diff --git a/nutch-core/src/main/java/org/apache/nutch/net/URLFilterChecker.java b/nutch-core/src/main/java/org/apache/nutch/net/URLFilterChecker.java new file mode 100644 index 0000000..89a3d00 --- /dev/null +++ b/nutch-core/src/main/java/org/apache/nutch/net/URLFilterChecker.java @@ -0,0 +1,134 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.nutch.net; + +import org.apache.nutch.plugin.Extension; +import org.apache.nutch.plugin.ExtensionPoint; +import org.apache.nutch.plugin.PluginRepository; + +import org.apache.hadoop.conf.Configuration; + +import org.apache.nutch.util.NutchConfiguration; + +import java.io.BufferedReader; +import java.io.InputStreamReader; + +/** + * Checks one given filter or all filters. + * + * @author John Xing + */ +public class URLFilterChecker { + + private Configuration conf; + + public URLFilterChecker(Configuration conf) { + this.conf = conf; + } + + private void checkOne(String filterName) throws Exception { + URLFilter filter = null; + + ExtensionPoint point = PluginRepository.get(conf).getExtensionPoint( + URLFilter.X_POINT_ID); + + if (point == null) + throw new RuntimeException(URLFilter.X_POINT_ID + " not found."); + + Extension[] extensions = point.getExtensions(); + + for (int i = 0; i < extensions.length; i++) { + Extension extension = extensions[i]; + filter = (URLFilter) extension.getExtensionInstance(); + if (filter.getClass().getName().equals(filterName)) { + break; + } else { + filter = null; + } + } + + if (filter == null) + throw new RuntimeException("Filter " + filterName + " not found."); + + // jerome : should we keep this behavior? + // if (LogFormatter.hasLoggedSevere()) + // throw new RuntimeException("Severe error encountered."); + + System.out.println("Checking URLFilter " + filterName); + + BufferedReader in = new BufferedReader(new InputStreamReader(System.in)); + String line; + while ((line = in.readLine()) != null) { + String out = filter.filter(line); + if (out != null) { + System.out.print("+"); + System.out.println(out); + } else { + System.out.print("-"); + System.out.println(line); + } + } + } + + private void checkAll() throws Exception { + System.out.println("Checking combination of all URLFilters available"); + + BufferedReader in = new BufferedReader(new InputStreamReader(System.in)); + String line; + while ((line = in.readLine()) != null) { + URLFilters filters = new URLFilters(this.conf); + String out = filters.filter(line); + if (out != null) { + System.out.print("+"); + System.out.println(out); + } else { + System.out.print("-"); + System.out.println(line); + } + } + } + + public static void main(String[] args) throws Exception { + + String usage = "Usage: URLFilterChecker (-filterName filterName | -allCombined) \n" + + "Tool takes a list of URLs, one per line, passed via STDIN.\n"; + + if (args.length == 0) { + System.err.println(usage); + System.exit(-1); + } + + String filterName = null; + if (args[0].equals("-filterName")) { + if (args.length != 2) { + System.err.println(usage); + System.exit(-1); + } + filterName = args[1]; + } + + URLFilterChecker checker = new URLFilterChecker(NutchConfiguration.create()); + if (filterName != null) { + checker.checkOne(filterName); + } else { + checker.checkAll(); + } + + System.exit(0); + } +} http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/main/java/org/apache/nutch/net/URLFilterException.java ---------------------------------------------------------------------- diff --git a/nutch-core/src/main/java/org/apache/nutch/net/URLFilterException.java b/nutch-core/src/main/java/org/apache/nutch/net/URLFilterException.java new file mode 100644 index 0000000..b367b56 --- /dev/null +++ b/nutch-core/src/main/java/org/apache/nutch/net/URLFilterException.java @@ -0,0 +1,39 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.nutch.net; + +@SuppressWarnings("serial") +public class URLFilterException extends Exception { + + public URLFilterException() { + super(); + } + + public URLFilterException(String message) { + super(message); + } + + public URLFilterException(String message, Throwable cause) { + super(message, cause); + } + + public URLFilterException(Throwable cause) { + super(cause); + } + +} http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/main/java/org/apache/nutch/net/URLFilters.java ---------------------------------------------------------------------- diff --git a/nutch-core/src/main/java/org/apache/nutch/net/URLFilters.java b/nutch-core/src/main/java/org/apache/nutch/net/URLFilters.java new file mode 100644 index 0000000..3deccca --- /dev/null +++ b/nutch-core/src/main/java/org/apache/nutch/net/URLFilters.java @@ -0,0 +1,44 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.nutch.net; + +import org.apache.hadoop.conf.Configuration; +import org.apache.nutch.plugin.PluginRepository; + +/** Creates and caches {@link URLFilter} implementing plugins. */ +public class URLFilters { + + public static final String URLFILTER_ORDER = "urlfilter.order"; + private URLFilter[] filters; + + public URLFilters(Configuration conf) { + this.filters = (URLFilter[]) PluginRepository.get(conf).getOrderedPlugins( + URLFilter.class, URLFilter.X_POINT_ID, URLFILTER_ORDER); + } + + /** Run all defined filters. Assume logical AND. */ + public String filter(String urlString) throws URLFilterException { + for (int i = 0; i < this.filters.length; i++) { + if (urlString == null) + return null; + urlString = this.filters[i].filter(urlString); + + } + return urlString; + } +}
