Added:
nutch/branches/2.x/docker/cassandra/nutch/plugin/nutch2-index-html/src/plugin/index-html/src/java/org/apache/nutch/indexer/html/HtmlIndexingFilter.java
URL:
http://svn.apache.org/viewvc/nutch/branches/2.x/docker/cassandra/nutch/plugin/nutch2-index-html/src/plugin/index-html/src/java/org/apache/nutch/indexer/html/HtmlIndexingFilter.java?rev=1680929&view=auto
==============================================================================
---
nutch/branches/2.x/docker/cassandra/nutch/plugin/nutch2-index-html/src/plugin/index-html/src/java/org/apache/nutch/indexer/html/HtmlIndexingFilter.java
(added)
+++
nutch/branches/2.x/docker/cassandra/nutch/plugin/nutch2-index-html/src/plugin/index-html/src/java/org/apache/nutch/indexer/html/HtmlIndexingFilter.java
Thu May 21 17:14:24 2015
@@ -0,0 +1,118 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.indexer.html;
+
+import java.util.Scanner;
+import java.nio.ByteBuffer;
+import java.io.ByteArrayInputStream;
+
+import java.text.ParseException;
+import java.util.Collection;
+import java.util.Date;
+import java.util.HashSet;
+
+import org.apache.avro.util.Utf8;
+import org.apache.commons.lang.StringUtils;
+import org.apache.nutch.util.StringUtil;
+
+import org.apache.commons.lang.time.DateUtils;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.nutch.indexer.IndexingException;
+import org.apache.nutch.indexer.IndexingFilter;
+import org.apache.nutch.indexer.NutchDocument;
+import org.apache.nutch.metadata.HttpHeaders;
+import org.apache.nutch.net.protocols.HttpDateFormat;
+import org.apache.nutch.storage.WebPage;
+import org.apache.nutch.storage.WebPage.Field;
+import org.apache.nutch.util.MimeUtil;
+import org.apache.nutch.util.TableUtil;
+import org.apache.oro.text.regex.MalformedPatternException;
+import org.apache.oro.text.regex.MatchResult;
+import org.apache.oro.text.regex.PatternMatcher;
+import org.apache.oro.text.regex.Perl5Compiler;
+import org.apache.oro.text.regex.Perl5Matcher;
+import org.apache.oro.text.regex.Perl5Pattern;
+import org.apache.solr.common.util.DateUtil;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+
+/**
+ * Add HTML of page the document element so it can be indexed in scheme.xml
+ *
+ * @author Mohamed Meabed <[email protected]>
+ */
+
+public class HtmlIndexingFilter implements IndexingFilter {
+ public static final Logger LOG =
LoggerFactory.getLogger(HtmlIndexingFilter.class);
+ private Configuration conf;
+
+ /**
+ * Get the MimeTypes resolver instance.
+ */
+ private MimeUtil MIME;
+
+ private static Collection<WebPage.Field> FIELDS = new
HashSet<WebPage.Field>();
+
+ static {
+ FIELDS.add(WebPage.Field.CONTENT);
+ }
+
+ @Override
+ public NutchDocument filter(NutchDocument doc, String url, WebPage page)
throws IndexingException {
+ addRawContent(doc, page, url);
+ return doc;
+ }
+
+
+ private NutchDocument addRawContent(NutchDocument doc, WebPage page,
String url) {
+ ByteBuffer raw = page.getContent();
+ if (raw != null) {
+ if (LOG.isInfoEnabled()) {
+ LOG.info("Html indexing for: " + url.toString());
+ }
+ ByteArrayInputStream arrayInputStream = new
ByteArrayInputStream(raw.array(), raw.arrayOffset() + raw.position(),
raw.remaining());
+ Scanner scanner = new Scanner(arrayInputStream);
+ scanner.useDelimiter("\\Z");//To read all scanner content in one
String
+ String data = "";
+ if (scanner.hasNext()) {
+ data = scanner.next();
+ }
+ doc.add("rawcontent", StringUtil.cleanField(data));
+ }
+ return doc;
+ }
+
+
+ public void addIndexBackendOptions(Configuration conf) {
+ }
+
+ public void setConf(Configuration conf) {
+ this.conf = conf;
+ MIME = new MimeUtil(conf);
+ }
+
+ public Configuration getConf() {
+ return this.conf;
+ }
+
+ @Override
+ public Collection<Field> getFields() {
+ return FIELDS;
+ }
+
+}
Propchange:
nutch/branches/2.x/docker/cassandra/nutch/plugin/nutch2-index-html/src/plugin/index-html/src/java/org/apache/nutch/indexer/html/HtmlIndexingFilter.java
------------------------------------------------------------------------------
svn:executable = *
Added:
nutch/branches/2.x/docker/cassandra/nutch/plugin/nutch2-index-html/src/plugin/index-html/src/java/org/apache/nutch/indexer/html/package.html
URL:
http://svn.apache.org/viewvc/nutch/branches/2.x/docker/cassandra/nutch/plugin/nutch2-index-html/src/plugin/index-html/src/java/org/apache/nutch/indexer/html/package.html?rev=1680929&view=auto
==============================================================================
---
nutch/branches/2.x/docker/cassandra/nutch/plugin/nutch2-index-html/src/plugin/index-html/src/java/org/apache/nutch/indexer/html/package.html
(added)
+++
nutch/branches/2.x/docker/cassandra/nutch/plugin/nutch2-index-html/src/plugin/index-html/src/java/org/apache/nutch/indexer/html/package.html
Thu May 21 17:14:24 2015
@@ -0,0 +1,5 @@
+<html>
+<body>
+<p>A HTML indexing plugin.</p><p></p>
+</body>
+</html>
Propchange:
nutch/branches/2.x/docker/cassandra/nutch/plugin/nutch2-index-html/src/plugin/index-html/src/java/org/apache/nutch/indexer/html/package.html
------------------------------------------------------------------------------
svn:executable = *
Added: nutch/branches/2.x/docker/cassandra/nutch/testUrls/seed.txt
URL:
http://svn.apache.org/viewvc/nutch/branches/2.x/docker/cassandra/nutch/testUrls/seed.txt?rev=1680929&view=auto
==============================================================================
--- nutch/branches/2.x/docker/cassandra/nutch/testUrls/seed.txt (added)
+++ nutch/branches/2.x/docker/cassandra/nutch/testUrls/seed.txt Thu May 21
17:14:24 2015
@@ -0,0 +1 @@
+http://www.google.com
Propchange: nutch/branches/2.x/docker/cassandra/nutch/testUrls/seed.txt
------------------------------------------------------------------------------
svn:executable = *