Author: tobr
Date: Thu Mar 28 11:18:10 2013
New Revision: 1462042
URL: http://svn.apache.org/r1462042
Log:
added module for extracting data given by selectors
Added:
incubator/droids/branches/0.2.x-cleanup/droids-nekohtml/
incubator/droids/branches/0.2.x-cleanup/droids-nekohtml/pom.xml (with
props)
incubator/droids/branches/0.2.x-cleanup/droids-nekohtml/src/
incubator/droids/branches/0.2.x-cleanup/droids-nekohtml/src/main/
incubator/droids/branches/0.2.x-cleanup/droids-nekohtml/src/main/java/
incubator/droids/branches/0.2.x-cleanup/droids-nekohtml/src/main/java/org/
incubator/droids/branches/0.2.x-cleanup/droids-nekohtml/src/main/java/org/apache/
incubator/droids/branches/0.2.x-cleanup/droids-nekohtml/src/main/java/org/apache/droids/
incubator/droids/branches/0.2.x-cleanup/droids-nekohtml/src/main/java/org/apache/droids/nekohtml/
incubator/droids/branches/0.2.x-cleanup/droids-nekohtml/src/main/java/org/apache/droids/nekohtml/NekoHtmlParser.java
- copied, changed from r1461977,
incubator/droids/branches/0.2.x-cleanup/droids-solr/src/main/java/org/apache/droids/solr/AdvancedSolrHandler.java
incubator/droids/branches/0.2.x-cleanup/droids-nekohtml/src/test/
incubator/droids/branches/0.2.x-cleanup/droids-nekohtml/src/test/java/
incubator/droids/branches/0.2.x-cleanup/droids-nekohtml/src/test/java/org/
incubator/droids/branches/0.2.x-cleanup/droids-nekohtml/src/test/java/org/apache/
incubator/droids/branches/0.2.x-cleanup/droids-nekohtml/src/test/java/org/apache/droids/
incubator/droids/branches/0.2.x-cleanup/droids-nekohtml/src/test/java/org/apache/droids/nekohtml/
incubator/droids/branches/0.2.x-cleanup/droids-nekohtml/src/test/java/org/apache/droids/nekohtml/NekoHtmlParserTest.java
- copied, changed from r1461977,
incubator/droids/branches/0.2.x-cleanup/droids-solr/src/test/java/org/apache/droids/solr/AdvancedSolrHandleTest.java
Removed:
incubator/droids/branches/0.2.x-cleanup/droids-solr/src/main/java/org/apache/droids/solr/AdvancedSolrHandler.java
incubator/droids/branches/0.2.x-cleanup/droids-solr/src/test/java/org/apache/droids/solr/AdvancedSolrHandleTest.java
Added: incubator/droids/branches/0.2.x-cleanup/droids-nekohtml/pom.xml
URL:
http://svn.apache.org/viewvc/incubator/droids/branches/0.2.x-cleanup/droids-nekohtml/pom.xml?rev=1462042&view=auto
==============================================================================
--- incubator/droids/branches/0.2.x-cleanup/droids-nekohtml/pom.xml (added)
+++ incubator/droids/branches/0.2.x-cleanup/droids-nekohtml/pom.xml Thu Mar 28
11:18:10 2013
@@ -0,0 +1,62 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements. See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<project xsi:schemaLocation="http://maven.apache.org/POM/4.0.0
http://maven.apache.org/xsd/maven-4.0.0.xsd"
+ xmlns="http://maven.apache.org/POM/4.0.0"
+ xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">
+ <modelVersion>4.0.0</modelVersion>
+ <packaging>jar</packaging>
+
+ <parent>
+ <groupId>org.apache.droids</groupId>
+ <artifactId>droids</artifactId>
+ <version>0.3.0-incubating-SNAPSHOT</version>
+ </parent>
+
+ <artifactId>droids-nekohtml</artifactId>
+ <name>APACHE DROIDS NEKOHTML PARSER</name>
+
+ <properties>
+ <nekohtml.version>1.9.18</nekohtml.version>
+ </properties>
+
+ <dependencies>
+ <dependency>
+ <groupId>org.apache.droids</groupId>
+ <artifactId>droids-core</artifactId>
+ <version>${project.version}</version>
+ </dependency>
+ <dependency>
+ <groupId>net.sourceforge.nekohtml</groupId>
+ <artifactId>nekohtml</artifactId>
+ <version>${nekohtml.version}</version>
+ </dependency>
+ <!-- FOR TESTING -->
+ <dependency>
+ <groupId>junit</groupId>
+ <artifactId>junit</artifactId>
+ <version>${junit.version}</version>
+ <scope>test</scope>
+ </dependency>
+ <dependency>
+ <groupId>ch.qos.logback</groupId>
+ <artifactId>logback-classic</artifactId>
+ <version>${logback.version}</version>
+ <scope>test</scope>
+ </dependency>
+ </dependencies>
+</project>
Propchange: incubator/droids/branches/0.2.x-cleanup/droids-nekohtml/pom.xml
------------------------------------------------------------------------------
svn:eol-style = native
Propchange: incubator/droids/branches/0.2.x-cleanup/droids-nekohtml/pom.xml
------------------------------------------------------------------------------
svn:keywords = Author Date Id Revision
Propchange: incubator/droids/branches/0.2.x-cleanup/droids-nekohtml/pom.xml
------------------------------------------------------------------------------
svn:mime-type = text/xml
Copied:
incubator/droids/branches/0.2.x-cleanup/droids-nekohtml/src/main/java/org/apache/droids/nekohtml/NekoHtmlParser.java
(from r1461977,
incubator/droids/branches/0.2.x-cleanup/droids-solr/src/main/java/org/apache/droids/solr/AdvancedSolrHandler.java)
URL:
http://svn.apache.org/viewvc/incubator/droids/branches/0.2.x-cleanup/droids-nekohtml/src/main/java/org/apache/droids/nekohtml/NekoHtmlParser.java?p2=incubator/droids/branches/0.2.x-cleanup/droids-nekohtml/src/main/java/org/apache/droids/nekohtml/NekoHtmlParser.java&p1=incubator/droids/branches/0.2.x-cleanup/droids-solr/src/main/java/org/apache/droids/solr/AdvancedSolrHandler.java&r1=1461977&r2=1462042&rev=1462042&view=diff
==============================================================================
---
incubator/droids/branches/0.2.x-cleanup/droids-solr/src/main/java/org/apache/droids/solr/AdvancedSolrHandler.java
(original)
+++
incubator/droids/branches/0.2.x-cleanup/droids-nekohtml/src/main/java/org/apache/droids/nekohtml/NekoHtmlParser.java
Thu Mar 28 11:18:10 2013
@@ -14,7 +14,7 @@
* See the License for the specific language governing permissions and
* limitations under the License.
*/
-package org.apache.droids.solr;
+package org.apache.droids.nekohtml;
import java.io.IOException;
import java.util.Arrays;
@@ -28,10 +28,8 @@ import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.apache.droids.core.DroidsException;
+import org.apache.droids.core.Parser;
import org.apache.droids.core.Task;
-import org.apache.solr.client.solrj.SolrServer;
-import org.apache.solr.client.solrj.SolrServerException;
-import org.apache.solr.common.SolrInputDocument;
import org.cyberneko.html.parsers.SAXParser;
import org.xml.sax.Attributes;
import org.xml.sax.ContentHandler;
@@ -42,11 +40,11 @@ import org.xml.sax.SAXNotRecognizedExcep
import org.xml.sax.SAXNotSupportedException;
/**
- * A Droids Handler which allows to specify selectors to store
- * documents' parts in a Solr index.
+ * A Droids Parser which allows to specify selectors to extract
+ * documents parts.
* <p/>
- * A selector is an Entry made of a key which matches the solr fiel and
- * of a value which correspond to a path selector.
+ * A selector is an Entry made of a key and
+ * a value which correspond to a path selector.
* <p/>
* Path selectors are always absolute and supports indexes.
* <p/>
@@ -55,18 +53,9 @@ import org.xml.sax.SAXNotSupportedExcept
* - /html[0]/div[0]/p[0]
* - /html[0]/div[1]/p[2]
*/
-public class AdvancedSolrHandler extends SolrHandler {
- /**
- * The selectors allow to save specific parts of the document in the index.
- * The HashMap's key matches the Solr field.
- * The HashMap's value is an absolute path corresponding to an element.
- */
+public class NekoHtmlParser<T extends Task> implements Parser<T> {
private Map<String, String> selectors;
-
- /**
- * A content handler
- */
- private SolrContentHandler contentHandler = new
SolrContentHandler(selectors);
+ private Map<String, Pattern> patterns;
/**
* An HTML parser
@@ -74,62 +63,23 @@ public class AdvancedSolrHandler extends
private SAXParser parser;
- public AdvancedSolrHandler(String solrUrl) {
- super(solrUrl);
- }
-
- public AdvancedSolrHandler(SolrServer solrServer) {
- super(solrServer);
+ public NekoHtmlParser() {
+ this(new HashMap<String, String>());
}
-
- /**
- * @return the current path selectors
- */
- public Map<String, String> getSelectors() {
- return selectors;
+ public NekoHtmlParser(HashMap<String, String> selectors) {
+ this.patterns = new HashMap<String, Pattern>();
+ setSelectors(selectors);
+ if (parser == null) initParser();
}
- /**
- * @param selectors an hash map containing path selectors
- */
- public void setSelectors(HashMap<String, String> selectors) {
- contentHandler.initPatterns(selectors);
- this.selectors = selectors;
- }
- /*
- * @see org.apache.droids.api.Handler#handle(java.net.URI,
org.apache.droids.api.DroidsContentEntity)
- */
@Override
- public void handle(Task task) throws DroidsException, IOException {
- SolrInputDocument doc = createSolrInputDocument(task);
- try {
- getSolrServer().add(doc);
- } catch (SolrServerException e) {
- throw new DroidsException(e);
- }
- }
-
- /**
- * Generates a SolrInputDocument from an URI and a DroidsContentEntity
- * which correspond to the document which need to be saved in the index
- *
- * @param task the task
- * @return
- */
- private SolrInputDocument createSolrInputDocument(Task task) {
- SolrInputDocument doc = new SolrInputDocument();
-
- doc.setField("id", task.getURI().getPath());
- doc.setField("name", task.getURI().toASCIIString());
- doc.setField("contentType", task.getContentEntity().getContentType());
- doc.setField("content", task.getParserData().getText());
-
- if (parser == null) initParser();
+ public void parse(T task) throws DroidsException {
+ NekoContentHandler contentHandler;
- if (!selectors.isEmpty()) {
- contentHandler.initDocument(doc);
+ if (!patterns.isEmpty()) {
+ contentHandler = new NekoContentHandler(task, patterns);
try {
parser.setContentHandler(contentHandler);
parser.parse(new
InputSource(task.getContentEntity().getContent()));
@@ -140,15 +90,35 @@ public class AdvancedSolrHandler extends
}
}
- return doc;
}
/**
+ * Get the selectors.
+ *
+ * @return the map of selectors
+ */
+ public Map<String, String> getSelectors() {
+ return selectors;
+ }
+
+ /**
+ * The selectors allow to save specific parts of the document in the index.
+ * The key of the map is used to identify the rule.
+ * The value contains the selection path rule e.g. /html[0]/div[0]
+ *
+ * @param selectors Map of selectors
+ */
+ public void setSelectors(HashMap<String, String> selectors) {
+ this.selectors = selectors;
+ initPatterns();
+ }
+
+ /**
+ *
* Initialize a Cyber Necko parser configured to return lower case
element's names
*
- * @return
*/
- private SAXParser initParser() {
+ private void initParser() {
parser = new SAXParser();
try {
parser.setProperty("http://cyberneko.org/html/properties/names/elems", "lower");
@@ -160,26 +130,59 @@ public class AdvancedSolrHandler extends
} catch (SAXNotSupportedException ex) {
throw new IllegalStateException(ex);
}
- return parser;
}
- @Override
- public void finish() {
+ /**
+ * Initialize patterns.
+ * Transform the selector map to regex rules.
+ */
+ private void initPatterns() {
+ if (selectors != null) {
- }
+ // clear the current patterns
+ patterns.clear();
- @Override
- public void cleanUp(String query) {
+ // pattern for the element and its index
+ final Pattern p =
Pattern.compile("^([a-zA-Z:-_\\.]+)(\\[([0-9]*)\\]){0,1}$");
+
+ // for each selector
+ Set<String> keys = selectors.keySet();
+ for (String key : keys) {
+ // creating a pattern
+ String regex = "^";
+ String selector = selectors.get(key);
+ String[] elements = selector.split("/");
+ // which match all the elements and their respective indices
+ for (String element : elements) {
+ Matcher m = p.matcher(element);
+ if (m.find()) {
+ String elementName = m.group(1);
+ String elementIndex = m.group(3);
+ regex += "/" + elementName;
+ if (elementIndex == null) {
+ regex += "\\[[0-9]*\\]";
+ } else {
+ regex += "\\[" + elementIndex + "\\]";
+ }
+ }
+ }
+ regex += "$";
+ // storing the new Pattern
+ Pattern pattern = Pattern.compile(regex);
+ patterns.put(key, pattern);
+ }
+ }
}
+
/**
- * A class that implements a SAX ContentHandler and uses patterns to
record documents
- * elements in a SolrInputDocuement.
+ * A class that implements a SAX ContentHandler and uses patterns to
+ * extract elements.
*/
- private class SolrContentHandler implements ContentHandler {
+ private class NekoContentHandler implements ContentHandler {
- private SolrInputDocument doc;
+ private T task;
/**
* the patterns which match element's path
@@ -200,66 +203,9 @@ public class AdvancedSolrHandler extends
private int lastLevel = 0;
- /**
- * Constructor
- *
- * @param selectors an Map which contains selectors
- */
- public SolrContentHandler(Map<String, String> selectors) {
- initPatterns(selectors);
- }
-
- /**
- * Initialize patterns
- *
- * @param selectors
- */
- public void initPatterns(Map<String, String> selectors) {
- if (selectors != null) {
-
- // clear the current patterns
- patterns.clear();
-
- // pattern for the element and its index
- final Pattern p =
Pattern.compile("^([a-zA-Z:-_\\.]+)(\\[([0-9]*)\\]){0,1}$");
-
- // for each selector
- Set<String> keys = selectors.keySet();
- for (String key : keys) {
- // creating a pattern
- String regex = "^";
- String selector = selectors.get(key);
- String[] elements = selector.split("/");
- // which match all the elements and their respective
indices
- for (String element : elements) {
- Matcher m = p.matcher(element);
- if (m.find()) {
- String elementName = m.group(1);
- String elementIndex = m.group(3);
- regex += "/" + elementName;
- if (elementIndex == null) {
- regex += "\\[[0-9]*\\]";
- } else {
- regex += "\\[" + elementIndex + "\\]";
- }
- }
- }
- regex += "$";
-
- // storing the new Pattern
- Pattern pattern = Pattern.compile(regex);
- patterns.put(key, pattern);
- }
- }
- }
-
- /**
- * Initialization of the document used for indexation
- *
- * @param doc a solr document
- */
- public void initDocument(SolrInputDocument doc) {
- this.doc = doc;
+ public NekoContentHandler(T task, Map<String, Pattern> patterns) {
+ this.task = task;
+ this.patterns = patterns;
}
/*
@@ -335,7 +281,7 @@ public class AdvancedSolrHandler extends
if (matcher.find()) {
// add the matching content to the solr document.
String value = valueRecorders.remove(patternName);
- doc.addField(patternName, value);
+ task.getParserData().add(patternName, value);
}
}
}
Copied:
incubator/droids/branches/0.2.x-cleanup/droids-nekohtml/src/test/java/org/apache/droids/nekohtml/NekoHtmlParserTest.java
(from r1461977,
incubator/droids/branches/0.2.x-cleanup/droids-solr/src/test/java/org/apache/droids/solr/AdvancedSolrHandleTest.java)
URL:
http://svn.apache.org/viewvc/incubator/droids/branches/0.2.x-cleanup/droids-nekohtml/src/test/java/org/apache/droids/nekohtml/NekoHtmlParserTest.java?p2=incubator/droids/branches/0.2.x-cleanup/droids-nekohtml/src/test/java/org/apache/droids/nekohtml/NekoHtmlParserTest.java&p1=incubator/droids/branches/0.2.x-cleanup/droids-solr/src/test/java/org/apache/droids/solr/AdvancedSolrHandleTest.java&r1=1461977&r2=1462042&rev=1462042&view=diff
==============================================================================
---
incubator/droids/branches/0.2.x-cleanup/droids-solr/src/test/java/org/apache/droids/solr/AdvancedSolrHandleTest.java
(original)
+++
incubator/droids/branches/0.2.x-cleanup/droids-nekohtml/src/test/java/org/apache/droids/nekohtml/NekoHtmlParserTest.java
Thu Mar 28 11:18:10 2013
@@ -14,35 +14,24 @@
* See the License for the specific language governing permissions and
* limitations under the License.
*/
-package org.apache.droids.solr;
+package org.apache.droids.nekohtml;
import java.io.ByteArrayInputStream;
-import java.io.IOException;
+import java.io.UnsupportedEncodingException;
import java.net.URI;
import java.net.URISyntaxException;
import java.util.HashMap;
-import org.apache.droids.core.ContentEntity;
+import org.apache.droids.core.BasicTask;
import org.apache.droids.core.DroidsException;
import org.apache.droids.core.Task;
-import org.apache.solr.client.solrj.SolrQuery;
-import org.apache.solr.client.solrj.SolrServer;
-import org.apache.solr.client.solrj.SolrServerException;
-import org.apache.solr.client.solrj.embedded.EmbeddedSolrServer;
-import org.apache.solr.client.solrj.response.QueryResponse;
-import org.apache.solr.common.SolrDocument;
-import org.apache.solr.core.CoreContainer;
-import org.junit.After;
-import org.junit.Before;
import org.junit.Test;
import static org.junit.Assert.*;
-public class AdvancedSolrHandleTest {
-
- SolrServer solr;
-
- String simpleHtmlPage = "" +
+public class NekoHtmlParserTest {
+ private Task task;
+ private static final String TEST_CONTENT = "" +
"<html>" +
"<body>" +
"<div>" +
@@ -58,65 +47,42 @@ public class AdvancedSolrHandleTest {
"</body>" +
"</html>";
- protected String getSolrHome() {
- return "example";
- }
-
- @Before
- public void setUp() throws Exception {
- CoreContainer.Initializer initializer = new
CoreContainer.Initializer();
- CoreContainer coreContainer = initializer.initialize();
- solr = new EmbeddedSolrServer(coreContainer, "");
- }
-
- @After
- public void tearDown() throws Exception {
- // remove everything....
- solr.deleteByQuery("*:*");
- solr.commit();
+ public NekoHtmlParserTest() {
+ try {
+ task = new BasicTask(new URI("http://localhost/"));
+
+ task.getContentEntity().setContent(new
ByteArrayInputStream(TEST_CONTENT.getBytes("UTF-8")));
+ } catch (URISyntaxException e) {
+ e.printStackTrace();
+ } catch (UnsupportedEncodingException e) {
+ e.printStackTrace();
+ }
}
- public void performSelection(String html, String field, String selector,
String expectedValue) throws IOException, DroidsException, URISyntaxException,
SolrServerException {
- AdvancedSolrHandler handler = new AdvancedSolrHandler(solr);
-
+ public void performSelection(String field, String selector, String
expectedValue) throws URISyntaxException, DroidsException {
HashMap<String, String> selectors = new HashMap<String, String>();
selectors.put(field, selector);
- handler.setSelectors(selectors);
-
- Task task = new SolrTask(new URI("http://localhost/"));
-
- ContentEntity contentEntity = task.getContentEntity();
- contentEntity.setContentType("text/html");
- contentEntity.setCharset("UTF-8");
- contentEntity.setContent(new
ByteArrayInputStream(html.getBytes("UTF-8")));
-
- handler.handle(task);
- solr.commit();
- SolrQuery query = new SolrQuery();
- query.setQuery("*:*");
- query.setFields(field);
- QueryResponse response = solr.query(query);
+ NekoHtmlParser<Task> parser = new NekoHtmlParser<Task>(selectors);
- SolrDocument doc = response.getResults().iterator().next();
- String value = (String) doc.getFieldValue(field);
+ parser.parse(task);
- assertEquals(expectedValue, value);
+ assertEquals(expectedValue, task.getParserData().get(field));
}
@Test
public void testSelectorA() throws Exception {
- performSelection(simpleHtmlPage, "selector",
"/html[0]/body[0]/div[0]/p[0]", "p0");
+ performSelection("selector", "/html[0]/body[0]/div[0]/p[0]", "p0");
}
@Test
public void testSelectorB() throws Exception {
- performSelection(simpleHtmlPage, "selector",
"/html[0]/body[0]/div[1]/p[1]", "p4");
+ performSelection("selector", "/html[0]/body[0]/div[1]/p[1]", "p4");
}
@Test
public void testSelectorC() throws Exception {
- performSelection(simpleHtmlPage, "selector",
"/html[0]/body[0]/div[1]", "p3p4p5");
+ performSelection("selector", "/html[0]/body[0]/div[1]", "p3p4p5");
}
}
\ No newline at end of file