Author: tobr
Date: Tue Feb 12 16:29:24 2013
New Revision: 1445240
URL: http://svn.apache.org/r1445240
Log:
added new ParserData API to the task
Added:
incubator/droids/branches/0.2.x-cleanup/droids-core/src/main/java/org/apache/droids/parse/LinkedParserData.java
(with props)
Modified:
incubator/droids/branches/0.2.x-cleanup/droids-core/src/main/java/org/apache/droids/core/Task.java
incubator/droids/branches/0.2.x-cleanup/droids-core/src/main/java/org/apache/droids/parse/FileNameParser.java
incubator/droids/branches/0.2.x-cleanup/droids-core/src/main/java/org/apache/droids/parse/SimpleLinkParser.java
Modified:
incubator/droids/branches/0.2.x-cleanup/droids-core/src/main/java/org/apache/droids/core/Task.java
URL:
http://svn.apache.org/viewvc/incubator/droids/branches/0.2.x-cleanup/droids-core/src/main/java/org/apache/droids/core/Task.java?rev=1445240&r1=1445239&r2=1445240&view=diff
==============================================================================
---
incubator/droids/branches/0.2.x-cleanup/droids-core/src/main/java/org/apache/droids/core/Task.java
(original)
+++
incubator/droids/branches/0.2.x-cleanup/droids-core/src/main/java/org/apache/droids/core/Task.java
Tue Feb 12 16:29:24 2013
@@ -38,13 +38,20 @@ public interface Task extends Serializab
public URI getURI();
/**
- * The data of the task.
+ * The raw data of the task.
*
* @return a Map of data values
*/
public ContentEntity getContentEntity();
/**
+ * The data extracted by the {@link Parser}
+ *
+ * @return the extracted data
+ */
+ public ParserData getParserData();
+
+ /**
* @return The depth of the task
*/
public int getDepth();
@@ -75,4 +82,5 @@ public interface Task extends Serializab
* @return Task
*/
public <T extends Task> T createTask(URI uri);
+
}
Modified:
incubator/droids/branches/0.2.x-cleanup/droids-core/src/main/java/org/apache/droids/parse/FileNameParser.java
URL:
http://svn.apache.org/viewvc/incubator/droids/branches/0.2.x-cleanup/droids-core/src/main/java/org/apache/droids/parse/FileNameParser.java?rev=1445240&r1=1445239&r2=1445240&view=diff
==============================================================================
---
incubator/droids/branches/0.2.x-cleanup/droids-core/src/main/java/org/apache/droids/parse/FileNameParser.java
(original)
+++
incubator/droids/branches/0.2.x-cleanup/droids-core/src/main/java/org/apache/droids/parse/FileNameParser.java
Tue Feb 12 16:29:24 2013
@@ -1,3 +1,19 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
package org.apache.droids.parse;
import org.apache.droids.core.DroidsException;
@@ -7,7 +23,7 @@ import org.apache.droids.core.Task;
import java.io.IOException;
/**
- * Simple Parser implmentation extracting the path component from
+ * Simple Parser implementation extracting the path component from
* the URI of the task.
* For file based walkers, this is file name of the file.
*
@@ -21,6 +37,6 @@ public class FileNameParser implements P
@Override
public void parse(Task task) throws DroidsException, IOException {
String path = task.getURI().getPath();
- task.getContentEntity().put(FILENAME,
path.substring(path.lastIndexOf('/') + 1));
+ task.getParserData().set(FILENAME,
path.substring(path.lastIndexOf('/') + 1));
}
}
Added:
incubator/droids/branches/0.2.x-cleanup/droids-core/src/main/java/org/apache/droids/parse/LinkedParserData.java
URL:
http://svn.apache.org/viewvc/incubator/droids/branches/0.2.x-cleanup/droids-core/src/main/java/org/apache/droids/parse/LinkedParserData.java?rev=1445240&view=auto
==============================================================================
---
incubator/droids/branches/0.2.x-cleanup/droids-core/src/main/java/org/apache/droids/parse/LinkedParserData.java
(added)
+++
incubator/droids/branches/0.2.x-cleanup/droids-core/src/main/java/org/apache/droids/parse/LinkedParserData.java
Tue Feb 12 16:29:24 2013
@@ -0,0 +1,50 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.droids.parse;
+
+import org.apache.droids.core.ParserData;
+
+/**
+ *
+ *
+ *
+ */
+public class LinkedParserData extends ParserData {
+ public static final String ANCHOR_TEXT = "anchortext";
+ public static final String ANCHOR_TITLE = "anchortitle";
+
+ public LinkedParserData() {
+ super();
+ }
+
+ public void setAnchorText(String anchorText) {
+ this.set(ANCHOR_TEXT, anchorText);
+ }
+
+ public String getAnchorText() {
+ return this.get(ANCHOR_TEXT);
+ }
+
+ public void setAnchorTitle(String anchorTitle) {
+ this.set(ANCHOR_TITLE, anchorTitle);
+ }
+
+ public String getAnchorTitle() {
+ return this.get(ANCHOR_TITLE);
+ }
+
+}
Propchange:
incubator/droids/branches/0.2.x-cleanup/droids-core/src/main/java/org/apache/droids/parse/LinkedParserData.java
------------------------------------------------------------------------------
svn:eol-style = native
Propchange:
incubator/droids/branches/0.2.x-cleanup/droids-core/src/main/java/org/apache/droids/parse/LinkedParserData.java
------------------------------------------------------------------------------
svn:keywords = Author Date Id Revision
Propchange:
incubator/droids/branches/0.2.x-cleanup/droids-core/src/main/java/org/apache/droids/parse/LinkedParserData.java
------------------------------------------------------------------------------
svn:mime-type = text/plain
Modified:
incubator/droids/branches/0.2.x-cleanup/droids-core/src/main/java/org/apache/droids/parse/SimpleLinkParser.java
URL:
http://svn.apache.org/viewvc/incubator/droids/branches/0.2.x-cleanup/droids-core/src/main/java/org/apache/droids/parse/SimpleLinkParser.java?rev=1445240&r1=1445239&r2=1445240&view=diff
==============================================================================
---
incubator/droids/branches/0.2.x-cleanup/droids-core/src/main/java/org/apache/droids/parse/SimpleLinkParser.java
(original)
+++
incubator/droids/branches/0.2.x-cleanup/droids-core/src/main/java/org/apache/droids/parse/SimpleLinkParser.java
Tue Feb 12 16:29:24 2013
@@ -1,8 +1,27 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
package org.apache.droids.parse;
import org.apache.droids.core.DroidsException;
+import org.apache.droids.core.LinkedTask;
import org.apache.droids.core.Parser;
-import org.apache.droids.core.Task;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+import org.slf4j.spi.LocationAwareLogger;
import java.io.IOException;
import java.io.InputStream;
@@ -18,21 +37,25 @@ import java.util.regex.Pattern;
*
* @version 1.0
*/
-public class SimpleLinkParser<T extends Task> implements Parser<T> {
+public class SimpleLinkParser implements Parser<LinkedTask> {
+ Logger logger = LoggerFactory.getLogger(SimpleLinkParser.class);
@Override
- public void parse(T task) throws DroidsException, IOException {
+ public void parse(LinkedTask task) throws DroidsException, IOException {
+ logger.info("parse " + task.getURI());
InputStream inStream = task.getContentEntity().getContent();
if (inStream != null) {
Scanner s = new Scanner(inStream).useDelimiter("\\A");
String content = s.hasNext() ? s.next() : "";
Pattern linkPattern =
Pattern.compile("<a[^>]+href=[\"']?([^\"'>]+)[\"']?[^>]*>(.+?)</a>",
Pattern.CASE_INSENSITIVE|Pattern.DOTALL);
Matcher pageMatcher = linkPattern.matcher(content);
- Set<Task> links = new HashSet<Task>();
+ Set<LinkedTask> links = new HashSet<LinkedTask>();
while(pageMatcher.find()){
-
links.add(task.createTask(task.getURI().resolve(pageMatcher.group(1))));
+ LinkedTask newTask =
task.createTask(task.getURI().resolve(pageMatcher.group(1)));
+ links.add(newTask);
}
- task.getContentEntity().setLinks(links);
+ task.setTo(links);
}
+
}
}