[
https://issues.apache.org/jira/browse/NUTCH-2389?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=16106551#comment-16106551
]
Hudson commented on NUTCH-2389:
-------------------------------
FAILURE: Integrated in Jenkins build Nutch-nutchgora #1588 (See
[https://builds.apache.org/job/Nutch-nutchgora/1588/])
NUTCH-2389 jsoup-extractor with parse filter, indexing filter and unit
(ikaidul:
[https://github.com/apache/nutch/commit/f41735cb3c96650f6a51f1c5eb87566572bf1679])
* (add) src/plugin/jsoup-extractor/plugin.xml
* (add)
src/plugin/jsoup-extractor/src/java/org/apache/nutch/jsoup/extractor/core/JsoupExtractorConstants.java
* (add)
src/plugin/jsoup-extractor/src/java/org/apache/nutch/jsoup/extractor/core/JsoupDocumentReader.java
* (add)
src/plugin/jsoup-extractor/src/test/org/apache/nutch/parse/jsoup/extractor/TestJsoupParser.java
* (add)
src/plugin/jsoup-extractor/src/java/org/apache/nutch/jsoup/extractor/core/package-info.java
* (add)
src/plugin/jsoup-extractor/src/java/org/apache/nutch/jsoup/extractor/parse/JsoupHtmlParser.java
* (edit) src/plugin/build.xml
* (add)
src/plugin/jsoup-extractor/src/java/org/apache/nutch/jsoup/extractor/core/normalizer/package-info.java
* (edit) conf/nutch-default.xml
* (add)
src/plugin/jsoup-extractor/src/java/org/apache/nutch/jsoup/extractor/parse/package-info.java
* (add) conf/jsoup-extractor-sample.xml
* (edit) build.xml
* (add)
src/plugin/jsoup-extractor/src/java/org/apache/nutch/jsoup/extractor/indexer/package-info.java
* (add)
src/plugin/jsoup-extractor/src/java/org/apache/nutch/jsoup/extractor/core/normalizer/SimpleStringNormalizer.java
* (add)
src/plugin/jsoup-extractor/src/java/org/apache/nutch/jsoup/extractor/indexer/JsoupIndexingFilter.java
* (add)
src/plugin/jsoup-extractor/src/test/org/apache/nutch/parse/jsoup/extractor/ViewCountNormalizer.java
* (add) conf/jsoup-extractor.xml
* (add)
src/plugin/jsoup-extractor/src/java/org/apache/nutch/jsoup/extractor/core/normalizer/Normalizable.java
* (add)
src/plugin/jsoup-extractor/src/java/org/apache/nutch/jsoup/extractor/core/JsoupDocument.java
* (add) src/plugin/jsoup-extractor/build.xml
NUTCH-2389 jsoup-extractor/ivy.xml commited (ikaidul:
[https://github.com/apache/nutch/commit/fe6997f30e4bcffe962da4d09ae73f379c026a76])
* (add) src/plugin/jsoup-extractor/ivy.xml
NUTCH-2389 Unit test implemented but not passed (ikaidul:
[https://github.com/apache/nutch/commit/17bd8f6e87f4fa4fd35c5aecfa09d8ef3bea6fd7])
* (delete) conf/jsoup-extractor-sample.xml
* (edit) conf/jsoup-extractor.xml
* (edit)
src/plugin/jsoup-extractor/src/java/org/apache/nutch/jsoup/extractor/core/JsoupDocumentReader.java
* (edit)
src/plugin/jsoup-extractor/src/java/org/apache/nutch/jsoup/extractor/parse/JsoupHtmlParser.java
* (edit) src/plugin/jsoup-extractor/plugin.xml
* (edit)
src/plugin/jsoup-extractor/src/java/org/apache/nutch/jsoup/extractor/core/JsoupExtractorConstants.java
* (edit)
src/plugin/jsoup-extractor/src/test/org/apache/nutch/parse/jsoup/extractor/TestJsoupParser.java
NUTCH-2389 package name changed (kaidulislam90:
[https://github.com/apache/nutch/commit/52e785d6f8ebf6f57150b255df380510f6ebcf6b])
* (delete)
src/plugin/jsoup-extractor/src/java/org/apache/nutch/jsoup/extractor/indexer/JsoupIndexingFilter.java
* (add)
src/plugin/jsoup-extractor/src/java/org/apache/nutch/core/jsoup/extractor/JsoupExtractorConstants.java
* (delete)
src/plugin/jsoup-extractor/src/java/org/apache/nutch/jsoup/extractor/core/package-info.java
* (delete)
src/plugin/jsoup-extractor/src/java/org/apache/nutch/jsoup/extractor/parse/package-info.java
* (edit) src/plugin/jsoup-extractor/plugin.xml
* (add)
src/plugin/jsoup-extractor/src/java/org/apache/nutch/core/jsoup/extractor/normalizer/Normalizable.java
* (add)
src/plugin/jsoup-extractor/src/java/org/apache/nutch/parse/jsoup/extractor/JsoupHtmlParser.java
* (delete)
src/plugin/jsoup-extractor/src/java/org/apache/nutch/jsoup/extractor/core/JsoupDocument.java
* (delete)
src/plugin/jsoup-extractor/src/java/org/apache/nutch/jsoup/extractor/parse/JsoupHtmlParser.java
* (delete)
src/plugin/jsoup-extractor/src/java/org/apache/nutch/jsoup/extractor/core/JsoupExtractorConstants.java
* (add)
src/plugin/jsoup-extractor/src/java/org/apache/nutch/core/jsoup/extractor/JsoupDocument.java
* (add)
src/plugin/jsoup-extractor/src/java/org/apache/nutch/core/jsoup/extractor/normalizer/SimpleStringNormalizer.java
* (delete)
src/plugin/jsoup-extractor/src/java/org/apache/nutch/jsoup/extractor/indexer/package-info.java
* (add)
src/plugin/jsoup-extractor/src/java/org/apache/nutch/core/jsoup/extractor/JsoupDocumentReader.java
* (add)
src/plugin/jsoup-extractor/src/java/org/apache/nutch/indexer/jsoup/extractor/package-info.java
* (delete)
src/plugin/jsoup-extractor/src/java/org/apache/nutch/jsoup/extractor/core/normalizer/SimpleStringNormalizer.java
* (delete)
src/plugin/jsoup-extractor/src/java/org/apache/nutch/jsoup/extractor/core/normalizer/package-info.java
* (delete)
src/plugin/jsoup-extractor/src/java/org/apache/nutch/jsoup/extractor/core/normalizer/Normalizable.java
* (edit)
src/plugin/jsoup-extractor/src/test/org/apache/nutch/parse/jsoup/extractor/ViewCountNormalizer.java
* (add)
src/plugin/jsoup-extractor/src/java/org/apache/nutch/core/jsoup/extractor/package-info.java
* (add)
src/plugin/jsoup-extractor/src/java/org/apache/nutch/indexer/jsoup/extractor/JsoupIndexingFilter.java
* (edit)
src/plugin/jsoup-extractor/src/test/org/apache/nutch/parse/jsoup/extractor/TestJsoupParser.java
* (delete)
src/plugin/jsoup-extractor/src/java/org/apache/nutch/jsoup/extractor/core/JsoupDocumentReader.java
* (add)
src/plugin/jsoup-extractor/src/java/org/apache/nutch/parse/jsoup/extractor/package-info.java
* (add)
src/plugin/jsoup-extractor/src/java/org/apache/nutch/core/jsoup/extractor/normalizer/package-info.java
NUTCH-2389 JsoupDocumentReader parsing bug fixed (kaidulislam90:
[https://github.com/apache/nutch/commit/82ff292200bf0b00ffa07f7b89de4b483c1e1d14])
* (edit)
src/plugin/jsoup-extractor/src/java/org/apache/nutch/core/jsoup/extractor/JsoupDocumentReader.java
* (edit)
src/plugin/jsoup-extractor/src/java/org/apache/nutch/parse/jsoup/extractor/JsoupHtmlParser.java
* (edit)
src/plugin/jsoup-extractor/src/test/org/apache/nutch/parse/jsoup/extractor/TestJsoupParser.java
NUTCH-2389 Unit test passed, xml parsing issue fixed (kaidulislam90:
[https://github.com/apache/nutch/commit/39ef77771b11fa778687964fa8c45e63387e1c87])
* (edit)
src/plugin/jsoup-extractor/src/java/org/apache/nutch/parse/jsoup/extractor/JsoupHtmlParser.java
* (edit)
src/plugin/jsoup-extractor/src/test/org/apache/nutch/parse/jsoup/extractor/TestJsoupParser.java
* (edit)
src/plugin/jsoup-extractor/src/java/org/apache/nutch/core/jsoup/extractor/JsoupDocumentReader.java
* (edit) conf/jsoup-extractor.xml
NUTCH-2389 IndexingFilter Utf8 conversion issue solved (kaidulislam90:
[https://github.com/apache/nutch/commit/66cbd7f9ff22a03a67133cf659679159384ceae9])
* (add)
src/plugin/jsoup-extractor/src/test/org/apache/nutch/parse/jsoup/extractor/TestJsoupHtmlParser.java
* (add) conf/jsoup-extractor-example.xml
* (edit)
src/plugin/jsoup-extractor/src/java/org/apache/nutch/indexer/jsoup/extractor/JsoupIndexingFilter.java
* (edit)
src/plugin/jsoup-extractor/src/java/org/apache/nutch/core/jsoup/extractor/JsoupDocumentReader.java
* (edit) conf/jsoup-extractor.xml
* (edit)
src/plugin/jsoup-extractor/src/java/org/apache/nutch/core/jsoup/extractor/JsoupExtractorConstants.java
* (delete)
src/plugin/jsoup-extractor/src/test/org/apache/nutch/parse/jsoup/extractor/TestJsoupParser.java
NUTCH-2389 Class name corrected in jsoup-extractor.xml (kaidulislam90:
[https://github.com/apache/nutch/commit/60abbc4c531991db2790973d28e92d0ce5d533cd])
* (edit) conf/jsoup-extractor.xml
NUTCH-2389 Unnecessary header import removed (kaidulislam90:
[https://github.com/apache/nutch/commit/6e11deef505af371efddfe2a5b76a87fa9dec773])
* (edit)
src/plugin/jsoup-extractor/src/java/org/apache/nutch/parse/jsoup/extractor/JsoupHtmlParser.java
NUTCH-2389 Missing license information added (kaidulislam90:
[https://github.com/apache/nutch/commit/1ede445097c4cb9ae20f4336057d6e722f40727e])
* (edit)
src/plugin/jsoup-extractor/src/java/org/apache/nutch/core/jsoup/extractor/normalizer/SimpleStringNormalizer.java
* (edit)
src/plugin/jsoup-extractor/src/test/org/apache/nutch/parse/jsoup/extractor/ViewCountNormalizer.java
* (edit)
src/plugin/jsoup-extractor/src/java/org/apache/nutch/core/jsoup/extractor/package-info.java
* (edit)
src/plugin/jsoup-extractor/src/java/org/apache/nutch/core/jsoup/extractor/JsoupExtractorConstants.java
NUTCH-2389 Diamond sign declaration removed and make Java 1.6 compatible
(kaidulislam90:
[https://github.com/apache/nutch/commit/b2130b47d10157b22d7a462dc80e94f7ac0cd2c5])
* (edit)
src/plugin/jsoup-extractor/src/java/org/apache/nutch/core/jsoup/extractor/JsoupDocumentReader.java
* (edit)
src/plugin/jsoup-extractor/src/java/org/apache/nutch/core/jsoup/extractor/JsoupDocument.java
> Precise data parsing using Jsoup CSS selectors
> ----------------------------------------------
>
> Key: NUTCH-2389
> URL: https://issues.apache.org/jira/browse/NUTCH-2389
> Project: Nutch
> Issue Type: New Feature
> Components: parser
> Affects Versions: 2.3
> Reporter: Kaidul Islam
> Assignee: Kaidul Islam
> Fix For: 2.4
>
> Original Estimate: 0.05h
> Remaining Estimate: 0.05h
>
> As far as I know, currently Nutch 1.x and 2.x has no features to
> extract/parse exact contents for specific websites. I've developed a plugin
> {{parse-jsoup}} using Jsoup for my current project to extract precise content
> for site specific crawling using detailed XML configuration(field name,
> CSS-selector, attribute, extraction rules, data-type, default-value etc).
> Please let me know if this feature seems relevant and currently not present
> in Nutch. I have also plan to export it into Nutch 1.x.
--
This message was sent by Atlassian JIRA
(v6.4.14#64029)