[ 
https://issues.apache.org/jira/browse/NUTCH-2389?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=16106551#comment-16106551
 ] 

Hudson commented on NUTCH-2389:
-------------------------------

FAILURE: Integrated in Jenkins build Nutch-nutchgora #1588 (See 
[https://builds.apache.org/job/Nutch-nutchgora/1588/])
NUTCH-2389 jsoup-extractor with parse filter, indexing filter and unit 
(ikaidul: 
[https://github.com/apache/nutch/commit/f41735cb3c96650f6a51f1c5eb87566572bf1679])
* (add) src/plugin/jsoup-extractor/plugin.xml
* (add) 
src/plugin/jsoup-extractor/src/java/org/apache/nutch/jsoup/extractor/core/JsoupExtractorConstants.java
* (add) 
src/plugin/jsoup-extractor/src/java/org/apache/nutch/jsoup/extractor/core/JsoupDocumentReader.java
* (add) 
src/plugin/jsoup-extractor/src/test/org/apache/nutch/parse/jsoup/extractor/TestJsoupParser.java
* (add) 
src/plugin/jsoup-extractor/src/java/org/apache/nutch/jsoup/extractor/core/package-info.java
* (add) 
src/plugin/jsoup-extractor/src/java/org/apache/nutch/jsoup/extractor/parse/JsoupHtmlParser.java
* (edit) src/plugin/build.xml
* (add) 
src/plugin/jsoup-extractor/src/java/org/apache/nutch/jsoup/extractor/core/normalizer/package-info.java
* (edit) conf/nutch-default.xml
* (add) 
src/plugin/jsoup-extractor/src/java/org/apache/nutch/jsoup/extractor/parse/package-info.java
* (add) conf/jsoup-extractor-sample.xml
* (edit) build.xml
* (add) 
src/plugin/jsoup-extractor/src/java/org/apache/nutch/jsoup/extractor/indexer/package-info.java
* (add) 
src/plugin/jsoup-extractor/src/java/org/apache/nutch/jsoup/extractor/core/normalizer/SimpleStringNormalizer.java
* (add) 
src/plugin/jsoup-extractor/src/java/org/apache/nutch/jsoup/extractor/indexer/JsoupIndexingFilter.java
* (add) 
src/plugin/jsoup-extractor/src/test/org/apache/nutch/parse/jsoup/extractor/ViewCountNormalizer.java
* (add) conf/jsoup-extractor.xml
* (add) 
src/plugin/jsoup-extractor/src/java/org/apache/nutch/jsoup/extractor/core/normalizer/Normalizable.java
* (add) 
src/plugin/jsoup-extractor/src/java/org/apache/nutch/jsoup/extractor/core/JsoupDocument.java
* (add) src/plugin/jsoup-extractor/build.xml
NUTCH-2389 jsoup-extractor/ivy.xml commited (ikaidul: 
[https://github.com/apache/nutch/commit/fe6997f30e4bcffe962da4d09ae73f379c026a76])
* (add) src/plugin/jsoup-extractor/ivy.xml
NUTCH-2389 Unit test implemented but not passed (ikaidul: 
[https://github.com/apache/nutch/commit/17bd8f6e87f4fa4fd35c5aecfa09d8ef3bea6fd7])
* (delete) conf/jsoup-extractor-sample.xml
* (edit) conf/jsoup-extractor.xml
* (edit) 
src/plugin/jsoup-extractor/src/java/org/apache/nutch/jsoup/extractor/core/JsoupDocumentReader.java
* (edit) 
src/plugin/jsoup-extractor/src/java/org/apache/nutch/jsoup/extractor/parse/JsoupHtmlParser.java
* (edit) src/plugin/jsoup-extractor/plugin.xml
* (edit) 
src/plugin/jsoup-extractor/src/java/org/apache/nutch/jsoup/extractor/core/JsoupExtractorConstants.java
* (edit) 
src/plugin/jsoup-extractor/src/test/org/apache/nutch/parse/jsoup/extractor/TestJsoupParser.java
NUTCH-2389 package name changed (kaidulislam90: 
[https://github.com/apache/nutch/commit/52e785d6f8ebf6f57150b255df380510f6ebcf6b])
* (delete) 
src/plugin/jsoup-extractor/src/java/org/apache/nutch/jsoup/extractor/indexer/JsoupIndexingFilter.java
* (add) 
src/plugin/jsoup-extractor/src/java/org/apache/nutch/core/jsoup/extractor/JsoupExtractorConstants.java
* (delete) 
src/plugin/jsoup-extractor/src/java/org/apache/nutch/jsoup/extractor/core/package-info.java
* (delete) 
src/plugin/jsoup-extractor/src/java/org/apache/nutch/jsoup/extractor/parse/package-info.java
* (edit) src/plugin/jsoup-extractor/plugin.xml
* (add) 
src/plugin/jsoup-extractor/src/java/org/apache/nutch/core/jsoup/extractor/normalizer/Normalizable.java
* (add) 
src/plugin/jsoup-extractor/src/java/org/apache/nutch/parse/jsoup/extractor/JsoupHtmlParser.java
* (delete) 
src/plugin/jsoup-extractor/src/java/org/apache/nutch/jsoup/extractor/core/JsoupDocument.java
* (delete) 
src/plugin/jsoup-extractor/src/java/org/apache/nutch/jsoup/extractor/parse/JsoupHtmlParser.java
* (delete) 
src/plugin/jsoup-extractor/src/java/org/apache/nutch/jsoup/extractor/core/JsoupExtractorConstants.java
* (add) 
src/plugin/jsoup-extractor/src/java/org/apache/nutch/core/jsoup/extractor/JsoupDocument.java
* (add) 
src/plugin/jsoup-extractor/src/java/org/apache/nutch/core/jsoup/extractor/normalizer/SimpleStringNormalizer.java
* (delete) 
src/plugin/jsoup-extractor/src/java/org/apache/nutch/jsoup/extractor/indexer/package-info.java
* (add) 
src/plugin/jsoup-extractor/src/java/org/apache/nutch/core/jsoup/extractor/JsoupDocumentReader.java
* (add) 
src/plugin/jsoup-extractor/src/java/org/apache/nutch/indexer/jsoup/extractor/package-info.java
* (delete) 
src/plugin/jsoup-extractor/src/java/org/apache/nutch/jsoup/extractor/core/normalizer/SimpleStringNormalizer.java
* (delete) 
src/plugin/jsoup-extractor/src/java/org/apache/nutch/jsoup/extractor/core/normalizer/package-info.java
* (delete) 
src/plugin/jsoup-extractor/src/java/org/apache/nutch/jsoup/extractor/core/normalizer/Normalizable.java
* (edit) 
src/plugin/jsoup-extractor/src/test/org/apache/nutch/parse/jsoup/extractor/ViewCountNormalizer.java
* (add) 
src/plugin/jsoup-extractor/src/java/org/apache/nutch/core/jsoup/extractor/package-info.java
* (add) 
src/plugin/jsoup-extractor/src/java/org/apache/nutch/indexer/jsoup/extractor/JsoupIndexingFilter.java
* (edit) 
src/plugin/jsoup-extractor/src/test/org/apache/nutch/parse/jsoup/extractor/TestJsoupParser.java
* (delete) 
src/plugin/jsoup-extractor/src/java/org/apache/nutch/jsoup/extractor/core/JsoupDocumentReader.java
* (add) 
src/plugin/jsoup-extractor/src/java/org/apache/nutch/parse/jsoup/extractor/package-info.java
* (add) 
src/plugin/jsoup-extractor/src/java/org/apache/nutch/core/jsoup/extractor/normalizer/package-info.java
NUTCH-2389 JsoupDocumentReader parsing bug fixed (kaidulislam90: 
[https://github.com/apache/nutch/commit/82ff292200bf0b00ffa07f7b89de4b483c1e1d14])
* (edit) 
src/plugin/jsoup-extractor/src/java/org/apache/nutch/core/jsoup/extractor/JsoupDocumentReader.java
* (edit) 
src/plugin/jsoup-extractor/src/java/org/apache/nutch/parse/jsoup/extractor/JsoupHtmlParser.java
* (edit) 
src/plugin/jsoup-extractor/src/test/org/apache/nutch/parse/jsoup/extractor/TestJsoupParser.java
NUTCH-2389 Unit test passed, xml parsing issue fixed (kaidulislam90: 
[https://github.com/apache/nutch/commit/39ef77771b11fa778687964fa8c45e63387e1c87])
* (edit) 
src/plugin/jsoup-extractor/src/java/org/apache/nutch/parse/jsoup/extractor/JsoupHtmlParser.java
* (edit) 
src/plugin/jsoup-extractor/src/test/org/apache/nutch/parse/jsoup/extractor/TestJsoupParser.java
* (edit) 
src/plugin/jsoup-extractor/src/java/org/apache/nutch/core/jsoup/extractor/JsoupDocumentReader.java
* (edit) conf/jsoup-extractor.xml
NUTCH-2389 IndexingFilter Utf8 conversion issue solved (kaidulislam90: 
[https://github.com/apache/nutch/commit/66cbd7f9ff22a03a67133cf659679159384ceae9])
* (add) 
src/plugin/jsoup-extractor/src/test/org/apache/nutch/parse/jsoup/extractor/TestJsoupHtmlParser.java
* (add) conf/jsoup-extractor-example.xml
* (edit) 
src/plugin/jsoup-extractor/src/java/org/apache/nutch/indexer/jsoup/extractor/JsoupIndexingFilter.java
* (edit) 
src/plugin/jsoup-extractor/src/java/org/apache/nutch/core/jsoup/extractor/JsoupDocumentReader.java
* (edit) conf/jsoup-extractor.xml
* (edit) 
src/plugin/jsoup-extractor/src/java/org/apache/nutch/core/jsoup/extractor/JsoupExtractorConstants.java
* (delete) 
src/plugin/jsoup-extractor/src/test/org/apache/nutch/parse/jsoup/extractor/TestJsoupParser.java
NUTCH-2389 Class name corrected in jsoup-extractor.xml (kaidulislam90: 
[https://github.com/apache/nutch/commit/60abbc4c531991db2790973d28e92d0ce5d533cd])
* (edit) conf/jsoup-extractor.xml
NUTCH-2389 Unnecessary header import removed (kaidulislam90: 
[https://github.com/apache/nutch/commit/6e11deef505af371efddfe2a5b76a87fa9dec773])
* (edit) 
src/plugin/jsoup-extractor/src/java/org/apache/nutch/parse/jsoup/extractor/JsoupHtmlParser.java
NUTCH-2389 Missing license information added (kaidulislam90: 
[https://github.com/apache/nutch/commit/1ede445097c4cb9ae20f4336057d6e722f40727e])
* (edit) 
src/plugin/jsoup-extractor/src/java/org/apache/nutch/core/jsoup/extractor/normalizer/SimpleStringNormalizer.java
* (edit) 
src/plugin/jsoup-extractor/src/test/org/apache/nutch/parse/jsoup/extractor/ViewCountNormalizer.java
* (edit) 
src/plugin/jsoup-extractor/src/java/org/apache/nutch/core/jsoup/extractor/package-info.java
* (edit) 
src/plugin/jsoup-extractor/src/java/org/apache/nutch/core/jsoup/extractor/JsoupExtractorConstants.java
NUTCH-2389 Diamond sign declaration removed and make Java 1.6 compatible 
(kaidulislam90: 
[https://github.com/apache/nutch/commit/b2130b47d10157b22d7a462dc80e94f7ac0cd2c5])
* (edit) 
src/plugin/jsoup-extractor/src/java/org/apache/nutch/core/jsoup/extractor/JsoupDocumentReader.java
* (edit) 
src/plugin/jsoup-extractor/src/java/org/apache/nutch/core/jsoup/extractor/JsoupDocument.java


> Precise data parsing using Jsoup CSS selectors
> ----------------------------------------------
>
>                 Key: NUTCH-2389
>                 URL: https://issues.apache.org/jira/browse/NUTCH-2389
>             Project: Nutch
>          Issue Type: New Feature
>          Components: parser
>    Affects Versions: 2.3
>            Reporter: Kaidul Islam
>            Assignee: Kaidul Islam
>             Fix For: 2.4
>
>   Original Estimate: 0.05h
>  Remaining Estimate: 0.05h
>
> As far as I know, currently Nutch 1.x and 2.x has no features to 
> extract/parse exact contents for specific websites. I've developed a plugin 
> {{parse-jsoup}} using Jsoup for my current project to extract precise content 
> for site specific crawling using detailed XML configuration(field name, 
> CSS-selector, attribute, extraction rules, data-type, default-value etc).
> Please let me know if this feature seems relevant and currently not present 
> in Nutch. I have also plan to export it into Nutch 1.x.



--
This message was sent by Atlassian JIRA
(v6.4.14#64029)

Reply via email to