http://git-wip-us.apache.org/repos/asf/any23/blob/fd822849/core/src/test/java/org/apache/any23/extractor/html/RDFMergerTest.java ---------------------------------------------------------------------- diff --git a/core/src/test/java/org/apache/any23/extractor/html/RDFMergerTest.java b/core/src/test/java/org/apache/any23/extractor/html/RDFMergerTest.java index 3f48ea2..bce8bb1 100644 --- a/core/src/test/java/org/apache/any23/extractor/html/RDFMergerTest.java +++ b/core/src/test/java/org/apache/any23/extractor/html/RDFMergerTest.java @@ -58,485 +58,445 @@ import java.util.Map; */ public class RDFMergerTest extends AbstractExtractorTestCase { - private static final DCTerms vDCTERMS = DCTerms.getInstance(); - private static final FOAF vFOAF = FOAF.getInstance(); - private static final Review vREVIEW = Review.getInstance(); - private static final VCard vVCARD = VCard.getInstance(); - - @Override - protected ExtractorFactory<?> getExtractorFactory() { - return null; - } - - @Test - public void testNoMicroformats() throws RepositoryException, ExtractionException, IOException { - extract("/html/html-without-uf.html"); - assertModelEmpty(); - } - - @Test - public void test01XFNFoaf() throws RepositoryException { - assertExtract("/html/mixed/01-xfn-foaf.html", false); - assertModelNotEmpty(); - assertStatementsSize(RDF.TYPE, vVCARD.VCard, 1); - Resource vcard = findExactlyOneBlankSubject(RDF.TYPE, vVCARD.VCard); - RepositoryResult<Statement> statements = getStatements(null, vFOAF.topic, vcard); - - try { - while(statements.hasNext()) { - Statement statement = statements.next(); - Resource person = statement.getSubject(); - Resource blank = findExactlyOneBlankSubject(OWL.SAMEAS, person); - assertContains(blank, RDF.TYPE, vFOAF.Person); - - } - - } finally { - statements.close(); - } - } - - @Test - public void testAbbrTitleEverything() throws ExtractionException, IOException, RepositoryException { - extractHCardAndRelated("/microformats/hcard/23-abbr-title-everything.html"); - - assertContains(vVCARD.fn, "John Doe"); - assertContains(vVCARD.nickname, "JJ"); - - assertContains(vVCARD.given_name, "Jonathan"); - assertContains(vVCARD.additional_name, "John"); - assertContains(vVCARD.family_name, "Doe-Smith"); - assertContains(vVCARD.honorific_suffix, "Medical Doctor"); - assertContains(vVCARD.title, "President"); - assertContains(vVCARD.role, "Chief"); - assertContains(vVCARD.tz, "-0700"); - assertContains(vVCARD.bday, "2006-04-04"); - assertContains(vVCARD.tel, RDFUtils.uri("tel:415.555.1234")); - assertContains(vVCARD.uid, "abcdefghijklmnopqrstuvwxyz"); - assertContains(vVCARD.class_, "public"); - assertContains(vVCARD.note, "this is a note"); - assertContains(vVCARD.organization_name, "Intellicorp"); - assertContains(vVCARD.organization_unit, "Intelligence"); - assertContains(RDF.TYPE, vVCARD.Location); - assertContains(vVCARD.geo, (Resource) null); - assertContains(vVCARD.latitude, "37.77"); - assertContains(vVCARD.longitude, "-122.41"); - assertContains(vVCARD.post_office_box, "Box 1234"); - assertContains(vVCARD.extended_address, "Suite 100"); - assertContains(vVCARD.street_address, "123 Fake Street"); - assertContains(vVCARD.locality, "San Francisco"); - assertContains(vVCARD.region, "California"); - assertContains(vVCARD.postal_code, "12345-6789"); - assertContains(vVCARD.country_name, "United States of America"); - assertContains(vVCARD.addressType, "work"); - } - - @Test - public void testAdr() throws ExtractionException, IOException, RepositoryException { - extractHRevAndRelated("/microformats/hcard/22-adr.html"); - - assertStatementsSize(RDF.TYPE, vVCARD.Address, 4); - - Map<String,String[]> addresses = new HashMap<String,String[]>(4); - addresses.put( - "1233 Main St.", - new String[]{ - "United States of America", - "Beverly Hills", - "90210", - "California"}); - addresses.put( - "1232 Main St.", - new String[]{ - "United States of America", - "Beverly Hills", - "90210", - "California"}); - addresses.put( - "1234 Main St.", - new String[]{ - "United States of America", - "Beverly Hills", - "90210", - "California" - }); - addresses.put( - "1231 Main St.", - new String[]{ - "United States of America", - "Beverly Hills", - "90210", - "California"}); - addresses.put( - "Suite 100", - new String[]{ - "United States of America", - "Beverly Hills", - "90210", - "California" - }); - - RepositoryResult<Statement> statements = getStatements(null, RDF.TYPE, vVCARD.Address); - - try { - while (statements.hasNext()) { - Resource adr = statements.next().getSubject(); - RepositoryResult<Statement> innerStatements = getStatements(adr, vVCARD.street_address, null); - try { - while (innerStatements.hasNext()) { - Value innerValue = innerStatements.next().getObject(); - assertContains(adr, vVCARD.country_name, addresses.get(innerValue.stringValue())[0]); - assertContains(adr, vVCARD.locality, addresses.get(innerValue.stringValue())[1]); - assertContains(adr, vVCARD.postal_code, addresses.get(innerValue.stringValue())[2]); - assertContains(adr, vVCARD.region, addresses.get(innerValue.stringValue())[3]); - } - - } finally { - innerStatements.close(); - } - } - - } finally { - statements.close(); - } - - assertContains(vVCARD.post_office_box, "PO Box 1234"); - assertContains(vVCARD.addressType, "home"); - } - - @Test - public void testGeoAbbr() throws ExtractionException, IOException, RepositoryException { - extractHCardAndRelated("/microformats/hcard/25-geo-abbr.html"); - assertModelNotEmpty(); - assertContains(vVCARD.fn, "Paradise"); - assertContains(RDF.TYPE, vVCARD.Organization); - assertContains(vVCARD.organization_name, "Paradise"); - assertContains(RDF.TYPE, vVCARD.Location); - assertContains(vVCARD.geo, (Resource) null); - assertContains(vVCARD.latitude, "30.267991"); - assertContains(vVCARD.longitude, "-97.739568"); - } - - @Test - public void testAncestors() throws ExtractionException, IOException, RepositoryException { - extractHCardAndRelated("/microformats/hcard/26-ancestors.html"); - assertModelNotEmpty(); - - assertContains(vVCARD.fn, "John Doe"); - assertNotContains(null, vVCARD.fn, - "Mister Jonathan John Doe-Smith Medical Doctor"); - assertContains(vVCARD.nickname, "JJ"); - assertContains(RDF.TYPE, vVCARD.Address); - assertContains(vVCARD.tz, "-0700"); - assertContains(vVCARD.title, "President"); - assertContains(vVCARD.role, "Chief"); - assertContains(vVCARD.organization_name, "Intellicorp"); - assertContains(vVCARD.organization_unit, "Intelligence"); - - assertContains(vVCARD.tel, RDFUtils.uri("tel:415.555.1234")); - assertContains(vVCARD.uid, "abcdefghijklmnopqrstuvwxyz"); - assertContains(vVCARD.note, "this is a note"); - assertContains(vVCARD.class_, "public"); - - assertContains(RDF.TYPE, vVCARD.Location); - assertContains(vVCARD.geo, (Resource) null); - assertContains(null, vVCARD.latitude, "37.77"); - assertContains(null, vVCARD.longitude, "-122.41"); - - assertContains(RDF.TYPE, vVCARD.Name); - assertContains(vVCARD.additional_name, "John"); - assertContains(vVCARD.given_name, "Jonathan"); - assertContains(vVCARD.family_name, "Doe-Smith"); - assertContains(vVCARD.honorific_prefix, "Mister"); - assertContains(vVCARD.honorific_suffix, "Medical Doctor"); - - assertContains(vVCARD.post_office_box, "Box 1234"); - assertContains(vVCARD.extended_address, "Suite 100"); - assertContains(vVCARD.street_address, "123 Fake Street"); - assertContains(vVCARD.locality, "San Francisco"); - assertContains(vVCARD.region, "California"); - assertContains(vVCARD.postal_code, "12345-6789"); - assertContains(vVCARD.country_name, "United States of America"); - assertContains(vVCARD.addressType, "work"); - } - - @Test - public void testSingleton() throws ExtractionException, IOException, RepositoryException { - extractHCardAndRelated("/microformats/hcard/37-singleton.html"); - assertModelNotEmpty(); - assertStatementsSize(vVCARD.fn, (Value) null, 1); - assertContains(vVCARD.fn, "john doe 1"); - assertStatementsSize(RDF.TYPE, vVCARD.Name, 1); - assertStatementsSize(vVCARD.given_name, (Value) null, 1); - assertContains(vVCARD.given_name, "john"); - assertStatementsSize(vVCARD.family_name, (Value) null, 1); - assertContains(vVCARD.family_name, "doe"); - assertStatementsSize(vVCARD.sort_string, (Value) null, 1); - assertContains(vVCARD.sort_string, "d"); - assertStatementsSize(vVCARD.bday, (Value) null, 1); - assertContains(vVCARD.bday, "20060707"); - assertStatementsSize(vVCARD.rev, (Value) null, 1); - assertContains(vVCARD.rev, "20060707"); - assertStatementsSize(vVCARD.class_, (Value) null, 1); - assertContains(vVCARD.class_, "public"); - assertStatementsSize(vVCARD.tz, (Value) null, 1); - assertContains(vVCARD.tz, "+0600"); - // 2 uf, one of them outside the card - assertStatementsSize(RDF.TYPE, vVCARD.Location, 2); - // one is actually used - assertStatementsSize(vVCARD.geo, (Value) null, 2); - assertContains(vVCARD.latitude, "123.45"); - assertContains(vVCARD.longitude, "67.89"); - assertStatementsSize(vVCARD.uid, (Value) null, 1); - assertContains(vVCARD.uid, "unique-id-1"); - } - - @Test - public void test01Basic() throws ExtractionException, IOException, RepositoryException { - extractHRevAndRelated("/microformats/hreview/01-spec.html"); - assertModelNotEmpty(); - - assertStatementsSize(RDF.TYPE, vREVIEW.Review, 1); - // reviewer, item - assertStatementsSize(RDF.TYPE, vVCARD.VCard, 2); - // there is one address in the item vcard - assertStatementsSize(RDF.TYPE, vVCARD.Address, 1); - - RepositoryResult<Statement> reviews = getStatements(null, RDF.TYPE, vREVIEW.Review); - - try { - while(reviews.hasNext()) { - Resource review = reviews.next().getSubject(); - assertContains(review, vREVIEW.rating, "5"); - assertContains(review, vREVIEW.title, "Crepes on Cole is awesome"); - assertContains(review, vDCTERMS.date, "20050418T2300-0700"); - assertContains( - vREVIEW.text, - "Crepes on Cole is one of the best little \n" - + " creperies in San Francisco.\n " - + "Excellent food and service. Plenty of tables in a variety of sizes\n" - + " for parties large and small. " - + "Window seating makes for excellent\n " - + "people watching to/from the N-Judah which stops right outside.\n" - + " I've had many fun social gatherings here, as well as gotten\n" - + " plenty of work done thanks to neighborhood WiFi."); - - assertContains(null, vREVIEW.hasReview, review); - } - } - finally { - reviews.close(); - } - - // generic checks that vcards are correct, improve - assertContains(vVCARD.fn, "Crepes on Cole"); - assertContains(vVCARD.fn, "Tantek"); - assertContains(vVCARD.locality, "San Francisco"); - assertContains(vVCARD.organization_name, "Crepes on Cole"); - - } - - @Test - public void test02RatedTags() throws ExtractionException, IOException, RepositoryException { - extractHRevAndRelated("/microformats/hreview/02-spec-2.html"); - - assertStatementsSize(vREVIEW.reviewer, (Value) null, 1); - assertStatementsSize(vREVIEW.hasReview, (Value) null, 1); - assertModelNotEmpty(); - assertStatementsSize(RDF.TYPE, vREVIEW.Review, 1); - // reviewer, item - assertStatementsSize(RDF.TYPE, vVCARD.VCard, 2); - // there is one address in the item vcard - assertStatementsSize(RDF.TYPE, vVCARD.Address, 1); - - RepositoryResult<Statement> reviews = getStatements(null, RDF.TYPE, vREVIEW.Review); - - try { - while (reviews.hasNext()) { - Resource review = reviews.next().getSubject(); - assertContains(review, vREVIEW.rating, "18"); - assertContains(review, vREVIEW.title, "Cafe Borrone"); - assertContains(review, vDCTERMS.date, "20050428T2130-0700"); - assertContains(null, vREVIEW.hasReview, review); - assertContains(vREVIEW.type, "business"); - } - - } finally { - reviews.close(); - } - - // generic checks that vcards are correct, improve - assertContains(vVCARD.fn, "Cafe Borrone"); - assertContains(vVCARD.fn, "anonymous"); - assertContains(vVCARD.organization_name, "Cafe Borrone"); - - } - - @Test - public void test03NoHcardForItem() throws ExtractionException, IOException, RepositoryException { - extractHRevAndRelated("/microformats/hreview/03-spec-3.html"); - - assertModelNotEmpty(); - assertStatementsSize(RDF.TYPE, vREVIEW.Review, 1); - assertStatementsSize(RDF.TYPE, vVCARD.VCard, 1); - - RepositoryResult<Statement> reviews = getStatements(null, RDF.TYPE, vREVIEW.Review); - - try { - while (reviews.hasNext()) { - Resource review = reviews.next().getSubject(); - assertContains(review, vREVIEW.rating, "5"); - assertNotContains(vREVIEW.title, null); - assertContains(review, vDCTERMS.date, "200502"); - - assertContains( - vREVIEW.text, - "\"The people thought they were just being rewarded for " + - "treating others\n as they like to be treated, for " + - "obeying stop signs and curing diseases,\n for mailing " + - "letters with the address of the sender... Don't wake me,\n " + - " I plan on sleeping in...\"\n \n \"Nothing Better\"" + - " is a great track on this album, too..."); - - RepositoryResult<Statement> whatHasAReview = getStatements(null, vREVIEW.hasReview, review); - - try { - while(whatHasAReview.hasNext()) { - Resource subject = whatHasAReview.next().getSubject(); - assertContains(subject, vVCARD.fn, "The Postal Service: Give Up"); - assertContains( - subject, - vVCARD.url, - RDFUtils.uri("http://www.amazon.com/exec/obidos/ASIN/B000089CJI/") - ); - assertContains( - subject, - vVCARD.photo, - RDFUtils.uri("http://images.amazon.com/images/P/B000089CJI.01._SCTHUMBZZZ_.jpg") - ); - } - - } finally { - whatHasAReview.close(); - } - - } - - } finally { - reviews.close(); - } - - assertContains(vVCARD.fn, "Adam Rifkin"); - assertContains(vVCARD.url, RDFUtils.uri("http://ifindkarma.com/blog/")); - } - - @Override - protected void extract(String filename) throws ExtractionException, IOException { - - InputStream input = new BufferedInputStream(this.getClass().getResourceAsStream(filename)); - - Document document = new TagSoupParser(input, baseURI.stringValue()).getDOM(); - HCardExtractor hCardExtractor = new HCardExtractorFactory().createExtractor(); - ExtractionContext hcExtractionContext = new ExtractionContext( - hCardExtractor.getDescription().getExtractorName(), - baseURI - ); - hCardExtractor.run( - ExtractionParameters.newDefault(), - hcExtractionContext, - document, - new ExtractionResultImpl( - hcExtractionContext, - hCardExtractor, - new RepositoryWriter(getConnection()) - ) - ); - XFNExtractor xfnExtractor = new XFNExtractorFactory().createExtractor(); - ExtractionContext xfnExtractionContext = new ExtractionContext( - xfnExtractor.getDescription().getExtractorName(), - baseURI - ); - xfnExtractor.run( - ExtractionParameters.newDefault(), - xfnExtractionContext, - document, - new ExtractionResultImpl( - xfnExtractionContext, - hCardExtractor, - new RepositoryWriter(getConnection()) - ) - ); - } - - private void extractHCardAndRelated(String filename) throws IOException, ExtractionException { - - InputStream input = new BufferedInputStream(this.getClass().getResourceAsStream(filename)); - - Document document = new TagSoupParser(input, baseURI.stringValue()).getDOM(); - HCardExtractor hCardExtractor = new HCardExtractorFactory().createExtractor(); - ExtractionContext hCardExtractionContext = new ExtractionContext( - hCardExtractor.getDescription().getExtractorName(), baseURI - ); - hCardExtractor.run( - ExtractionParameters.newDefault(), - hCardExtractionContext, - document, - new ExtractionResultImpl( - hCardExtractionContext, - hCardExtractor, new RepositoryWriter(getConnection()) - ) - ); - - GeoExtractor geoExtractor = new GeoExtractorFactory().createExtractor(); - ExtractionContext geoExtractionContext = new ExtractionContext( - geoExtractor.getDescription().getExtractorName(), baseURI - ); - geoExtractor.run( - ExtractionParameters.newDefault(), - geoExtractionContext, - document, - new ExtractionResultImpl( - geoExtractionContext, - geoExtractor, - new RepositoryWriter(getConnection()) - ) - ); - - AdrExtractor adrExtractor = new AdrExtractorFactory().createExtractor(); - ExtractionContext adrExtractionContext = new ExtractionContext( - adrExtractor.getDescription().getExtractorName(), baseURI - ); - adrExtractor.run( - ExtractionParameters.newDefault(), - adrExtractionContext, - document, - new ExtractionResultImpl( - adrExtractionContext, - adrExtractor, - new RepositoryWriter(getConnection()) - ) - ); - - } - - private void extractHRevAndRelated(String filename) throws ExtractionException, IOException { - extractHCardAndRelated(filename); - InputStream input = new BufferedInputStream(this.getClass().getResourceAsStream(filename)); - Document document = new TagSoupParser(input, baseURI.stringValue()).getDOM(); - HReviewExtractor hReviewExtractor = new HReviewExtractorFactory().createExtractor(); - ExtractionContext hreviewExtractionContext = new ExtractionContext( - hReviewExtractor.getDescription().getExtractorName(), baseURI - ); - hReviewExtractor.run( - ExtractionParameters.newDefault(), - hreviewExtractionContext, - document, - new ExtractionResultImpl( - hreviewExtractionContext, - hReviewExtractor, - new RepositoryWriter(getConnection()) - ) - ); - } + private static final DCTerms vDCTERMS = DCTerms.getInstance(); + private static final FOAF vFOAF = FOAF.getInstance(); + private static final Review vREVIEW = Review.getInstance(); + private static final VCard vVCARD = VCard.getInstance(); + + @Override + protected ExtractorFactory<?> getExtractorFactory() { + return null; + } + + @Test + public void testNoMicroformats() throws Exception, ExtractionException, + IOException { + extract("/html/html-without-uf.html"); + assertModelEmpty(); + } + + @Test + public void test01XFNFoaf() throws Exception { + assertExtract("/html/mixed/01-xfn-foaf.html", false); + assertModelNotEmpty(); + assertStatementsSize(RDF.TYPE, vVCARD.VCard, 1); + Resource vcard = findExactlyOneBlankSubject(RDF.TYPE, vVCARD.VCard); + RepositoryResult<Statement> statements = getStatements(null, + vFOAF.topic, vcard); + + try { + while (statements.hasNext()) { + Statement statement = statements.next(); + Resource person = statement.getSubject(); + Resource blank = findExactlyOneBlankSubject(OWL.SAMEAS, person); + assertContains(blank, RDF.TYPE, vFOAF.Person); + + } + + } finally { + statements.close(); + } + } + + @Test + public void testAbbrTitleEverything() throws ExtractionException, + IOException, RepositoryException { + extractHCardAndRelated("/microformats/hcard/23-abbr-title-everything.html"); + + assertContains(vVCARD.fn, "John Doe"); + assertContains(vVCARD.nickname, "JJ"); + + assertContains(vVCARD.given_name, "Jonathan"); + assertContains(vVCARD.additional_name, "John"); + assertContains(vVCARD.family_name, "Doe-Smith"); + assertContains(vVCARD.honorific_suffix, "Medical Doctor"); + assertContains(vVCARD.title, "President"); + assertContains(vVCARD.role, "Chief"); + assertContains(vVCARD.tz, "-0700"); + assertContains(vVCARD.bday, "2006-04-04"); + assertContains(vVCARD.tel, RDFUtils.uri("tel:415.555.1234")); + assertContains(vVCARD.uid, "abcdefghijklmnopqrstuvwxyz"); + assertContains(vVCARD.class_, "public"); + assertContains(vVCARD.note, "this is a note"); + assertContains(vVCARD.organization_name, "Intellicorp"); + assertContains(vVCARD.organization_unit, "Intelligence"); + assertContains(RDF.TYPE, vVCARD.Location); + assertContains(vVCARD.geo, (Resource) null); + assertContains(vVCARD.latitude, "37.77"); + assertContains(vVCARD.longitude, "-122.41"); + assertContains(vVCARD.post_office_box, "Box 1234"); + assertContains(vVCARD.extended_address, "Suite 100"); + assertContains(vVCARD.street_address, "123 Fake Street"); + assertContains(vVCARD.locality, "San Francisco"); + assertContains(vVCARD.region, "California"); + assertContains(vVCARD.postal_code, "12345-6789"); + assertContains(vVCARD.country_name, "United States of America"); + assertContains(vVCARD.addressType, "work"); + } + + @Test + public void testAdr() throws Exception { + extractHRevAndRelated("/microformats/hcard/22-adr.html"); + + assertStatementsSize(RDF.TYPE, vVCARD.Address, 4); + + Map<String, String[]> addresses = new HashMap<String, String[]>(4); + addresses.put("1233 Main St.", new String[] { + "United States of America", "Beverly Hills", "90210", + "California" }); + addresses.put("1232 Main St.", new String[] { + "United States of America", "Beverly Hills", "90210", + "California" }); + addresses.put("1234 Main St.", new String[] { + "United States of America", "Beverly Hills", "90210", + "California" }); + addresses.put("1231 Main St.", new String[] { + "United States of America", "Beverly Hills", "90210", + "California" }); + addresses.put("Suite 100", new String[] { "United States of America", + "Beverly Hills", "90210", "California" }); + + RepositoryResult<Statement> statements = getStatements(null, RDF.TYPE, + vVCARD.Address); + + try { + while (statements.hasNext()) { + Resource adr = statements.next().getSubject(); + RepositoryResult<Statement> innerStatements = getStatements( + adr, vVCARD.street_address, null); + try { + while (innerStatements.hasNext()) { + Value innerValue = innerStatements.next().getObject(); + assertContains(adr, vVCARD.country_name, + addresses.get(innerValue.stringValue())[0]); + assertContains(adr, vVCARD.locality, + addresses.get(innerValue.stringValue())[1]); + assertContains(adr, vVCARD.postal_code, + addresses.get(innerValue.stringValue())[2]); + assertContains(adr, vVCARD.region, + addresses.get(innerValue.stringValue())[3]); + } + + } finally { + innerStatements.close(); + } + } + + } finally { + statements.close(); + } + + assertContains(vVCARD.post_office_box, "PO Box 1234"); + assertContains(vVCARD.addressType, "home"); + } + + @Test + public void testGeoAbbr() throws ExtractionException, IOException, + RepositoryException { + extractHCardAndRelated("/microformats/hcard/25-geo-abbr.html"); + assertModelNotEmpty(); + assertContains(vVCARD.fn, "Paradise"); + assertContains(RDF.TYPE, vVCARD.Organization); + assertContains(vVCARD.organization_name, "Paradise"); + assertContains(RDF.TYPE, vVCARD.Location); + assertContains(vVCARD.geo, (Resource) null); + assertContains(vVCARD.latitude, "30.267991"); + assertContains(vVCARD.longitude, "-97.739568"); + } + + @Test + public void testAncestors() throws ExtractionException, IOException, + RepositoryException { + extractHCardAndRelated("/microformats/hcard/26-ancestors.html"); + assertModelNotEmpty(); + + assertContains(vVCARD.fn, "John Doe"); + assertNotContains(null, vVCARD.fn, + "Mister Jonathan John Doe-Smith Medical Doctor"); + assertContains(vVCARD.nickname, "JJ"); + assertContains(RDF.TYPE, vVCARD.Address); + assertContains(vVCARD.tz, "-0700"); + assertContains(vVCARD.title, "President"); + assertContains(vVCARD.role, "Chief"); + assertContains(vVCARD.organization_name, "Intellicorp"); + assertContains(vVCARD.organization_unit, "Intelligence"); + + assertContains(vVCARD.tel, RDFUtils.uri("tel:415.555.1234")); + assertContains(vVCARD.uid, "abcdefghijklmnopqrstuvwxyz"); + assertContains(vVCARD.note, "this is a note"); + assertContains(vVCARD.class_, "public"); + + assertContains(RDF.TYPE, vVCARD.Location); + assertContains(vVCARD.geo, (Resource) null); + assertContains(null, vVCARD.latitude, "37.77"); + assertContains(null, vVCARD.longitude, "-122.41"); + + assertContains(RDF.TYPE, vVCARD.Name); + assertContains(vVCARD.additional_name, "John"); + assertContains(vVCARD.given_name, "Jonathan"); + assertContains(vVCARD.family_name, "Doe-Smith"); + assertContains(vVCARD.honorific_prefix, "Mister"); + assertContains(vVCARD.honorific_suffix, "Medical Doctor"); + + assertContains(vVCARD.post_office_box, "Box 1234"); + assertContains(vVCARD.extended_address, "Suite 100"); + assertContains(vVCARD.street_address, "123 Fake Street"); + assertContains(vVCARD.locality, "San Francisco"); + assertContains(vVCARD.region, "California"); + assertContains(vVCARD.postal_code, "12345-6789"); + assertContains(vVCARD.country_name, "United States of America"); + assertContains(vVCARD.addressType, "work"); + } + + @Test + public void testSingleton() throws Exception { + extractHCardAndRelated("/microformats/hcard/37-singleton.html"); + assertModelNotEmpty(); + assertStatementsSize(vVCARD.fn, (Value) null, 1); + assertContains(vVCARD.fn, "john doe 1"); + assertStatementsSize(RDF.TYPE, vVCARD.Name, 1); + assertStatementsSize(vVCARD.given_name, (Value) null, 1); + assertContains(vVCARD.given_name, "john"); + assertStatementsSize(vVCARD.family_name, (Value) null, 1); + assertContains(vVCARD.family_name, "doe"); + assertStatementsSize(vVCARD.sort_string, (Value) null, 1); + assertContains(vVCARD.sort_string, "d"); + assertStatementsSize(vVCARD.bday, (Value) null, 1); + assertContains(vVCARD.bday, "20060707"); + assertStatementsSize(vVCARD.rev, (Value) null, 1); + assertContains(vVCARD.rev, "20060707"); + assertStatementsSize(vVCARD.class_, (Value) null, 1); + assertContains(vVCARD.class_, "public"); + assertStatementsSize(vVCARD.tz, (Value) null, 1); + assertContains(vVCARD.tz, "+0600"); + // 2 uf, one of them outside the card + assertStatementsSize(RDF.TYPE, vVCARD.Location, 2); + // one is actually used + assertStatementsSize(vVCARD.geo, (Value) null, 2); + assertContains(vVCARD.latitude, "123.45"); + assertContains(vVCARD.longitude, "67.89"); + assertStatementsSize(vVCARD.uid, (Value) null, 1); + assertContains(vVCARD.uid, "unique-id-1"); + } + + @Test + public void test01Basic() throws Exception { + extractHRevAndRelated("/microformats/hreview/01-spec.html"); + assertModelNotEmpty(); + + assertStatementsSize(RDF.TYPE, vREVIEW.Review, 1); + // reviewer, item + assertStatementsSize(RDF.TYPE, vVCARD.VCard, 2); + // there is one address in the item vcard + assertStatementsSize(RDF.TYPE, vVCARD.Address, 1); + + RepositoryResult<Statement> reviews = getStatements(null, RDF.TYPE, + vREVIEW.Review); + + try { + while (reviews.hasNext()) { + Resource review = reviews.next().getSubject(); + assertContains(review, vREVIEW.rating, "5"); + assertContains(review, vREVIEW.title, + "Crepes on Cole is awesome"); + assertContains(review, vDCTERMS.date, "20050418T2300-0700"); + assertContains( + vREVIEW.text, + "Crepes on Cole is one of the best little \n" + + " creperies in San Francisco.\n " + + "Excellent food and service. Plenty of tables in a variety of sizes\n" + + " for parties large and small. " + + "Window seating makes for excellent\n " + + "people watching to/from the N-Judah which stops right outside.\n" + + " I've had many fun social gatherings here, as well as gotten\n" + + " plenty of work done thanks to neighborhood WiFi."); + + assertContains(null, vREVIEW.hasReview, review); + } + } finally { + reviews.close(); + } + + // generic checks that vcards are correct, improve + assertContains(vVCARD.fn, "Crepes on Cole"); + assertContains(vVCARD.fn, "Tantek"); + assertContains(vVCARD.locality, "San Francisco"); + assertContains(vVCARD.organization_name, "Crepes on Cole"); + + } + + @Test + public void test02RatedTags() throws Exception { + extractHRevAndRelated("/microformats/hreview/02-spec-2.html"); + + assertStatementsSize(vREVIEW.reviewer, (Value) null, 1); + assertStatementsSize(vREVIEW.hasReview, (Value) null, 1); + assertModelNotEmpty(); + assertStatementsSize(RDF.TYPE, vREVIEW.Review, 1); + // reviewer, item + assertStatementsSize(RDF.TYPE, vVCARD.VCard, 2); + // there is one address in the item vcard + assertStatementsSize(RDF.TYPE, vVCARD.Address, 1); + + RepositoryResult<Statement> reviews = getStatements(null, RDF.TYPE, + vREVIEW.Review); + + try { + while (reviews.hasNext()) { + Resource review = reviews.next().getSubject(); + assertContains(review, vREVIEW.rating, "18"); + assertContains(review, vREVIEW.title, "Cafe Borrone"); + assertContains(review, vDCTERMS.date, "20050428T2130-0700"); + assertContains(null, vREVIEW.hasReview, review); + assertContains(vREVIEW.type, "business"); + } + + } finally { + reviews.close(); + } + + // generic checks that vcards are correct, improve + assertContains(vVCARD.fn, "Cafe Borrone"); + assertContains(vVCARD.fn, "anonymous"); + assertContains(vVCARD.organization_name, "Cafe Borrone"); + + } + + @Test + public void test03NoHcardForItem() throws Exception { + extractHRevAndRelated("/microformats/hreview/03-spec-3.html"); + + assertModelNotEmpty(); + assertStatementsSize(RDF.TYPE, vREVIEW.Review, 1); + assertStatementsSize(RDF.TYPE, vVCARD.VCard, 1); + + RepositoryResult<Statement> reviews = getStatements(null, RDF.TYPE, + vREVIEW.Review); + + try { + while (reviews.hasNext()) { + Resource review = reviews.next().getSubject(); + assertContains(review, vREVIEW.rating, "5"); + assertNotContains(vREVIEW.title, null); + assertContains(review, vDCTERMS.date, "200502"); + + assertContains( + vREVIEW.text, + "\"The people thought they were just being rewarded for " + + "treating others\n as they like to be treated, for " + + "obeying stop signs and curing diseases,\n for mailing " + + "letters with the address of the sender... Don't wake me,\n " + + " I plan on sleeping in...\"\n \n \"Nothing Better\"" + + " is a great track on this album, too..."); + + RepositoryResult<Statement> whatHasAReview = getStatements( + null, vREVIEW.hasReview, review); + + try { + while (whatHasAReview.hasNext()) { + Resource subject = whatHasAReview.next().getSubject(); + assertContains(subject, vVCARD.fn, + "The Postal Service: Give Up"); + assertContains( + subject, + vVCARD.url, + RDFUtils.uri("http://www.amazon.com/exec/obidos/ASIN/B000089CJI/")); + assertContains( + subject, + vVCARD.photo, + RDFUtils.uri("http://images.amazon.com/images/P/B000089CJI.01._SCTHUMBZZZ_.jpg")); + } + + } finally { + whatHasAReview.close(); + } + + } + + } finally { + reviews.close(); + } + + assertContains(vVCARD.fn, "Adam Rifkin"); + assertContains(vVCARD.url, RDFUtils.uri("http://ifindkarma.com/blog/")); + } + + @Override + protected void extract(String filename) throws ExtractionException, + IOException { + + InputStream input = new BufferedInputStream(this.getClass() + .getResourceAsStream(filename)); + + Document document = new TagSoupParser(input, baseURI.stringValue()) + .getDOM(); + HCardExtractor hCardExtractor = new HCardExtractorFactory() + .createExtractor(); + ExtractionContext hcExtractionContext = new ExtractionContext( + hCardExtractor.getDescription().getExtractorName(), baseURI); + hCardExtractor.run(ExtractionParameters.newDefault(), + hcExtractionContext, document, new ExtractionResultImpl( + hcExtractionContext, hCardExtractor, + new RepositoryWriter(getConnection()))); + XFNExtractor xfnExtractor = new XFNExtractorFactory().createExtractor(); + ExtractionContext xfnExtractionContext = new ExtractionContext( + xfnExtractor.getDescription().getExtractorName(), baseURI); + xfnExtractor.run(ExtractionParameters.newDefault(), + xfnExtractionContext, document, new ExtractionResultImpl( + xfnExtractionContext, hCardExtractor, + new RepositoryWriter(getConnection()))); + } + + private void extractHCardAndRelated(String filename) throws IOException, + ExtractionException { + + InputStream input = new BufferedInputStream(this.getClass() + .getResourceAsStream(filename)); + + Document document = new TagSoupParser(input, baseURI.stringValue()) + .getDOM(); + HCardExtractor hCardExtractor = new HCardExtractorFactory() + .createExtractor(); + ExtractionContext hCardExtractionContext = new ExtractionContext( + hCardExtractor.getDescription().getExtractorName(), baseURI); + hCardExtractor.run(ExtractionParameters.newDefault(), + hCardExtractionContext, document, new ExtractionResultImpl( + hCardExtractionContext, hCardExtractor, + new RepositoryWriter(getConnection()))); + + GeoExtractor geoExtractor = new GeoExtractorFactory().createExtractor(); + ExtractionContext geoExtractionContext = new ExtractionContext( + geoExtractor.getDescription().getExtractorName(), baseURI); + geoExtractor.run(ExtractionParameters.newDefault(), + geoExtractionContext, document, new ExtractionResultImpl( + geoExtractionContext, geoExtractor, + new RepositoryWriter(getConnection()))); + + AdrExtractor adrExtractor = new AdrExtractorFactory().createExtractor(); + ExtractionContext adrExtractionContext = new ExtractionContext( + adrExtractor.getDescription().getExtractorName(), baseURI); + adrExtractor.run(ExtractionParameters.newDefault(), + adrExtractionContext, document, new ExtractionResultImpl( + adrExtractionContext, adrExtractor, + new RepositoryWriter(getConnection()))); + + } + + private void extractHRevAndRelated(String filename) + throws ExtractionException, IOException { + extractHCardAndRelated(filename); + InputStream input = new BufferedInputStream(this.getClass() + .getResourceAsStream(filename)); + Document document = new TagSoupParser(input, baseURI.stringValue()) + .getDOM(); + HReviewExtractor hReviewExtractor = new HReviewExtractorFactory() + .createExtractor(); + ExtractionContext hreviewExtractionContext = new ExtractionContext( + hReviewExtractor.getDescription().getExtractorName(), baseURI); + hReviewExtractor.run(ExtractionParameters.newDefault(), + hreviewExtractionContext, document, new ExtractionResultImpl( + hreviewExtractionContext, hReviewExtractor, + new RepositoryWriter(getConnection()))); + } }
http://git-wip-us.apache.org/repos/asf/any23/blob/fd822849/core/src/test/java/org/apache/any23/extractor/html/SpeciesExtractorTest.java ---------------------------------------------------------------------- diff --git a/core/src/test/java/org/apache/any23/extractor/html/SpeciesExtractorTest.java b/core/src/test/java/org/apache/any23/extractor/html/SpeciesExtractorTest.java index 5d7fd14..0334bda 100644 --- a/core/src/test/java/org/apache/any23/extractor/html/SpeciesExtractorTest.java +++ b/core/src/test/java/org/apache/any23/extractor/html/SpeciesExtractorTest.java @@ -47,7 +47,7 @@ public class SpeciesExtractorTest extends AbstractExtractorTestCase { * @throws RepositoryException */ @Test - public void testSpeciesMicroformatExtractOverTaxoBox() throws RepositoryException { + public void testSpeciesMicroformatExtractOverTaxoBox() throws Exception { assertExtract("/microformats/species/species-example-2.html"); assertModelNotEmpty(); logger.debug(dumpModelToRDFXML()); http://git-wip-us.apache.org/repos/asf/any23/blob/fd822849/core/src/test/java/org/apache/any23/extractor/html/TurtleHTMLExtractorTest.java ---------------------------------------------------------------------- diff --git a/core/src/test/java/org/apache/any23/extractor/html/TurtleHTMLExtractorTest.java b/core/src/test/java/org/apache/any23/extractor/html/TurtleHTMLExtractorTest.java index 3afeb82..7984458 100644 --- a/core/src/test/java/org/apache/any23/extractor/html/TurtleHTMLExtractorTest.java +++ b/core/src/test/java/org/apache/any23/extractor/html/TurtleHTMLExtractorTest.java @@ -49,7 +49,7 @@ public class TurtleHTMLExtractorTest extends AbstractExtractorTestCase { * @throws RepositoryException */ @Test - public void testExtraction() throws IOException, ExtractionException, RepositoryException { + public void testExtraction() throws Exception { assertExtract("/html/html-turtle.html"); logger.debug( dumpModelToRDFXML() ); assertStatementsSize( null, (Value) null, 10); http://git-wip-us.apache.org/repos/asf/any23/blob/fd822849/core/src/test/java/org/apache/any23/extractor/rdf/JSONLDExtractorTest.java ---------------------------------------------------------------------- diff --git a/core/src/test/java/org/apache/any23/extractor/rdf/JSONLDExtractorTest.java b/core/src/test/java/org/apache/any23/extractor/rdf/JSONLDExtractorTest.java index 759910e..e89ae7d 100644 --- a/core/src/test/java/org/apache/any23/extractor/rdf/JSONLDExtractorTest.java +++ b/core/src/test/java/org/apache/any23/extractor/rdf/JSONLDExtractorTest.java @@ -63,14 +63,6 @@ public class JSONLDExtractorTest { extract(uri, "/org/apache/any23/extractor/rdf/place-example.jsonld"); } - @Ignore("Need to verify if jsonld-java-sesame can extract from HTML") - @Test - public void testExtractFromHTMLDocument() - throws IOException, ExtractionException, TripleHandlerException { - final URI uri = RDFUtils.uri("http://host.com/embedded_json-ld.html"); - extract(uri, "/org/apache/any23/extractor/rdf/embedded_json-ld.html"); - } - public void extract(URI uri, String filePath) throws IOException, ExtractionException, TripleHandlerException { ByteArrayOutputStream baos = new ByteArrayOutputStream(); http://git-wip-us.apache.org/repos/asf/any23/blob/fd822849/core/src/test/java/org/apache/any23/extractor/rdfa/AbstractRDFaExtractorTestCase.java ---------------------------------------------------------------------- diff --git a/core/src/test/java/org/apache/any23/extractor/rdfa/AbstractRDFaExtractorTestCase.java b/core/src/test/java/org/apache/any23/extractor/rdfa/AbstractRDFaExtractorTestCase.java index 7fdfd8a..6adb31c 100644 --- a/core/src/test/java/org/apache/any23/extractor/rdfa/AbstractRDFaExtractorTestCase.java +++ b/core/src/test/java/org/apache/any23/extractor/rdfa/AbstractRDFaExtractorTestCase.java @@ -31,143 +31,130 @@ import org.slf4j.LoggerFactory; * * @author Michele Mostarda ([email protected]) */ -public abstract class AbstractRDFaExtractorTestCase extends AbstractExtractorTestCase { +public abstract class AbstractRDFaExtractorTestCase extends + AbstractExtractorTestCase { - protected static final DCTerms vDCTERMS = DCTerms.getInstance(); - protected static final FOAF vFOAF = FOAF.getInstance(); + protected static final DCTerms vDCTERMS = DCTerms.getInstance(); + protected static final FOAF vFOAF = FOAF.getInstance(); - Logger logger = LoggerFactory.getLogger(RDFaExtractorTest.class); + Logger logger = LoggerFactory.getLogger(RDFaExtractorTest.class); - /** - * Verify the basic RDFa support. - * - * @throws org.openrdf.repository.RepositoryException - */ - @Test - public void testBasic() throws RepositoryException { - assertExtract("/html/rdfa/basic.html"); - System.out.println( dumpModelToNQuads() ); - assertContains(null, vDCTERMS.creator, RDFUtils.literal("Alice", "en") ); - assertContains(null, vDCTERMS.title , RDFUtils.literal("The trouble with Bob", "en") ); - assertContains(null, RDFUtils.uri("http://fake.org/prop"), RDFUtils.literal("Mary", "en") ); - } + /** + * Verify the basic RDFa support. + * + * @throws org.openrdf.repository.RepositoryException + */ + @Test + public void testBasic() throws Exception { + assertExtract("/html/rdfa/basic.html"); + System.out.println(dumpModelToNQuads()); + assertContains(null, vDCTERMS.creator, RDFUtils.literal("Alice", "en")); + assertContains(null, vDCTERMS.title, + RDFUtils.literal("The trouble with Bob", "en")); + assertContains(null, RDFUtils.uri("http://fake.org/prop"), + RDFUtils.literal("Mary", "en")); + } - /** - * This test check if the - * <a href=""http://www.w3.org/TR/2010/WD-rdfa-core-20100422/#s_curieprocessing">RDFa1.1 CURIEs</a> - * expansion is correct and backward compatible with - * <a href="http://www.w3.org/TR/rdfa-syntax/#s_curieprocessing">RDFa 1.0</a>. - * - * @throws RepositoryException - */ - @Test - public void testRDFa11CURIEs() throws RepositoryException { - assertExtract("/html/rdfa/rdfa-11-curies.html"); - assertModelNotEmpty(); - assertContains( - RDFUtils.uri("http://dbpedia.org/resource/Albert_Einstein"), - RDFUtils.uri("http://dbpedia.org/name"), - RDFUtils.literal("Albert Einstein") - ); - assertContains( - RDFUtils.uri("http://dbpedia.org/resource/Albert_Einstein"), - RDFUtils.uri("http://dbpedia.org/knows"), - RDFUtils.uri("http://dbpedia.org/resource/Franklin_Roosevlet") - ); - assertContains( - RDFUtils.uri("http://database.org/table/Departments"), - RDFUtils.uri("http://database.org/description"), - RDFUtils.literal("Tables listing departments") - ); - assertContains( - RDFUtils.uri("http://database.org/table/Departments"), - RDFUtils.uri("http://database.org/owner"), - RDFUtils.uri("http://database.org/people/Davide_Palmisano") - ); - assertContains( - RDFUtils.uri("http://database.org/table/Departments"), - RDFUtils.uri("http://xmlns.com/foaf/0.1/author"), - RDFUtils.uri("http://database.org/people/Davide_Palmisano") - ); - assertContains( - RDFUtils.uri("http://database.org/table/Departments"), - RDFUtils.uri("http://purl.org/dc/01/name"), - RDFUtils.literal("Departments") - ); - assertStatementsSize(null, null, null, 6); - logger.debug(dumpHumanReadableTriples()); - } + /** + * This test check if the <a + * href=""http://www.w3.org/TR/2010/WD-rdfa-core-20100422 + * /#s_curieprocessing">RDFa1.1 CURIEs</a> expansion is correct and backward + * compatible with <a + * href="http://www.w3.org/TR/rdfa-syntax/#s_curieprocessing">RDFa 1.0</a>. + * + * @throws Exception + */ + @Test + public void testRDFa11CURIEs() throws Exception { + assertExtract("/html/rdfa/rdfa-11-curies.html"); + assertModelNotEmpty(); + assertContains( + RDFUtils.uri("http://dbpedia.org/resource/Albert_Einstein"), + RDFUtils.uri("http://dbpedia.org/name"), + RDFUtils.literal("Albert Einstein")); + assertContains( + RDFUtils.uri("http://dbpedia.org/resource/Albert_Einstein"), + RDFUtils.uri("http://dbpedia.org/knows"), + RDFUtils.uri("http://dbpedia.org/resource/Franklin_Roosevlet")); + assertContains(RDFUtils.uri("http://database.org/table/Departments"), + RDFUtils.uri("http://database.org/description"), + RDFUtils.literal("Tables listing departments")); + assertContains(RDFUtils.uri("http://database.org/table/Departments"), + RDFUtils.uri("http://database.org/owner"), + RDFUtils.uri("http://database.org/people/Davide_Palmisano")); + assertContains(RDFUtils.uri("http://database.org/table/Departments"), + RDFUtils.uri("http://xmlns.com/foaf/0.1/author"), + RDFUtils.uri("http://database.org/people/Davide_Palmisano")); + assertContains(RDFUtils.uri("http://database.org/table/Departments"), + RDFUtils.uri("http://purl.org/dc/01/name"), + RDFUtils.literal("Departments")); + assertStatementsSize(null, null, null, 6); + logger.debug(dumpHumanReadableTriples()); + } - /** - * This test checks if the subject of a property modeled as <i>RDFa</i> in a <i>XHTML</i> document - * where the subject contains inner <i>XML</i> tags is represented as a plain <i>Literal</i> stripping all - * the inner tags. - * For details see the <a href="http://www.w3.org/TR/rdfa-syntax/">RDFa in XHTML: Syntax and Processing</a> - * recommendation. - * - * @throws RepositoryException - */ - @Test - public void testEmptyDatatypeDeclarationWithInnerXMLTags() throws RepositoryException { - assertExtract("/html/rdfa/null-datatype-test.html"); - logger.debug(dumpModelToRDFXML()); + /** + * This test checks if the subject of a property modeled as <i>RDFa</i> in a + * <i>XHTML</i> document where the subject contains inner <i>XML</i> tags is + * represented as a plain <i>Literal</i> stripping all the inner tags. For + * details see the <a href="http://www.w3.org/TR/rdfa-syntax/">RDFa in + * XHTML: Syntax and Processing</a> recommendation. + * + * @throws Exception + */ + @Test + public void testEmptyDatatypeDeclarationWithInnerXMLTags() throws Exception { + assertExtract("/html/rdfa/null-datatype-test.html"); + logger.debug(dumpModelToRDFXML()); - assertContains( - RDFUtils.uri("http://dbpedia.org/resource/Albert_Einstein"), - vFOAF.name, - RDFUtils.literal("Albert Einstein", "en") - ); + assertContains( + RDFUtils.uri("http://dbpedia.org/resource/Albert_Einstein"), + vFOAF.name, RDFUtils.literal("Albert Einstein", "en")); - } + } - /** - * This test checks if the <i>RDF</i> extraction is compliant to the - * <a href="http://www.w3.org/TR/rdfa-syntax/">RDFa in XHTML: Syntax and Processing</a> specification against the - * <a href="http://files.openspring.net/tmp/drupal-test-frontpage.html">Drupal test page</a>. - * - * @throws org.openrdf.repository.RepositoryException - */ - @Test - public void testDrupalTestPage() throws RepositoryException { - assertExtract("/html/rdfa/drupal-test-frontpage.html"); - logger.debug(dumpModelToTurtle()); - assertContains( - RDFUtils.uri("http://bob.example.com/node/3"), - vDCTERMS.title, - RDFUtils.literal("A blog post...", "en") - ); - } + /** + * This test checks if the <i>RDF</i> extraction is compliant to the <a + * href="http://www.w3.org/TR/rdfa-syntax/">RDFa in XHTML: Syntax and + * Processing</a> specification against the <a + * href="http://files.openspring.net/tmp/drupal-test-frontpage.html">Drupal + * test page</a>. + * + * @throws org.openrdf.repository.RepositoryException + */ + @Test + public void testDrupalTestPage() throws Exception { + assertExtract("/html/rdfa/drupal-test-frontpage.html"); + logger.debug(dumpModelToTurtle()); + assertContains(RDFUtils.uri("http://bob.example.com/node/3"), + vDCTERMS.title, RDFUtils.literal("A blog post...", "en")); + } - /** - * See RDFa 1.1 Specification section 6.2 . - * - * @throws RepositoryException - */ - @Test - public void testIncompleteTripleManagement() throws RepositoryException { - assertExtract("/html/rdfa/incomplete-triples.html"); - logger.debug(dumpModelToTurtle()); + /** + * See RDFa 1.1 Specification section 6.2 . + * + * @throws Exception + */ + @Test + public void testIncompleteTripleManagement() throws Exception { + assertExtract("/html/rdfa/incomplete-triples.html"); + logger.debug(dumpModelToTurtle()); - assertContains( - RDFUtils.uri("http://dbpedia.org/resource/Albert_Einstein"), - RDFUtils.uri("http://dbpedia.org/property/birthPlace"), - RDFUtils.uri("http://dbpedia.org/resource/Germany") - ); - assertContains( - RDFUtils.uri("http://dbpedia.org/resource/Germany"), - RDFUtils.uri("http://dbpedia.org/property/conventionalLongName"), - RDFUtils.literal("Federal Republic of Germany") - ); - assertContains( - RDFUtils.uri("http://dbpedia.org/resource/Albert_Einstein"), - RDFUtils.uri("http://dbpedia.org/property/citizenship"), - RDFUtils.uri("http://dbpedia.org/resource/Germany") - ); - assertContains( - RDFUtils.uri("http://dbpedia.org/resource/Albert_Einstein"), - RDFUtils.uri("http://dbpedia.org/property/citizenship"), - RDFUtils.uri("http://dbpedia.org/resource/United_States") - ); - } + assertContains( + RDFUtils.uri("http://dbpedia.org/resource/Albert_Einstein"), + RDFUtils.uri("http://dbpedia.org/property/birthPlace"), + RDFUtils.uri("http://dbpedia.org/resource/Germany")); + assertContains( + RDFUtils.uri("http://dbpedia.org/resource/Germany"), + RDFUtils.uri("http://dbpedia.org/property/conventionalLongName"), + RDFUtils.literal("Federal Republic of Germany")); + assertContains( + RDFUtils.uri("http://dbpedia.org/resource/Albert_Einstein"), + RDFUtils.uri("http://dbpedia.org/property/citizenship"), + RDFUtils.uri("http://dbpedia.org/resource/Germany")); + assertContains( + RDFUtils.uri("http://dbpedia.org/resource/Albert_Einstein"), + RDFUtils.uri("http://dbpedia.org/property/citizenship"), + RDFUtils.uri("http://dbpedia.org/resource/United_States")); + } } http://git-wip-us.apache.org/repos/asf/any23/blob/fd822849/nquads/src/main/java/org/apache/any23/io/nquads/NQuadsParserFactory.java ---------------------------------------------------------------------- diff --git a/nquads/src/main/java/org/apache/any23/io/nquads/NQuadsParserFactory.java b/nquads/src/main/java/org/apache/any23/io/nquads/NQuadsParserFactory.java index 9c367fe..4df3496 100644 --- a/nquads/src/main/java/org/apache/any23/io/nquads/NQuadsParserFactory.java +++ b/nquads/src/main/java/org/apache/any23/io/nquads/NQuadsParserFactory.java @@ -17,7 +17,6 @@ package org.apache.any23.io.nquads; -import org.kohsuke.MetaInfServices; import org.openrdf.rio.RDFFormat; import org.openrdf.rio.RDFParser; import org.openrdf.rio.RDFParserFactory; @@ -27,7 +26,6 @@ import org.openrdf.rio.RDFParserFactory; * * @author Michele Mostarda ([email protected]) */ -@MetaInfServices public class NQuadsParserFactory implements RDFParserFactory { public NQuadsParserFactory() {} http://git-wip-us.apache.org/repos/asf/any23/blob/fd822849/nquads/src/main/java/org/apache/any23/io/nquads/NQuadsWriterFactory.java ---------------------------------------------------------------------- diff --git a/nquads/src/main/java/org/apache/any23/io/nquads/NQuadsWriterFactory.java b/nquads/src/main/java/org/apache/any23/io/nquads/NQuadsWriterFactory.java index 79833a4..ecea7d0 100644 --- a/nquads/src/main/java/org/apache/any23/io/nquads/NQuadsWriterFactory.java +++ b/nquads/src/main/java/org/apache/any23/io/nquads/NQuadsWriterFactory.java @@ -17,7 +17,6 @@ package org.apache.any23.io.nquads; -import org.kohsuke.MetaInfServices; import org.openrdf.rio.RDFFormat; import org.openrdf.rio.RDFWriter; import org.openrdf.rio.RDFWriterFactory; @@ -30,7 +29,6 @@ import java.io.Writer; * * @author Michele Mostarda ([email protected]) */ -@MetaInfServices public class NQuadsWriterFactory implements RDFWriterFactory { @Override http://git-wip-us.apache.org/repos/asf/any23/blob/fd822849/plugins/basic-crawler/src/main/java/org/apache/any23/cli/Crawler.java ---------------------------------------------------------------------- diff --git a/plugins/basic-crawler/src/main/java/org/apache/any23/cli/Crawler.java b/plugins/basic-crawler/src/main/java/org/apache/any23/cli/Crawler.java index 651f7fe..1f84069 100644 --- a/plugins/basic-crawler/src/main/java/org/apache/any23/cli/Crawler.java +++ b/plugins/basic-crawler/src/main/java/org/apache/any23/cli/Crawler.java @@ -28,7 +28,6 @@ import edu.uci.ics.crawler4j.parser.ParseData; import org.apache.any23.plugin.crawler.CrawlerListener; import org.apache.any23.plugin.crawler.SiteCrawler; import org.apache.any23.source.StringDocumentSource; -import org.kohsuke.MetaInfServices; import java.io.File; import java.net.URL; @@ -44,7 +43,6 @@ import static java.lang.String.format; * * @author Michele Mostarda ([email protected]) */ -@MetaInfServices( value = Tool.class ) @Parameters(commandNames = "crawler", commandDescription = "Any23 Crawler Command Line Tool.") public class Crawler extends Rover { http://git-wip-us.apache.org/repos/asf/any23/blob/fd822849/plugins/basic-crawler/src/main/resources/META-INF/services/org.apache.any23.cli.Tool ---------------------------------------------------------------------- diff --git a/plugins/basic-crawler/src/main/resources/META-INF/services/org.apache.any23.cli.Tool b/plugins/basic-crawler/src/main/resources/META-INF/services/org.apache.any23.cli.Tool new file mode 100644 index 0000000..cd000e0 --- /dev/null +++ b/plugins/basic-crawler/src/main/resources/META-INF/services/org.apache.any23.cli.Tool @@ -0,0 +1 @@ +org.apache.any23.cli.Crawler \ No newline at end of file http://git-wip-us.apache.org/repos/asf/any23/blob/fd822849/plugins/html-scraper/src/main/java/org/apache/any23/plugin/htmlscraper/HTMLScraperExtractor.java ---------------------------------------------------------------------- diff --git a/plugins/html-scraper/src/main/java/org/apache/any23/plugin/htmlscraper/HTMLScraperExtractor.java b/plugins/html-scraper/src/main/java/org/apache/any23/plugin/htmlscraper/HTMLScraperExtractor.java index 397f034..c6963da 100644 --- a/plugins/html-scraper/src/main/java/org/apache/any23/plugin/htmlscraper/HTMLScraperExtractor.java +++ b/plugins/html-scraper/src/main/java/org/apache/any23/plugin/htmlscraper/HTMLScraperExtractor.java @@ -30,7 +30,6 @@ import org.apache.any23.extractor.ExtractionResult; import org.apache.any23.extractor.Extractor; import org.apache.any23.extractor.ExtractorDescription; import org.apache.any23.vocab.SINDICE; -import org.kohsuke.MetaInfServices; import org.openrdf.model.URI; import org.openrdf.model.impl.ValueFactoryImpl; @@ -46,7 +45,6 @@ import java.util.List; * @see HTMLScraperPlugin * @author Michele Mostarda ([email protected]) */ -@MetaInfServices( value = Extractor.class ) public class HTMLScraperExtractor implements Extractor.ContentExtractor { public final static URI PAGE_CONTENT_DE_PROPERTY = http://git-wip-us.apache.org/repos/asf/any23/blob/fd822849/plugins/html-scraper/src/main/java/org/apache/any23/plugin/htmlscraper/HTMLScraperExtractorFactory.java ---------------------------------------------------------------------- diff --git a/plugins/html-scraper/src/main/java/org/apache/any23/plugin/htmlscraper/HTMLScraperExtractorFactory.java b/plugins/html-scraper/src/main/java/org/apache/any23/plugin/htmlscraper/HTMLScraperExtractorFactory.java index ca44938..25a9992 100644 --- a/plugins/html-scraper/src/main/java/org/apache/any23/plugin/htmlscraper/HTMLScraperExtractorFactory.java +++ b/plugins/html-scraper/src/main/java/org/apache/any23/plugin/htmlscraper/HTMLScraperExtractorFactory.java @@ -9,13 +9,11 @@ import org.apache.any23.extractor.ExtractorDescription; import org.apache.any23.extractor.ExtractorFactory; import org.apache.any23.extractor.SimpleExtractorFactory; import org.apache.any23.rdf.Prefixes; -import org.kohsuke.MetaInfServices; /** * @author Peter Ansell [email protected] * */ -@MetaInfServices(ExtractorFactory.class) public class HTMLScraperExtractorFactory extends SimpleExtractorFactory<HTMLScraperExtractor> implements ExtractorFactory<HTMLScraperExtractor> { http://git-wip-us.apache.org/repos/asf/any23/blob/fd822849/plugins/html-scraper/src/main/resources/META-INF/services/org.apache.any23.extractor.ExtractorFactory ---------------------------------------------------------------------- diff --git a/plugins/html-scraper/src/main/resources/META-INF/services/org.apache.any23.extractor.ExtractorFactory b/plugins/html-scraper/src/main/resources/META-INF/services/org.apache.any23.extractor.ExtractorFactory new file mode 100644 index 0000000..30e75c2 --- /dev/null +++ b/plugins/html-scraper/src/main/resources/META-INF/services/org.apache.any23.extractor.ExtractorFactory @@ -0,0 +1 @@ +org.apache.any23.plugin.htmlscraper.HTMLScraperExtractorFactory \ No newline at end of file http://git-wip-us.apache.org/repos/asf/any23/blob/fd822849/plugins/office-scraper/src/main/java/org/apache/any23/plugin/officescraper/ExcelExtractor.java ---------------------------------------------------------------------- diff --git a/plugins/office-scraper/src/main/java/org/apache/any23/plugin/officescraper/ExcelExtractor.java b/plugins/office-scraper/src/main/java/org/apache/any23/plugin/officescraper/ExcelExtractor.java index 4978581..c4e5284 100644 --- a/plugins/office-scraper/src/main/java/org/apache/any23/plugin/officescraper/ExcelExtractor.java +++ b/plugins/office-scraper/src/main/java/org/apache/any23/plugin/officescraper/ExcelExtractor.java @@ -31,7 +31,6 @@ import org.apache.poi.ss.usermodel.Row; import org.apache.poi.ss.usermodel.Sheet; import org.apache.poi.ss.usermodel.Workbook; import org.apache.poi.xssf.usermodel.XSSFWorkbook; -import org.kohsuke.MetaInfServices; import org.openrdf.model.URI; import org.openrdf.model.vocabulary.RDF; @@ -47,7 +46,6 @@ import java.io.InputStream; * * @author Michele Mostarda ([email protected]) */ -@MetaInfServices( value = Extractor.class ) public class ExcelExtractor implements Extractor.ContentExtractor { private static final Excel excel = Excel.getInstance(); http://git-wip-us.apache.org/repos/asf/any23/blob/fd822849/plugins/office-scraper/src/main/java/org/apache/any23/plugin/officescraper/ExcelExtractorFactory.java ---------------------------------------------------------------------- diff --git a/plugins/office-scraper/src/main/java/org/apache/any23/plugin/officescraper/ExcelExtractorFactory.java b/plugins/office-scraper/src/main/java/org/apache/any23/plugin/officescraper/ExcelExtractorFactory.java index 0798dd7..2515bf0 100644 --- a/plugins/office-scraper/src/main/java/org/apache/any23/plugin/officescraper/ExcelExtractorFactory.java +++ b/plugins/office-scraper/src/main/java/org/apache/any23/plugin/officescraper/ExcelExtractorFactory.java @@ -23,13 +23,11 @@ import org.apache.any23.extractor.ExtractorDescription; import org.apache.any23.extractor.ExtractorFactory; import org.apache.any23.extractor.SimpleExtractorFactory; import org.apache.any23.rdf.Prefixes; -import org.kohsuke.MetaInfServices; /** * @author Peter Ansell [email protected] * */ -@MetaInfServices(ExtractorFactory.class) public class ExcelExtractorFactory extends SimpleExtractorFactory<ExcelExtractor> implements ExtractorFactory<ExcelExtractor> { http://git-wip-us.apache.org/repos/asf/any23/blob/fd822849/plugins/office-scraper/src/main/resources/META-INF/services/org.apache.any23.extractor.ExtractorFactory ---------------------------------------------------------------------- diff --git a/plugins/office-scraper/src/main/resources/META-INF/services/org.apache.any23.extractor.ExtractorFactory b/plugins/office-scraper/src/main/resources/META-INF/services/org.apache.any23.extractor.ExtractorFactory new file mode 100644 index 0000000..1b4576c --- /dev/null +++ b/plugins/office-scraper/src/main/resources/META-INF/services/org.apache.any23.extractor.ExtractorFactory @@ -0,0 +1 @@ +org.apache.any23.plugin.officescraper.ExcelExtractorFactory \ No newline at end of file http://git-wip-us.apache.org/repos/asf/any23/blob/fd822849/src/site/apt/any23-plugins.apt ---------------------------------------------------------------------- diff --git a/src/site/apt/any23-plugins.apt b/src/site/apt/any23-plugins.apt index bf55805..7bd297b 100644 --- a/src/site/apt/any23-plugins.apt +++ b/src/site/apt/any23-plugins.apt @@ -117,7 +117,6 @@ public class HTMLScraperPlugin implements ExtractorPlugin { An example of plugin is defined below. +-------------------------------------- -@MetaInfServices @Parameters(commandNames = { "myexec" }, commandDescription = "Prints out XXX used by Any23.") public class MyExecutableTool implements Tool { http://git-wip-us.apache.org/repos/asf/any23/blob/fd822849/test-resources/src/test/resources/html/html-embedded-jsonld-extractor-multiple.html ---------------------------------------------------------------------- diff --git a/test-resources/src/test/resources/html/html-embedded-jsonld-extractor-multiple.html b/test-resources/src/test/resources/html/html-embedded-jsonld-extractor-multiple.html new file mode 100644 index 0000000..3d00184 --- /dev/null +++ b/test-resources/src/test/resources/html/html-embedded-jsonld-extractor-multiple.html @@ -0,0 +1,45 @@ +<!-- + Licensed to the Apache Software Foundation (ASF) under one or more + contributor license agreements. See the NOTICE file distributed with + this work for additional information regarding copyright ownership. + The ASF licenses this file to You under the Apache License, Version 2.0 + (the "License"); you may not use this file except in compliance with + the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +--> +<html> + <head> + <title>Hello World!</title> + <meta name="title" content="Embedded JSONLD extractor"/> + <!-- As per spec in http://www.w3.org/TR/json-ld/#embedding-json-ld-in-html-documents --> + <script type="application/ld+json"> + { + "@context": "http://json-ld.org/contexts/person.jsonld", + "@id": "http://dbpedia.org/resource/Robert_Millar", + "@type": "Person", + "name": "Robert Millar", + "born": "1958-09-13T00:00:00" + } + </script> + <script type="application/ld+json"> + { + "@context": "http://json-ld.org/contexts/person.jsonld", + "@id": "http://dbpedia.org/resource/Robert_Frost", + "@type": "Person", + "name": "Robert Frost", + "born": "1874-03-26T00:00:00", + "died": "1963-01-29T00:00:00" + } + </script> + + </head> + <h1>Embedded JSONLD Extractor</h1> + <p>It extracts only the embedded JSON-LD elements. +</html> \ No newline at end of file http://git-wip-us.apache.org/repos/asf/any23/blob/fd822849/test-resources/src/test/resources/html/html-embedded-jsonld-extractor.html ---------------------------------------------------------------------- diff --git a/test-resources/src/test/resources/html/html-embedded-jsonld-extractor.html b/test-resources/src/test/resources/html/html-embedded-jsonld-extractor.html index 09859ff..ca32e42 100644 --- a/test-resources/src/test/resources/html/html-embedded-jsonld-extractor.html +++ b/test-resources/src/test/resources/html/html-embedded-jsonld-extractor.html @@ -23,9 +23,9 @@ { "@context": "http://json-ld.org/contexts/person.jsonld", "@id": "http://dbpedia.org/resource/Robert_Millar", + "@type": "Person", "name": "Robert Millar", - "born": "1958-09-13", - "birthPlace": "http://dbpedia.org/resource/Glasgow" + "born": "1958-09-13T00:00:00" } </script> </head>
