Repository: any23 Updated Branches: refs/heads/master f3e66003a -> 19d85f2fc
ANY23-390 implement ICal, JCal, and XCal extractors Project: http://git-wip-us.apache.org/repos/asf/any23/repo Commit: http://git-wip-us.apache.org/repos/asf/any23/commit/54a92960 Tree: http://git-wip-us.apache.org/repos/asf/any23/tree/54a92960 Diff: http://git-wip-us.apache.org/repos/asf/any23/diff/54a92960 Branch: refs/heads/master Commit: 54a92960ac2fda9510041b6886eb7259a9b1220b Parents: f3e6600 Author: Hans <[email protected]> Authored: Tue Aug 21 11:37:35 2018 -0500 Committer: Hans <[email protected]> Committed: Tue Aug 21 11:37:35 2018 -0500 ---------------------------------------------------------------------- core/pom.xml | 5 + .../calendar/BaseCalendarExtractor.java | 222 +++++++++++++++++++ .../any23/extractor/calendar/ICalExtractor.java | 41 ++++ .../calendar/ICalExtractorFactory.java | 47 ++++ .../any23/extractor/calendar/JCalExtractor.java | 40 ++++ .../calendar/JCalExtractorFactory.java | 48 ++++ .../any23/extractor/calendar/XCalExtractor.java | 41 ++++ .../calendar/XCalExtractorFactory.java | 47 ++++ .../any23/extractor/calendar/package-info.java | 21 ++ pom.xml | 5 + 10 files changed, 517 insertions(+) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/any23/blob/54a92960/core/pom.xml ---------------------------------------------------------------------- diff --git a/core/pom.xml b/core/pom.xml index 49a1bfc..0fb5b28 100644 --- a/core/pom.xml +++ b/core/pom.xml @@ -93,6 +93,11 @@ </dependency> <!-- END: httpcomponents --> + <dependency> + <groupId>net.sf.biweekly</groupId> + <artifactId>biweekly</artifactId> + </dependency> + <!-- BEGIN: Tika --> <dependency> <groupId>org.apache.tika</groupId> http://git-wip-us.apache.org/repos/asf/any23/blob/54a92960/core/src/main/java/org/apache/any23/extractor/calendar/BaseCalendarExtractor.java ---------------------------------------------------------------------- diff --git a/core/src/main/java/org/apache/any23/extractor/calendar/BaseCalendarExtractor.java b/core/src/main/java/org/apache/any23/extractor/calendar/BaseCalendarExtractor.java new file mode 100644 index 0000000..74c3e10 --- /dev/null +++ b/core/src/main/java/org/apache/any23/extractor/calendar/BaseCalendarExtractor.java @@ -0,0 +1,222 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.any23.extractor.calendar; + +import biweekly.ICalDataType; +import biweekly.ICalVersion; +import biweekly.ICalendar; +import biweekly.component.ICalComponent; +import biweekly.io.ParseWarning; +import biweekly.io.SkipMeException; +import biweekly.io.StreamReader; +import biweekly.io.TimezoneInfo; +import biweekly.io.WriteContext; +import biweekly.io.scribe.ScribeIndex; +import biweekly.io.scribe.component.ICalComponentScribe; +import biweekly.io.scribe.property.ICalPropertyScribe; +import biweekly.property.ICalProperty; +import com.github.mangstadt.vinnie.io.VObjectPropertyValues; +import org.apache.any23.extractor.ExtractionContext; +import org.apache.any23.extractor.ExtractionException; +import org.apache.any23.extractor.ExtractionParameters; +import org.apache.any23.extractor.ExtractionResult; +import org.apache.any23.extractor.Extractor; +import org.apache.any23.extractor.IssueReport; +import org.apache.any23.vocab.ICAL; +import org.eclipse.rdf4j.model.BNode; +import org.eclipse.rdf4j.model.IRI; +import org.eclipse.rdf4j.model.Value; +import org.eclipse.rdf4j.model.ValueFactory; +import org.eclipse.rdf4j.model.impl.SimpleValueFactory; +import org.eclipse.rdf4j.model.vocabulary.RDF; + +import java.io.IOException; +import java.io.InputStream; +import java.io.PrintWriter; +import java.io.StringWriter; +import java.util.Locale; +import java.util.Objects; + +/** + * @author Hans Brende ([email protected]) + */ +abstract class BaseCalendarExtractor implements Extractor.ContentExtractor { + + @Override + public void setStopAtFirstError(boolean b) { + //unsupported + } + + private static final ValueFactory f = SimpleValueFactory.getInstance(); + private static final ICAL vICAL = ICAL.getInstance(); + + abstract StreamReader reader(InputStream inputStream); + + @Override + public final void run(ExtractionParameters extractionParameters, ExtractionContext extractionContext, InputStream inputStream, + ExtractionResult extractionResult) throws IOException, ExtractionException { + ScribeIndex index = new ScribeIndex(); + try (StreamReader reader = reader(inputStream)) { + ICalendar cal; + while ((cal = reader.readNext()) != null) { + for (ParseWarning warning : reader.getWarnings()) { + String message = warning.getMessage(); + Integer lineNumber = warning.getLineNumber(); + if (lineNumber == null) { + extractionResult.notifyIssue(IssueReport.IssueLevel.WARNING, message, -1, -1); + } else { + extractionResult.notifyIssue(IssueReport.IssueLevel.WARNING, message, lineNumber, -1); + } + } + + BNode calNode = f.createBNode(); + extractionResult.writeTriple(calNode, RDF.TYPE, vICAL.Vcalendar); + extract(index, cal.getTimezoneInfo(), calNode, cal, extractionResult); + } + } catch (Exception e) { + extractionResult.notifyIssue(IssueReport.IssueLevel.FATAL, toString(e), -1, -1); + } + } + + private static String toString(Throwable th) { + StringWriter writer = new StringWriter(); + try (PrintWriter pw = new PrintWriter(writer)) { + th.printStackTrace(pw); + } + String string = writer.toString(); + if (string.length() > 200) { + return string.substring(0, 197) + "..."; + } + return string; + } + + + private static String localNameOfType(String typeName) { + if (typeName.isEmpty()) { + return ""; + } + int ind = Character.charCount(typeName.codePointAt(0)); + return typeName.substring(0, ind).toUpperCase(Locale.ENGLISH) + typeName.substring(ind); + } + + private static String localNameOfProperty(String propertyName) { + String[] nameComponents = propertyName.split("-"); + StringBuilder sb = new StringBuilder(propertyName.length()); + sb.append(nameComponents[0]); + for (int i = 1, len = nameComponents.length; i < len; i++) { + sb.append(localNameOfType(nameComponents[i])); + } + return sb.toString(); + } + + private static IRI type(ICalComponentScribe<?> scribe, ExtractionResult result) { + if (scribe == null) { + return null; + } + String originalName = scribe.getComponentName(); + String name = originalName.toLowerCase(Locale.ENGLISH); + + if (name.startsWith("x-")) { + //non-standard class + return f.createIRI(ICAL.NS, "X-" + localNameOfType(name.substring(2))); + } + + name = localNameOfType(name); + + try { + return Objects.requireNonNull(vICAL.getClass(name)); + } catch (RuntimeException e) { + IRI iri = f.createIRI(ICAL.NS, name); + result.notifyIssue(IssueReport.IssueLevel.ERROR, + "class " + iri + " (" + originalName + ") not defined in " + ICAL.class.getName(), + -1, -1); + return iri; + } + } + + private static IRI predicate(ICalPropertyScribe<?> scribe, ExtractionResult result) { + if (scribe == null) { + return null; + } + String originalName = scribe.getPropertyName(ICalVersion.V2_0); + String name = originalName.toLowerCase(Locale.ENGLISH); + if (name.startsWith("x-")) { + //non-standard property + return f.createIRI(ICAL.NS, "x-" + localNameOfProperty(name.substring(2))); + } + + name = localNameOfProperty(name); + + try { + return Objects.requireNonNull(vICAL.getProperty(name)); + } catch (RuntimeException e) { + IRI iri = f.createIRI(ICAL.NS, name); + result.notifyIssue(IssueReport.IssueLevel.ERROR, + "property " + iri + " (" + originalName + ") not defined in " + ICAL.class.getName(), + -1, -1); + return iri; + } + } + + @SuppressWarnings("unchecked") + private static <T extends ICalProperty> Value value(ICalPropertyScribe<T> scribe, ICalProperty property, TimezoneInfo info) { + try { + T prop = (T)property; + String text = scribe.writeText(prop, new WriteContext(ICalVersion.V2_0, info, null)); + if (text == null) { + return null; + } + text = VObjectPropertyValues.unescape(text); + ICalDataType dataType = scribe.dataType(prop, ICalVersion.V2_0); + if (ICalDataType.URI.equals(dataType) || ICalDataType.URL.equals(dataType)) { + try { + return f.createIRI(text.trim()); + } catch (IllegalArgumentException e) { + //ignore + } + } + return f.createLiteral(text); + } catch (SkipMeException e) { + return null; + } + } + + private static void extract(ScribeIndex index, TimezoneInfo info, BNode node, ICalComponent component, ExtractionResult extractionResult) { + for (ICalProperty property : component.getProperties().values()) { + ICalPropertyScribe<?> scribe = index.getPropertyScribe(property); + IRI predicate = predicate(scribe, extractionResult); + if (predicate != null) { + Value value = value(scribe, property, info); + if (value != null) { + extractionResult.writeTriple(node, predicate, value); + } + } + } + for (ICalComponent child : component.getComponents().values()) { + BNode childNode = f.createBNode(); + extractionResult.writeTriple(node, vICAL.component, childNode); + IRI childType = type(index.getComponentScribe(child), extractionResult); + if (childType != null) { + extractionResult.writeTriple(childNode, RDF.TYPE, childType); + } + extract(index, info, childNode, child, extractionResult); + } + } + +} + http://git-wip-us.apache.org/repos/asf/any23/blob/54a92960/core/src/main/java/org/apache/any23/extractor/calendar/ICalExtractor.java ---------------------------------------------------------------------- diff --git a/core/src/main/java/org/apache/any23/extractor/calendar/ICalExtractor.java b/core/src/main/java/org/apache/any23/extractor/calendar/ICalExtractor.java new file mode 100644 index 0000000..adb2d7d --- /dev/null +++ b/core/src/main/java/org/apache/any23/extractor/calendar/ICalExtractor.java @@ -0,0 +1,41 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.any23.extractor.calendar; + +import biweekly.io.StreamReader; +import biweekly.io.text.ICalReader; +import org.apache.any23.extractor.ExtractorDescription; + +import java.io.InputStream; + +/** + * @author Hans Brende ([email protected]) + */ +public class ICalExtractor extends BaseCalendarExtractor { + + @Override + StreamReader reader(InputStream inputStream) { + return new ICalReader(inputStream); + } + + @Override + public ExtractorDescription getDescription() { + return ICalExtractorFactory.getDescriptionInstance(); + } + +} http://git-wip-us.apache.org/repos/asf/any23/blob/54a92960/core/src/main/java/org/apache/any23/extractor/calendar/ICalExtractorFactory.java ---------------------------------------------------------------------- diff --git a/core/src/main/java/org/apache/any23/extractor/calendar/ICalExtractorFactory.java b/core/src/main/java/org/apache/any23/extractor/calendar/ICalExtractorFactory.java new file mode 100644 index 0000000..8a25dcb --- /dev/null +++ b/core/src/main/java/org/apache/any23/extractor/calendar/ICalExtractorFactory.java @@ -0,0 +1,47 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.any23.extractor.calendar; + +import org.apache.any23.extractor.ExtractorDescription; +import org.apache.any23.extractor.SimpleExtractorFactory; +import org.apache.any23.rdf.Prefixes; + +import java.util.Collections; + +/** + * @author Hans Brende ([email protected]) + */ +public class ICalExtractorFactory extends SimpleExtractorFactory<ICalExtractor> { + + private static final String NAME = "ical"; + private static final Prefixes PREFIXES = null; + private static final ExtractorDescription descriptionInstance = new ICalExtractorFactory(); + + public ICalExtractorFactory() { + super(NAME, PREFIXES, Collections.singletonList("text/calendar"), null); + } + + public static ExtractorDescription getDescriptionInstance() { + return descriptionInstance; + } + + @Override + public ICalExtractor createExtractor() { + return new ICalExtractor(); + } +} http://git-wip-us.apache.org/repos/asf/any23/blob/54a92960/core/src/main/java/org/apache/any23/extractor/calendar/JCalExtractor.java ---------------------------------------------------------------------- diff --git a/core/src/main/java/org/apache/any23/extractor/calendar/JCalExtractor.java b/core/src/main/java/org/apache/any23/extractor/calendar/JCalExtractor.java new file mode 100644 index 0000000..32dc73b --- /dev/null +++ b/core/src/main/java/org/apache/any23/extractor/calendar/JCalExtractor.java @@ -0,0 +1,40 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.any23.extractor.calendar; + +import biweekly.io.StreamReader; +import biweekly.io.json.JCalReader; +import org.apache.any23.extractor.ExtractorDescription; + +import java.io.InputStream; + +/** + * @author Hans Brende ([email protected]) + */ +public class JCalExtractor extends BaseCalendarExtractor { + + @Override + StreamReader reader(InputStream inputStream) { + return new JCalReader(inputStream); + } + + @Override + public ExtractorDescription getDescription() { + return JCalExtractorFactory.getDescriptionInstance(); + } +} http://git-wip-us.apache.org/repos/asf/any23/blob/54a92960/core/src/main/java/org/apache/any23/extractor/calendar/JCalExtractorFactory.java ---------------------------------------------------------------------- diff --git a/core/src/main/java/org/apache/any23/extractor/calendar/JCalExtractorFactory.java b/core/src/main/java/org/apache/any23/extractor/calendar/JCalExtractorFactory.java new file mode 100644 index 0000000..2344bab --- /dev/null +++ b/core/src/main/java/org/apache/any23/extractor/calendar/JCalExtractorFactory.java @@ -0,0 +1,48 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.any23.extractor.calendar; + +import org.apache.any23.extractor.ExtractorDescription; +import org.apache.any23.extractor.SimpleExtractorFactory; +import org.apache.any23.rdf.Prefixes; + +import java.util.Collections; + +/** + * @author Hans Brende ([email protected]) + */ +public class JCalExtractorFactory extends SimpleExtractorFactory<JCalExtractor> { + + private static final String NAME = "jcal"; + private static final Prefixes PREFIXES = null; + private static final ExtractorDescription descriptionInstance = new JCalExtractorFactory(); + + public JCalExtractorFactory() { + super(NAME, PREFIXES, Collections.singletonList("application/calendar+json"), null); + } + + public static ExtractorDescription getDescriptionInstance() { + return descriptionInstance; + } + + @Override + public JCalExtractor createExtractor() { + return new JCalExtractor(); + } +} + http://git-wip-us.apache.org/repos/asf/any23/blob/54a92960/core/src/main/java/org/apache/any23/extractor/calendar/XCalExtractor.java ---------------------------------------------------------------------- diff --git a/core/src/main/java/org/apache/any23/extractor/calendar/XCalExtractor.java b/core/src/main/java/org/apache/any23/extractor/calendar/XCalExtractor.java new file mode 100644 index 0000000..555b4f7 --- /dev/null +++ b/core/src/main/java/org/apache/any23/extractor/calendar/XCalExtractor.java @@ -0,0 +1,41 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.any23.extractor.calendar; + +import biweekly.io.StreamReader; +import biweekly.io.xml.XCalReader; +import org.apache.any23.extractor.ExtractorDescription; + +import java.io.InputStream; + +/** + * @author Hans Brende ([email protected]) + */ +public class XCalExtractor extends BaseCalendarExtractor { + + @Override + StreamReader reader(InputStream inputStream) { + return new XCalReader(inputStream); + } + + @Override + public ExtractorDescription getDescription() { + return XCalExtractorFactory.getDescriptionInstance(); + } + +} http://git-wip-us.apache.org/repos/asf/any23/blob/54a92960/core/src/main/java/org/apache/any23/extractor/calendar/XCalExtractorFactory.java ---------------------------------------------------------------------- diff --git a/core/src/main/java/org/apache/any23/extractor/calendar/XCalExtractorFactory.java b/core/src/main/java/org/apache/any23/extractor/calendar/XCalExtractorFactory.java new file mode 100644 index 0000000..6519b1f --- /dev/null +++ b/core/src/main/java/org/apache/any23/extractor/calendar/XCalExtractorFactory.java @@ -0,0 +1,47 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.any23.extractor.calendar; + +import org.apache.any23.extractor.ExtractorDescription; +import org.apache.any23.extractor.SimpleExtractorFactory; +import org.apache.any23.rdf.Prefixes; + +import java.util.Collections; + +/** + * @author Hans Brende ([email protected]) + */ +public class XCalExtractorFactory extends SimpleExtractorFactory<XCalExtractor> { + + private static final String NAME = "xcal"; + private static final Prefixes PREFIXES = null; + private static final ExtractorDescription descriptionInstance = new XCalExtractorFactory(); + + public XCalExtractorFactory() { + super(NAME, PREFIXES, Collections.singletonList("application/calendar+xml"), null); + } + + public static ExtractorDescription getDescriptionInstance() { + return descriptionInstance; + } + + @Override + public XCalExtractor createExtractor() { + return new XCalExtractor(); + } +} http://git-wip-us.apache.org/repos/asf/any23/blob/54a92960/core/src/main/java/org/apache/any23/extractor/calendar/package-info.java ---------------------------------------------------------------------- diff --git a/core/src/main/java/org/apache/any23/extractor/calendar/package-info.java b/core/src/main/java/org/apache/any23/extractor/calendar/package-info.java new file mode 100644 index 0000000..50d731a --- /dev/null +++ b/core/src/main/java/org/apache/any23/extractor/calendar/package-info.java @@ -0,0 +1,21 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * This package contains extractors for various calendar serialization formats. + */ +package org.apache.any23.extractor.calendar; \ No newline at end of file http://git-wip-us.apache.org/repos/asf/any23/blob/54a92960/pom.xml ---------------------------------------------------------------------- diff --git a/pom.xml b/pom.xml index ce2ee5d..2aa6278 100644 --- a/pom.xml +++ b/pom.xml @@ -372,6 +372,11 @@ <artifactId>jsoup</artifactId> <version>1.11.3</version> </dependency> + <dependency> + <groupId>net.sf.biweekly</groupId> + <artifactId>biweekly</artifactId> + <version>0.6.2</version> + </dependency> <!-- BEGIN: Tika --> <dependency>
