Repository: any23 Updated Branches: refs/heads/master 60d6f6164 -> 205cfe442
ANY23-336 Hacky patch to tide us over until jsonldjava 0.11.2 release Project: http://git-wip-us.apache.org/repos/asf/any23/repo Commit: http://git-wip-us.apache.org/repos/asf/any23/commit/205cfe44 Tree: http://git-wip-us.apache.org/repos/asf/any23/tree/205cfe44 Diff: http://git-wip-us.apache.org/repos/asf/any23/diff/205cfe44 Branch: refs/heads/master Commit: 205cfe442c427dee6caae806d8293f7bfaec5e74 Parents: 60d6f61 Author: Hans <[email protected]> Authored: Mon Apr 2 03:05:34 2018 -0500 Committer: Hans <[email protected]> Committed: Mon Apr 2 12:11:26 2018 -0500 ---------------------------------------------------------------------- .../any23/extractor/rdf/JSONLDExtractor.java | 91 ++++++++++++++++++++ .../extractor/rdf/JSONLDExtractorTest.java | 23 +++++ 2 files changed, 114 insertions(+) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/any23/blob/205cfe44/core/src/main/java/org/apache/any23/extractor/rdf/JSONLDExtractor.java ---------------------------------------------------------------------- diff --git a/core/src/main/java/org/apache/any23/extractor/rdf/JSONLDExtractor.java b/core/src/main/java/org/apache/any23/extractor/rdf/JSONLDExtractor.java index 96067b8..a073a21 100644 --- a/core/src/main/java/org/apache/any23/extractor/rdf/JSONLDExtractor.java +++ b/core/src/main/java/org/apache/any23/extractor/rdf/JSONLDExtractor.java @@ -17,10 +17,27 @@ package org.apache.any23.extractor.rdf; +import com.github.jsonldjava.utils.JarCacheStorage; +import com.github.jsonldjava.utils.JsonUtils; import org.apache.any23.extractor.ExtractionContext; import org.apache.any23.extractor.ExtractionResult; import org.apache.any23.extractor.ExtractorDescription; +import org.apache.http.client.cache.HttpCacheEntry; +import org.apache.http.client.cache.HttpCacheStorage; +import org.apache.http.client.protocol.RequestAcceptEncoding; +import org.apache.http.client.protocol.ResponseContentEncoding; +import org.apache.http.impl.client.CloseableHttpClient; +import org.apache.http.impl.client.DefaultRedirectStrategy; +import org.apache.http.impl.client.cache.BasicHttpCacheStorage; +import org.apache.http.impl.client.cache.CacheConfig; +import org.apache.http.impl.client.cache.CachingHttpClientBuilder; import org.eclipse.rdf4j.rio.RDFParser; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.io.IOException; +import java.lang.invoke.MethodHandles; +import java.lang.reflect.Field; /** * Concrete implementation of {@link org.apache.any23.extractor.Extractor.ContentExtractor} @@ -29,6 +46,80 @@ import org.eclipse.rdf4j.rio.RDFParser; */ public class JSONLDExtractor extends BaseRDFExtractor { + private static final Logger LOG = LoggerFactory.getLogger(MethodHandles.lookup().lookupClass()); + + //TODO: the static members of this class can be removed once jsonldjava 0.11.2 is released + //See https://issues.apache.org/jira/browse/ANY23-336 + static final boolean needsHttpClientSwap; + + static { + if (!(needsHttpClientSwap = isHttpClientSwapNeeded())) { + LOG.warn("The static members of this class are no longer needed."); + } else { + try { + Field field = JsonUtils.class.getDeclaredField("DEFAULT_HTTP_CLIENT"); + field.setAccessible(true); + field.set(null, createDefaultHttpClient()); + } catch (Throwable e) { + LOG.warn("failed to swap jsonldjava http client", e); + } + } + } + + private static boolean isHttpClientSwapNeeded() { + try { + JsonUtils.class.getDeclaredField("JSONLD_JAVA_USER_AGENT"); + return false; + } catch (Throwable th) { + return true; + } + } + + private static CloseableHttpClient createDefaultHttpClient() { + // Common CacheConfig for both the JarCacheStorage and the underlying + // BasicHttpCacheStorage + final CacheConfig cacheConfig = CacheConfig.custom().setMaxCacheEntries(500) + .setMaxObjectSize(1024 * 256).setSharedCache(false) + .setHeuristicCachingEnabled(true).setHeuristicDefaultLifetime(86400).build(); + + final CloseableHttpClient result = CachingHttpClientBuilder.create() + // allow caching + .setCacheConfig(cacheConfig) + // Wrap the local JarCacheStorage around a BasicHttpCacheStorage + .setHttpCacheStorage(new JarCacheStorage0(null, cacheConfig, + new BasicHttpCacheStorage(cacheConfig))) + // Support compressed data + // https://wayback.archive.org/web/20130901115452/http://hc.apache.org:80/httpcomponents-client-ga/tutorial/html/httpagent.html#d5e1238 + .addInterceptorFirst(new RequestAcceptEncoding()) + .addInterceptorFirst(new ResponseContentEncoding()) + .setRedirectStrategy(DefaultRedirectStrategy.INSTANCE) + // use system defaults for proxy etc. + .useSystemProperties().build(); + + return result; + } + + private static class JarCacheStorage0 extends JarCacheStorage { + + private final HttpCacheStorage delegate; + + public JarCacheStorage0(ClassLoader classLoader, CacheConfig cacheConfig, + HttpCacheStorage delegate) { + super(classLoader, cacheConfig, delegate); + this.delegate = delegate; + } + + @Override + public HttpCacheEntry getEntry(String key) throws IOException { + HttpCacheEntry entry = delegate.getEntry(key); + return entry != null ? entry : super.getEntry(key); + } + } + + + + + public JSONLDExtractor(boolean verifyDataType, boolean stopAtFirstError) { super(verifyDataType, stopAtFirstError); } http://git-wip-us.apache.org/repos/asf/any23/blob/205cfe44/core/src/test/java/org/apache/any23/extractor/rdf/JSONLDExtractorTest.java ---------------------------------------------------------------------- diff --git a/core/src/test/java/org/apache/any23/extractor/rdf/JSONLDExtractorTest.java b/core/src/test/java/org/apache/any23/extractor/rdf/JSONLDExtractorTest.java index 1e9aa6f..fcef3e4 100644 --- a/core/src/test/java/org/apache/any23/extractor/rdf/JSONLDExtractorTest.java +++ b/core/src/test/java/org/apache/any23/extractor/rdf/JSONLDExtractorTest.java @@ -19,6 +19,7 @@ package org.apache.any23.extractor.rdf; import java.io.ByteArrayOutputStream; import java.io.IOException; +import com.github.jsonldjava.core.DocumentLoader; import org.apache.any23.extractor.ExtractionContext; import org.apache.any23.extractor.ExtractionException; import org.apache.any23.extractor.ExtractionParameters; @@ -29,6 +30,7 @@ import org.apache.any23.writer.RDFXMLWriter; import org.apache.any23.writer.TripleHandler; import org.apache.any23.writer.TripleHandlerException; import org.junit.After; +import org.junit.Assert; import org.junit.Before; import org.junit.Test; import org.eclipse.rdf4j.model.IRI; @@ -56,6 +58,27 @@ public class JSONLDExtractorTest { } @Test + public void testRemoteContextCaching() throws Exception { + Assert.assertTrue("The static members of " + JSONLDExtractor.class + " can now be removed!", + JSONLDExtractor.needsHttpClientSwap); + DocumentLoader documentLoader = new DocumentLoader(); + final String[] urls = {"http://schema.org/", "http://schema.org/docs/jsonldcontext.json"}; + for (String url : urls) { + long start = System.currentTimeMillis(); + for (int i = 1; i <= 10000; i++) { + documentLoader.loadDocument(url); + + long seconds = (System.currentTimeMillis() - start) / 1000; + + if (seconds > 60) { + Assert.fail(String.format("Took %s seconds to access %s %s times", seconds, url, i)); + break; + } + } + } + } + + @Test public void testExtractFromJSONLDDocument() throws IOException, ExtractionException, TripleHandlerException { final IRI uri = RDFUtils.iri("http://host.com/place-example.jsonld");
