Repository: jena Updated Branches: refs/heads/master 347d7764d -> 1c1325c56
JENA-1305: Added support for ElasticSearch V 5.2.1 Project: http://git-wip-us.apache.org/repos/asf/jena/repo Commit: http://git-wip-us.apache.org/repos/asf/jena/commit/1c1325c5 Tree: http://git-wip-us.apache.org/repos/asf/jena/tree/1c1325c5 Diff: http://git-wip-us.apache.org/repos/asf/jena/diff/1c1325c5 Branch: refs/heads/master Commit: 1c1325c5646f3fd908bf56db0480759a22dcd68c Parents: 347d776 Author: Anuj Kumar <[email protected]> Authored: Wed Mar 8 12:06:11 2017 +0100 Committer: Osma Suominen <[email protected]> Committed: Tue Mar 28 17:49:36 2017 +0300 ---------------------------------------------------------------------- jena-parent/pom.xml | 14 + jena-text/pom.xml | 93 +++- .../main/java/examples/JenaESTextExample.java | 99 ++++ .../org/apache/jena/query/text/ESSettings.java | 177 ++++++++ .../jena/query/text/TextDatasetFactory.java | 25 ++ .../org/apache/jena/query/text/TextIndexES.java | 448 +++++++++++++++++++ .../query/text/assembler/TextAssembler.java | 2 + .../text/assembler/TextIndexESAssembler.java | 114 +++++ .../jena/query/text/assembler/TextVocab.java | 8 + jena-text/src/main/resources/data-es.ttl | 46 ++ jena-text/src/main/resources/text-config-es.ttl | 64 +++ .../org/apache/jena/query/text/TS_Text.java | 14 +- .../apache/jena/query/text/it/BaseESTest.java | 111 +++++ .../jena/query/text/it/TextIndexESIT.java | 306 +++++++++++++ 14 files changed, 1511 insertions(+), 10 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/jena/blob/1c1325c5/jena-parent/pom.xml ---------------------------------------------------------------------- diff --git a/jena-parent/pom.xml b/jena-parent/pom.xml index bd18208..59c69ff 100644 --- a/jena-parent/pom.xml +++ b/jena-parent/pom.xml @@ -72,6 +72,7 @@ <ver.commons-codec>1.10</ver.commons-codec> <ver.lucene>6.4.1</ver.lucene> + <ver.elasticsearch>5.2.1</ver.elasticsearch> <ver.spatial4j>0.6</ver.spatial4j> <ver.mockito>1.9.5</ver.mockito> @@ -275,6 +276,19 @@ <version>${ver.spatial4j}</version> </dependency> + <!-- ES dependencies--> + <dependency> + <groupId>org.elasticsearch</groupId> + <artifactId>elasticsearch</artifactId> + <version>${ver.elasticsearch}</version> + </dependency> + + <dependency> + <groupId>org.elasticsearch.client</groupId> + <artifactId>transport</artifactId> + <version>${ver.elasticsearch}</version> + </dependency> + <!-- Logging --> <dependency> <groupId>org.slf4j</groupId> http://git-wip-us.apache.org/repos/asf/jena/blob/1c1325c5/jena-text/pom.xml ---------------------------------------------------------------------- diff --git a/jena-text/pom.xml b/jena-text/pom.xml index 37010f1..03e90a6 100644 --- a/jena-text/pom.xml +++ b/jena-text/pom.xml @@ -81,6 +81,32 @@ <artifactId>lucene-queryparser</artifactId> </dependency> + <dependency> + <groupId>org.elasticsearch</groupId> + <artifactId>elasticsearch</artifactId> + </dependency> + + <dependency> + <groupId>org.elasticsearch.client</groupId> + <artifactId>transport</artifactId> + </dependency> + + <dependency> + <groupId>junit</groupId> + <artifactId>junit</artifactId> + </dependency> + + <dependency> + <groupId>org.apache.logging.log4j</groupId> + <artifactId>log4j-api</artifactId> + <version>2.7</version> + </dependency> + <dependency> + <groupId>org.apache.logging.log4j</groupId> + <artifactId>log4j-core</artifactId> + <version>2.7</version> + </dependency> + </dependencies> <build> @@ -112,11 +138,72 @@ <groupId>org.apache.maven.plugins</groupId> <artifactId>maven-surefire-plugin</artifactId> <configuration> - <includes> - <include>**/TS_*.java</include> - </includes> + <!-- Skip the default running of this plug-in (or everything is run twice...see below) --> + <skip>true</skip> </configuration> + <executions> + <execution> + <id>unit-tests</id> + <phase>test</phase> + <goals> + <goal>test</goal> + </goals> + <configuration> + <skip>false</skip> + <includes> + <include>**/TS_*.java</include> + </includes> + <excludes> + <exclude>**/*IT.java</exclude> + </excludes> + </configuration> + </execution> + <execution> + <id>integration-tests</id> + <phase>integration-test</phase> + <goals> + <goal>test</goal> + </goals> + <configuration> + <skip>false</skip> + <includes> + <include>**/*IT.java</include> + </includes> + </configuration> + </execution> + </executions> </plugin> + <plugin> + <groupId>com.github.alexcojocaru</groupId> + <artifactId>elasticsearch-maven-plugin</artifactId> + <!-- REPLACE THE FOLLOWING WITH THE PLUGIN VERSION YOU NEED --> + <version>5.2</version> + <configuration> + <clusterName>elasticsearch</clusterName> + <transportPort>9500</transportPort> + <httpPort>9400</httpPort> + </configuration> + <executions> + <!-- + The elasticsearch maven plugin goals are by default bound to the + pre-integration-test and post-integration-test phases + --> + <execution> + <id>start-elasticsearch</id> + <phase>pre-integration-test</phase> + <goals> + <goal>runforked</goal> + </goals> + </execution> + <execution> + <id>stop-elasticsearch</id> + <phase>post-integration-test</phase> + <goals> + <goal>stop</goal> + </goals> + </execution> + </executions> + </plugin> <plugin> <groupId>org.apache.maven.plugins</groupId> http://git-wip-us.apache.org/repos/asf/jena/blob/1c1325c5/jena-text/src/main/java/examples/JenaESTextExample.java ---------------------------------------------------------------------- diff --git a/jena-text/src/main/java/examples/JenaESTextExample.java b/jena-text/src/main/java/examples/JenaESTextExample.java new file mode 100644 index 0000000..3eb3042 --- /dev/null +++ b/jena-text/src/main/java/examples/JenaESTextExample.java @@ -0,0 +1,99 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package examples; + +import org.apache.jena.atlas.lib.StrUtils; +import org.apache.jena.query.*; +import org.apache.jena.sparql.util.QueryExecUtils; + +/** + * Simple example class to test the {@link org.apache.jena.query.text.assembler.TextIndexESAssembler} + * For this class to work properly, an elasticsearch node should be up and running, otherwise it will fail. + * You can find the details of downloading and running an ElasticSearch version here: https://www.elastic.co/downloads/past-releases/elasticsearch-5-2-1 + * Unzip the file in your favourite directory and then execute the appropriate file under the bin directory. + * It will take less than a minute. + * In order to visualize what is written in ElasticSearch, you need to download and run Kibana: https://www.elastic.co/downloads/kibana + * To run kibana, just go to the bin directory and execute the appropriate file. + * We need to resort to this mechanism as ElasticSearch has stopped supporting embedded ElasticSearch. + * + * In addition we cant have it in the test package because ElasticSearch + * detects the thread origin and stops us from instantiating a client. + */ +public class JenaESTextExample { + + public static void main(String[] args) { + + queryData(loadData(createAssembler())); + } + + + private static Dataset createAssembler() { + String assemblerFile = "text-config-es.ttl"; + Dataset ds = DatasetFactory.assemble(assemblerFile, + "http://localhost/jena_example/#text_dataset") ; + return ds; + } + + private static Dataset loadData(Dataset ds) { + JenaTextExample1.loadData(ds, "data-es.ttl"); + return ds; + } + + /** + * Query Data + * @param ds + */ + private static void queryData(Dataset ds) { +// JenaTextExample1.queryData(ds); + queryDataWithoutProperty(ds); + + + } + + public static void queryDataWithoutProperty(Dataset dataset) + { + + + String pre = StrUtils.strjoinNL + ( "PREFIX : <http://example/>" + , "PREFIX text: <http://jena.apache.org/text#>" + , "PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>") ; + + String qs = StrUtils.strjoinNL + ( "SELECT * " +// , " { ?s text:query (rdfs:comment 'this' 'lang:en') ;" + , " { ?s text:query ('this' 'lang:en*') ;" +// , " { ?s text:query ('this' 'lang:en-GB') ;" +// , " { ?s text:query (rdfs:comment 'this' 'lang:en-GB') ;" +// , " { ?s text:query (rdfs:comment 'this' 'lang:*') ;" +// , " { ?s text:query (rdfs:comment 'this' 'lang:none') ;" +// , " { ?s text:query (rdfs:comment 'this') ;" +// , " { ?s text:query ('X1' 'lang:en') ;" + , " rdfs:label ?label" + , " }") ; + + dataset.begin(ReadWrite.READ) ; + try { + Query q = QueryFactory.create(pre+"\n"+qs) ; + QueryExecution qexec = QueryExecutionFactory.create(q , dataset) ; + QueryExecUtils.executeQuery(q, qexec) ; + } finally { dataset.end() ; } + + + } +} http://git-wip-us.apache.org/repos/asf/jena/blob/1c1325c5/jena-text/src/main/java/org/apache/jena/query/text/ESSettings.java ---------------------------------------------------------------------- diff --git a/jena-text/src/main/java/org/apache/jena/query/text/ESSettings.java b/jena-text/src/main/java/org/apache/jena/query/text/ESSettings.java new file mode 100644 index 0000000..0c5a11e --- /dev/null +++ b/jena-text/src/main/java/org/apache/jena/query/text/ESSettings.java @@ -0,0 +1,177 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.jena.query.text; + +import java.util.HashMap; +import java.util.Map; + +/** + * Settings for ElasticSearch based indexing + */ +public class ESSettings { + + /** + * Map of hosts and ports. The host could also be an IP Address + */ + private Map<String,Integer> hostToPortMapping; + + /** + * Name of the Cluster. Defaults to 'elasticsearch' + */ + private String clusterName; + + /** + * Number of shards. Defaults to '1' + */ + private Integer shards; + + /** + * Number of replicas. Defaults to '1' + */ + private Integer replicas; + + /** + * Name of the index. Defaults to 'jena-text' + */ + private String indexName; + + + public Map<String, Integer> getHostToPortMapping() { + return hostToPortMapping; + } + + public void setHostToPortMapping(Map<String, Integer> hostToPortMapping) { + this.hostToPortMapping = hostToPortMapping; + } + + public ESSettings.Builder builder() { + return new ESSettings.Builder(); + } + + /** + * Convenient builder class for building ESSettings + */ + public static class Builder { + + ESSettings settings; + + public Builder() { + this.settings = new ESSettings(); + this.settings.setClusterName("elasticsearch"); + this.settings.setShards(1); + this.settings.setReplicas(1); + this.settings.setHostToPortMapping(new HashMap<>()); + this.settings.setIndexName("jena-text"); + } + + + public Builder indexName(String indexName) { + if(indexName != null && !indexName.isEmpty()) { + this.settings.setIndexName(indexName); + } + return this; + } + + public Builder clusterName(String clusterName) { + if(clusterName != null && !clusterName.isEmpty()) { + this.settings.setClusterName(clusterName); + } + return this; + + } + + public Builder shards(Integer shards) { + if (shards != null) { + this.settings.setShards(shards); + } + return this; + } + + public Builder replicas(Integer replicas) { + if(replicas != null) { + this.settings.setReplicas(replicas); + } + return this; + } + + public Builder hostAndPort(String host, Integer port) { + if(host != null && port != null) { + this.settings.getHostToPortMapping().put(host, port); + } + return this; + + } + + public Builder hostAndPortMap(Map<String, Integer> hostAndPortMap) { + if(hostAndPortMap != null) { + this.settings.getHostToPortMapping().putAll(hostAndPortMap); + } + + return this; + } + + public ESSettings build() { + return this.settings; + } + + } + + public String getClusterName() { + return clusterName; + } + + public void setClusterName(String clusterName) { + this.clusterName = clusterName; + } + + public Integer getShards() { + return shards; + } + + public void setShards(Integer shards) { + this.shards = shards; + } + + public Integer getReplicas() { + return replicas; + } + + public void setReplicas(Integer replicas) { + this.replicas = replicas; + } + + public String getIndexName() { + return indexName; + } + + public void setIndexName(String indexName) { + this.indexName = indexName; + } + + + @Override + public String toString() { + return "ESSettings{" + + "hostToPortMapping=" + hostToPortMapping + + ", clusterName='" + clusterName + '\'' + + ", shards=" + shards + + ", replicas=" + replicas + + ", indexName='" + indexName + + '}'; + } +} http://git-wip-us.apache.org/repos/asf/jena/blob/1c1325c5/jena-text/src/main/java/org/apache/jena/query/text/TextDatasetFactory.java ---------------------------------------------------------------------- diff --git a/jena-text/src/main/java/org/apache/jena/query/text/TextDatasetFactory.java b/jena-text/src/main/java/org/apache/jena/query/text/TextDatasetFactory.java index c365fa5..5dba5a2 100644 --- a/jena-text/src/main/java/org/apache/jena/query/text/TextDatasetFactory.java +++ b/jena-text/src/main/java/org/apache/jena/query/text/TextDatasetFactory.java @@ -164,5 +164,30 @@ public class TextDatasetFactory TextIndex index = createLuceneIndex(directory, config) ; return create(base, index, true) ; } + + /** + * Create an ElasticSearch based Index and return a Dataset based on this index + * @param base the base {@link Dataset} + * @param config {@link TextIndexConfig} containing the {@link EntityDefinition} + * @param settings ElasticSearch specific settings for initializing and connecting to an ElasticSearch Cluster + * @return The config definition for the index instantiation + */ + public static Dataset createES(Dataset base, TextIndexConfig config, ESSettings settings) + { + TextIndex index = createESIndex(config, settings) ; + return create(base, index, true) ; + } + + /** + * Create an ElasticSearch based Index + * @param config {@link TextIndexConfig} containing the {@link EntityDefinition} + * @param settings ElasticSearch specific settings for initializing and connecting to an ElasticSearch Cluster + * @return a configured instance of TextIndexES + */ + public static TextIndex createESIndex(TextIndexConfig config, ESSettings settings) + { + return new TextIndexES(config, settings) ; + } + } http://git-wip-us.apache.org/repos/asf/jena/blob/1c1325c5/jena-text/src/main/java/org/apache/jena/query/text/TextIndexES.java ---------------------------------------------------------------------- diff --git a/jena-text/src/main/java/org/apache/jena/query/text/TextIndexES.java b/jena-text/src/main/java/org/apache/jena/query/text/TextIndexES.java new file mode 100644 index 0000000..b85d4ed --- /dev/null +++ b/jena-text/src/main/java/org/apache/jena/query/text/TextIndexES.java @@ -0,0 +1,448 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.jena.query.text; + +import org.apache.commons.lang3.exception.ExceptionUtils; +import org.apache.jena.graph.Node; +import org.apache.jena.graph.NodeFactory; +import org.apache.jena.sparql.util.NodeFactoryExtra; +import org.elasticsearch.action.admin.indices.exists.indices.IndicesExistsRequest; +import org.elasticsearch.action.admin.indices.exists.indices.IndicesExistsResponse; +import org.elasticsearch.action.get.GetResponse; +import org.elasticsearch.action.index.IndexRequest; +import org.elasticsearch.action.search.SearchResponse; +import org.elasticsearch.action.update.UpdateRequest; +import org.elasticsearch.action.update.UpdateResponse; +import org.elasticsearch.client.Client; +import org.elasticsearch.client.transport.TransportClient; +import org.elasticsearch.common.settings.Settings; +import org.elasticsearch.common.transport.InetSocketTransportAddress; +import org.elasticsearch.common.xcontent.XContentBuilder; +import org.elasticsearch.index.engine.DocumentMissingException; +import org.elasticsearch.index.query.QueryBuilders; +import org.elasticsearch.script.Script; +import org.elasticsearch.search.SearchHit; +import org.elasticsearch.transport.client.PreBuiltTransportClient; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.net.InetAddress; +import java.util.*; + +import static org.elasticsearch.common.xcontent.XContentFactory.jsonBuilder; + +/** + * Elastic Search Implementation of {@link TextIndex} + * + */ +public class TextIndexES implements TextIndex { + + /** + * The definition of the Entity we are trying to Index + */ + private final EntityDefinition docDef ; + + /** + * Thread safe ElasticSearch Java Client to perform Index operations + */ + private static Client client; + + /** + * The name of the index. Defaults to 'jena-text' + */ + private final String indexName; + + /** + * The parameter representing the cluster name key + */ + static final String CLUSTER_NAME_PARAM = "cluster.name"; + + /** + * The parameter representing the number of shards key + */ + static final String NUM_OF_SHARDS_PARAM = "number_of_shards"; + + /** + * The parameter representing the number of replicas key + */ + static final String NUM_OF_REPLICAS_PARAM = "number_of_replicas"; + + private static final String DASH = "-"; + + private static final String UNDERSCORE = "_"; + + private static final String COLON = ":"; + + private static final String ASTERISK = "*"; + + /** + * ES Script for adding/updating the document in the index. + * The main reason to use scripts is because we want to modify the values of the fields that contains an array of values + */ + private static final String ADD_UPDATE_SCRIPT = "if((ctx._source == null) || (ctx._source.<fieldName> == null) || (ctx._source.<fieldName>.empty == true)) " + + "{ctx._source.<fieldName>=[params.fieldValue] } else {ctx._source.<fieldName>.add(params.fieldValue)}"; + + /** + * ES Script for deleting a specific value in the field for the given document in the index. + * The main reason to use scripts is because we want to delete specific value of the field that contains an array of values + */ + private static final String DELETE_SCRIPT = "if((ctx._source != null) && (ctx._source.<fieldToRemove> != null) && (ctx._source.<fieldToRemove>.empty != true) " + + "&& (ctx._source.<fieldToRemove>.indexOf(params.valueToRemove) >= 0)) " + + "{ctx._source.<fieldToRemove>.remove(ctx._source.<fieldToRemove>.indexOf(params.valueToRemove))}"; + + /** + * Number of maximum results to return in case no limit is specified on the search operation + */ + static final Integer MAX_RESULTS = 10000; + + private static final Logger LOGGER = LoggerFactory.getLogger(TextIndexES.class) ; + + /** + * Construct an instance of {@link TextIndexES} based on provided {@link TextIndexConfig} and {@link ESSettings} + * The constructor is responsible for initializing a {@link TransportClient} based on the provided configs + * and create index based on the provided {@link ESSettings} + * @param config an instance of {@link TextIndexConfig} + * @param esSettings an instance of {@link ESSettings} + */ + public TextIndexES(TextIndexConfig config, ESSettings esSettings) { + + this.indexName = esSettings.getIndexName(); + this.docDef = config.getEntDef(); + docDef.setLangField("lang"); + + try { + if(client == null) { + + LOGGER.debug("Initializing the Elastic Search Java Client with settings: " + esSettings); + Settings settings = Settings.builder() + .put(CLUSTER_NAME_PARAM, esSettings.getClusterName()).build(); + List<InetSocketTransportAddress> addresses = new ArrayList<>(); + for(String host: esSettings.getHostToPortMapping().keySet()) { + InetSocketTransportAddress addr = new InetSocketTransportAddress(InetAddress.getByName(host), esSettings.getHostToPortMapping().get(host)); + addresses.add(addr); + } + + InetSocketTransportAddress socketAddresses[] = new InetSocketTransportAddress[addresses.size()]; + client = new PreBuiltTransportClient(settings).addTransportAddresses(addresses.toArray(socketAddresses)); + LOGGER.debug("Successfully initialized the client"); + } + + IndicesExistsResponse exists = client.admin().indices().exists(new IndicesExistsRequest(indexName)).get(); + if(!exists.isExists()) { + Settings indexSettings = Settings.builder() + .put(NUM_OF_SHARDS_PARAM, esSettings.getShards()) + .put(NUM_OF_REPLICAS_PARAM, esSettings.getReplicas()) + .build(); + LOGGER.debug("Index with name " + indexName + " does not exist yet. Creating one with settings: " + indexSettings.toString()); + client.admin().indices().prepareCreate(indexName).setSettings(indexSettings).get(); + } + }catch (Exception e) { + throw new TextIndexException("Exception occurred while instantiating ElasticSearch Text Index", e); + } + } + + + /** + * Constructor used mainly for performing Integration tests + * @param config an instance of {@link TextIndexConfig} + * @param client an instance of {@link TransportClient}. The client should already have been initialized with an index + */ + public TextIndexES(TextIndexConfig config, Client client, String indexName) { + this.docDef = config.getEntDef(); + this.client = client; + this.indexName = indexName; + } + + /** + * We do not have any specific logic to perform before committing + */ + @Override + public void prepareCommit() { + //Do Nothing + + } + + /** + * Commit happens in the individual get/add/delete operations + */ + @Override + public void commit() { + // Do Nothing + } + + /** + * We do not do rollback + */ + @Override + public void rollback() { + //Do Nothing + + } + + /** + * We don't have resources that need to be closed explicitely + */ + @Override + public void close() { + // Do Nothing + + } + + /** + * Update an Entity. Since we are doing Upserts in add entity anyways, we simply call {@link #addEntity(Entity)} + * method that takes care of updating the Entity as well. + * @param entity the entity to update. + */ + @Override + public void updateEntity(Entity entity) { + //Since Add entity also updates the indexed document in case it already exists, + // we can simply call the addEntity from here. + addEntity(entity); + } + + + /** + * Add an Entity to the ElasticSearch Index. + * The entity will be added as a new document in ES, if it does not already exists. + * If the Entity exists, then the entity will simply be updated. + * The entity will never be replaced. + * @param entity the entity to add + */ + @Override + public void addEntity(Entity entity) { + LOGGER.debug("Adding/Updating the entity in ES"); + + //The field that has a not null value in the current Entity instance. + //Required, mainly for building a script for the update command. + String fieldToAdd = null; + String fieldValueToAdd = null; + try { + XContentBuilder builder = jsonBuilder() + .startObject(); + + for(String field: docDef.fields()) { + if(entity.get(field) != null) { + if(entity.getLanguage() != null && !entity.getLanguage().isEmpty()) { + //We make sure that the field name contains all underscore and no dash (for eg. when the lang value is en-GB) + //The reason to do this is because the script fails with exception in case we have "-" in field name. + fieldToAdd = normalizeFieldName(field, entity.getLanguage()); + } else { + fieldToAdd = field; + } + + fieldValueToAdd = (String) entity.get(field); + builder = builder.field(fieldToAdd, Arrays.asList(fieldValueToAdd)); + break; + } else { + //We are making sure that the field is at-least added to the index. + //This will help us tremendously when we are appending the data later in an already indexed document. + builder = builder.field(field, Collections.emptyList()); + } + } + + builder = builder.endObject(); + IndexRequest indexRequest = new IndexRequest(indexName, docDef.getEntityField(), entity.getId()) + .source(builder); + + String addUpdateScript = ADD_UPDATE_SCRIPT.replaceAll("<fieldName>", fieldToAdd); + Map<String, Object> params = new HashMap<>(); + params.put("fieldValue", fieldValueToAdd); + + UpdateRequest upReq = new UpdateRequest(indexName, docDef.getEntityField(), entity.getId()) + .script(new Script(Script.DEFAULT_SCRIPT_TYPE, Script.DEFAULT_SCRIPT_LANG, addUpdateScript, params)) + .upsert(indexRequest); + + UpdateResponse response = client.update(upReq).get(); + + LOGGER.debug("Received the following Update response : " + response + " for the following entity: " + entity); + + } catch(Exception e) { + throw new TextIndexException("Unable to Index the Entity in ElasticSearch.", e); + } + } + + /** + * Delete the value of the entity from the existing document, if any. + * The document itself will never get deleted. Only the value will get deleted. + * @param entity entity whose value needs to be deleted + */ + @Override + public void deleteEntity(Entity entity) { + + String fieldToRemove = null; + String valueToRemove = null; + for(String field : docDef.fields()) { + if(entity.get(field) != null) { + fieldToRemove = field; + if(entity.getLanguage()!= null && !entity.getLanguage().isEmpty()) { + fieldToRemove = normalizeFieldName(fieldToRemove, entity.getLanguage()); + } + valueToRemove = (String)entity.get(field); + break; + } + } + + if(fieldToRemove != null && valueToRemove != null) { + + LOGGER.debug("deleting content related to entity: " + entity.getId()); + String deleteScript = DELETE_SCRIPT.replaceAll("<fieldToRemove>", fieldToRemove); + Map<String,Object> params = new HashMap<>(); + params.put("valueToRemove", valueToRemove); + + UpdateRequest updateRequest = new UpdateRequest(indexName, docDef.getEntityField(), entity.getId()) + .script(new Script(Script.DEFAULT_SCRIPT_TYPE, Script.DEFAULT_SCRIPT_LANG,deleteScript,params)); + + try { + client.update(updateRequest).get(); + }catch(Exception e) { + if( ExceptionUtils.getRootCause(e) instanceof DocumentMissingException) { + LOGGER.debug("Trying to delete values from a missing document. Ignoring deletion of entity: ", entity); + } else { + throw new TextIndexException("Unable to delete entity.", e); + } + } + } + } + + /** + * Get an Entity given the subject Id + * @param uri the subject Id of the entity + * @return a map of field name and field values; + */ + @Override + public Map<String, Node> get(String uri) { + + GetResponse response; + Map<String, Node> result = new HashMap<>(); + + if(uri != null) { + response = client.prepareGet(indexName, docDef.getEntityField(), uri).get(); + if(response != null && !response.isSourceEmpty()) { + String entityField = response.getId(); + Node entity = NodeFactory.createURI(entityField) ; + result.put(docDef.getEntityField(), entity); + Map<String, Object> source = response.getSource(); + for (String field: docDef.fields()) { + Object fieldResponse = source.get(field); + + if(fieldResponse == null) { + //We wont return it. + continue; + } + else if(fieldResponse instanceof List<?>) { + //We are storing the values of fields as a List always. + //If there are values stored in the list, then we return the first value, + // else we do not include the field in the returned Map of Field -> Node Mapping + List<?> responseList = (List<?>)fieldResponse; + if(responseList != null && responseList.size() > 0) { + String fieldValue = (String)responseList.get(0); + Node fieldNode = NodeFactoryExtra.createLiteralNode(fieldValue, null, null); + result.put(field, fieldNode); + } + } + } + } + } + + return result; + } + + + @Override + public List<TextHit> query(Node property, String qs, String graphURI, String lang) { + return query(property, qs, graphURI, lang, MAX_RESULTS); + } + + /** + * Query the ElasticSearch for the given Node, with the given query String and limit. + * @param property the node property to make a search for + * @param qs the query string + * @param limit limit on the number of records to return + * @return List of {@link TextHit}s containing the documents that have been found + */ + @Override + public List<TextHit> query(Node property, String qs, String graphURI, String lang, int limit) { + if(property != null) { + qs = parse(property.getLocalName(), qs, lang); + } else { + qs = parse(null, qs, lang); + } + + LOGGER.debug("Querying ElasticSearch for QueryString: " + qs); + SearchResponse response = client.prepareSearch(indexName) + .setTypes(docDef.getEntityField()) + .setQuery(QueryBuilders.queryStringQuery(qs)) + // Not fetching the source because we are currently not interested + // in the actual values but only Id of the document. This will also speed up search + .setFetchSource(false) + .setFrom(0).setSize(limit) + .get(); + + List<TextHit> results = new ArrayList<>() ; + for (SearchHit hit : response.getHits()) { + + //It has been decided to return NULL literal values for now. + String entityField = hit.getId(); + Node entityNode = TextQueryFuncs.stringToNode(entityField); + Float score = hit.getScore(); + TextHit textHit = new TextHit(entityNode, score, null); + results.add(textHit); + + } + return results; + } + + @Override + public EntityDefinition getDocDef() { + return docDef ; + } + + private String parse(String fieldName, String qs, String lang) { + if(fieldName != null && !fieldName.isEmpty()) { + if(lang != null && !lang.equals("none")) { + if (!ASTERISK.equals(lang)) { + fieldName = fieldName + UNDERSCORE + lang.replaceAll(DASH, UNDERSCORE); + qs = fieldName + COLON + qs; + } else { + if(!qs.contains("\\*")) { + fieldName = fieldName + ASTERISK; + qs = fieldName + COLON + qs; + } + } + + } else { + //Lang is null, but field name is not null + qs = fieldName + COLON + qs; + + } + } + //We do this to enable wild card search + return qs.replaceAll("\\*", "\\\\*"); + + } + + private String normalizeFieldName(String fieldName, String lang) { + //We know that the lang field is not null already + StringBuilder sb = new StringBuilder(fieldName); + return sb.append(UNDERSCORE).append(lang.replaceAll(DASH,UNDERSCORE)).toString(); + + + } + +} http://git-wip-us.apache.org/repos/asf/jena/blob/1c1325c5/jena-text/src/main/java/org/apache/jena/query/text/assembler/TextAssembler.java ---------------------------------------------------------------------- diff --git a/jena-text/src/main/java/org/apache/jena/query/text/assembler/TextAssembler.java b/jena-text/src/main/java/org/apache/jena/query/text/assembler/TextAssembler.java index a14f8c6..80b2f7e 100644 --- a/jena-text/src/main/java/org/apache/jena/query/text/assembler/TextAssembler.java +++ b/jena-text/src/main/java/org/apache/jena/query/text/assembler/TextAssembler.java @@ -30,12 +30,14 @@ public class TextAssembler Assembler.general.implementWith(TextVocab.entityMap, new EntityDefinitionAssembler()) ; Assembler.general.implementWith(TextVocab.textIndexLucene, new TextIndexLuceneAssembler()) ; + Assembler.general.implementWith(TextVocab.textIndexES, new TextIndexESAssembler()) ; Assembler.general.implementWith(TextVocab.standardAnalyzer, new StandardAnalyzerAssembler()) ; Assembler.general.implementWith(TextVocab.simpleAnalyzer, new SimpleAnalyzerAssembler()) ; Assembler.general.implementWith(TextVocab.keywordAnalyzer, new KeywordAnalyzerAssembler()) ; Assembler.general.implementWith(TextVocab.lowerCaseKeywordAnalyzer, new LowerCaseKeywordAnalyzerAssembler()) ; Assembler.general.implementWith(TextVocab.localizedAnalyzer, new LocalizedAnalyzerAssembler()) ; Assembler.general.implementWith(TextVocab.configurableAnalyzer, new ConfigurableAnalyzerAssembler()) ; + } } http://git-wip-us.apache.org/repos/asf/jena/blob/1c1325c5/jena-text/src/main/java/org/apache/jena/query/text/assembler/TextIndexESAssembler.java ---------------------------------------------------------------------- diff --git a/jena-text/src/main/java/org/apache/jena/query/text/assembler/TextIndexESAssembler.java b/jena-text/src/main/java/org/apache/jena/query/text/assembler/TextIndexESAssembler.java new file mode 100644 index 0000000..f677190 --- /dev/null +++ b/jena-text/src/main/java/org/apache/jena/query/text/assembler/TextIndexESAssembler.java @@ -0,0 +1,114 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.jena.query.text.assembler; + +import org.apache.jena.assembler.Assembler; +import org.apache.jena.assembler.Mode; +import org.apache.jena.assembler.assemblers.AssemblerBase; +import org.apache.jena.query.text.*; +import org.apache.jena.rdf.model.Resource; +import org.apache.jena.sparql.util.graph.GraphUtils; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.util.HashMap; +import java.util.Map; + +import static org.apache.jena.query.text.assembler.TextVocab.*; + +public class TextIndexESAssembler extends AssemblerBase { + + private static Logger LOGGER = LoggerFactory.getLogger(TextIndexESAssembler.class) ; + + protected static final String COMMA = ","; + protected static final String COLON = ":"; + /* + <#index> a :TextIndexES ; + text:serverList "127.0.0.1:9300,127.0.0.2:9400,127.0.0.3:9500" ; #Comma separated list of hosts:ports + text:clusterName "elasticsearch" + text:shards "1" + text:replicas "1" + text:entityMap <#endMap> ; + . + */ + + @SuppressWarnings("resource") + @Override + public TextIndex open(Assembler a, Resource root, Mode mode) { + try { + String listOfHostsAndPorts = GraphUtils.getAsStringValue(root, pServerList) ; + if(listOfHostsAndPorts == null || listOfHostsAndPorts.isEmpty()) { + throw new TextIndexException("Mandatory property text:serverList (containing the comma-separated list of host:port) property is not specified. " + + "An example value for the property: 127.0.0.1:9300"); + } + String[] hosts = listOfHostsAndPorts.split(COMMA); + Map<String,Integer> hostAndPortMapping = new HashMap<>(); + for(String host : hosts) { + String[] hostAndPort = host.split(COLON); + if(hostAndPort.length < 2) { + LOGGER.error("Either the host or the port value is missing.Please specify the property in host:port format. " + + "Both parts are mandatory. Ignoring this value. Moving to the next one."); + continue; + } + hostAndPortMapping.put(hostAndPort[0], Integer.valueOf(hostAndPort[1])); + } + + String clusterName = GraphUtils.getAsStringValue(root, pClusterName); + if(clusterName == null || clusterName.isEmpty()) { + LOGGER.warn("ClusterName property is not specified. Defaulting to 'elasticsearch'"); + clusterName = "elasticsearch"; + } + + String numberOfShards = GraphUtils.getAsStringValue(root, pShards); + if(numberOfShards == null || numberOfShards.isEmpty()) { + LOGGER.warn("shards property is not specified. Defaulting to '1'"); + numberOfShards = "1"; + } + + String replicationFactor = GraphUtils.getAsStringValue(root, pReplicas); + if(replicationFactor == null || replicationFactor.isEmpty()) { + LOGGER.warn("replicas property is not specified. Defaulting to '1'"); + replicationFactor = "1"; + } + + String indexName = GraphUtils.getAsStringValue(root, pIndexName); + if(indexName == null || indexName.isEmpty()) { + LOGGER.warn("index Name property is not specified. Defaulting to 'jena-text'"); + indexName = "jena-text"; + } + + Resource r = GraphUtils.getResourceValue(root, pEntityMap) ; + EntityDefinition docDef = (EntityDefinition)a.open(r) ; + TextIndexConfig config = new TextIndexConfig(docDef); + + //We have to create an ES specific settings class in order to pass the Index Initialization specific properties. + ESSettings settings = new ESSettings().builder() + .clusterName(clusterName) + .hostAndPortMap(hostAndPortMapping) + .shards(Integer.valueOf(numberOfShards)) + .replicas(Integer.valueOf(replicationFactor)) + .indexName(indexName) + .build(); + + return TextDatasetFactory.createESIndex(config, settings) ; + } catch (Exception e) { + throw new TextIndexException("An exception occurred while trying to open/load the Assembler configuration. ", e); + } + } +} http://git-wip-us.apache.org/repos/asf/jena/blob/1c1325c5/jena-text/src/main/java/org/apache/jena/query/text/assembler/TextVocab.java ---------------------------------------------------------------------- diff --git a/jena-text/src/main/java/org/apache/jena/query/text/assembler/TextVocab.java b/jena-text/src/main/java/org/apache/jena/query/text/assembler/TextVocab.java index 582f2ef..719d404 100644 --- a/jena-text/src/main/java/org/apache/jena/query/text/assembler/TextVocab.java +++ b/jena-text/src/main/java/org/apache/jena/query/text/assembler/TextVocab.java @@ -78,5 +78,13 @@ public class TextVocab public static final Resource lowerCaseFilter = Vocab.resource(NS, "LowerCaseFilter"); public static final Resource asciiFoldingFilter = Vocab.resource(NS, "ASCIIFoldingFilter"); + //Elasticsearch + public static final Resource textIndexES = Vocab.resource(NS, "TextIndexES") ; + public static final Property pServerList = Vocab.property(NS, "serverList"); + public static final Property pClusterName = Vocab.property(NS, "clusterName"); + public static final Property pShards = Vocab.property(NS, "shards"); + public static final Property pReplicas = Vocab.property(NS, "replicas"); + public static final Property pIndexName = Vocab.property(NS, "indexName"); + } http://git-wip-us.apache.org/repos/asf/jena/blob/1c1325c5/jena-text/src/main/resources/data-es.ttl ---------------------------------------------------------------------- diff --git a/jena-text/src/main/resources/data-es.ttl b/jena-text/src/main/resources/data-es.ttl new file mode 100644 index 0000000..8813e86 --- /dev/null +++ b/jena-text/src/main/resources/data-es.ttl @@ -0,0 +1,46 @@ + # Licensed to the Apache Software Foundation (ASF) under one + # or more contributor license agreements. See the NOTICE file + # distributed with this work for additional information + # regarding copyright ownership. The ASF licenses this file + # to you under the Apache License, Version 2.0 (the + # "License"); you may not use this file except in compliance + # with the License. You may obtain a copy of the License at + # + # http://www.apache.org/licenses/LICENSE-2.0 + # + # Unless required by applicable law or agreed to in writing, software + # distributed under the License is distributed on an "AS IS" BASIS, + # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + # See the License for the specific language governing permissions and + # limitations under the License. + +@prefix : <http://example/> . +@prefix xsd: <http://www.w3.org/2001/XMLSchema#> . +@prefix rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#> . +@prefix rdfs: <http://www.w3.org/2000/01/rdf-schema#> . + +:s a :Thing ; + :p :123 ; + rdfs:label "Thing 1"@en ; + rdfs:comment "It's a complicated comment"@en ; + rdfs:comment "this is another comment"@en ; + rdfs:comment "this is en GB comment"@en-GB ; + :id 123 . + +:s1 a :Thing ; + :p :123 ; + rdfs:label "Thing 2"@en ; + :id 123 . + +:s2 a :Thing ; + :p :123 ; + rdfs:label "Whatever"@en ; + :id 123 . + + +:x1 rdfs:label "X2 word"@en . +:x1 rdfs:label "X1 another word" . +:x2 rdfs:label "X2 word" . +:x3 rdfs:label "X3 word" . +:x3 rdfs:label "X4 word" . +:x1 rdfs:label "X9 word" . \ No newline at end of file http://git-wip-us.apache.org/repos/asf/jena/blob/1c1325c5/jena-text/src/main/resources/text-config-es.ttl ---------------------------------------------------------------------- diff --git a/jena-text/src/main/resources/text-config-es.ttl b/jena-text/src/main/resources/text-config-es.ttl new file mode 100644 index 0000000..7a03384 --- /dev/null +++ b/jena-text/src/main/resources/text-config-es.ttl @@ -0,0 +1,64 @@ + # Licensed to the Apache Software Foundation (ASF) under one + # or more contributor license agreements. See the NOTICE file + # distributed with this work for additional information + # regarding copyright ownership. The ASF licenses this file + # to you under the Apache License, Version 2.0 (the + # "License"); you may not use this file except in compliance + # with the License. You may obtain a copy of the License at + # + # http://www.apache.org/licenses/LICENSE-2.0 + # + # Unless required by applicable law or agreed to in writing, software + # distributed under the License is distributed on an "AS IS" BASIS, + # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + # See the License for the specific language governing permissions and + # limitations under the License. + + ## Example of a TDB dataset and text index for ElasticSearch + +@prefix : <http://localhost/jena_example/#> . +@prefix rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#> . +@prefix rdfs: <http://www.w3.org/2000/01/rdf-schema#> . +@prefix tdb: <http://jena.hpl.hp.com/2008/tdb#> . +@prefix ja: <http://jena.hpl.hp.com/2005/11/Assembler#> . +@prefix text: <http://jena.apache.org/text#> . + +# TDB +[] ja:loadClass "org.apache.jena.tdb.TDB" . +tdb:DatasetTDB rdfs:subClassOf ja:RDFDataset . +tdb:GraphTDB rdfs:subClassOf ja:Model . + +# Text +[] ja:loadClass "org.apache.jena.query.text.TextQuery" . +text:TextDataset rdfs:subClassOf ja:RDFDataset . +text:TextIndexES rdfs:subClassOf text:TextIndex . + +## --------------------------------------------------------------- +## This URI must be fixed - it's used to assemble the text dataset. + +:text_dataset rdf:type text:TextDataset ; + text:dataset <#dataset> ; + text:index <#indexES> ; + . + +<#dataset> rdf:type tdb:DatasetTDB ; + tdb:location "--mem--" ; + . + +<#indexES> a text:TextIndexES ; + text:serverList "127.0.0.1:9300" ; # A comma-separated list of Host:Port values of the ElasticSearch Cluster nodes. + text:clusterName "elasticsearch" ; # Name of the ElasticSearch Cluster. If not specified defaults to 'elasticsearch' + text:shards "1" ; # The number of shards for the index. Defaults to 1 + text:replicas "1" ; # The number of replicas for the index. Defaults to 1 + text:indexName "jena-text" ; # Name of the Index. defaults to jena-text + text:entityMap <#entMap> ; + . + +<#entMap> a text:EntityMap ; + text:entityField "uri" ; # Defines the Document Type in the ES Index + text:defaultField "text" ; ## Must be defined in the text:maps + text:map ( + # rdfs:label + [ text:field "text" ; text:predicate rdfs:label ] + [ text:field "comment" ; text:predicate rdfs:comment ] + ) . http://git-wip-us.apache.org/repos/asf/jena/blob/1c1325c5/jena-text/src/test/java/org/apache/jena/query/text/TS_Text.java ---------------------------------------------------------------------- diff --git a/jena-text/src/test/java/org/apache/jena/query/text/TS_Text.java b/jena-text/src/test/java/org/apache/jena/query/text/TS_Text.java index 4cb6c21..7259b11 100644 --- a/jena-text/src/test/java/org/apache/jena/query/text/TS_Text.java +++ b/jena-text/src/test/java/org/apache/jena/query/text/TS_Text.java @@ -18,12 +18,12 @@ package org.apache.jena.query.text; -import org.apache.jena.query.text.assembler.TestEntityMapAssembler ; -import org.apache.jena.query.text.assembler.TestTextDatasetAssembler ; -import org.apache.jena.query.text.assembler.TestTextIndexLuceneAssembler ; -import org.junit.runner.RunWith ; -import org.junit.runners.Suite ; -import org.junit.runners.Suite.SuiteClasses ; +import org.apache.jena.query.text.assembler.TestEntityMapAssembler; +import org.apache.jena.query.text.assembler.TestTextDatasetAssembler; +import org.apache.jena.query.text.assembler.TestTextIndexLuceneAssembler; +import org.junit.runner.RunWith; +import org.junit.runners.Suite; +import org.junit.runners.Suite.SuiteClasses; @RunWith(Suite.class) @SuiteClasses({ @@ -50,4 +50,4 @@ import org.junit.runners.Suite.SuiteClasses ; }) public class TS_Text -{ } +{} http://git-wip-us.apache.org/repos/asf/jena/blob/1c1325c5/jena-text/src/test/java/org/apache/jena/query/text/it/BaseESTest.java ---------------------------------------------------------------------- diff --git a/jena-text/src/test/java/org/apache/jena/query/text/it/BaseESTest.java b/jena-text/src/test/java/org/apache/jena/query/text/it/BaseESTest.java new file mode 100644 index 0000000..195b4b3 --- /dev/null +++ b/jena-text/src/test/java/org/apache/jena/query/text/it/BaseESTest.java @@ -0,0 +1,111 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.jena.query.text.it; + +import org.apache.jena.query.text.EntityDefinition; +import org.apache.jena.query.text.TextIndexConfig; +import org.apache.jena.query.text.TextIndexES; +import org.apache.jena.vocabulary.RDFS; +import org.elasticsearch.action.admin.indices.delete.DeleteIndexRequest; +import org.elasticsearch.action.admin.indices.exists.indices.IndicesExistsRequest; +import org.elasticsearch.client.transport.TransportClient; +import org.elasticsearch.common.settings.Settings; +import org.elasticsearch.common.transport.InetSocketTransportAddress; +import org.elasticsearch.transport.client.PreBuiltTransportClient; +import org.junit.After; +import org.junit.Assert; +import org.junit.Before; +import org.junit.BeforeClass; + +import java.net.InetAddress; +import java.net.UnknownHostException; + +/** + * Base Class for ElasticSearch based Integration tests. + */ +public abstract class BaseESTest { + + protected static TransportClient transportClient; + + private final static String ADDRESS = "127.0.0.1"; + private final static int PORT = 9500; + private final static String CLUSTER_NAME = "elasticsearch"; + protected final static String INDEX_NAME = "jena-text"; + + protected static TextIndexES classToTest; + + static final String DOC_TYPE = "text"; + + /** + * Make sure that we have connectivity to the locally running ES node. + * The ES is started during the pre-integration-test phase + */ + @BeforeClass + public static void setupTransportClient() { + + Settings settings = Settings.builder().put("cluster.name", CLUSTER_NAME).build(); + transportClient = new PreBuiltTransportClient(settings); + try { + transportClient.addTransportAddress( + new InetSocketTransportAddress(InetAddress.getByName(ADDRESS), PORT) + ); + } catch (UnknownHostException ex) { + Assert.fail("Failed to create transport client" + ex.getMessage()); + } + classToTest = new TextIndexES(config(), transportClient, INDEX_NAME); + Assert.assertNotNull("Transport client was not created successfully", transportClient); + + + } + + /** + * Make sure that we always start we a clean index. + * This will help keep the tests isolated + * @throws Exception + */ + @Before + public void beforeTest() throws Exception{ + //Create Index + transportClient.admin().indices().prepareCreate(INDEX_NAME).get(); + Assert.assertTrue(transportClient.admin().indices().exists(new IndicesExistsRequest(INDEX_NAME)).get().isExists()); + + } + + /** + * Make sure that we always delete the index when completed with the test + * This will help keep the tests isolated + * @throws Exception + */ + @After + public void afterTest() throws Exception{ + //Delete Index + transportClient.admin().indices().delete(new DeleteIndexRequest(INDEX_NAME)).get(); + } + + /** + * Simple Config for text index + * @return + */ + private static TextIndexConfig config() { + EntityDefinition ed = new EntityDefinition(DOC_TYPE, "label", RDFS.label); + ed.set("comment", RDFS.comment.asNode()); + ed.setLangField("lang"); + TextIndexConfig config = new TextIndexConfig(ed); + return config; + } +} http://git-wip-us.apache.org/repos/asf/jena/blob/1c1325c5/jena-text/src/test/java/org/apache/jena/query/text/it/TextIndexESIT.java ---------------------------------------------------------------------- diff --git a/jena-text/src/test/java/org/apache/jena/query/text/it/TextIndexESIT.java b/jena-text/src/test/java/org/apache/jena/query/text/it/TextIndexESIT.java new file mode 100644 index 0000000..c806d5b --- /dev/null +++ b/jena-text/src/test/java/org/apache/jena/query/text/it/TextIndexESIT.java @@ -0,0 +1,306 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.jena.query.text.it; + +import org.apache.jena.graph.Node; +import org.apache.jena.query.text.Entity; +import org.apache.jena.query.text.TextHit; +import org.apache.jena.vocabulary.RDFS; +import org.elasticsearch.action.get.GetResponse; +import org.junit.Assert; +import org.junit.Test; + +import java.util.List; +import java.util.Map; +import java.util.concurrent.TimeUnit; + +/** + * Integration test class for {@link org.apache.jena.query.text.TextIndexES} + */ +public class TextIndexESIT extends BaseESTest { + + @Test + public void testAddEntity() { + String labelKey = "label"; + String labelValue = "this is a sample Label"; + Assert.assertNotNull(classToTest); + Entity entityToAdd = entity("http://example/x3", labelKey, labelValue); + GetResponse response = addEntity(entityToAdd); + Assert.assertTrue(response.getSource().containsKey(labelKey)); + Assert.assertEquals(labelValue, ((List)response.getSource().get(labelKey)).get(0)); + } + + @Test + public void testDeleteEntity() { + testAddEntity(); + String labelKey = "label"; + String labelValue = "this is a sample Label"; + //Now Delete the entity + classToTest.deleteEntity(entity("http://example/x3", labelKey, labelValue)); + + //Try to find it + GetResponse response = transportClient.prepareGet(INDEX_NAME, DOC_TYPE, "http://example/x3").get(); + //It Should Exist + Assert.assertTrue(response.isExists()); + //But the field value should now be empty + Assert.assertEquals("http://example/x3", response.getId()); + Assert.assertTrue(response.getSource().containsKey(labelKey)); + Assert.assertEquals(0, ((List)response.getSource().get(labelKey)).size()); + } + + @Test + public void testDeleteWhenNoneExists() { + + GetResponse response = transportClient.prepareGet(INDEX_NAME, DOC_TYPE, "http://example/x3").get(); + Assert.assertFalse(response.isExists()); + Assert.assertNotNull(classToTest); + classToTest.deleteEntity(entity("http://example/x3", "label", "doesnt matter")); + response = transportClient.prepareGet(INDEX_NAME, DOC_TYPE, "http://example/x3").get(); + Assert.assertFalse(response.isExists()); + + } + + @Test + public void testQuery() { + testAddEntity(); + // This will search for value "this" only in the label field + List<TextHit> result = classToTest.query(RDFS.label.asNode(), "this", null, null, 10); + Assert.assertNotNull(result); + Assert.assertEquals(1, result.size()); + + //This will search for value "this" across all the fields + result = classToTest.query(null, "this", null, null, 10); + Assert.assertNotNull(result); + Assert.assertEquals(1, result.size()); + + //This will search for value "this" in the label_en field, if it exists. In this case it doesnt so we should get zero results + result = classToTest.query(RDFS.label.asNode(), "this", null, "en", 10); + Assert.assertNotNull(result); + Assert.assertEquals(0, result.size()); + + } + + @Test + public void testQueryWhenNoneExists() { + List<TextHit> result = classToTest.query(RDFS.label.asNode(), "this",null, null, 1); + Assert.assertNotNull(result); + Assert.assertEquals(0, result.size()); + } + + @Test + public void testGet() { + testAddEntity(); + //Now Get the same entity + Map<String, Node> response = classToTest.get("http://example/x3"); + Assert.assertNotNull(response); + Assert.assertEquals(2, response.size()); + } + + @Test + public void testGetWhenNoneExists() { + Map<String, Node> response = classToTest.get("http://example/x3"); + Assert.assertNotNull(response); + Assert.assertEquals(0, response.size()); + } + + /** + * This is an elaborate test that does the following: + * 1. Create a Document with ID: "http://example/x3" , label: Germany and lang:en + * 2. Makes sure the document is created successfully and is searchable based on the label + * 3. Next add another label to the same Entity with ID: "http://example/x3", label:Deutschland and lang:de + * 4. Makes sure that the document is searchable both with old (Germany) and new (Deutschland) values. + * 5. Next, it deletes the value: Germany created in step 1. + * 6. Makes sure that document is searchable with value: Deutschland but NOT with value: Germany + * 7. Finally, delete the value: Deutschland + * 8. The document should not be searchable with value: Deutschland + * 9. The document should still exist + */ + @Test + public void testMultipleValuesinMultipleLanguages() throws InterruptedException{ + addEntity(entity("http://example/x3", "label", "Germany", "en")); + List<TextHit> result = classToTest.query(RDFS.label.asNode(), "Germany",null, "en", 10); + Assert.assertNotNull(result); + Assert.assertEquals(1, result.size()); + Assert.assertEquals("http://example/x3", result.get(0).getNode().getURI()); + //Next add another label to the same entity + addEntity(entity("http://example/x3", "label", "Deutschland", "de")); + //Query with old value + result = classToTest.query(RDFS.label.asNode(), "Germany", null, "en", 10); + Assert.assertEquals(1, result.size()); + Assert.assertEquals("http://example/x3", result.get(0).getNode().getURI()); + + //Query with new value + result = classToTest.query(RDFS.label.asNode(), "Deutschland", null, "de", 10); + Assert.assertEquals(1, result.size()); + Assert.assertEquals("http://example/x3", result.get(0).getNode().getURI()); + + //Query without lang value + result = classToTest.query(RDFS.label.asNode(), "Deutschland", null, null, 10); + Assert.assertEquals(0, result.size()); + + //Query without lang value as * + result = classToTest.query(RDFS.label.asNode(), "Deutschland", null, "*", 10); + Assert.assertEquals(1, result.size()); + Assert.assertEquals("http://example/x3", result.get(0).getNode().getURI()); + + //Now lets delete the Germany label + classToTest.deleteEntity(entity("http://example/x3", "label", "Germany", "en")); + + TimeUnit.SECONDS.sleep(1); + + //We should NOT be able to find the entity using Germany label anymore + result = classToTest.query(RDFS.label.asNode(), "Germany", null, null, 10); + Assert.assertEquals(0, result.size()); + + result = classToTest.query(RDFS.label.asNode(), "Germany", null, "en", 10); + Assert.assertEquals(0, result.size()); + + //But we should be able to find it with the Deutschland label value + result = classToTest.query(RDFS.label.asNode(), "Deutschland", null, "de", 10); + Assert.assertEquals(1, result.size()); + Assert.assertEquals("http://example/x3", result.get(0).getNode().getURI()); + + //Now lets delete the Deutschland label + classToTest.deleteEntity(entity("http://example/x3", "label", "Deutschland", "de")); + + //if the Delete and query happens almost instantly, then there are chances to still get false positives + //Thus sleeping for couple of seconds to give ES time to clean up. + TimeUnit.SECONDS.sleep(1); + //We should NOT be able to find the entity using Deutschland label anymore + result = classToTest.query(RDFS.label.asNode(), "Deutschland", null, null, 10); + Assert.assertEquals(0, result.size()); + + result = classToTest.query(RDFS.label.asNode(), "Deutschland", null, "de", 10); + Assert.assertEquals(0, result.size()); + + + } + + /** + * This test tries to save the same label values in different languages and makes sure that they are saved properly + */ + @Test + public void testSameLabelInDifferentLanguages() throws InterruptedException{ + addEntity(entity("http://example/x3", "label", "Berlin", "en")); + List<TextHit> result = classToTest.query(RDFS.label.asNode(), "Berlin", null, "en", 10); + Assert.assertNotNull(result); + Assert.assertEquals(1, result.size()); + Assert.assertEquals("http://example/x3", result.get(0).getNode().getURI()); + + //Next add Berlin with 'de' language + addEntity(entity("http://example/x3", "label", "Berlin", "de")); + result = classToTest.query(RDFS.label.asNode(), "Berlin", null, "de", 10); + Assert.assertNotNull(result); + Assert.assertEquals(1, result.size()); + Assert.assertEquals("http://example/x3", result.get(0).getNode().getURI()); + + //Now let's remove Berlin for language 'en' + classToTest.deleteEntity(entity("http://example/x3", "label", "Berlin", "en")); + //We should still be able to find the Document + result = classToTest.query(RDFS.label.asNode(), "Berlin", null, "de", 10); + Assert.assertNotNull(result); + Assert.assertEquals(1, result.size()); + Assert.assertEquals("http://example/x3", result.get(0).getNode().getURI()); + + //Now Lets remove Berlin for language 'de' + classToTest.deleteEntity(entity("http://example/x3", "label", "Berlin", "de")); + + //if the Delete and query happens almost instantly, then there are chances to still get false positives + //Thus sleeping for couple of seconds to give ES time to clean up + TimeUnit.SECONDS.sleep(1); + //Now we should NOT be able to find the document + result = classToTest.query(RDFS.label.asNode(), "Berlin", null, "de", 10); + Assert.assertNotNull(result); + Assert.assertEquals(0, result.size()); + } + + @Test + public void testLanguageTagSubCodes() { + addEntity(entity("http://example/x3", "label", "color", "en-US")); + addEntity(entity("http://example/x3", "label", "colour", "en-GB")); + + //Let's find it using color + List<TextHit> result = classToTest.query(RDFS.label.asNode(), "color", null, "en-US", 10); + Assert.assertNotNull(result); + Assert.assertEquals(1, result.size()); + Assert.assertEquals("http://example/x3", result.get(0).getNode().getURI()); + + result = classToTest.query(RDFS.label.asNode(), "color", null, "none", 10); + Assert.assertNotNull(result); + Assert.assertEquals(0, result.size()); + + //Next Lets find it using colour + result = classToTest.query(RDFS.label.asNode(), "colour", null, "en-GB", 10); + Assert.assertNotNull(result); + Assert.assertEquals(1, result.size()); + Assert.assertEquals("http://example/x3", result.get(0).getNode().getURI()); + + result = classToTest.query(RDFS.label.asNode(), "colour", null, "none", 10); + Assert.assertNotNull(result); + Assert.assertEquals(0, result.size()); + + //Next lets find it after specifying the lang parameter + result = classToTest.query(RDFS.label.asNode(), "colour",null, "en*", 10); + Assert.assertNotNull(result); + Assert.assertEquals(1, result.size()); + Assert.assertEquals("http://example/x3", result.get(0).getNode().getURI()); + + result = classToTest.query(RDFS.label.asNode(), "color",null, "en*", 10); + Assert.assertNotNull(result); + Assert.assertEquals(1, result.size()); + Assert.assertEquals("http://example/x3", result.get(0).getNode().getURI()); + + //Now lets find it by specifying exact lang values + result = classToTest.query(RDFS.label.asNode(), "colour",null, "en-GB", 10); + Assert.assertNotNull(result); + Assert.assertEquals(1, result.size()); + Assert.assertEquals("http://example/x3", result.get(0).getNode().getURI()); + + result = classToTest.query(RDFS.label.asNode(), "color",null, "en-US", 10); + Assert.assertNotNull(result); + Assert.assertEquals(1, result.size()); + Assert.assertEquals("http://example/x3", result.get(0).getNode().getURI()); + + //We should NOT be able to find anything for wrong language + result = classToTest.query(RDFS.label.asNode(), "color",null, "en-GB", 10); + Assert.assertNotNull(result); + Assert.assertEquals(0, result.size()); + + + } + private Entity entity(String id, String fieldName, String fieldValue) { + return entity(id, fieldName, fieldValue, null); + } + + private Entity entity(String id, String fieldName, String fieldValue, String lang) { + Entity entity = new Entity(id, null, lang, null); + entity.put(fieldName, fieldValue); + return entity; + } + + private GetResponse addEntity(Entity entityToAdd) { + classToTest.addEntity(entityToAdd); + GetResponse response = transportClient.prepareGet(INDEX_NAME, DOC_TYPE, entityToAdd.getId()).get(); + + Assert.assertNotNull(response); + Assert.assertEquals(entityToAdd.getId(), response.getId()); + return response; + + } + +}
