Author: rwesten
Date: Wed Apr 17 10:38:48 2013
New Revision: 1468829
URL: http://svn.apache.org/r1468829
Log:
fixes STANBOL-1038; switches to shade plugin for runable jar (STANBOL-1039);
implemented simple EntityScoreProvider (STANBOL-1040); also updated the default
configuration to use the new entity score provider and to provide an example
for filtering Entities based on the feature class
Added:
stanbol/trunk/entityhub/indexing/geonames/src/main/java/org/apache/stanbol/entityhub/indexing/geonames/GeonamesEntityScoreProvider.java
(with props)
stanbol/trunk/entityhub/indexing/geonames/src/main/resources/indexing/config/entityTypes.properties
(with props)
Removed:
stanbol/trunk/entityhub/indexing/geonames/src/main/assembly/
Modified:
stanbol/trunk/entityhub/indexing/geonames/pom.xml
stanbol/trunk/entityhub/indexing/geonames/src/main/java/org/apache/stanbol/entityhub/indexing/geonames/GeonamesIndexingSource.java
stanbol/trunk/entityhub/indexing/geonames/src/main/java/org/apache/stanbol/entityhub/indexing/geonames/HierarchyProcessor.java
stanbol/trunk/entityhub/indexing/geonames/src/main/resources/indexing/config/indexing.properties
Modified: stanbol/trunk/entityhub/indexing/geonames/pom.xml
URL:
http://svn.apache.org/viewvc/stanbol/trunk/entityhub/indexing/geonames/pom.xml?rev=1468829&r1=1468828&r2=1468829&view=diff
==============================================================================
--- stanbol/trunk/entityhub/indexing/geonames/pom.xml (original)
+++ stanbol/trunk/entityhub/indexing/geonames/pom.xml Wed Apr 17 10:38:48 2013
@@ -65,28 +65,38 @@
</configuration>
</plugin>
<plugin>
- <artifactId>maven-assembly-plugin</artifactId>
- <version>2.2</version>
+ <groupId>org.apache.maven.plugins</groupId>
+ <artifactId>maven-jar-plugin</artifactId>
<configuration>
- <descriptors>
- <descriptor>src/main/assembly/assembly.xml</descriptor>
- </descriptors>
<archive>
<manifest>
- <addClasspath>true</addClasspath>
+ <addClasspath>true</addClasspath>
<mainClass>org.apache.stanbol.entityhub.indexing.Main</mainClass>
</manifest>
</archive>
</configuration>
-<!-- <executions>
+ </plugin>
+ <plugin>
+ <groupId>org.apache.maven.plugins</groupId>
+ <artifactId>maven-shade-plugin</artifactId>
+ <configuration>
+ <artifactSet>
+ <includes>
+ <include>*</include>
+ </includes>
+ </artifactSet>
+ <transformers>
+ <transformer
implementation="org.apache.maven.plugins.shade.resource.ServicesResourceTransformer"/>
+ </transformers>
+ </configuration>
+ <executions>
<execution>
- <id>make-assembly</id>
<phase>package</phase>
<goals>
- <goal>single</goal>
+ <goal>shade</goal>
</goals>
</execution>
- </executions> -->
+ </executions>
</plugin>
</plugins>
</build>
Added:
stanbol/trunk/entityhub/indexing/geonames/src/main/java/org/apache/stanbol/entityhub/indexing/geonames/GeonamesEntityScoreProvider.java
URL:
http://svn.apache.org/viewvc/stanbol/trunk/entityhub/indexing/geonames/src/main/java/org/apache/stanbol/entityhub/indexing/geonames/GeonamesEntityScoreProvider.java?rev=1468829&view=auto
==============================================================================
---
stanbol/trunk/entityhub/indexing/geonames/src/main/java/org/apache/stanbol/entityhub/indexing/geonames/GeonamesEntityScoreProvider.java
(added)
+++
stanbol/trunk/entityhub/indexing/geonames/src/main/java/org/apache/stanbol/entityhub/indexing/geonames/GeonamesEntityScoreProvider.java
Wed Apr 17 10:38:48 2013
@@ -0,0 +1,69 @@
+package org.apache.stanbol.entityhub.indexing.geonames;
+
+import java.util.Map;
+
+import org.apache.stanbol.entityhub.indexing.core.EntityScoreProvider;
+import org.apache.stanbol.entityhub.servicesapi.model.Reference;
+import org.apache.stanbol.entityhub.servicesapi.model.Representation;
+
+public class GeonamesEntityScoreProvider implements EntityScoreProvider {
+
+ private static final String FCLASS_A =
GeonamesConstants.GEONAMES_ONTOLOGY_NS +"A";
+ private static final String FCLASS_P =
GeonamesConstants.GEONAMES_ONTOLOGY_NS +"P";
+ private static final int MAX_POPULATION = 1000000;
+ private static final double FACT = Math.log1p(1000000);
+ private static final Float DEFAULT_SCORE = Float.valueOf(0.3f);
+
+ @Override
+ public void setConfiguration(Map<String,Object> config) {
+ }
+
+ @Override
+ public boolean needsInitialisation() {
+ return false;
+ }
+
+ @Override
+ public void initialise() {
+ }
+
+ @Override
+ public void close() {
+ }
+
+ @Override
+ public boolean needsData() {
+ return true;
+ }
+
+ @Override
+ public Float process(String id) throws UnsupportedOperationException {
+ throw new UnsupportedOperationException("This implementation requries
data to process the score");
+ }
+
+ @Override
+ public Float process(Representation entity) throws
UnsupportedOperationException {
+ Reference ref =
entity.getFirstReference(GeonamesPropertyEnum.gn_featureClass.toString());
+ String fclass = ref == null ? null : ref.getReference();
+ //ref =
entity.getFirstReference(GeonamesPropertyEnum.gn_featureCode.toString());
+ //String fCode = ref == null ? null : ref.getReference();
+
+ if(FCLASS_A.equals(fclass)){
+ return Float.valueOf(1f);
+ } else if(FCLASS_P.equals(fclass)){
+ Long population =
entity.getFirst(GeonamesPropertyEnum.gn_population.toString(), Long.class);
+ if(population == null){
+ return Float.valueOf(0.2f); //min population score
+ } else {
+ long p = Math.min(MAX_POPULATION, population.longValue());
+ double fact = Math.log1p(p);
+ //Normalised the score based on the population in the range
+ // [0.2..1.0]
+ return Float.valueOf((float)((fact/FACT*0.8)+0.2));
+ }
+ } else {
+ return DEFAULT_SCORE;
+ }
+ }
+
+}
Propchange:
stanbol/trunk/entityhub/indexing/geonames/src/main/java/org/apache/stanbol/entityhub/indexing/geonames/GeonamesEntityScoreProvider.java
------------------------------------------------------------------------------
svn:mime-type = text/plain
Modified:
stanbol/trunk/entityhub/indexing/geonames/src/main/java/org/apache/stanbol/entityhub/indexing/geonames/GeonamesIndexingSource.java
URL:
http://svn.apache.org/viewvc/stanbol/trunk/entityhub/indexing/geonames/src/main/java/org/apache/stanbol/entityhub/indexing/geonames/GeonamesIndexingSource.java?rev=1468829&r1=1468828&r2=1468829&view=diff
==============================================================================
---
stanbol/trunk/entityhub/indexing/geonames/src/main/java/org/apache/stanbol/entityhub/indexing/geonames/GeonamesIndexingSource.java
(original)
+++
stanbol/trunk/entityhub/indexing/geonames/src/main/java/org/apache/stanbol/entityhub/indexing/geonames/GeonamesIndexingSource.java
Wed Apr 17 10:38:48 2013
@@ -266,7 +266,7 @@ public class GeonamesIndexingSource impl
Integer geoNamesId = Integer.parseInt(id);
//create a new Doc based on the first Element (geonamesID)
Representation doc = valueFactory.createRepresentation(
- new
StringBuilder(GEONAMES_RESOURCE_NS).append(id).toString());
+ new
StringBuilder(GEONAMES_RESOURCE_NS).append(id).append('/').toString());
//add the Integer id so that we do not need to parse it from
the subject URI
doc.add(GeonamesPropertyEnum.idx_id.toString(), geoNamesId);
//add the geonames:Feature type
Modified:
stanbol/trunk/entityhub/indexing/geonames/src/main/java/org/apache/stanbol/entityhub/indexing/geonames/HierarchyProcessor.java
URL:
http://svn.apache.org/viewvc/stanbol/trunk/entityhub/indexing/geonames/src/main/java/org/apache/stanbol/entityhub/indexing/geonames/HierarchyProcessor.java?rev=1468829&r1=1468828&r2=1468829&view=diff
==============================================================================
---
stanbol/trunk/entityhub/indexing/geonames/src/main/java/org/apache/stanbol/entityhub/indexing/geonames/HierarchyProcessor.java
(original)
+++
stanbol/trunk/entityhub/indexing/geonames/src/main/java/org/apache/stanbol/entityhub/indexing/geonames/HierarchyProcessor.java
Wed Apr 17 10:38:48 2013
@@ -359,7 +359,7 @@ public class HierarchyProcessor implemen
//add country
if(adminIds[0] != null){
doc.add(GeonamesPropertyEnum.gn_parentCountry.toString(),
vf.createReference(
- new
StringBuilder(GeonamesConstants.GEONAMES_RESOURCE_NS).append(adminIds[0]).toString()));
+ new
StringBuilder(GeonamesConstants.GEONAMES_RESOURCE_NS).append(adminIds[0]).append('/').toString()));
parentLevel = Collections.singleton(adminIds[0]);
} else {
parentLevel = Collections.emptySet();
@@ -413,7 +413,7 @@ public class HierarchyProcessor implemen
if(id != null){
refs.add(vf.createReference(
new StringBuilder(GeonamesConstants.GEONAMES_RESOURCE_NS)
- .append(id).toString()));
+ .append(id).append('/').toString()));
}
}
return refs;
Added:
stanbol/trunk/entityhub/indexing/geonames/src/main/resources/indexing/config/entityTypes.properties
URL:
http://svn.apache.org/viewvc/stanbol/trunk/entityhub/indexing/geonames/src/main/resources/indexing/config/entityTypes.properties?rev=1468829&view=auto
==============================================================================
---
stanbol/trunk/entityhub/indexing/geonames/src/main/resources/indexing/config/entityTypes.properties
(added)
+++
stanbol/trunk/entityhub/indexing/geonames/src/main/resources/indexing/config/entityTypes.properties
Wed Apr 17 10:38:48 2013
@@ -0,0 +1,19 @@
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements. See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+field=geonames:featureClass
+
+# Only index Freebase Topics
+values=geonames:A;geonames:P
Propchange:
stanbol/trunk/entityhub/indexing/geonames/src/main/resources/indexing/config/entityTypes.properties
------------------------------------------------------------------------------
svn:mime-type = text/plain
Modified:
stanbol/trunk/entityhub/indexing/geonames/src/main/resources/indexing/config/indexing.properties
URL:
http://svn.apache.org/viewvc/stanbol/trunk/entityhub/indexing/geonames/src/main/resources/indexing/config/indexing.properties?rev=1468829&r1=1468828&r2=1468829&view=diff
==============================================================================
---
stanbol/trunk/entityhub/indexing/geonames/src/main/resources/indexing/config/indexing.properties
(original)
+++
stanbol/trunk/entityhub/indexing/geonames/src/main/resources/indexing/config/indexing.properties
Wed Apr 17 10:38:48 2013
@@ -41,18 +41,19 @@ Synchronized=false
#the configured source is also the default. You can also configure a directory
#if you want to index from multiple dumps (e.g. only specific countries)
entityDataIterable=org.apache.stanbol.entityhub.indexing.geonames.GeonamesIndexingSource,source:geonames/allCountries.zip
-#no support for entity scores
-entityScoreProvider=org.apache.stanbol.entityhub.indexing.core.source.NoEntityScoreProvider
-
+#scores entities based on class (A -> 1.0, P -> based on population [0.2..1],
rest -> 0.3
+entityScoreProvider=org.apache.stanbol.entityhub.indexing.geonames.GeonamesEntityScoreProvider
# ------------
# EntityProcessor
# ------------
# Three processors
+# (0) index only some feature classes
# (1) alternate labels
# (2) hierarchy
# (3) field mappings
# Default Entity Processor configuration
+# (0)
org.apache.stanbol.entityhub.indexing.core.processor.FieldValueFilter,config:entityTypes;
entityProcessor=org.apache.stanbol.entityhub.indexing.geonames.AlternateLabelProcessor;org.apache.stanbol.entityhub.indexing.geonames.HierarchyProcessor;org.apache.stanbol.entityhub.indexing.core.processor.FiledMapperProcessor
# ------------