This is an automated email from the ASF dual-hosted git repository.
bchapuis pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/incubator-baremaps.git
The following commit(s) were added to refs/heads/main by this push:
new 23e45b63 Geocoder: scoring by population, parsing query text support
AND,OR operators (#712)
23e45b63 is described below
commit 23e45b638b351ac5a643cc850e6a17ccf3f928ef
Author: Perdjesk <[email protected]>
AuthorDate: Wed Jun 28 20:22:10 2023 +0200
Geocoder: scoring by population, parsing query text support AND,OR
operators (#712)
---
baremaps-core/pom.xml | 4 +
.../baremaps/geocoder/GeonamesDocumentMapper.java | 6 +-
.../baremaps/geocoder/GeonamesQueryBuilder.java | 54 +++++++++++-
.../org/apache/baremaps/iploc/IpLocMapper.java | 3 +-
.../baremaps/geocoder/GeonamesIndexTest.java | 95 ++++++++++++++++++++++
.../apache/baremaps/server/GeocoderResource.java | 49 +++++++----
pom.xml | 5 ++
7 files changed, 194 insertions(+), 22 deletions(-)
diff --git a/baremaps-core/pom.xml b/baremaps-core/pom.xml
index 8502ed1d..e9bfe79f 100644
--- a/baremaps-core/pom.xml
+++ b/baremaps-core/pom.xml
@@ -93,6 +93,10 @@
<groupId>org.apache.lucene</groupId>
<artifactId>lucene-core</artifactId>
</dependency>
+ <dependency>
+ <groupId>org.apache.lucene</groupId>
+ <artifactId>lucene-expressions</artifactId>
+ </dependency>
<dependency>
<groupId>org.apache.lucene</groupId>
<artifactId>lucene-queryparser</artifactId>
diff --git
a/baremaps-core/src/main/java/org/apache/baremaps/geocoder/GeonamesDocumentMapper.java
b/baremaps-core/src/main/java/org/apache/baremaps/geocoder/GeonamesDocumentMapper.java
index 871e44f9..cd6177ac 100644
---
a/baremaps-core/src/main/java/org/apache/baremaps/geocoder/GeonamesDocumentMapper.java
+++
b/baremaps-core/src/main/java/org/apache/baremaps/geocoder/GeonamesDocumentMapper.java
@@ -29,19 +29,21 @@ public class GeonamesDocumentMapper implements
Function<GeonamesRecord, Document
document.add(new TextField("name", record.getName(), Field.Store.YES));
document.add(new TextField("country",
IsoCountriesUtils.getCountry(record.getCountryCode()),
Field.Store.YES));
+ // countryCode is not analyzed and thus must be queried using uppercase
document.add(new StringField("countryCode", record.getCountryCode(),
Field.Store.YES));
document.add(new LatLonPoint("point", record.getLatitude(),
record.getLongitude()));
document.add(new StoredField("longitude", record.getLongitude()));
document.add(new StoredField("latitude", record.getLatitude()));
- document.add(new StoredField("asciiname", record.getAsciiname()));
+ document.add(new TextField("asciiname", record.getAsciiname(),
Field.Store.YES));
document.add(new StoredField("alternatenames",
record.getAlternatenames()));
- document.add(new StoredField("featureClass", record.getFeatureClass()));
+ document.add(new StringField("featureClass", record.getFeatureClass(),
Field.Store.YES));
document.add(new StoredField("featureCode", record.getFeatureCode()));
document.add(new StoredField("cc2", record.getCc2()));
document.add(new StoredField("admin1Code", record.getAdmin1Code()));
document.add(new StoredField("admin2Code", record.getAdmin2Code()));
document.add(new StoredField("admin3Code", record.getAdmin3Code()));
document.add(new StoredField("admin4Code", record.getAdmin4Code()));
+ document.add(new NumericDocValuesField("population",
record.getPopulation()));
document.add(new StoredField("population", record.getPopulation()));
if (record.getElevation() != null) {
document.add(new StoredField("elevation", record.getElevation()));
diff --git
a/baremaps-core/src/main/java/org/apache/baremaps/geocoder/GeonamesQueryBuilder.java
b/baremaps-core/src/main/java/org/apache/baremaps/geocoder/GeonamesQueryBuilder.java
index 57838f80..561cb321 100644
---
a/baremaps-core/src/main/java/org/apache/baremaps/geocoder/GeonamesQueryBuilder.java
+++
b/baremaps-core/src/main/java/org/apache/baremaps/geocoder/GeonamesQueryBuilder.java
@@ -14,13 +14,19 @@ package org.apache.baremaps.geocoder;
+import java.text.ParseException;
import java.util.Map;
import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.expressions.Expression;
+import org.apache.lucene.expressions.SimpleBindings;
+import org.apache.lucene.expressions.js.JavascriptCompiler;
import org.apache.lucene.index.Term;
+import org.apache.lucene.queries.function.FunctionScoreQuery;
import org.apache.lucene.queryparser.classic.QueryParser;
import org.apache.lucene.queryparser.simple.SimpleQueryParser;
import org.apache.lucene.search.BooleanClause;
import org.apache.lucene.search.BooleanQuery;
+import org.apache.lucene.search.DoubleValuesSource;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.TermQuery;
@@ -35,6 +41,11 @@ public class GeonamesQueryBuilder {
private String countryCode = "";
+ private boolean scoringByPopulation;
+
+ private boolean andOperator;
+
+
public GeonamesQueryBuilder() {
this(GeocoderConstants.ANALYZER);
}
@@ -53,15 +64,37 @@ public class GeonamesQueryBuilder {
return this;
}
- public Query build() {
+ /**
+ * The scoring will take into account the population
+ */
+ public GeonamesQueryBuilder withScoringByPopulation() {
+ this.scoringByPopulation = true;
+ return this;
+ }
+
+ /**
+ * The queryText will be parsed with AND operator between terms instead of
OR.
+ */
+ public GeonamesQueryBuilder withAndOperator() {
+ this.andOperator = true;
+ return this;
+ }
+
+ public Query build() throws ParseException {
var builder = new BooleanQuery.Builder();
if (queryText != null) {
var queryTextEsc = QueryParser.escape(queryText);
if (!queryTextEsc.isBlank()) {
- var fieldWeights = Map.of("name", 1f, "country", 1f);
- var termsQuery = new SimpleQueryParser(analyzer,
fieldWeights).parse(queryTextEsc);
- builder.add(termsQuery, BooleanClause.Occur.SHOULD);
+ var fieldWeights = Map.of("name", 1f, "asciiname", 1f, "country", 1f,
"countryCode", 1f);
+ var parser = new SimpleQueryParser(analyzer, fieldWeights);
+ if (andOperator) {
+ // AND operator between query terms parsed instead of default OR
+ parser.setDefaultOperator(BooleanClause.Occur.MUST);
+ }
+ var termsQuery = parser.parse(queryTextEsc);
+ // at least one terms of the queryText must be present
+ builder.add(termsQuery, BooleanClause.Occur.MUST);
}
}
@@ -73,6 +106,19 @@ public class GeonamesQueryBuilder {
}
}
+ if (scoringByPopulation) {
+ var query = builder.build();
+ // ln(1+population) to tolerate entries with population=0
+ Expression expr = JavascriptCompiler.compile("_score +
ln(1+population)");
+
+ var bindings = new SimpleBindings();
+ bindings.add("_score", DoubleValuesSource.SCORES);
+ bindings.add("population",
DoubleValuesSource.fromIntField("population"));
+
+ return new FunctionScoreQuery(
+ query,
+ expr.getDoubleValuesSource(bindings));
+ }
return builder.build();
}
}
diff --git
a/baremaps-core/src/main/java/org/apache/baremaps/iploc/IpLocMapper.java
b/baremaps-core/src/main/java/org/apache/baremaps/iploc/IpLocMapper.java
index 78caa1da..0215afa2 100644
--- a/baremaps-core/src/main/java/org/apache/baremaps/iploc/IpLocMapper.java
+++ b/baremaps-core/src/main/java/org/apache/baremaps/iploc/IpLocMapper.java
@@ -15,6 +15,7 @@ package org.apache.baremaps.iploc;
import com.google.common.net.InetAddresses;
import java.io.IOException;
+import java.text.ParseException;
import java.util.Optional;
import java.util.function.Function;
import java.util.regex.Pattern;
@@ -182,7 +183,7 @@ public class IpLocMapper implements Function<NicObject,
Optional<IpLocObject>> {
* @throws IOException if an I/O error occurs
*/
private Optional<Coordinate> findLocation(String searchTerms, String
countryCode)
- throws IOException {
+ throws IOException, ParseException {
var indexSearcher = searcherManager.acquire();
var geonamesQuery =
new
GeonamesQueryBuilder().queryText(searchTerms).countryCode(countryCode).build();
diff --git
a/baremaps-core/src/test/java/org/apache/baremaps/geocoder/GeonamesIndexTest.java
b/baremaps-core/src/test/java/org/apache/baremaps/geocoder/GeonamesIndexTest.java
new file mode 100644
index 00000000..60f2126a
--- /dev/null
+++
b/baremaps-core/src/test/java/org/apache/baremaps/geocoder/GeonamesIndexTest.java
@@ -0,0 +1,95 @@
+/*
+ * Licensed under the Apache License, Version 2.0 (the "License"); you may not
use this file except
+ * in compliance with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
distributed under the License
+ * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
KIND, either express
+ * or implied. See the License for the specific language governing permissions
and limitations under
+ * the License.
+ */
+
+package org.apache.baremaps.geocoder;
+
+import static org.junit.jupiter.api.Assertions.assertEquals;
+
+import java.io.IOException;
+import java.nio.file.Files;
+import java.nio.file.Path;
+import java.nio.file.Paths;
+import java.util.Arrays;
+import org.apache.baremaps.testing.TestFiles;
+import org.apache.baremaps.utils.FileUtils;
+import org.apache.baremaps.workflow.WorkflowContext;
+import org.apache.baremaps.workflow.tasks.CreateGeonamesIndex;
+import org.apache.lucene.search.IndexSearcher;
+import org.apache.lucene.search.SearcherFactory;
+import org.apache.lucene.search.SearcherManager;
+import org.apache.lucene.store.MMapDirectory;
+import org.junit.jupiter.api.AfterAll;
+import org.junit.jupiter.api.BeforeAll;
+import org.junit.jupiter.api.Test;
+
+
+public class GeonamesIndexTest {
+
+ private static Path directory;
+ private static IndexSearcher searcher;
+
+ @BeforeAll
+ public static void beforeAll() throws Exception {
+ // Init the geocoder service
+ directory = Files.createTempDirectory(Paths.get("."), "geocoder_");
+
+ // Create the geonames index
+ var data = TestFiles.resolve("geonames/LI.txt");
+ var task = new CreateGeonamesIndex(data, directory);
+ task.execute(new WorkflowContext());
+ var dir = MMapDirectory.open(directory);
+ var searcherManager = new SearcherManager(dir, new SearcherFactory());
+ searcher = searcherManager.acquire();
+ }
+
+ @AfterAll
+ public static void afterAll() throws IOException {
+ FileUtils.deleteRecursively(directory);
+ }
+
+ @Test
+ void testCreateIndex() throws Exception {
+ var geonamesQuery =
+ new
GeonamesQueryBuilder().queryText("vaduz").countryCode("LI").build();
+ var topDocs = searcher.search(geonamesQuery, 1);
+ var doc =
searcher.doc(Arrays.stream(topDocs.scoreDocs).findFirst().get().doc);
+ assertEquals("Vaduz", doc.getField("name").stringValue());
+ }
+
+ @Test
+ void testOrQuery() throws Exception {
+ var geonamesQuery =
+ new GeonamesQueryBuilder().queryText("vaduz
berlin").countryCode("LI").build();
+ var topDocs = searcher.search(geonamesQuery, 1);
+ var doc =
searcher.doc(Arrays.stream(topDocs.scoreDocs).findFirst().get().doc);
+ assertEquals("Vaduz", doc.getField("name").stringValue());
+ }
+
+ @Test
+ void testAndQueryNoHits() throws Exception {
+ var geonamesQuery =
+ new GeonamesQueryBuilder().queryText("vaduz
berlin").withAndOperator().countryCode("LI")
+ .build();
+ var topDocs = searcher.search(geonamesQuery, 1);
+ assertEquals(0, topDocs.totalHits.value);
+ }
+
+ @Test
+ void testAndQuery() throws Exception {
+ var geonamesQuery =
+ new GeonamesQueryBuilder().queryText("vaduz
liechtenstein").withAndOperator()
+ .countryCode("LI").build();
+ var topDocs = searcher.search(geonamesQuery, 1);
+ var doc =
searcher.doc(Arrays.stream(topDocs.scoreDocs).findFirst().get().doc);
+ assertEquals("Vaduz", doc.getField("name").stringValue());
+ }
+}
diff --git
a/baremaps-server/src/main/java/org/apache/baremaps/server/GeocoderResource.java
b/baremaps-server/src/main/java/org/apache/baremaps/server/GeocoderResource.java
index f359a90d..9b3543d8 100644
---
a/baremaps-server/src/main/java/org/apache/baremaps/server/GeocoderResource.java
+++
b/baremaps-server/src/main/java/org/apache/baremaps/server/GeocoderResource.java
@@ -17,13 +17,17 @@ import static
com.google.common.net.HttpHeaders.CONTENT_TYPE;
import static javax.ws.rs.core.MediaType.APPLICATION_JSON;
import java.io.IOException;
+import java.text.ParseException;
import java.util.Arrays;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import javax.inject.Inject;
import javax.inject.Singleton;
-import javax.ws.rs.*;
+import javax.ws.rs.DefaultValue;
+import javax.ws.rs.GET;
+import javax.ws.rs.QueryParam;
+import javax.ws.rs.WebApplicationException;
import javax.ws.rs.core.Response;
import org.apache.baremaps.geocoder.GeonamesQueryBuilder;
import org.apache.lucene.search.IndexSearcher;
@@ -54,28 +58,43 @@ public class GeocoderResource {
@GET
@javax.ws.rs.Path("/api/geocoder")
- public Response getIpToLocation(
+ public Response searchLocations(
@QueryParam("queryText") String queryText,
@QueryParam("countryCode") @DefaultValue("") String countryCode,
- @QueryParam("limit") @DefaultValue("10") int limit) throws IOException {
+ @QueryParam("limit") @DefaultValue("10") int limit) {
if (queryText == null) {
throw new
WebApplicationException(Response.status(Response.Status.BAD_REQUEST)
.entity("The queryText parameter is mandatory").build());
}
- var query = new
GeonamesQueryBuilder().queryText(queryText).countryCode(countryCode).build();
- var searcher = searcherManager.acquire();
try {
- var result = searcher.search(query, limit);
- var results =
- Arrays.stream(result.scoreDocs).map(scoreDoc -> asResult(searcher,
scoreDoc)).toList();
- return Response.status(200).header(ACCESS_CONTROL_ALLOW_ORIGIN, "*")
- .header(CONTENT_TYPE, APPLICATION_JSON).entity(new
GeocoderResponse(results)).build();
- } catch (IllegalArgumentException e) {
- return Response.status(400).entity(e.getMessage()).build();
+ IndexSearcher searcher = searcherManager.acquire();
+ try {
+ // Querying to search location uses AND operator between terms such as
every term "adds up"
+ // Examples of queryText:
+ // - "paris", returns paris in france in first results (i.e because of
scoring with
+ // population)
+ // - "paris brazil", returns paris in brazil and not paris in france.
+ var query = new GeonamesQueryBuilder()
+
.queryText(queryText).countryCode(countryCode).withScoringByPopulation()
+ .withAndOperator()
+ .build();
+
+ var result = searcher.search(query, limit);
+ var results =
+ Arrays.stream(result.scoreDocs).map(scoreDoc -> asResult(searcher,
scoreDoc)).toList();
+ return
Response.status(Response.Status.OK).header(ACCESS_CONTROL_ALLOW_ORIGIN, "*")
+ .header(CONTENT_TYPE, APPLICATION_JSON).entity(new
GeocoderResponse(results)).build();
+ } catch (IllegalArgumentException e) {
+ return
Response.status(Response.Status.BAD_REQUEST).entity(e.getMessage()).build();
+ } catch (IOException | ParseException e) {
+ return
Response.status(Response.Status.INTERNAL_SERVER_ERROR).entity(e.getMessage())
+ .build();
+ } finally {
+ searcherManager.release(searcher);
+ }
} catch (IOException e) {
- return Response.status(500).entity(e.getMessage()).build();
- } finally {
- searcherManager.release(searcher);
+ return
Response.status(Response.Status.INTERNAL_SERVER_ERROR).entity(e.getMessage())
+ .build();
}
}
diff --git a/pom.xml b/pom.xml
index 53844624..73cbab07 100644
--- a/pom.xml
+++ b/pom.xml
@@ -349,6 +349,11 @@
<artifactId>lucene-core</artifactId>
<version>${version.lucene}</version>
</dependency>
+ <dependency>
+ <groupId>org.apache.lucene</groupId>
+ <artifactId>lucene-expressions</artifactId>
+ <version>${version.lucene}</version>
+ </dependency>
<dependency>
<groupId>org.apache.lucene</groupId>
<artifactId>lucene-queryparser</artifactId>