- Introduce a new save method that supports batching - Use new save method batching to import Geonames (a lot) faster - Replace slow running ASCIIFoldingFilter call with faster MappingCharFilter call. See https://issues.apache.org/jira/browse/LUCENE-7525 and https://issues.apache.org/jira/browse/SOLR-2013 -
Project: http://git-wip-us.apache.org/repos/asf/incubator-unomi/repo Commit: http://git-wip-us.apache.org/repos/asf/incubator-unomi/commit/ac2c9ba5 Tree: http://git-wip-us.apache.org/repos/asf/incubator-unomi/tree/ac2c9ba5 Diff: http://git-wip-us.apache.org/repos/asf/incubator-unomi/diff/ac2c9ba5 Branch: refs/heads/feature-UNOMI-70-ES5X Commit: ac2c9ba5c6b955048ac47afb402dc80e1213f16f Parents: 69b0403 Author: Serge Huber <[email protected]> Authored: Tue Dec 20 16:38:19 2016 +0100 Committer: Serge Huber <[email protected]> Committed: Tue Dec 20 16:38:46 2016 +0100 ---------------------------------------------------------------------- .../geonames/services/GeonamesServiceImpl.java | 26 +- .../ElasticSearchPersistenceServiceImpl.java | 11 +- .../conditions/ConditionContextHelper.java | 31 +- .../src/main/resources/mapping-FoldToASCII.txt | 3813 ++++++++++++++++++ .../persistence/spi/PersistenceService.java | 11 + 5 files changed, 3879 insertions(+), 13 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/incubator-unomi/blob/ac2c9ba5/extensions/geonames/services/src/main/java/org/apache/unomi/geonames/services/GeonamesServiceImpl.java ---------------------------------------------------------------------- diff --git a/extensions/geonames/services/src/main/java/org/apache/unomi/geonames/services/GeonamesServiceImpl.java b/extensions/geonames/services/src/main/java/org/apache/unomi/geonames/services/GeonamesServiceImpl.java index ebe53a0..2b0a0dc 100644 --- a/extensions/geonames/services/src/main/java/org/apache/unomi/geonames/services/GeonamesServiceImpl.java +++ b/extensions/geonames/services/src/main/java/org/apache/unomi/geonames/services/GeonamesServiceImpl.java @@ -112,12 +112,15 @@ public class GeonamesServiceImpl implements GeonamesService { ZipInputStream zipInputStream = new ZipInputStream(new FileInputStream(f)); ZipEntry zipEntry = zipInputStream.getNextEntry(); // used to advance to the first entry in the ZipInputStream + long fileSize = zipEntry.getSize(); BufferedReader reader = new BufferedReader(new InputStreamReader(zipInputStream, "UTF-8")); SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd"); String line; logger.info("Starting to import geonames database from file {}...", f); - long lineCount = 0; + long charCount = 0; + double lastCompletionPourcentage = 0.0; + long lastCharCount = 0; long importStartTime = System.currentTimeMillis(); while ((line = reader.readLine()) != null) { String[] values = line.split("\t"); @@ -134,14 +137,25 @@ public class GeonamesServiceImpl implements GeonamesService { values[16], values[17], sdf.parse(values[18])); - persistenceService.save(geonameEntry); + persistenceService.save(geonameEntry, true); } - lineCount++; - if (lineCount % 1000 == 0) { - logger.info("{} lines imported from file {}", lineCount, f); + charCount+=line.length(); + if (fileSize > 0) { + double completionPourcentage = 100.0 * charCount / fileSize; + if (completionPourcentage - lastCompletionPourcentage > 1.0) { + int roundedPourcentage = (int) completionPourcentage; + logger.info("{}% imported from file {}", roundedPourcentage, f); + lastCompletionPourcentage = completionPourcentage; + } + } else { + if (charCount - lastCharCount > (100*1024*1024)) { + logger.info("{}MB imported from file {}", charCount / (1024*1024), f); + lastCharCount = charCount; + } } } - logger.info("{} lines from Geonames database file {} imported in {}ms", lineCount, f, System.currentTimeMillis()-importStartTime); + long totalTimeMillis = System.currentTimeMillis()-importStartTime; + logger.info("{} characters from Geonames database file {} imported in {}ms. Speed={}MB/s", charCount, f, totalTimeMillis, charCount / (1024*1024) / (totalTimeMillis / 1000)); } catch (Exception e) { logger.error(e.getMessage(), e); } http://git-wip-us.apache.org/repos/asf/incubator-unomi/blob/ac2c9ba5/persistence-elasticsearch/core/src/main/java/org/apache/unomi/persistence/elasticsearch/ElasticSearchPersistenceServiceImpl.java ---------------------------------------------------------------------- diff --git a/persistence-elasticsearch/core/src/main/java/org/apache/unomi/persistence/elasticsearch/ElasticSearchPersistenceServiceImpl.java b/persistence-elasticsearch/core/src/main/java/org/apache/unomi/persistence/elasticsearch/ElasticSearchPersistenceServiceImpl.java index 49e09d0..e7ed75b 100644 --- a/persistence-elasticsearch/core/src/main/java/org/apache/unomi/persistence/elasticsearch/ElasticSearchPersistenceServiceImpl.java +++ b/persistence-elasticsearch/core/src/main/java/org/apache/unomi/persistence/elasticsearch/ElasticSearchPersistenceServiceImpl.java @@ -739,6 +739,11 @@ public class ElasticSearchPersistenceServiceImpl implements PersistenceService, @Override public boolean save(final Item item) { + return save(item, false); + } + + @Override + public boolean save(final Item item, final boolean useBatching) { return new InClassLoaderExecute<Boolean>() { protected Boolean execute(Object... args) { @@ -769,7 +774,11 @@ public class ElasticSearchPersistenceServiceImpl implements PersistenceService, } try { - indexBuilder.execute().actionGet(); + if (bulkProcessor == null || !useBatching) { + indexBuilder.execute().actionGet(); + } else { + bulkProcessor.add(indexBuilder.request()); + } } catch (IndexNotFoundException e) { if (existingIndexNames.contains(index)) { existingIndexNames.remove(index); http://git-wip-us.apache.org/repos/asf/incubator-unomi/blob/ac2c9ba5/persistence-elasticsearch/core/src/main/java/org/apache/unomi/persistence/elasticsearch/conditions/ConditionContextHelper.java ---------------------------------------------------------------------- diff --git a/persistence-elasticsearch/core/src/main/java/org/apache/unomi/persistence/elasticsearch/conditions/ConditionContextHelper.java b/persistence-elasticsearch/core/src/main/java/org/apache/unomi/persistence/elasticsearch/conditions/ConditionContextHelper.java index 63964e1..6f3f1b4 100644 --- a/persistence-elasticsearch/core/src/main/java/org/apache/unomi/persistence/elasticsearch/conditions/ConditionContextHelper.java +++ b/persistence-elasticsearch/core/src/main/java/org/apache/unomi/persistence/elasticsearch/conditions/ConditionContextHelper.java @@ -20,14 +20,18 @@ package org.apache.unomi.persistence.elasticsearch.conditions; import com.google.common.base.Function; import com.google.common.collect.Lists; import org.apache.commons.lang3.StringUtils; -import org.apache.lucene.analysis.miscellaneous.ASCIIFoldingFilter; -import org.apache.lucene.util.ArrayUtil; +import org.apache.logging.log4j.core.util.IOUtils; +import org.apache.lucene.analysis.charfilter.MappingCharFilterFactory; +import org.apache.lucene.analysis.util.ClasspathResourceLoader; import org.apache.unomi.api.conditions.Condition; import org.mvel2.MVEL; import org.mvel2.ParserConfiguration; import org.mvel2.ParserContext; +import java.io.IOException; +import java.io.Reader; import java.io.Serializable; +import java.io.StringReader; import java.util.ArrayList; import java.util.HashMap; import java.util.List; @@ -37,6 +41,18 @@ import java.util.concurrent.ConcurrentHashMap; public class ConditionContextHelper { private static Map<String,Serializable> mvelExpressions = new ConcurrentHashMap<>(); + private static MappingCharFilterFactory mappingCharFilterFactory; + static { + Map<String,String> args = new HashMap<>(); + args.put("mapping", "mapping-FoldToASCII.txt"); + mappingCharFilterFactory = new MappingCharFilterFactory(args); + try { + mappingCharFilterFactory.inform(new ClasspathResourceLoader(ConditionContextHelper.class.getClassLoader())); + } catch (IOException e) { + e.printStackTrace(); + } + } + public static Condition getContextualCondition(Condition condition, Map<String, Object> context) { if (context.isEmpty() || !hasContextualParameter(condition.getParameterValues())) { return condition; @@ -124,10 +140,13 @@ public class ConditionContextHelper { public static String foldToASCII(String s) { if (s != null) { s = s.toLowerCase(); - int maxSizeNeeded = 4 * s.length(); - char[] output = new char[ArrayUtil.oversize(maxSizeNeeded, 2)]; - int length = ASCIIFoldingFilter.foldToASCII(s.toCharArray(), 0, output, 0, s.length()); - return new String(output, 0, length); + StringReader stringReader = new StringReader(s); + Reader foldedStringReader = mappingCharFilterFactory.create(stringReader); + try { + return IOUtils.toString(foldedStringReader); + } catch (IOException e) { + e.printStackTrace(); + } } return null; }
