jenkins-bot has submitted this change and it was merged. ( https://gerrit.wikimedia.org/r/396474 )
Change subject: Fix Updater performance issue ...................................................................... Fix Updater performance issue Do not use removeAll - just assemble the collections in straightforward way. Bug: T182464 Change-Id: I02530690e877cc1b60f11dc2fc8b97622c742d68 --- M tools/src/main/java/org/wikidata/query/rdf/tool/change/RecentChangesPoller.java M tools/src/main/java/org/wikidata/query/rdf/tool/rdf/RdfRepository.java 2 files changed, 39 insertions(+), 23 deletions(-) Approvals: jenkins-bot: Verified Gehel: Looks good to me, approved diff --git a/tools/src/main/java/org/wikidata/query/rdf/tool/change/RecentChangesPoller.java b/tools/src/main/java/org/wikidata/query/rdf/tool/change/RecentChangesPoller.java index caf3ff6..85b6208 100644 --- a/tools/src/main/java/org/wikidata/query/rdf/tool/change/RecentChangesPoller.java +++ b/tools/src/main/java/org/wikidata/query/rdf/tool/change/RecentChangesPoller.java @@ -97,10 +97,6 @@ int batchSize, Map<Long, Boolean> seenIDs, int tailSeconds) { this.wikibase = wikibase; this.firstStartTime = firstStartTime; - // FIXME: temporary plug for T182464, remove after it's fixed - if (batchSize > 200) { - batchSize = 200; - } this.batchSize = batchSize; this.seenIDs = seenIDs; this.tailSeconds = tailSeconds; diff --git a/tools/src/main/java/org/wikidata/query/rdf/tool/rdf/RdfRepository.java b/tools/src/main/java/org/wikidata/query/rdf/tool/rdf/RdfRepository.java index e6e891e..b048f6d 100644 --- a/tools/src/main/java/org/wikidata/query/rdf/tool/rdf/RdfRepository.java +++ b/tools/src/main/java/org/wikidata/query/rdf/tool/rdf/RdfRepository.java @@ -2,7 +2,6 @@ import static com.google.common.collect.Sets.newHashSetWithExpectedSize; import static com.google.common.io.Resources.getResource; -import static org.wikidata.query.rdf.tool.FilteredStatements.filtered; import java.io.ByteArrayInputStream; import java.io.IOException; @@ -372,7 +371,7 @@ } /** - * Provides the SPARQL needed to synchronize the data statements. + * Provides the SPARQL needed to synchronize the data statements for a single entity. * * @param entityId id of the entity to sync * @param statements all known statements about the entity @@ -390,17 +389,13 @@ b.bind("uris.statement", uris.statement()); b.bindStatements("insertStatements", statements); - Collection<Statement> entityStatements = filtered(statements).withSubject(uris.entity() + entityId); + List<Statement> entityStatements = new ArrayList<>(); + List<Statement> statementStatements = new ArrayList<>(); + Set<Statement> aboutStatements = new HashSet<>(); + classifyStatements(statements, entityId, entityStatements, statementStatements, aboutStatements); + b.bindValues("entityStatements", entityStatements); - - Collection<Statement> statementStatements = filtered(statements).withSubjectStarts(uris.statement()); b.bindValues("statementStatements", statementStatements); - - Collection<Statement> aboutStatements = new HashSet<Statement>(statements); - aboutStatements.removeAll(entityStatements); - aboutStatements.removeAll(statementStatements); - aboutStatements.removeAll(filtered(statements).withSubjectStarts(uris.value())); - aboutStatements.removeAll(filtered(statements).withSubjectStarts(uris.reference())); b.bindValues("aboutStatements", aboutStatements); if (valueList != null && !valueList.isEmpty()) { @@ -412,6 +407,36 @@ } return b.toString(); + } + + /** + * Sort statements into a set of specialized collections, by subject. + * @param statements List of statements to process + * @param entityId + * @param entityStatements subject is entity + * @param statementStatements subject is any statement + * @param aboutStatements not entity, not statement, not value and not reference + */ + private void classifyStatements(Collection<Statement> statements, + String entityId, Collection<Statement> entityStatements, + Collection<Statement> statementStatements, + Collection<Statement> aboutStatements) { + for (Statement statement: statements) { + String s = statement.getSubject().stringValue(); + if (s.equals(uris.entity() + entityId)) { + entityStatements.add(statement); + } + if (s.startsWith(uris.statement())) { + statementStatements.add(statement); + } + if (!s.equals(uris.entity() + entityId) + && !s.startsWith(uris.statement()) + && !s.startsWith(uris.value()) + && !s.startsWith(uris.reference()) + ) { + aboutStatements.add(statement); + } + } } /** @@ -433,6 +458,8 @@ List<Statement> insertStatements = new ArrayList<>(); List<Statement> entityStatements = new ArrayList<>(); + List<Statement> statementStatements = new ArrayList<>(); + Set<Statement> aboutStatements = new HashSet<>(); Set<String> valueSet = new HashSet<>(); for (final Change change : changes) { @@ -442,7 +469,7 @@ } entityIds.add(change.entityId()); insertStatements.addAll(change.getStatements()); - entityStatements.addAll(filtered(change.getStatements()).withSubject(uris.entity() + change.entityId())); + classifyStatements(change.getStatements(), change.entityId(), entityStatements, statementStatements, aboutStatements); valueSet.addAll(change.getCleanupList()); } @@ -456,14 +483,7 @@ b.bindStatements("insertStatements", insertStatements); b.bindValues("entityStatements", entityStatements); - Collection<Statement> statementStatements = filtered(insertStatements).withSubjectStarts(uris.statement()); b.bindValues("statementStatements", statementStatements); - - Collection<Statement> aboutStatements = new HashSet<>(insertStatements); - aboutStatements.removeAll(entityStatements); - aboutStatements.removeAll(statementStatements); - aboutStatements.removeAll(filtered(insertStatements).withSubjectStarts(uris.value())); - aboutStatements.removeAll(filtered(insertStatements).withSubjectStarts(uris.reference())); b.bindValues("aboutStatements", aboutStatements); if (!valueSet.isEmpty()) { -- To view, visit https://gerrit.wikimedia.org/r/396474 To unsubscribe, visit https://gerrit.wikimedia.org/r/settings Gerrit-MessageType: merged Gerrit-Change-Id: I02530690e877cc1b60f11dc2fc8b97622c742d68 Gerrit-PatchSet: 5 Gerrit-Project: wikidata/query/rdf Gerrit-Branch: master Gerrit-Owner: Smalyshev <smalys...@wikimedia.org> Gerrit-Reviewer: Gehel <guillaume.leder...@wikimedia.org> Gerrit-Reviewer: Smalyshev <smalys...@wikimedia.org> Gerrit-Reviewer: jenkins-bot <> _______________________________________________ MediaWiki-commits mailing list MediaWiki-commits@lists.wikimedia.org https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits