Manybubbles has uploaded a new change for review.
https://gerrit.wikimedia.org/r/203882
Change subject: WIP: Carefully pick up from where we left off
......................................................................
WIP: Carefully pick up from where we left off
Closes T95194
Change-Id: I87616ab6cf63bee8496996e6d8d331f0d9edfe93
---
M common/src/main/java/org/wikidata/query/rdf/common/uri/Ontology.java
M common/src/main/java/org/wikidata/query/rdf/common/uri/WikibaseUris.java
M tools/src/main/java/org/wikidata/query/rdf/tool/Update.java
M tools/src/main/java/org/wikidata/query/rdf/tool/rdf/RdfRepository.java
4 files changed, 81 insertions(+), 27 deletions(-)
git pull ssh://gerrit.wikimedia.org:29418/wikidata/query/rdf
refs/changes/82/203882/1
diff --git
a/common/src/main/java/org/wikidata/query/rdf/common/uri/Ontology.java
b/common/src/main/java/org/wikidata/query/rdf/common/uri/Ontology.java
index 590706a..32d04e9 100644
--- a/common/src/main/java/org/wikidata/query/rdf/common/uri/Ontology.java
+++ b/common/src/main/java/org/wikidata/query/rdf/common/uri/Ontology.java
@@ -30,7 +30,7 @@
/**
* Wikibase exports dump information with this subject.
*/
- public static final Object DUMP = NAMESPACE + "Dump";
+ public static final String DUMP = NAMESPACE + "Dump";
/**
* Predicate for marking Wikibase's Rank.
diff --git
a/common/src/main/java/org/wikidata/query/rdf/common/uri/WikibaseUris.java
b/common/src/main/java/org/wikidata/query/rdf/common/uri/WikibaseUris.java
index 02bada2..c8e46dd 100644
--- a/common/src/main/java/org/wikidata/query/rdf/common/uri/WikibaseUris.java
+++ b/common/src/main/java/org/wikidata/query/rdf/common/uri/WikibaseUris.java
@@ -10,6 +10,7 @@
public static WikibaseUris WIKIDATA = new WikibaseUris("www.wikidata.org");
public static WikibaseUris TEST_WIKIDATA = new
WikibaseUris("test.wikidata.org");
+ private final String root;
private final String entityData;
private final String entity;
private final String statement;
@@ -18,7 +19,7 @@
private final String qualifier;
public WikibaseUris(String host) {
- String root = "http://" + host;
+ root = "http://" + host;
entityData = root + "/wiki/Special:EntityData/";
entity = root + "/entity/";
statement = entity + "statement/";
@@ -41,6 +42,13 @@
}
/**
+ * The root of the wikibase uris - http://www.wikidata.org for Wikidata.
+ */
+ public String root() {
+ return root;
+ }
+
+ /**
* Prefix wikibase uses for dump information about entities.
*/
public String entityData() {
diff --git a/tools/src/main/java/org/wikidata/query/rdf/tool/Update.java
b/tools/src/main/java/org/wikidata/query/rdf/tool/Update.java
index 012ebab..de7821d 100644
--- a/tools/src/main/java/org/wikidata/query/rdf/tool/Update.java
+++ b/tools/src/main/java/org/wikidata/query/rdf/tool/Update.java
@@ -21,6 +21,7 @@
import org.openrdf.model.Statement;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
+import org.wikidata.query.rdf.common.uri.WikibaseUris;
import org.wikidata.query.rdf.tool.CliUtils.BasicOptions;
import org.wikidata.query.rdf.tool.CliUtils.MungerOptions;
import org.wikidata.query.rdf.tool.CliUtils.WikibaseOptions;
@@ -28,7 +29,6 @@
import org.wikidata.query.rdf.tool.change.Change.Batch;
import org.wikidata.query.rdf.tool.change.IdChangeSource;
import org.wikidata.query.rdf.tool.change.RecentChangesPoller;
-import org.wikidata.query.rdf.common.uri.WikibaseUris;
import org.wikidata.query.rdf.tool.exception.ContainedException;
import org.wikidata.query.rdf.tool.exception.RetryableException;
import org.wikidata.query.rdf.tool.rdf.Munger;
@@ -121,8 +121,8 @@
}
}
} else {
- log.info("Checking last update time");
- Date lastUpdate = rdfRepository.fetchLastUpdate();
+ log.info("Checking where we left off");
+ Date lastUpdate = rdfRepository.fetchLeftOffTime();
long minStartTime = System.currentTimeMillis() -
TimeUnit.DAYS.toMillis(30);
if (lastUpdate == null) {
startTime = minStartTime;
diff --git
a/tools/src/main/java/org/wikidata/query/rdf/tool/rdf/RdfRepository.java
b/tools/src/main/java/org/wikidata/query/rdf/tool/rdf/RdfRepository.java
index cf47fab..6396b24 100644
--- a/tools/src/main/java/org/wikidata/query/rdf/tool/rdf/RdfRepository.java
+++ b/tools/src/main/java/org/wikidata/query/rdf/tool/rdf/RdfRepository.java
@@ -10,9 +10,12 @@
import java.util.GregorianCalendar;
import java.util.List;
import java.util.Locale;
+import java.util.TimeZone;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
+import javax.xml.datatype.DatatypeConfigurationException;
+import javax.xml.datatype.DatatypeFactory;
import javax.xml.datatype.XMLGregorianCalendar;
import org.apache.http.Consts;
@@ -39,16 +42,20 @@
import org.openrdf.query.resultio.binary.BinaryQueryResultParser;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
+import org.wikidata.query.rdf.common.uri.Ontology;
import org.wikidata.query.rdf.common.uri.Provenance;
import org.wikidata.query.rdf.common.uri.SchemaDotOrg;
import org.wikidata.query.rdf.common.uri.WikibaseUris;
import org.wikidata.query.rdf.tool.exception.ContainedException;
+import org.wikidata.query.rdf.tool.exception.FatalException;
import com.google.common.base.Charsets;
import com.google.common.io.CharStreams;
public class RdfRepository {
private static final Logger log =
LoggerFactory.getLogger(RdfRepository.class);
+
+ private static final TimeZone UTC = TimeZone.getTimeZone("UTC");
private final CloseableHttpClient client =
HttpClients.custom().setMaxConnPerRoute(100).setMaxConnTotal(100)
.build();
@@ -189,31 +196,44 @@
}
/**
- * Fetch the last wikidata update time.
+ * Fetch where we left off updating the repository.
*
- * @return the date or null if there are no update times
+ * @return the date or null if we have nowhere to start from
*/
- public Date fetchLastUpdate() {
- // TODO this is very likely inefficient
- TupleQueryResult result = query("PREFIX schema:
<http://schema.org/>\nSELECT (MAX(?lastUpdate) as ?maxLastUpdate)\nWHERE { ?s
schema:dateModified ?lastUpdate . }");
- try {
- if (!result.hasNext()) {
- return null;
- }
- Binding maxLastUpdate = result.next().getBinding("maxLastUpdate");
- if (maxLastUpdate == null) {
- return null;
- }
- XMLGregorianCalendar xmlCalendar = ((Literal)
maxLastUpdate.getValue()).calendarValue();
- /*
- * We convert rather blindly to a GregorianCalendar because we're
- * reasonably sure all the right data is present.
- */
- GregorianCalendar calendar = xmlCalendar.toGregorianCalendar();
- return calendar.getTime();
- } catch (QueryEvaluationException e) {
- throw new RuntimeException("Error evaluating query", e);
+ public Date fetchLeftOffTime() {
+ log.info("Checking for left off time from the updater");
+ StringBuilder b = SchemaDotOrg.prefix(new StringBuilder());
+ b.append("SELECT * WHERE { <").append(uris.root()).append(">
schema:dateModified ?date }");
+ Date leftOffTime = dateFromQuery(b.toString());
+ if (leftOffTime != null) {
+ log.info("Found left off time from the updater");
+ return leftOffTime;
}
+ log.info("Checking for left off time from the dump");
+ b = Ontology.prefix(SchemaDotOrg.prefix(new StringBuilder()));
+ b.append("SELECT * WHERE { ontology:Dump schema:dateModified ?date }");
+ return dateFromQuery(b.toString());
+ }
+
+ /**
+ * Update where we left off so when fetchLeftOffTime is next called it
+ * returns leftOffTime so we can continue from there after the updater is
+ * restarted.
+ */
+ public void updateLeftOffTime(Date leftOffTime) {
+ log.debug("Setting last updated time to {}", leftOffTime);
+ UpdateBuilder b = new UpdateBuilder();
+ b.delete(uris.root(), "schema:dateModified", "?o");
+ b.where(uris.root(), "schema:dateModified", "?o");
+ GregorianCalendar c = new GregorianCalendar(UTC, Locale.ROOT);
+ c.setTime(leftOffTime);
+ try {
+ b.insert(uris.root(), "schema:dateModified",
DatatypeFactory.newInstance().newXMLGregorianCalendar(c));
+ } catch (DatatypeConfigurationException e) {
+ throw new FatalException("Holy cow datatype configuration
exception on default "
+ + "datatype factory. Seems like something really really
strange.", e);
+ }
+ execute("update", UPDATE_COUNT_RESPONSE, b.toString());
}
/**
@@ -283,6 +303,32 @@
}
/**
+ * Run a query that returns just a date in the "date" binding and return
its
+ * result.
+ */
+ private Date dateFromQuery(String query) {
+ TupleQueryResult result = query(query);
+ try {
+ if (!result.hasNext()) {
+ return null;
+ }
+ Binding maxLastUpdate = result.next().getBinding("date");
+ if (maxLastUpdate == null) {
+ return null;
+ }
+ XMLGregorianCalendar xmlCalendar = ((Literal)
maxLastUpdate.getValue()).calendarValue();
+ /*
+ * We convert rather blindly to a GregorianCalendar because we're
+ * reasonably sure all the right data is present.
+ */
+ GregorianCalendar calendar = xmlCalendar.toGregorianCalendar();
+ return calendar.getTime();
+ } catch (QueryEvaluationException e) {
+ throw new FatalException("Error evaluating query", e);
+ }
+ }
+
+ /**
* Passed to execute to setup the accept header and parse the response. Its
* super ultra mega important to parse the response in execute because
* execute manages closing the http response object. If execute return the
--
To view, visit https://gerrit.wikimedia.org/r/203882
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings
Gerrit-MessageType: newchange
Gerrit-Change-Id: I87616ab6cf63bee8496996e6d8d331f0d9edfe93
Gerrit-PatchSet: 1
Gerrit-Project: wikidata/query/rdf
Gerrit-Branch: master
Gerrit-Owner: Manybubbles <[email protected]>
_______________________________________________
MediaWiki-commits mailing list
[email protected]
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits