Manybubbles has uploaded a new change for review.

  https://gerrit.wikimedia.org/r/203882

Change subject: WIP: Carefully pick up from where we left off
......................................................................

WIP: Carefully pick up from where we left off

Closes T95194

Change-Id: I87616ab6cf63bee8496996e6d8d331f0d9edfe93
---
M common/src/main/java/org/wikidata/query/rdf/common/uri/Ontology.java
M common/src/main/java/org/wikidata/query/rdf/common/uri/WikibaseUris.java
M tools/src/main/java/org/wikidata/query/rdf/tool/Update.java
M tools/src/main/java/org/wikidata/query/rdf/tool/rdf/RdfRepository.java
4 files changed, 81 insertions(+), 27 deletions(-)


  git pull ssh://gerrit.wikimedia.org:29418/wikidata/query/rdf 
refs/changes/82/203882/1

diff --git 
a/common/src/main/java/org/wikidata/query/rdf/common/uri/Ontology.java 
b/common/src/main/java/org/wikidata/query/rdf/common/uri/Ontology.java
index 590706a..32d04e9 100644
--- a/common/src/main/java/org/wikidata/query/rdf/common/uri/Ontology.java
+++ b/common/src/main/java/org/wikidata/query/rdf/common/uri/Ontology.java
@@ -30,7 +30,7 @@
     /**
      * Wikibase exports dump information with this subject.
      */
-    public static final Object DUMP = NAMESPACE + "Dump";
+    public static final String DUMP = NAMESPACE + "Dump";
 
     /**
      * Predicate for marking Wikibase's Rank.
diff --git 
a/common/src/main/java/org/wikidata/query/rdf/common/uri/WikibaseUris.java 
b/common/src/main/java/org/wikidata/query/rdf/common/uri/WikibaseUris.java
index 02bada2..c8e46dd 100644
--- a/common/src/main/java/org/wikidata/query/rdf/common/uri/WikibaseUris.java
+++ b/common/src/main/java/org/wikidata/query/rdf/common/uri/WikibaseUris.java
@@ -10,6 +10,7 @@
     public static WikibaseUris WIKIDATA = new WikibaseUris("www.wikidata.org");
     public static WikibaseUris TEST_WIKIDATA = new 
WikibaseUris("test.wikidata.org");
 
+    private final String root;
     private final String entityData;
     private final String entity;
     private final String statement;
@@ -18,7 +19,7 @@
     private final String qualifier;
 
     public WikibaseUris(String host) {
-        String root = "http://"; + host;
+        root = "http://"; + host;
         entityData = root + "/wiki/Special:EntityData/";
         entity = root + "/entity/";
         statement = entity + "statement/";
@@ -41,6 +42,13 @@
     }
 
     /**
+     * The root of the wikibase uris - http://www.wikidata.org for Wikidata.
+     */
+    public String root() {
+        return root;
+    }
+
+    /**
      * Prefix wikibase uses for dump information about entities.
      */
     public String entityData() {
diff --git a/tools/src/main/java/org/wikidata/query/rdf/tool/Update.java 
b/tools/src/main/java/org/wikidata/query/rdf/tool/Update.java
index 012ebab..de7821d 100644
--- a/tools/src/main/java/org/wikidata/query/rdf/tool/Update.java
+++ b/tools/src/main/java/org/wikidata/query/rdf/tool/Update.java
@@ -21,6 +21,7 @@
 import org.openrdf.model.Statement;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
+import org.wikidata.query.rdf.common.uri.WikibaseUris;
 import org.wikidata.query.rdf.tool.CliUtils.BasicOptions;
 import org.wikidata.query.rdf.tool.CliUtils.MungerOptions;
 import org.wikidata.query.rdf.tool.CliUtils.WikibaseOptions;
@@ -28,7 +29,6 @@
 import org.wikidata.query.rdf.tool.change.Change.Batch;
 import org.wikidata.query.rdf.tool.change.IdChangeSource;
 import org.wikidata.query.rdf.tool.change.RecentChangesPoller;
-import org.wikidata.query.rdf.common.uri.WikibaseUris;
 import org.wikidata.query.rdf.tool.exception.ContainedException;
 import org.wikidata.query.rdf.tool.exception.RetryableException;
 import org.wikidata.query.rdf.tool.rdf.Munger;
@@ -121,8 +121,8 @@
                 }
             }
         } else {
-            log.info("Checking last update time");
-            Date lastUpdate = rdfRepository.fetchLastUpdate();
+            log.info("Checking where we left off");
+            Date lastUpdate = rdfRepository.fetchLeftOffTime();
             long minStartTime = System.currentTimeMillis() - 
TimeUnit.DAYS.toMillis(30);
             if (lastUpdate == null) {
                 startTime = minStartTime;
diff --git 
a/tools/src/main/java/org/wikidata/query/rdf/tool/rdf/RdfRepository.java 
b/tools/src/main/java/org/wikidata/query/rdf/tool/rdf/RdfRepository.java
index cf47fab..6396b24 100644
--- a/tools/src/main/java/org/wikidata/query/rdf/tool/rdf/RdfRepository.java
+++ b/tools/src/main/java/org/wikidata/query/rdf/tool/rdf/RdfRepository.java
@@ -10,9 +10,12 @@
 import java.util.GregorianCalendar;
 import java.util.List;
 import java.util.Locale;
+import java.util.TimeZone;
 import java.util.regex.Matcher;
 import java.util.regex.Pattern;
 
+import javax.xml.datatype.DatatypeConfigurationException;
+import javax.xml.datatype.DatatypeFactory;
 import javax.xml.datatype.XMLGregorianCalendar;
 
 import org.apache.http.Consts;
@@ -39,16 +42,20 @@
 import org.openrdf.query.resultio.binary.BinaryQueryResultParser;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
+import org.wikidata.query.rdf.common.uri.Ontology;
 import org.wikidata.query.rdf.common.uri.Provenance;
 import org.wikidata.query.rdf.common.uri.SchemaDotOrg;
 import org.wikidata.query.rdf.common.uri.WikibaseUris;
 import org.wikidata.query.rdf.tool.exception.ContainedException;
+import org.wikidata.query.rdf.tool.exception.FatalException;
 
 import com.google.common.base.Charsets;
 import com.google.common.io.CharStreams;
 
 public class RdfRepository {
     private static final Logger log = 
LoggerFactory.getLogger(RdfRepository.class);
+
+    private static final TimeZone UTC = TimeZone.getTimeZone("UTC");
 
     private final CloseableHttpClient client = 
HttpClients.custom().setMaxConnPerRoute(100).setMaxConnTotal(100)
             .build();
@@ -189,31 +196,44 @@
     }
 
     /**
-     * Fetch the last wikidata update time.
+     * Fetch where we left off updating the repository.
      *
-     * @return the date or null if there are no update times
+     * @return the date or null if we have nowhere to start from
      */
-    public Date fetchLastUpdate() {
-        // TODO this is very likely inefficient
-        TupleQueryResult result = query("PREFIX schema: 
<http://schema.org/>\nSELECT (MAX(?lastUpdate) as ?maxLastUpdate)\nWHERE { ?s 
schema:dateModified ?lastUpdate . }");
-        try {
-            if (!result.hasNext()) {
-                return null;
-            }
-            Binding maxLastUpdate = result.next().getBinding("maxLastUpdate");
-            if (maxLastUpdate == null) {
-                return null;
-            }
-            XMLGregorianCalendar xmlCalendar = ((Literal) 
maxLastUpdate.getValue()).calendarValue();
-            /*
-             * We convert rather blindly to a GregorianCalendar because we're
-             * reasonably sure all the right data is present.
-             */
-            GregorianCalendar calendar = xmlCalendar.toGregorianCalendar();
-            return calendar.getTime();
-        } catch (QueryEvaluationException e) {
-            throw new RuntimeException("Error evaluating query", e);
+    public Date fetchLeftOffTime() {
+        log.info("Checking for left off time from the updater");
+        StringBuilder b = SchemaDotOrg.prefix(new StringBuilder());
+        b.append("SELECT * WHERE { <").append(uris.root()).append("> 
schema:dateModified ?date }");
+        Date leftOffTime = dateFromQuery(b.toString());
+        if (leftOffTime != null) {
+            log.info("Found left off time from the updater");
+            return leftOffTime;
         }
+        log.info("Checking for left off time from the dump");
+        b = Ontology.prefix(SchemaDotOrg.prefix(new StringBuilder()));
+        b.append("SELECT * WHERE { ontology:Dump schema:dateModified ?date }");
+        return dateFromQuery(b.toString());
+    }
+
+    /**
+     * Update where we left off so when fetchLeftOffTime is next called it
+     * returns leftOffTime so we can continue from there after the updater is
+     * restarted.
+     */
+    public void updateLeftOffTime(Date leftOffTime) {
+        log.debug("Setting last updated time to {}", leftOffTime);
+        UpdateBuilder b = new UpdateBuilder();
+        b.delete(uris.root(), "schema:dateModified", "?o");
+        b.where(uris.root(), "schema:dateModified", "?o");
+        GregorianCalendar c = new GregorianCalendar(UTC, Locale.ROOT);
+        c.setTime(leftOffTime);
+        try {
+            b.insert(uris.root(), "schema:dateModified", 
DatatypeFactory.newInstance().newXMLGregorianCalendar(c));
+        } catch (DatatypeConfigurationException e) {
+            throw new FatalException("Holy cow datatype configuration 
exception on default "
+                    + "datatype factory.  Seems like something really really 
strange.", e);
+        }
+        execute("update", UPDATE_COUNT_RESPONSE, b.toString());
     }
 
     /**
@@ -283,6 +303,32 @@
     }
 
     /**
+     * Run a query that returns just a date in the "date" binding and return 
its
+     * result.
+     */
+    private Date dateFromQuery(String query) {
+        TupleQueryResult result = query(query);
+        try {
+            if (!result.hasNext()) {
+                return null;
+            }
+            Binding maxLastUpdate = result.next().getBinding("date");
+            if (maxLastUpdate == null) {
+                return null;
+            }
+            XMLGregorianCalendar xmlCalendar = ((Literal) 
maxLastUpdate.getValue()).calendarValue();
+            /*
+             * We convert rather blindly to a GregorianCalendar because we're
+             * reasonably sure all the right data is present.
+             */
+            GregorianCalendar calendar = xmlCalendar.toGregorianCalendar();
+            return calendar.getTime();
+        } catch (QueryEvaluationException e) {
+            throw new FatalException("Error evaluating query", e);
+        }
+    }
+
+    /**
      * Passed to execute to setup the accept header and parse the response. Its
      * super ultra mega important to parse the response in execute because
      * execute manages closing the http response object. If execute return the

-- 
To view, visit https://gerrit.wikimedia.org/r/203882
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings

Gerrit-MessageType: newchange
Gerrit-Change-Id: I87616ab6cf63bee8496996e6d8d331f0d9edfe93
Gerrit-PatchSet: 1
Gerrit-Project: wikidata/query/rdf
Gerrit-Branch: master
Gerrit-Owner: Manybubbles <[email protected]>

_______________________________________________
MediaWiki-commits mailing list
[email protected]
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits

Reply via email to