Christopher Johnson (WMDE) has uploaded a new change for review.

  https://gerrit.wikimedia.org/r/248033

Change subject: adds bulk sparql query and output scripts removes total_views
......................................................................

adds bulk sparql query and output scripts
removes total_views

Change-Id: Icf4fcff57024ccc74277d042039957d655102ca2
---
D src/site_stats/total_views/README.md
D src/site_stats/total_views/create_table.sql
D src/site_stats/total_views/export.sh
D src/site_stats/total_views/generate.sh
A src/sparql/Rcron.sh
A src/sparql/bulk_sparql.R
A src/sparql/config.R
A src/sparql/rdfq.xml
8 files changed, 206 insertions(+), 31 deletions(-)


  git pull ssh://gerrit.wikimedia.org:29418/analytics/limn-wikidata-data 
refs/changes/33/248033/1

diff --git a/src/site_stats/total_views/README.md 
b/src/site_stats/total_views/README.md
deleted file mode 100644
index 0371a11..0000000
--- a/src/site_stats/total_views/README.md
+++ /dev/null
@@ -1,5 +0,0 @@
-- - -
-site stats total views
-====================
-
-See https://www.mediawiki.org/wiki/Manual:Site_stats_table
\ No newline at end of file
diff --git a/src/site_stats/total_views/create_table.sql 
b/src/site_stats/total_views/create_table.sql
deleted file mode 100644
index 796f706..0000000
--- a/src/site_stats/total_views/create_table.sql
+++ /dev/null
@@ -1,5 +0,0 @@
-CREATE TABLE IF NOT EXISTS wikidata_site_stats_total_views
-  (
-     date          DATE NOT NULL,
-     count   BIGINT(20) NOT NULL
-  );
\ No newline at end of file
diff --git a/src/site_stats/total_views/export.sh 
b/src/site_stats/total_views/export.sh
deleted file mode 100755
index ad67de6..0000000
--- a/src/site_stats/total_views/export.sh
+++ /dev/null
@@ -1,7 +0,0 @@
-#!/bin/bash
-
-mysqldefaults="/etc/mysql/conf.d/analytics-research-client.cnf"
-mysqlhost="analytics-store.eqiad.wmnet"
-
-mysql --defaults-file=$mysqldefaults -h $mysqlhost -e "SELECT * FROM 
staging.wikidata_site_stats_total_views" > data.tsv
-cp data.tsv /a/aggregate-datasets/wikidata/site_stats_total_views.tsv
diff --git a/src/site_stats/total_views/generate.sh 
b/src/site_stats/total_views/generate.sh
deleted file mode 100755
index 30b9cdf..0000000
--- a/src/site_stats/total_views/generate.sh
+++ /dev/null
@@ -1,14 +0,0 @@
-#!/bin/bash
-
-dateISO=`date --date=today --iso-8601=date`
-
-total_views=$(mysql 
--defaults-file=/etc/mysql/conf.d/analytics-research-client.cnf -h 
analytics-store.eqiad.wmnet -A -se "select ss_total_views from site_stats" 
wikidatawiki)
-
-# Start building the SQL
-sql='INSERT INTO wikidata_site_stats_total_views (date,count) VALUES '
-sql="$sql ('$dateISO', '$total_views');"
-
-# Commit the SQL
-mysql --defaults-file=/etc/mysql/conf.d/analytics-research-client.cnf -h 
analytics-store.eqiad.wmnet -A -e "$sql" staging
-
-echo "All done!"
diff --git a/src/sparql/Rcron.sh b/src/sparql/Rcron.sh
new file mode 100644
index 0000000..5e210c1
--- /dev/null
+++ b/src/sparql/Rcron.sh
@@ -0,0 +1,2 @@
+#! /bin/bash
+R CMD BATCH /srv/dashboards/shiny-server/wdm/src/scripts/bulk_sparql.R
diff --git a/src/sparql/bulk_sparql.R b/src/sparql/bulk_sparql.R
new file mode 100644
index 0000000..d9d2190
--- /dev/null
+++ b/src/sparql/bulk_sparql.R
@@ -0,0 +1,37 @@
+#Bulk Query of WDQS and write to TSV
+source("config.R")
+#TODO: create output path for analytics instance
+#output_path = "/a/aggregate-datasets/wikidata/sparql/"
+output_path = "/tmp/sparql/"
+qlist <- read_file("rdfq.xml")
+
+rdfq <- xmlParse(qlist)
+queries <- xmlToDataFrame(nodes = getNodeSet(rdfq, "//rdfq:select", c(rdfq = 
"http://wikiba.se/rdfq#";)))
+prefixes <- xmlToDataFrame(nodes = getNodeSet(rdfq, "//rdfq:prefix", c(rdfq = 
"http://wikiba.se/rdfq#";)))
+comments <- xmlToDataFrame(nodes = getNodeSet(rdfq, "//rdfs:comment", c(rdfs = 
"http://www.w3.org/2000/01/rdf-schema#";)))
+
+get_sparql_result <- function(uri = wdqs_uri, prefix, query) {
+  xml_result <- readLines(curl(paste0(uri, prefix, query)))
+  doc = xmlParse(xml_result)
+  result = xmlToDataFrame(nodes = getNodeSet(doc, "//sq:literal", c(sq = 
"http://www.w3.org/2005/sparql-results#";)))
+  return(result)
+}
+
+write_tsv <- function(result, filename){
+  date = Sys.Date()
+  file_uri <- paste0(output_path, filename)
+  out = data.frame(date, result)
+  write.table(out, file=file_uri, append = TRUE, sep = "\t", row.names = 
FALSE, col.names = FALSE)
+}
+
+bulk_sparql_query <- function(esc_queries) {
+  for(q in esc_queries) {
+    result <- get_sparql_result(wdqs_uri, pfx, q)
+    tsv_filename <- paste0("spql", match(q, esc_queries), ".tsv")
+    write_tsv(result, tsv_filename)
+  }
+}
+
+esc_queries <- lapply(queries$text, curl_escape)
+pfx <- paste(prefixes$text, collapse="")
+bulk_sparql_query (esc_queries)
diff --git a/src/sparql/config.R b/src/sparql/config.R
new file mode 100644
index 0000000..64af533
--- /dev/null
+++ b/src/sparql/config.R
@@ -0,0 +1,15 @@
+library(readr)
+library(curl)
+library(XML)
+
+base_uri <- "/srv/dashboards/shiny-server/wdm/"
+data_uri <- paste0(base_uri, "data/")
+sparql_data_uri <- paste0(data_uri, "sparql/")
+graphite_api_uri <- "https://graphite.wikimedia.org/render/?target=";
+custom_css <- paste0(base_uri, "assets/css/custom.css")
+metrics_rdf <- paste0(base_uri, "assets/metrics.owl")
+
+source_data_uri <- "http://wdm-data.wmflabs.org/data/";
+agg_data_uri <- "http://datasets.wikimedia.org/aggregate-datasets/wikidata/";
+wdqs_uri <- "https://query.wikidata.org/bigdata/namespace/wdq/sparql?query=";
+
diff --git a/src/sparql/rdfq.xml b/src/sparql/rdfq.xml
new file mode 100644
index 0000000..96487b2
--- /dev/null
+++ b/src/sparql/rdfq.xml
@@ -0,0 +1,152 @@
+<?xml version="1.0"?>
+
+<!DOCTYPE rdf:RDF [
+    <!ENTITY owl "http://www.w3.org/2002/07/owl#"; >
+    <!ENTITY xsd "http://www.w3.org/2001/XMLSchema#"; >
+    <!ENTITY rdfs "http://www.w3.org/2000/01/rdf-schema#"; >
+    <!ENTITY rdf "http://www.w3.org/1999/02/22-rdf-syntax-ns#"; >
+]>
+
+
+<RDF xmlns="http://wikiba.se/rdfq#"; 
xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#";
+     xmlns:rdfs="http://www.w3.org/2000/01/rdf-schema#";>
+  <prefixes>
+    
<prefix>PREFIX%20wikibase%3A%20%3Chttp%3A%2F%2Fwikiba.se%2Fontology%23%3E</prefix>
+    
<prefix>PREFIX%20wdt%3A%20%3Chttp%3A%2F%2Fwww.wikidata.org%2Fprop%2Fdirect%2F%3E</prefix>
+    
<prefix>PREFIX%20wd%3A%20%3Chttp%3A%2F%2Fwww.wikidata.org%2Fentity%2F%3E</prefix>
+    
<prefix>PREFIX%20rdfs%3A%20%3Chttp%3A%2F%2Fwww.w3.org%2F2000%2F01%2Frdf-schema%23%3E</prefix>
+    
<prefix>PREFIX%20prov%3A%20%3Chttp%3A%2F%2Fwww.w3.org%2Fns%2Fprov%23%3E</prefix>
+    
<prefix>PREFIX%20wdref%3A%20%3Chttp%3A%2F%2Fwww.wikidata.org%2Freference%2F%3E</prefix>
+  </prefixes>
+  <!-- Globe Coordinate Values -->
+  <rdf-query>
+    <select>SELECT (count(distinct(?s)) AS ?scount) WHERE {?s ?p 
wikibase:GlobecoordinateValue}</select>
+    <rdfs:comment>count Globecoordinate Value</rdfs:comment>
+  </rdf-query>
+
+  <!-- Time Values -->
+  <rdf-query>
+    <select>SELECT (count(distinct(?s)) AS ?scount) WHERE {?s a 
wikibase:TimeValue}</select>
+    <rdfs:comment>count TimeValue</rdfs:comment>
+  </rdf-query>
+
+  <!-- Quantity Values -->
+  <rdf-query>
+    <select>SELECT (count(distinct(?s)) AS ?scount) WHERE {?s a 
wikibase:QuantityValue}</select>
+    <rdfs:comment>count QuantityValue</rdfs:comment>
+  </rdf-query>
+
+  <!-- Preferred Ranks -->
+  <rdf-query>
+    <select>SELECT (count(distinct(?s)) AS ?scount) WHERE {?s wikibase:rank 
wikibase:PreferredRank}</select>
+    <rdfs:comment>count PreferredRank</rdfs:comment>
+  </rdf-query>
+
+  <!-- Qualifiers -->
+  <rdf-query>
+    <select>SELECT (count(distinct(?s)) AS ?scount) WHERE {?s 
wikibase:qualifier ?o}</select>
+    <rdfs:comment>count qualifier</rdfs:comment>
+  </rdf-query>
+
+  <!-- PropertyType is Wikibase Item -->
+  <rdf-query>
+    <select>SELECT (count(distinct(?s)) AS ?scount) WHERE {?s 
wikibase:propertyType wikibase:WikibaseItem}</select>
+    <rdfs:comment>count WikibaseItem</rdfs:comment>
+  </rdf-query>
+
+  <!-- PropertyType is CommonsMedia -->
+  <rdf-query>
+    <select>SELECT (count(distinct(?s)) AS ?scount) WHERE {?s 
wikibase:propertyType wikibase:CommonsMedia}</select>
+    <rdfs:comment>count Property Type = CommonsMedia</rdfs:comment>
+  </rdf-query>
+
+  <!-- PropertyType is Monolingualtext -->
+  <rdf-query>
+    <select>SELECT (count(distinct(?s)) AS ?scount) WHERE {?s 
wikibase:propertyType wikibase:Monolingualtext}</select>
+    <rdfs:comment>count Property Type = Monolingualtext</rdfs:comment>
+  </rdf-query>
+
+  <!-- PropertyType is Quantity -->
+  <rdf-query>
+    <select>SELECT (count(distinct(?s)) AS ?scount) WHERE {?s 
wikibase:propertyType wikibase:Quantity}</select>
+    <rdfs:comment>count Property Type = Quantity</rdfs:comment>
+  </rdf-query>
+
+  <!-- PropertyType is String -->
+  <rdf-query>
+    <select>SELECT (count(distinct(?s)) AS ?scount) WHERE {?s 
wikibase:propertyType wikibase:String}</select>
+    <rdfs:comment>count Property Type = String</rdfs:comment>
+  </rdf-query>
+
+  <!-- PropertyType is Time -->
+  <rdf-query>
+    <select>SELECT (count(distinct(?s)) AS ?scount) WHERE {?s 
wikibase:propertyType wikibase:Time}</select>
+    <rdfs:comment>count Property Type = Time</rdfs:comment>
+  </rdf-query>
+
+  <!-- PropertyType is URL -->
+  <rdf-query>
+    <select>SELECT (count(distinct(?s)) AS ?scount) WHERE {?s 
wikibase:propertyType wikibase:Url}</select>
+    <rdfs:comment>count Property Type = Url</rdfs:comment>
+  </rdf-query>
+
+  <!-- Wikimedia Categories -->
+  <rdf-query>
+    <select>SELECT (count(distinct(?s)) AS ?scount) WHERE {?s wdt:P31 
wd:Q4167836}</select>
+    <rdfs:comment>count instance of Wikimedia Categories</rdfs:comment>
+  </rdf-query>
+
+  <!-- Commons Categories -->
+  <rdf-query>
+    <select>SELECT (count(distinct(?s)) AS ?scount) WHERE {?s wdt:P373 
?o}</select>
+    <rdfs:comment>count has property Commons Category</rdfs:comment>
+  </rdf-query>
+
+  <!-- Country of Citizenship -->
+  <rdf-query>
+    <select>SELECT (count(distinct(?s)) AS ?scount) WHERE {?s wdt:P27 
?o}</select>
+    <rdfs:comment>count has property Country of Citizenship</rdfs:comment>
+  </rdf-query>
+
+  <!-- Given Name -->
+  <rdf-query>
+    <select>SELECT (count(distinct(?s)) AS ?scount) WHERE {?s wdt:P735 
?o}</select>
+    <rdfs:comment>count has property Given Name</rdfs:comment>
+  </rdf-query>
+
+  <!-- Humans -->
+  <rdf-query>
+    <select>SELECT (count(distinct(?s)) AS ?scount) WHERE {?s wdt:P31 
wd:Q5}</select>
+    <rdfs:comment>count instance of Human</rdfs:comment>
+  </rdf-query>
+
+  <!-- Entities with VIAF -->
+  <rdf-query>
+    <select>SELECT (count(distinct(?s)) AS ?scount) WHERE {?s wdt:P214 
?o}</select>
+    <rdfs:comment>count has property VIAF</rdfs:comment>
+  </rdf-query>
+
+  <!-- Entities with OCLC -->
+  <rdf-query>
+    <select>SELECT (count(distinct(?s)) AS ?scount) WHERE {?s wdt:P243 
?o}</select>
+    <rdfs:comment>count has property OCLC</rdfs:comment>
+  </rdf-query>
+
+  <!-- Get Property Label -->
+  <rdf-query>
+    <select>SELECT distinct ?o WHERE {wd:P735 ?p ?o
+    SERVICE wikibase:label {
+      bd:serviceParam wikibase:language "en" .
+      wd:P735 rdfs:label ?o
+    }
+  }</select>
+    <rdfs:comment>get Property Label e.g P735</rdfs:comment>
+  </rdf-query>
+
+  <!-- Statements Referenced to Wikipedia 
(wdref:004ec6fbee857649acdbdbad4f97b2c8571df97b) -->
+  <rdf-query>
+    <select>SELECT (count(distinct(?s)) AS ?scount) WHERE {?s 
prov:wasDerivedFrom wdref:004ec6fbee857649acdbdbad4f97b2c8571df97b}</select>
+    <rdfs:comment>Statements Referenced to Wikipedia (with GUID 
x)</rdfs:comment>
+  </rdf-query>
+</RDF>
+

-- 
To view, visit https://gerrit.wikimedia.org/r/248033
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings

Gerrit-MessageType: newchange
Gerrit-Change-Id: Icf4fcff57024ccc74277d042039957d655102ca2
Gerrit-PatchSet: 1
Gerrit-Project: analytics/limn-wikidata-data
Gerrit-Branch: master
Gerrit-Owner: Christopher Johnson (WMDE) <christopher.john...@wikimedia.de>

_______________________________________________
MediaWiki-commits mailing list
MediaWiki-commits@lists.wikimedia.org
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits

Reply via email to