Christopher Johnson (WMDE) has uploaded a new change for review. https://gerrit.wikimedia.org/r/248033
Change subject: adds bulk sparql query and output scripts removes total_views ...................................................................... adds bulk sparql query and output scripts removes total_views Change-Id: Icf4fcff57024ccc74277d042039957d655102ca2 --- D src/site_stats/total_views/README.md D src/site_stats/total_views/create_table.sql D src/site_stats/total_views/export.sh D src/site_stats/total_views/generate.sh A src/sparql/Rcron.sh A src/sparql/bulk_sparql.R A src/sparql/config.R A src/sparql/rdfq.xml 8 files changed, 206 insertions(+), 31 deletions(-) git pull ssh://gerrit.wikimedia.org:29418/analytics/limn-wikidata-data refs/changes/33/248033/1 diff --git a/src/site_stats/total_views/README.md b/src/site_stats/total_views/README.md deleted file mode 100644 index 0371a11..0000000 --- a/src/site_stats/total_views/README.md +++ /dev/null @@ -1,5 +0,0 @@ -- - - -site stats total views -==================== - -See https://www.mediawiki.org/wiki/Manual:Site_stats_table \ No newline at end of file diff --git a/src/site_stats/total_views/create_table.sql b/src/site_stats/total_views/create_table.sql deleted file mode 100644 index 796f706..0000000 --- a/src/site_stats/total_views/create_table.sql +++ /dev/null @@ -1,5 +0,0 @@ -CREATE TABLE IF NOT EXISTS wikidata_site_stats_total_views - ( - date DATE NOT NULL, - count BIGINT(20) NOT NULL - ); \ No newline at end of file diff --git a/src/site_stats/total_views/export.sh b/src/site_stats/total_views/export.sh deleted file mode 100755 index ad67de6..0000000 --- a/src/site_stats/total_views/export.sh +++ /dev/null @@ -1,7 +0,0 @@ -#!/bin/bash - -mysqldefaults="/etc/mysql/conf.d/analytics-research-client.cnf" -mysqlhost="analytics-store.eqiad.wmnet" - -mysql --defaults-file=$mysqldefaults -h $mysqlhost -e "SELECT * FROM staging.wikidata_site_stats_total_views" > data.tsv -cp data.tsv /a/aggregate-datasets/wikidata/site_stats_total_views.tsv diff --git a/src/site_stats/total_views/generate.sh b/src/site_stats/total_views/generate.sh deleted file mode 100755 index 30b9cdf..0000000 --- a/src/site_stats/total_views/generate.sh +++ /dev/null @@ -1,14 +0,0 @@ -#!/bin/bash - -dateISO=`date --date=today --iso-8601=date` - -total_views=$(mysql --defaults-file=/etc/mysql/conf.d/analytics-research-client.cnf -h analytics-store.eqiad.wmnet -A -se "select ss_total_views from site_stats" wikidatawiki) - -# Start building the SQL -sql='INSERT INTO wikidata_site_stats_total_views (date,count) VALUES ' -sql="$sql ('$dateISO', '$total_views');" - -# Commit the SQL -mysql --defaults-file=/etc/mysql/conf.d/analytics-research-client.cnf -h analytics-store.eqiad.wmnet -A -e "$sql" staging - -echo "All done!" diff --git a/src/sparql/Rcron.sh b/src/sparql/Rcron.sh new file mode 100644 index 0000000..5e210c1 --- /dev/null +++ b/src/sparql/Rcron.sh @@ -0,0 +1,2 @@ +#! /bin/bash +R CMD BATCH /srv/dashboards/shiny-server/wdm/src/scripts/bulk_sparql.R diff --git a/src/sparql/bulk_sparql.R b/src/sparql/bulk_sparql.R new file mode 100644 index 0000000..d9d2190 --- /dev/null +++ b/src/sparql/bulk_sparql.R @@ -0,0 +1,37 @@ +#Bulk Query of WDQS and write to TSV +source("config.R") +#TODO: create output path for analytics instance +#output_path = "/a/aggregate-datasets/wikidata/sparql/" +output_path = "/tmp/sparql/" +qlist <- read_file("rdfq.xml") + +rdfq <- xmlParse(qlist) +queries <- xmlToDataFrame(nodes = getNodeSet(rdfq, "//rdfq:select", c(rdfq = "http://wikiba.se/rdfq#"))) +prefixes <- xmlToDataFrame(nodes = getNodeSet(rdfq, "//rdfq:prefix", c(rdfq = "http://wikiba.se/rdfq#"))) +comments <- xmlToDataFrame(nodes = getNodeSet(rdfq, "//rdfs:comment", c(rdfs = "http://www.w3.org/2000/01/rdf-schema#"))) + +get_sparql_result <- function(uri = wdqs_uri, prefix, query) { + xml_result <- readLines(curl(paste0(uri, prefix, query))) + doc = xmlParse(xml_result) + result = xmlToDataFrame(nodes = getNodeSet(doc, "//sq:literal", c(sq = "http://www.w3.org/2005/sparql-results#"))) + return(result) +} + +write_tsv <- function(result, filename){ + date = Sys.Date() + file_uri <- paste0(output_path, filename) + out = data.frame(date, result) + write.table(out, file=file_uri, append = TRUE, sep = "\t", row.names = FALSE, col.names = FALSE) +} + +bulk_sparql_query <- function(esc_queries) { + for(q in esc_queries) { + result <- get_sparql_result(wdqs_uri, pfx, q) + tsv_filename <- paste0("spql", match(q, esc_queries), ".tsv") + write_tsv(result, tsv_filename) + } +} + +esc_queries <- lapply(queries$text, curl_escape) +pfx <- paste(prefixes$text, collapse="") +bulk_sparql_query (esc_queries) diff --git a/src/sparql/config.R b/src/sparql/config.R new file mode 100644 index 0000000..64af533 --- /dev/null +++ b/src/sparql/config.R @@ -0,0 +1,15 @@ +library(readr) +library(curl) +library(XML) + +base_uri <- "/srv/dashboards/shiny-server/wdm/" +data_uri <- paste0(base_uri, "data/") +sparql_data_uri <- paste0(data_uri, "sparql/") +graphite_api_uri <- "https://graphite.wikimedia.org/render/?target=" +custom_css <- paste0(base_uri, "assets/css/custom.css") +metrics_rdf <- paste0(base_uri, "assets/metrics.owl") + +source_data_uri <- "http://wdm-data.wmflabs.org/data/" +agg_data_uri <- "http://datasets.wikimedia.org/aggregate-datasets/wikidata/" +wdqs_uri <- "https://query.wikidata.org/bigdata/namespace/wdq/sparql?query=" + diff --git a/src/sparql/rdfq.xml b/src/sparql/rdfq.xml new file mode 100644 index 0000000..96487b2 --- /dev/null +++ b/src/sparql/rdfq.xml @@ -0,0 +1,152 @@ +<?xml version="1.0"?> + +<!DOCTYPE rdf:RDF [ + <!ENTITY owl "http://www.w3.org/2002/07/owl#" > + <!ENTITY xsd "http://www.w3.org/2001/XMLSchema#" > + <!ENTITY rdfs "http://www.w3.org/2000/01/rdf-schema#" > + <!ENTITY rdf "http://www.w3.org/1999/02/22-rdf-syntax-ns#" > +]> + + +<RDF xmlns="http://wikiba.se/rdfq#" xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#" + xmlns:rdfs="http://www.w3.org/2000/01/rdf-schema#"> + <prefixes> + <prefix>PREFIX%20wikibase%3A%20%3Chttp%3A%2F%2Fwikiba.se%2Fontology%23%3E</prefix> + <prefix>PREFIX%20wdt%3A%20%3Chttp%3A%2F%2Fwww.wikidata.org%2Fprop%2Fdirect%2F%3E</prefix> + <prefix>PREFIX%20wd%3A%20%3Chttp%3A%2F%2Fwww.wikidata.org%2Fentity%2F%3E</prefix> + <prefix>PREFIX%20rdfs%3A%20%3Chttp%3A%2F%2Fwww.w3.org%2F2000%2F01%2Frdf-schema%23%3E</prefix> + <prefix>PREFIX%20prov%3A%20%3Chttp%3A%2F%2Fwww.w3.org%2Fns%2Fprov%23%3E</prefix> + <prefix>PREFIX%20wdref%3A%20%3Chttp%3A%2F%2Fwww.wikidata.org%2Freference%2F%3E</prefix> + </prefixes> + <!-- Globe Coordinate Values --> + <rdf-query> + <select>SELECT (count(distinct(?s)) AS ?scount) WHERE {?s ?p wikibase:GlobecoordinateValue}</select> + <rdfs:comment>count Globecoordinate Value</rdfs:comment> + </rdf-query> + + <!-- Time Values --> + <rdf-query> + <select>SELECT (count(distinct(?s)) AS ?scount) WHERE {?s a wikibase:TimeValue}</select> + <rdfs:comment>count TimeValue</rdfs:comment> + </rdf-query> + + <!-- Quantity Values --> + <rdf-query> + <select>SELECT (count(distinct(?s)) AS ?scount) WHERE {?s a wikibase:QuantityValue}</select> + <rdfs:comment>count QuantityValue</rdfs:comment> + </rdf-query> + + <!-- Preferred Ranks --> + <rdf-query> + <select>SELECT (count(distinct(?s)) AS ?scount) WHERE {?s wikibase:rank wikibase:PreferredRank}</select> + <rdfs:comment>count PreferredRank</rdfs:comment> + </rdf-query> + + <!-- Qualifiers --> + <rdf-query> + <select>SELECT (count(distinct(?s)) AS ?scount) WHERE {?s wikibase:qualifier ?o}</select> + <rdfs:comment>count qualifier</rdfs:comment> + </rdf-query> + + <!-- PropertyType is Wikibase Item --> + <rdf-query> + <select>SELECT (count(distinct(?s)) AS ?scount) WHERE {?s wikibase:propertyType wikibase:WikibaseItem}</select> + <rdfs:comment>count WikibaseItem</rdfs:comment> + </rdf-query> + + <!-- PropertyType is CommonsMedia --> + <rdf-query> + <select>SELECT (count(distinct(?s)) AS ?scount) WHERE {?s wikibase:propertyType wikibase:CommonsMedia}</select> + <rdfs:comment>count Property Type = CommonsMedia</rdfs:comment> + </rdf-query> + + <!-- PropertyType is Monolingualtext --> + <rdf-query> + <select>SELECT (count(distinct(?s)) AS ?scount) WHERE {?s wikibase:propertyType wikibase:Monolingualtext}</select> + <rdfs:comment>count Property Type = Monolingualtext</rdfs:comment> + </rdf-query> + + <!-- PropertyType is Quantity --> + <rdf-query> + <select>SELECT (count(distinct(?s)) AS ?scount) WHERE {?s wikibase:propertyType wikibase:Quantity}</select> + <rdfs:comment>count Property Type = Quantity</rdfs:comment> + </rdf-query> + + <!-- PropertyType is String --> + <rdf-query> + <select>SELECT (count(distinct(?s)) AS ?scount) WHERE {?s wikibase:propertyType wikibase:String}</select> + <rdfs:comment>count Property Type = String</rdfs:comment> + </rdf-query> + + <!-- PropertyType is Time --> + <rdf-query> + <select>SELECT (count(distinct(?s)) AS ?scount) WHERE {?s wikibase:propertyType wikibase:Time}</select> + <rdfs:comment>count Property Type = Time</rdfs:comment> + </rdf-query> + + <!-- PropertyType is URL --> + <rdf-query> + <select>SELECT (count(distinct(?s)) AS ?scount) WHERE {?s wikibase:propertyType wikibase:Url}</select> + <rdfs:comment>count Property Type = Url</rdfs:comment> + </rdf-query> + + <!-- Wikimedia Categories --> + <rdf-query> + <select>SELECT (count(distinct(?s)) AS ?scount) WHERE {?s wdt:P31 wd:Q4167836}</select> + <rdfs:comment>count instance of Wikimedia Categories</rdfs:comment> + </rdf-query> + + <!-- Commons Categories --> + <rdf-query> + <select>SELECT (count(distinct(?s)) AS ?scount) WHERE {?s wdt:P373 ?o}</select> + <rdfs:comment>count has property Commons Category</rdfs:comment> + </rdf-query> + + <!-- Country of Citizenship --> + <rdf-query> + <select>SELECT (count(distinct(?s)) AS ?scount) WHERE {?s wdt:P27 ?o}</select> + <rdfs:comment>count has property Country of Citizenship</rdfs:comment> + </rdf-query> + + <!-- Given Name --> + <rdf-query> + <select>SELECT (count(distinct(?s)) AS ?scount) WHERE {?s wdt:P735 ?o}</select> + <rdfs:comment>count has property Given Name</rdfs:comment> + </rdf-query> + + <!-- Humans --> + <rdf-query> + <select>SELECT (count(distinct(?s)) AS ?scount) WHERE {?s wdt:P31 wd:Q5}</select> + <rdfs:comment>count instance of Human</rdfs:comment> + </rdf-query> + + <!-- Entities with VIAF --> + <rdf-query> + <select>SELECT (count(distinct(?s)) AS ?scount) WHERE {?s wdt:P214 ?o}</select> + <rdfs:comment>count has property VIAF</rdfs:comment> + </rdf-query> + + <!-- Entities with OCLC --> + <rdf-query> + <select>SELECT (count(distinct(?s)) AS ?scount) WHERE {?s wdt:P243 ?o}</select> + <rdfs:comment>count has property OCLC</rdfs:comment> + </rdf-query> + + <!-- Get Property Label --> + <rdf-query> + <select>SELECT distinct ?o WHERE {wd:P735 ?p ?o + SERVICE wikibase:label { + bd:serviceParam wikibase:language "en" . + wd:P735 rdfs:label ?o + } + }</select> + <rdfs:comment>get Property Label e.g P735</rdfs:comment> + </rdf-query> + + <!-- Statements Referenced to Wikipedia (wdref:004ec6fbee857649acdbdbad4f97b2c8571df97b) --> + <rdf-query> + <select>SELECT (count(distinct(?s)) AS ?scount) WHERE {?s prov:wasDerivedFrom wdref:004ec6fbee857649acdbdbad4f97b2c8571df97b}</select> + <rdfs:comment>Statements Referenced to Wikipedia (with GUID x)</rdfs:comment> + </rdf-query> +</RDF> + -- To view, visit https://gerrit.wikimedia.org/r/248033 To unsubscribe, visit https://gerrit.wikimedia.org/r/settings Gerrit-MessageType: newchange Gerrit-Change-Id: Icf4fcff57024ccc74277d042039957d655102ca2 Gerrit-PatchSet: 1 Gerrit-Project: analytics/limn-wikidata-data Gerrit-Branch: master Gerrit-Owner: Christopher Johnson (WMDE) <christopher.john...@wikimedia.de> _______________________________________________ MediaWiki-commits mailing list MediaWiki-commits@lists.wikimedia.org https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits