GoranSMilovanovic has uploaded a new change for review. ( https://gerrit.wikimedia.org/r/392209 )
Change subject: Production - stat1005 structure ...................................................................... Production - stat1005 structure Change-Id: I0bf98633c0ab722761a064e5eebee9fb1fb5511e --- A WDCM_Engine.R A WDCM_Ontology/WDCM_Ontology_Berlin_05032017.csv M _installProduction_analytics-wmde.R 3 files changed, 221 insertions(+), 2 deletions(-) git pull ssh://gerrit.wikimedia.org:29418/analytics/wmde/WDCM refs/changes/09/392209/1 diff --git a/WDCM_Engine.R b/WDCM_Engine.R new file mode 100644 index 0000000..39a6330 --- /dev/null +++ b/WDCM_Engine.R @@ -0,0 +1,206 @@ +### --------------------------------------------------------------------------- +### --- WDCM Engine, v. Beta 0.1 +### --- Script: WDCM_Engine.R, v. Beta 0.1 +### --- Author: Goran S. Milovanovic, Data Analyst, WMDE +### --- Developed under the contract between Goran Milovanovic PR Data Kolektiv +### --- and WMDE. +### --- Contact: [email protected] +### --------------------------------------------------------------------------- +### --- DESCRIPTION: +### --- WDCM_Engine.R unifies the previous +### --- four WDCM Engine scripts: +### --- WDCM_Collect_Items.R +### --- WDCM_Sqoop_Clients.R +### --- WDCM_Search_Clients.R +### --- WDCM_Pre-Process.R +### --- each section in WDCM_Engine.R provides additional explanation. +### --- NOTE: WDCM_Engine.R is exclusively the only WDCM R script +### --- that is run in production from stat1005 (currently) +### --- to produce the WDCM update +### --------------------------------------------------------------------------- +### --- LICENSE: +### --------------------------------------------------------------------------- +### --- GPL v2 +### --- This file is part of Wikidata Concepts Monitor (WDCM) +### --- +### --- WDCM is free software: you can redistribute it and/or modify +### --- it under the terms of the GNU General Public License as published by +### --- the Free Software Foundation, either version 2 of the License, or +### --- (at your option) any later version. +### --- +### --- WDCM is distributed in the hope that it will be useful, +### --- but WITHOUT ANY WARRANTY; without even the implied warranty of +### --- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +### --- GNU General Public License for more details. +### --- +### --- You should have received a copy of the GNU General Public License +### --- along with WDCM. If not, see <http://www.gnu.org/licenses/>. +### --------------------------------------------------------------------------- + +### --------------------------------------------------------------------------- +### --- Script 1: WDCM_Collect_Items.R, WDCM Search Module +### --------------------------------------------------------------------------- + +### --------------------------------------------------------------------------- +### --- WDCM Search Module, v. Beta 0.1 +### --- Script: WDCM_Collect_Items.R, v. Beta 0.1 +### --- Author: Goran S. Milovanovic, Data Analyst, WMDE +### --- Developed under the contract between Goran Milovanovic PR Data Kolektiv +### --- and WMDE. +### --- Contact: [email protected] +### --------------------------------------------------------------------------- +### --- DESCRIPTION: +### --- WDCM_Collect_Items.R takes a list of concepts (categories) +### --- defined by a given WDCM Ontology (human input) and then +### --- contacts the SPARQL endpoint to fetch all relevant item IDs. +### --------------------------------------------------------------------------- +### --- INPUT: +### --- the WDCM_Collect_Items.R reads the WDCM Ontology file (csv) +### --- from /home/goransm/WMDE/WDCM/WDCM_DataIN/WDCM_Ontology +### --- on wikidataconcepts.wmflabs.org +### --------------------------------------------------------------------------- +### --- OUTPUT: +### --- Results are stored locally as .csv files on the wikidataconcepts Labs instance: +### --- wikidataconcepts.wmflabs.org +### --- in: /home/goransm/WMDE/WDCM/WDCM_DataOUT +### --- These output .csv files migrate to production (stat1005.eqiad.wmnet, currently): +### --- where they are then further processed by the WDCM Search Module (running: +### --- WDCM_Search_Clients.R) +### --------------------------------------------------------------------------- + +### --------------------------------------------------------------------------- +### --- GPL v2 +### --- This file is part of Wikidata Concepts Monitor (WDCM) +### --- +### --- WDCM is free software: you can redistribute it and/or modify +### --- it under the terms of the GNU General Public License as published by +### --- the Free Software Foundation, either version 2 of the License, or +### --- (at your option) any later version. +### --- +### --- WDCM is distributed in the hope that it will be useful, +### --- but WITHOUT ANY WARRANTY; without even the implied warranty of +### --- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +### --- GNU General Public License for more details. +### --- +### --- You should have received a copy of the GNU General Public License +### --- along with WDCM. If not, see <http://www.gnu.org/licenses/>. +### --------------------------------------------------------------------------- + + +### --- Setup + +library(httr) +library(stringr) +library(XML) +library(readr) +library(data.table) + +### ---- Read WDCM_Ontology from: /WDCM_Ontology + +wDir <- '/WDCM_Ontology' +setwd(wDir) +wdcmOntology <- read.csv("WDCM_Ontology_Berlin_05032017.csv", + header = T, + check.names = F, + stringsAsFactors = F) + +### --- Select all instances accross all sub-classes of searchItems: + +# - endPoint: +endPointURL <- "https://query.wikidata.org/bigdata/namespace/wdq/sparql?format=xml&query=" + +# - track the number of items fetched: +totalN <- numeric() + +# - set output dir: +outDir <- './WDCM_CollectedItems' +setwd(outDir) + +# - track uncompleted queries: +qErrors <- character() + +for (i in 1:length(wdcmOntology$CategoryItems)) { + + searchItems <- str_trim( + strsplit(wdcmOntology$CategoryItems[i], + split = ",", fixed = T)[[1]], + "both") + + itemsOut <- list() + + for (k in 1:length(searchItems)) { + + # - Construct Query: + query <- paste0( + 'PREFIX wd: <http://www.wikidata.org/entity/> ', + 'PREFIX wdt: <http://www.wikidata.org/prop/direct/> ', + 'SELECT ?item WHERE { ?item wdt:P31/wdt:P279* wd:', + searchItems[k], + '. }' + ) + + # Run Query: + res <- GET(paste0(endPointURL, URLencode(query))) + + # If res$status_code == 200, store: + + if (res$status_code == 200) { + + # XML: + rc <- rawToChar(res$content) + rc <- htmlParse(rc) + + # clear: + rm(res); gc() + + # extract: + items <- xpathSApply(rc, "//uri", xmlValue) + items <- unname(sapply(items, function(x) { + strsplit(x, split = "/", fixed = T)[[1]][length(strsplit(x, split = "/", fixed = T)[[1]])] + })) + labels <- xpathSApply(rc, "//literal", xmlValue) + + # - as.data.frame: + items <- data.frame(item = items, + label = labels, + stringsAsFactors = F) + + # - to itemsOut: + itemsOut[[k]] <- items + + # - clear: + rm(items); gc() + + } else { + qErrors <- append(qErrors, searchItems[k]) + } + + } + + # - itemsOut as data.frame: + itemsOut <- rbindlist(itemsOut) + + # - keep only unique items: + w <- which(!(duplicated(itemsOut$item))) + itemsOut <- itemsOut[w] + + # store as CSV + write_csv(itemsOut, path = paste0(wdcmOntology$Category[i],"_ItemIDs.csv")) + + # total numeber of concepts ++: + totalN <- append(totalN, length(itemsOut$item)) + + # clear: + rm(itemsOut); gc() + + # Wait 30 secs before processing and running the next query: + Sys.sleep(30) + +} + +# - log uncompleted queries: +# - set log dir: +outDir <- './WDCM_Logs' +setwd(outDir) +write.csv(qErrors, "WDCM_CollectItems_SPARQL_Errors.csv") + diff --git a/WDCM_Ontology/WDCM_Ontology_Berlin_05032017.csv b/WDCM_Ontology/WDCM_Ontology_Berlin_05032017.csv new file mode 100644 index 0000000..5c44db0 --- /dev/null +++ b/WDCM_Ontology/WDCM_Ontology_Berlin_05032017.csv @@ -0,0 +1,15 @@ +Category,CategoryItems,WikidataDescription +Human,Q5,human +Wikimedia_Internal,"Q4167836, Q4167410, Q11266439","Wikimedia category, Wikimedia disambiguation page, wikimedia template" +Work Of Art,Q838948,work of art +Scientific Article,Q13442814,scientific article +Book,Q571,book +Geographical Object,Q618123,geographical object +Organization,"Q783794, Q5971060, Q43229","company, club, organization" +Architectural Structure,"Q4989906, Q41176","monument, building" +Gene,Q7187,gene +Chemical Entities,"Q11344, Q11173, Q79529","chemical element, chemical compound, chemical substance" +Astronomical Object,Q6999,astronomical object +Taxon,Q16521,taxon +Event,Q1656682,event +Thoroughfare,Q83620,thoroughfare diff --git a/_installProduction_analytics-wmde.R b/_installProduction_analytics-wmde.R index d973370..e5fee13 100644 --- a/_installProduction_analytics-wmde.R +++ b/_installProduction_analytics-wmde.R @@ -8,7 +8,6 @@ http_proxy = "http://webproxy.eqiad.wmnet:8080", https_proxy = "http://webproxy.eqiad.wmnet:8080") - # - fPath: where the scripts is run from? fPath <- as.character(commandArgs(trailingOnly = FALSE)[4]) fPath <- gsub("--file=", "", fPath, fixed = T) @@ -17,7 +16,6 @@ paste(fPath[1:length(fPath) - 1], collapse = "/"), "/", sep = "") - # - find out whether the fPath/r-library directory exists # - YES: delete it and mkdir, NO: mkdir only -- To view, visit https://gerrit.wikimedia.org/r/392209 To unsubscribe, visit https://gerrit.wikimedia.org/r/settings Gerrit-MessageType: newchange Gerrit-Change-Id: I0bf98633c0ab722761a064e5eebee9fb1fb5511e Gerrit-PatchSet: 1 Gerrit-Project: analytics/wmde/WDCM Gerrit-Branch: master Gerrit-Owner: GoranSMilovanovic <[email protected]> _______________________________________________ MediaWiki-commits mailing list [email protected] https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits
