OliverKeyes has uploaded a new change for review.
https://gerrit.wikimedia.org/r/236202
Change subject: Move Hive-related code into functions in common.R
......................................................................
Move Hive-related code into functions in common.R
With the WDQS code we're now duplicating hive-related functionality
(deterministically generating WHERE clauses based on dates, running
queries) between wdqs.R and api.R. This patch moves that code into
functions within common.R for trivial reuse.
Change-Id: Idf0fea9e5a7fd9242e57e5d8d28fae483a54d765
---
M common.R
M search/api.R
M wdqs/basic_usage.R
3 files changed, 34 insertions(+), 25 deletions(-)
git pull ssh://gerrit.wikimedia.org:29418/wikimedia/discovery/golden
refs/changes/02/236202/1
diff --git a/common.R b/common.R
index 5a39578..5329801 100644
--- a/common.R
+++ b/common.R
@@ -29,3 +29,29 @@
write.table(x, file, append = FALSE, sep = "\t", row.names = FALSE)
}
}
+
+# date_clause; provided with a date it generates an appropriate set of WHERE
clauses for HDFS partitioning.
+date_clause <- function(date){
+ return(paste0(" WHERE year = ", lubridate::year(date),
+ " AND month = ", lubridate::month(date),
+ " AND day = ", lubridate::day(date), " "))
+
+}
+
+# query_hive; provided with a hive query it writes it out to file and then
calls Hive over said file, reading the results
+# and cleaning up after isself nicely when done.
+query_hive <- function(query){
+
+ # Write query out to tempfile and create tempfile for results.
+ query_dump <- tempfile()
+ cat(query, file = query_dump)
+ results_dump <- tempfile()
+
+ # Query and read in the results
+ system(paste0("export HADOOP_HEAPSIZE=1024 && hive -f ", query_dump, " > ",
results_dump))
+ results <- read.delim(results_dump, sep = "\t", quote = "", as.is = TRUE,
header = TRUE)
+
+ # Clean up and return
+ file.remove(query_dump, results_dump)
+ return(results)
+}
\ No newline at end of file
diff --git a/search/api.R b/search/api.R
index 6164803..8050abd 100644
--- a/search/api.R
+++ b/search/api.R
@@ -10,11 +10,11 @@
if(is.null(date)){
date <- Sys.Date() - 1
}
- subquery <- paste0(" WHERE year = ", lubridate::year(date),
- " AND month = ", lubridate::month(date),
- " AND day = ", lubridate::day(date), " ")
+
+ # Date subquery
+ subquery <- date_clause(date)
- # Write query and dump to file
+ # Write query and run it
query <- paste0("ADD JAR
/srv/deployment/analytics/refinery/artifacts/refinery-hive.jar;
CREATE TEMPORARY FUNCTION search_classify AS
'org.wikimedia.analytics.refinery.hive.SearchClassifierUDF';
@@ -25,14 +25,7 @@
", subquery,
"AND webrequest_source IN('text','mobile') AND http_status =
'200'
GROUP BY year, month, day, search_classify(uri_path,
uri_query);")
- query_dump <- tempfile()
- cat(query, file = query_dump)
-
- # Query
- results_dump <- tempfile()
- system(paste0("export HADOOP_HEAPSIZE=1024 && hive -f ", query_dump, " > ",
results_dump))
- results <- read.delim(results_dump, sep = "\t", quote = "", as.is = TRUE,
header = TRUE)
- file.remove(query_dump, results_dump)
+ results <- query_hive(query)
# Filter and reformat
results <- results[complete.cases(results),]
diff --git a/wdqs/basic_usage.R b/wdqs/basic_usage.R
index 5dc37e6..38ed2ed 100644
--- a/wdqs/basic_usage.R
+++ b/wdqs/basic_usage.R
@@ -16,11 +16,9 @@
if(is.null(date)) {
date <- Sys.Date() - 1
}
- subquery <- paste0(" WHERE year = ", lubridate::year(date),
- " AND month = ", lubridate::month(date),
- " AND day = ", lubridate::day(date), " ")
+ subquery <- date_clause(date)
- # Write query and dump to file
+ # Write query and run it
query <- paste0("USE wmf;
SELECT year, month, day, uri_path,
UPPER(http_status IN('200','304')) as success,
@@ -32,15 +30,7 @@
AND uri_path IN('/', '/bigdata/namespace/wdq/sparql')
GROUP BY year, month, day, uri_path,
UPPER(http_status IN('200','304'));")
-
- query_dump <- tempfile()
- cat(query, file = query_dump)
-
- # Query
- results_dump <- tempfile()
- system(paste0("export HADOOP_HEAPSIZE=1024 && hive -f ", query_dump, " > ",
results_dump))
- results <- read.delim(results_dump, sep = "\t", quote = "", as.is = TRUE,
header = TRUE)
- file.remove(query_dump, results_dump)
+ results <- query_hive(query)
output <- data.frame(timestamp = as.Date(paste(results$year, results$month,
results$day, sep = "-")),
path = results$uri_path,
--
To view, visit https://gerrit.wikimedia.org/r/236202
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings
Gerrit-MessageType: newchange
Gerrit-Change-Id: Idf0fea9e5a7fd9242e57e5d8d28fae483a54d765
Gerrit-PatchSet: 1
Gerrit-Project: wikimedia/discovery/golden
Gerrit-Branch: master
Gerrit-Owner: OliverKeyes <[email protected]>
_______________________________________________
MediaWiki-commits mailing list
[email protected]
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits