Deskana has submitted this change and it was merged.

Change subject: Move Hive-related code into functions in common.R
......................................................................


Move Hive-related code into functions in common.R

With the WDQS code we're now duplicating hive-related functionality
(deterministically generating WHERE clauses based on dates, running
queries) between wdqs.R and api.R. This patch moves that code into
functions within common.R for trivial reuse.

Change-Id: Idf0fea9e5a7fd9242e57e5d8d28fae483a54d765
---
M common.R
M search/api.R
M wdqs/basic_usage.R
3 files changed, 34 insertions(+), 25 deletions(-)

Approvals:
  Bearloga: Verified; Looks good to me, approved
  Deskana: Verified; Looks good to me, approved



diff --git a/common.R b/common.R
index 5a39578..5329801 100644
--- a/common.R
+++ b/common.R
@@ -29,3 +29,29 @@
     write.table(x, file, append = FALSE, sep = "\t", row.names = FALSE)
   }
 }
+
+# date_clause; provided with a date it generates an appropriate set of WHERE 
clauses for HDFS partitioning.
+date_clause <- function(date){
+  return(paste0(" WHERE year = ", lubridate::year(date),
+                " AND month = ", lubridate::month(date),
+                " AND day = ", lubridate::day(date), " "))
+  
+}
+
+# query_hive; provided with a hive query it writes it out to file and then 
calls Hive over said file, reading the results
+# and cleaning up after isself nicely when done.
+query_hive <- function(query){
+  
+  # Write query out to tempfile and create tempfile for results.
+  query_dump <- tempfile()
+  cat(query, file = query_dump)
+  results_dump <- tempfile()
+  
+  # Query and read in the results
+  system(paste0("export HADOOP_HEAPSIZE=1024 && hive -f ", query_dump, " > ", 
results_dump))
+  results <- read.delim(results_dump, sep = "\t", quote = "", as.is = TRUE, 
header = TRUE)
+  
+  # Clean up and return
+  file.remove(query_dump, results_dump)
+  return(results)
+}
\ No newline at end of file
diff --git a/search/api.R b/search/api.R
index 6164803..8050abd 100644
--- a/search/api.R
+++ b/search/api.R
@@ -10,11 +10,11 @@
   if(is.null(date)){
     date <- Sys.Date() - 1
   }
-  subquery <- paste0(" WHERE year = ", lubridate::year(date),
-                     " AND month = ", lubridate::month(date),
-                     " AND day = ", lubridate::day(date), " ")
+  
+  # Date subquery
+  subquery <- date_clause(date)
 
-  # Write query and dump to file
+  # Write query and run it
   query <- paste0("ADD JAR 
/srv/deployment/analytics/refinery/artifacts/refinery-hive.jar;
                    CREATE TEMPORARY FUNCTION search_classify AS
                   'org.wikimedia.analytics.refinery.hive.SearchClassifierUDF';
@@ -25,14 +25,7 @@
                   ", subquery,
                   "AND webrequest_source IN('text','mobile') AND http_status = 
'200'
                    GROUP BY year, month, day, search_classify(uri_path, 
uri_query);")
-  query_dump <- tempfile()
-  cat(query, file = query_dump)
-
-  # Query
-  results_dump <- tempfile()
-  system(paste0("export HADOOP_HEAPSIZE=1024 && hive -f ", query_dump, " > ", 
results_dump))
-  results <- read.delim(results_dump, sep = "\t", quote = "", as.is = TRUE, 
header = TRUE)
-  file.remove(query_dump, results_dump)
+  results <- query_hive(query)
 
   # Filter and reformat
   results <- results[complete.cases(results),]
diff --git a/wdqs/basic_usage.R b/wdqs/basic_usage.R
index 5dc37e6..38ed2ed 100644
--- a/wdqs/basic_usage.R
+++ b/wdqs/basic_usage.R
@@ -16,11 +16,9 @@
   if(is.null(date)) {
     date <- Sys.Date() - 1
   }
-  subquery <- paste0(" WHERE year = ", lubridate::year(date),
-                     " AND month = ", lubridate::month(date),
-                     " AND day = ", lubridate::day(date), " ")
+  subquery <- date_clause(date)
 
-  # Write query and dump to file
+  # Write query and run it
   query <- paste0("USE wmf;
                    SELECT year, month, day, uri_path,
                    UPPER(http_status IN('200','304')) as success,
@@ -32,15 +30,7 @@
                    AND uri_path IN('/', '/bigdata/namespace/wdq/sparql')
                    GROUP BY year, month, day, uri_path,
                    UPPER(http_status IN('200','304'));")
-                   
-  query_dump <- tempfile()
-  cat(query, file = query_dump)
-
-  # Query
-  results_dump <- tempfile()
-  system(paste0("export HADOOP_HEAPSIZE=1024 && hive -f ", query_dump, " > ", 
results_dump))
-  results <- read.delim(results_dump, sep = "\t", quote = "", as.is = TRUE, 
header = TRUE)
-  file.remove(query_dump, results_dump)
+  results <- query_hive(query)
 
   output <- data.frame(timestamp = as.Date(paste(results$year, results$month, 
results$day, sep = "-")),
                        path = results$uri_path,

-- 
To view, visit https://gerrit.wikimedia.org/r/236202
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings

Gerrit-MessageType: merged
Gerrit-Change-Id: Idf0fea9e5a7fd9242e57e5d8d28fae483a54d765
Gerrit-PatchSet: 1
Gerrit-Project: wikimedia/discovery/golden
Gerrit-Branch: master
Gerrit-Owner: OliverKeyes <[email protected]>
Gerrit-Reviewer: Bearloga <[email protected]>
Gerrit-Reviewer: Deskana <[email protected]>

_______________________________________________
MediaWiki-commits mailing list
[email protected]
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits

Reply via email to