Addshore has uploaded a new change for review.
https://gerrit.wikimedia.org/r/301657
Change subject: WIP Create WikidataSpecialEntityDataMetrics
......................................................................
WIP Create WikidataSpecialEntityDataMetrics
Bug: T141525
Change-Id: Ib1b69479456cb5e6c24d0581f566d347309cc1e9
---
A
refinery-job/src/main/scala/org/wikimedia/analytics/refinery/job/WikidataSpecialEntityDataMetrics.scala
1 file changed, 157 insertions(+), 0 deletions(-)
git pull ssh://gerrit.wikimedia.org:29418/analytics/refinery/source
refs/changes/57/301657/1
diff --git
a/refinery-job/src/main/scala/org/wikimedia/analytics/refinery/job/WikidataSpecialEntityDataMetrics.scala
b/refinery-job/src/main/scala/org/wikimedia/analytics/refinery/job/WikidataSpecialEntityDataMetrics.scala
new file mode 100644
index 0000000..993b4e3
--- /dev/null
+++
b/refinery-job/src/main/scala/org/wikimedia/analytics/refinery/job/WikidataSpecialEntityDataMetrics.scala
@@ -0,0 +1,157 @@
+package org.wikimedia.analytics.refinery.job
+
+import org.apache.spark.sql.hive.HiveContext
+import org.apache.spark.{SparkConf, SparkContext}
+import org.joda.time.DateTime
+import org.wikimedia.analytics.refinery.core.GraphiteClient
+import scopt.OptionParser
+
+/**
+ * Reports metrics for the wikidata Special:EntityData page to graphite
+ *
+ * Usage with spark-submit:
+ * spark-submit \
+ * --class
org.wikimedia.analytics.refinery.job.WikidataSpecialEntityDataMetrics
+ * /path/to/refinery-job.jar
+ * -y <year> -m <month> -d <day>
+ * [-n <namespace> -w <data-base-path> -g <graphite-host> -p <graphite-port>]
+ */
+object WikidataSpecialEntityDataMetrics {
+
+
+ /**
+ * Config class for CLI argument parser using scopt
+ */
+ case class Params(webrequestTable: String = "wmf.webrequest",
+ graphiteHost: String = "localhost",
+ graphitePort: Int = 2003,
+ graphiteNamespace: String = "daily.wikidata.entitydata",
+ year: Int = 0, month: Int = 0, day: Int = 0)
+
+ /**
+ * Define the command line options parser
+ */
+ val argsParser = new OptionParser[Params]("Wikidata Special:EntityData
Metrics") {
+ head("Wikidata Special:EntityData Metrics", "")
+ note("This job reports use of the wikidata Special:EntityData page to
graphite daily")
+ help("help") text ("Prints this usage text")
+
+ opt[String]('t', "webrequest-table") optional() valueName ("<table>")
action { (x, p) =>
+ p.copy(webrequestTable = x)
+ } text ("Hive webrequest table to use. Defaults to wmf.webrequest")
+
+ opt[String]('g', "graphite-host") optional() valueName ("<path>") action {
(x, p) =>
+ p.copy(graphiteHost = x)
+ } text ("Graphite host. Defaults to localhost")
+
+ opt[Int]('p', "graphite-port") optional() valueName ("<port>") action {
(x, p) =>
+ p.copy(graphitePort = x)
+ } text ("Graphite port. Defaults to 2003")
+
+ opt[String]('n', "graphite-namespace") optional() valueName
("<graphite.namespace>") action { (x, p) =>
+ p.copy(graphiteNamespace = x)
+ } text ("graphite metric namespace/prefix. Defaults to
daily.wikidata.entitydata")
+
+ opt[Int]('y', "year") required() action { (x, p) =>
+ p.copy(year = x)
+ } text ("Year as an integer")
+
+ opt[Int]('m', "month") required() action { (x, p) =>
+ p.copy(month = x)
+ } validate { x => if (x > 0 & x <= 12) success else failure("Invalid
month")
+ } text ("Month as an integer")
+
+ opt[Int]('d', "day") required() action { (x, p) =>
+ p.copy(day = x)
+ } validate { x => if (x > 0 & x <= 31) success else failure("Invalid day")
+ } text ("Day of month as an integer")
+
+ }
+
+ def main(args: Array[String]): Unit = {
+ argsParser.parse(args, Params()) match {
+ case Some(params) => {
+ // Initial Spark setup
+ val conf = new
SparkConf().setAppName("WikidataSpecialEntityDataMetrics-%d-%d-%d".format(
+ params.year, params.month, params.day))
+ val sc = new SparkContext(conf)
+ val hiveContext = new HiveContext(sc)
+
+ val sql = """
+ SELECT
+ COUNT(1) AS count,
+ agent_type,
+ content_type
+ FROM %s
+ WHERE year = %d
+ AND month = %d
+ AND day = %d
+ AND http_status = 200
+ AND normalized_host.project_class = 'wikidata'
+ AND uri_path rlike '^/wiki/Special:EntityData/.*$'
+ GROUP BY agent_type, content_type
+ """.format(params.webrequestTable, params.year,
params.month, params.day)
+
+ val data = hiveContext.sql(sql).collect().map(r => (r.getLong(0),
r.getString(1), r.getString(2)))
+ var metrics = scala.collection.mutable.Map[String, Int]()
+
+ data.foreach{ case (count, agentType, contentType) => {
+ contentType = normalizeFormat( contentType )
+ if( count > 0 ) {
+
+ val formatKey = "format." + contentType
+ try{
+ metrics.put( formatKey, metrics.get( formatKey ).get + count )
+ } catch {
+ case e: Exception => metrics.put( formatKey, count )
+ }
+
+ val agentTypeKey = "agent_types." + contentType
+ try{
+ metrics.put( agentTypeKey, metrics.get( agentTypeKey ).get +
count )
+ } catch {
+ case e: Exception => metrics.put( agentTypeKey, count )
+ }
+
+ }
+ }}
+
+ val graphite = new GraphiteClient(params.graphiteHost,
params.graphitePort)
+ val time = new DateTime(params.year, params.month, params.day, 0, 0)
+
+ for ((metricName, count) <- metrics) {
+ val metric = "%s.%s".format(params.graphiteNamespace, metricName)
+ graphite.sendOnce(metric, count, time.getMillis / 1000)
+ }
+
+ }
+ case None => sys.exit(1)
+ }
+ }
+
+ def normalizeFormat (contentType:String) : String = {
+ if( contentType contains "/rdf+xml" ){
+ return "rdf"
+ }
+ if( contentType contains "/vnd.php" ){
+ return "php"
+ }
+ if( contentType contains "/n-triples" ){
+ return "nt"
+ }
+ if( contentType contains "/n3" ){
+ return "n3"
+ }
+ if( contentType contains "/json" ){
+ return "json"
+ }
+ if( contentType contains "/turtle" ){
+ return "ttl"
+ }
+ if( contentType contains "/html" ){
+ return "html"
+ }
+ return "unknown"
+ }
+
+}
--
To view, visit https://gerrit.wikimedia.org/r/301657
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings
Gerrit-MessageType: newchange
Gerrit-Change-Id: Ib1b69479456cb5e6c24d0581f566d347309cc1e9
Gerrit-PatchSet: 1
Gerrit-Project: analytics/refinery/source
Gerrit-Branch: master
Gerrit-Owner: Addshore <[email protected]>
_______________________________________________
MediaWiki-commits mailing list
[email protected]
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits