Addshore has uploaded a new change for review.

  https://gerrit.wikimedia.org/r/301657

Change subject: WIP Create WikidataSpecialEntityDataMetrics
......................................................................

WIP Create WikidataSpecialEntityDataMetrics

Bug: T141525
Change-Id: Ib1b69479456cb5e6c24d0581f566d347309cc1e9
---
A 
refinery-job/src/main/scala/org/wikimedia/analytics/refinery/job/WikidataSpecialEntityDataMetrics.scala
1 file changed, 157 insertions(+), 0 deletions(-)


  git pull ssh://gerrit.wikimedia.org:29418/analytics/refinery/source 
refs/changes/57/301657/1

diff --git 
a/refinery-job/src/main/scala/org/wikimedia/analytics/refinery/job/WikidataSpecialEntityDataMetrics.scala
 
b/refinery-job/src/main/scala/org/wikimedia/analytics/refinery/job/WikidataSpecialEntityDataMetrics.scala
new file mode 100644
index 0000000..993b4e3
--- /dev/null
+++ 
b/refinery-job/src/main/scala/org/wikimedia/analytics/refinery/job/WikidataSpecialEntityDataMetrics.scala
@@ -0,0 +1,157 @@
+package org.wikimedia.analytics.refinery.job
+
+import org.apache.spark.sql.hive.HiveContext
+import org.apache.spark.{SparkConf, SparkContext}
+import org.joda.time.DateTime
+import org.wikimedia.analytics.refinery.core.GraphiteClient
+import scopt.OptionParser
+
+/**
+  * Reports metrics for the wikidata Special:EntityData page to graphite
+  *
+  * Usage with spark-submit:
+  * spark-submit \
+  * --class 
org.wikimedia.analytics.refinery.job.WikidataSpecialEntityDataMetrics
+  * /path/to/refinery-job.jar
+  * -y <year> -m <month> -d <day>
+  * [-n <namespace> -w <data-base-path> -g <graphite-host> -p <graphite-port>]
+  */
+object WikidataSpecialEntityDataMetrics {
+
+
+  /**
+    * Config class for CLI argument parser using scopt
+    */
+  case class Params(webrequestTable: String = "wmf.webrequest",
+                    graphiteHost: String = "localhost",
+                    graphitePort: Int = 2003,
+                    graphiteNamespace: String = "daily.wikidata.entitydata",
+                    year: Int = 0, month: Int = 0, day: Int = 0)
+
+  /**
+    * Define the command line options parser
+    */
+  val argsParser = new OptionParser[Params]("Wikidata Special:EntityData 
Metrics") {
+    head("Wikidata Special:EntityData Metrics", "")
+    note("This job reports use of the wikidata Special:EntityData page to 
graphite daily")
+    help("help") text ("Prints this usage text")
+
+    opt[String]('t', "webrequest-table") optional() valueName ("<table>") 
action { (x, p) =>
+      p.copy(webrequestTable = x)
+    } text ("Hive webrequest table to use. Defaults to wmf.webrequest")
+
+    opt[String]('g', "graphite-host") optional() valueName ("<path>") action { 
(x, p) =>
+      p.copy(graphiteHost = x)
+    } text ("Graphite host. Defaults to localhost")
+
+    opt[Int]('p', "graphite-port") optional() valueName ("<port>") action { 
(x, p) =>
+      p.copy(graphitePort = x)
+    } text ("Graphite port. Defaults to 2003")
+
+    opt[String]('n', "graphite-namespace") optional() valueName 
("<graphite.namespace>") action { (x, p) =>
+      p.copy(graphiteNamespace = x)
+    } text ("graphite metric namespace/prefix. Defaults to 
daily.wikidata.entitydata")
+
+    opt[Int]('y', "year") required() action { (x, p) =>
+      p.copy(year = x)
+    } text ("Year as an integer")
+
+    opt[Int]('m', "month") required() action { (x, p) =>
+      p.copy(month = x)
+    } validate { x => if (x > 0 & x <= 12) success else failure("Invalid 
month")
+    } text ("Month as an integer")
+
+    opt[Int]('d', "day") required() action { (x, p) =>
+      p.copy(day = x)
+    } validate { x => if (x > 0 & x <= 31) success else failure("Invalid day")
+    } text ("Day of month as an integer")
+
+  }
+
+  def main(args: Array[String]): Unit = {
+    argsParser.parse(args, Params()) match {
+      case Some(params) => {
+        // Initial Spark setup
+        val conf = new 
SparkConf().setAppName("WikidataSpecialEntityDataMetrics-%d-%d-%d".format(
+          params.year, params.month, params.day))
+        val sc = new SparkContext(conf)
+        val hiveContext = new HiveContext(sc)
+
+        val sql = """
+  SELECT
+    COUNT(1) AS count,
+    agent_type,
+    content_type
+  FROM %s
+  WHERE year = %d
+    AND month = %d
+    AND day = %d
+    AND http_status = 200
+    AND normalized_host.project_class = 'wikidata'
+    AND uri_path rlike '^/wiki/Special:EntityData/.*$'
+    GROUP BY agent_type, content_type
+                  """.format(params.webrequestTable, params.year, 
params.month, params.day)
+
+        val data = hiveContext.sql(sql).collect().map(r => (r.getLong(0), 
r.getString(1), r.getString(2)))
+        var metrics = scala.collection.mutable.Map[String, Int]()
+
+        data.foreach{ case (count, agentType, contentType) => {
+          contentType = normalizeFormat( contentType )
+          if( count > 0 ) {
+
+            val formatKey = "format." + contentType
+            try{
+              metrics.put( formatKey, metrics.get( formatKey ).get + count )
+            } catch {
+              case e: Exception => metrics.put( formatKey, count )
+            }
+
+            val agentTypeKey = "agent_types." + contentType
+            try{
+              metrics.put( agentTypeKey, metrics.get( agentTypeKey ).get + 
count )
+            } catch {
+              case e: Exception => metrics.put( agentTypeKey, count )
+            }
+
+          }
+        }}
+
+        val graphite = new GraphiteClient(params.graphiteHost, 
params.graphitePort)
+        val time = new DateTime(params.year, params.month, params.day, 0, 0)
+
+        for ((metricName, count) <- metrics) {
+          val metric = "%s.%s".format(params.graphiteNamespace, metricName)
+          graphite.sendOnce(metric, count, time.getMillis / 1000)
+        }
+
+      }
+      case None => sys.exit(1)
+    }
+  }
+
+  def normalizeFormat (contentType:String) : String = {
+    if( contentType contains "/rdf+xml" ){
+      return "rdf"
+    }
+    if( contentType contains "/vnd.php" ){
+      return "php"
+    }
+    if( contentType contains "/n-triples" ){
+      return "nt"
+    }
+    if( contentType contains "/n3" ){
+      return "n3"
+    }
+    if( contentType contains "/json" ){
+      return "json"
+    }
+    if( contentType contains "/turtle" ){
+      return "ttl"
+    }
+    if( contentType contains "/html" ){
+      return "html"
+    }
+    return "unknown"
+  }
+
+}

-- 
To view, visit https://gerrit.wikimedia.org/r/301657
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings

Gerrit-MessageType: newchange
Gerrit-Change-Id: Ib1b69479456cb5e6c24d0581f566d347309cc1e9
Gerrit-PatchSet: 1
Gerrit-Project: analytics/refinery/source
Gerrit-Branch: master
Gerrit-Owner: Addshore <addshorew...@gmail.com>

_______________________________________________
MediaWiki-commits mailing list
MediaWiki-commits@lists.wikimedia.org
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits

Reply via email to